Commit 93c467b2 authored by Peter Eastman's avatar Peter Eastman
Browse files

Merged 5.1Optimizations branch back to trunk

parent f6d4557d
...@@ -61,7 +61,7 @@ using namespace OpenMM; ...@@ -61,7 +61,7 @@ using namespace OpenMM;
using namespace std; using namespace std;
const int CudaContext::ThreadBlockSize = 64; const int CudaContext::ThreadBlockSize = 64;
const int CudaContext::TileSize = 32; const int CudaContext::TileSize = sizeof(tileflags)*8;
bool CudaContext::hasInitializedCuda = false; bool CudaContext::hasInitializedCuda = false;
CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const string& precision, const string& compiler, CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const string& precision, const string& compiler,
...@@ -369,6 +369,7 @@ CUmodule CudaContext::createModule(const string source, const map<string, string ...@@ -369,6 +369,7 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
src << "typedef float3 mixed3;\n"; src << "typedef float3 mixed3;\n";
src << "typedef float4 mixed4;\n"; src << "typedef float4 mixed4;\n";
} }
src << "typedef unsigned int tileflags;\n";
for (map<string, string>::const_iterator iter = defines.begin(); iter != defines.end(); ++iter) { for (map<string, string>::const_iterator iter = defines.begin(); iter != defines.end(); ++iter) {
src << "#define " << iter->first; src << "#define " << iter->first;
if (!iter->second.empty()) if (!iter->second.empty())
......
...@@ -42,6 +42,8 @@ ...@@ -42,6 +42,8 @@
#include "windowsExportCuda.h" #include "windowsExportCuda.h"
#include "CudaPlatform.h" #include "CudaPlatform.h"
typedef unsigned int tileflags;
namespace OpenMM { namespace OpenMM {
class CudaArray; class CudaArray;
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2009-2012 Stanford University and the Authors. * * Portions copyright (c) 2009-2013 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -99,7 +99,7 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S ...@@ -99,7 +99,7 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
posDelta(NULL), settleAtoms(NULL), settleParams(NULL), shakeAtoms(NULL), shakeParams(NULL), posDelta(NULL), settleAtoms(NULL), settleParams(NULL), shakeAtoms(NULL), shakeParams(NULL),
random(NULL), randomSeed(NULL), randomPos(0), stepSize(NULL), ccmaAtoms(NULL), ccmaDistance(NULL), random(NULL), randomSeed(NULL), randomPos(0), stepSize(NULL), ccmaAtoms(NULL), ccmaDistance(NULL),
ccmaReducedMass(NULL), ccmaAtomConstraints(NULL), ccmaNumAtomConstraints(NULL), ccmaConstraintMatrixColumn(NULL), ccmaReducedMass(NULL), ccmaAtomConstraints(NULL), ccmaNumAtomConstraints(NULL), ccmaConstraintMatrixColumn(NULL),
ccmaConstraintMatrixValue(NULL), ccmaDelta1(NULL), ccmaDelta2(NULL), ccmaConvergedMemory(NULL), ccmaConstraintMatrixValue(NULL), ccmaDelta1(NULL), ccmaDelta2(NULL), ccmaConverged(NULL),
vsite2AvgAtoms(NULL), vsite2AvgWeights(NULL), vsite3AvgAtoms(NULL), vsite3AvgWeights(NULL), vsite2AvgAtoms(NULL), vsite2AvgWeights(NULL), vsite3AvgAtoms(NULL), vsite3AvgWeights(NULL),
vsiteOutOfPlaneAtoms(NULL), vsiteOutOfPlaneWeights(NULL) { vsiteOutOfPlaneAtoms(NULL), vsiteOutOfPlaneWeights(NULL) {
// Create workspace arrays. // Create workspace arrays.
...@@ -466,9 +466,8 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S ...@@ -466,9 +466,8 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
ccmaAtoms = CudaArray::create<int2>(context, numCCMA, "CcmaAtoms"); ccmaAtoms = CudaArray::create<int2>(context, numCCMA, "CcmaAtoms");
ccmaAtomConstraints = CudaArray::create<int>(context, numAtoms*maxAtomConstraints, "CcmaAtomConstraints"); ccmaAtomConstraints = CudaArray::create<int>(context, numAtoms*maxAtomConstraints, "CcmaAtomConstraints");
ccmaNumAtomConstraints = CudaArray::create<int>(context, numAtoms, "CcmaAtomConstraintsIndex"); ccmaNumAtomConstraints = CudaArray::create<int>(context, numAtoms, "CcmaAtomConstraintsIndex");
CHECK_RESULT2(cuMemHostAlloc((void**) &ccmaConvergedMemory, 2*sizeof(int), CU_MEMHOSTALLOC_DEVICEMAP), "Error allocating pinned memory");
CHECK_RESULT2(cuMemHostGetDevicePointer(&ccmaConvergedDeviceMemory, ccmaConvergedMemory, 0), "Error getting device address for pinned memory");
ccmaConstraintMatrixColumn = CudaArray::create<int>(context, numCCMA*maxRowElements, "ConstraintMatrixColumn"); ccmaConstraintMatrixColumn = CudaArray::create<int>(context, numCCMA*maxRowElements, "ConstraintMatrixColumn");
ccmaConverged = CudaArray::create<int>(context, 2, "ccmaConverged");
vector<int2> atomsVec(ccmaAtoms->getSize()); vector<int2> atomsVec(ccmaAtoms->getSize());
vector<int> atomConstraintsVec(ccmaAtomConstraints->getSize()); vector<int> atomConstraintsVec(ccmaAtomConstraints->getSize());
vector<int> numAtomConstraintsVec(ccmaNumAtomConstraints->getSize()); vector<int> numAtomConstraintsVec(ccmaNumAtomConstraints->getSize());
...@@ -680,8 +679,8 @@ CudaIntegrationUtilities::~CudaIntegrationUtilities() { ...@@ -680,8 +679,8 @@ CudaIntegrationUtilities::~CudaIntegrationUtilities() {
delete ccmaDelta1; delete ccmaDelta1;
if (ccmaDelta2 != NULL) if (ccmaDelta2 != NULL)
delete ccmaDelta2; delete ccmaDelta2;
if (ccmaConvergedMemory != NULL) if (ccmaConverged != NULL)
cuMemFreeHost(ccmaConvergedMemory); delete ccmaConverged;
if (vsite2AvgAtoms != NULL) if (vsite2AvgAtoms != NULL)
delete vsite2AvgAtoms; delete vsite2AvgAtoms;
if (vsite2AvgWeights != NULL) if (vsite2AvgWeights != NULL)
...@@ -734,33 +733,32 @@ void CudaIntegrationUtilities::applyConstraints(bool constrainVelocities, double ...@@ -734,33 +733,32 @@ void CudaIntegrationUtilities::applyConstraints(bool constrainVelocities, double
context.executeKernel(shakeKernel, args, shakeAtoms->getSize()); context.executeKernel(shakeKernel, args, shakeAtoms->getSize());
} }
if (ccmaAtoms != NULL) { if (ccmaAtoms != NULL) {
void* directionsArgs[] = {&ccmaAtoms->getDevicePointer(), &ccmaDistance->getDevicePointer(), &context.getPosq().getDevicePointer(), &posCorrection}; void* directionsArgs[] = {&ccmaAtoms->getDevicePointer(), &ccmaDistance->getDevicePointer(), &context.getPosq().getDevicePointer(), &posCorrection, &ccmaConverged->getDevicePointer()};
context.executeKernel(ccmaDirectionsKernel, directionsArgs, ccmaAtoms->getSize()); context.executeKernel(ccmaDirectionsKernel, directionsArgs, ccmaAtoms->getSize());
int i; int i;
void* forceArgs[] = {&ccmaAtoms->getDevicePointer(), &ccmaDistance->getDevicePointer(), void* forceArgs[] = {&ccmaAtoms->getDevicePointer(), &ccmaDistance->getDevicePointer(),
constrainVelocities ? &context.getVelm().getDevicePointer() : &posDelta->getDevicePointer(), constrainVelocities ? &context.getVelm().getDevicePointer() : &posDelta->getDevicePointer(),
&ccmaReducedMass->getDevicePointer(), &ccmaDelta1->getDevicePointer(), &ccmaConvergedDeviceMemory, &ccmaReducedMass->getDevicePointer(), &ccmaDelta1->getDevicePointer(), &ccmaConverged->getDevicePointer(),
tolPointer, &i}; tolPointer, &i};
void* multiplyArgs[] = {&ccmaDelta1->getDevicePointer(), &ccmaDelta2->getDevicePointer(), void* multiplyArgs[] = {&ccmaDelta1->getDevicePointer(), &ccmaDelta2->getDevicePointer(),
&ccmaConstraintMatrixColumn->getDevicePointer(), &ccmaConstraintMatrixValue->getDevicePointer(), &ccmaConvergedDeviceMemory, &i}; &ccmaConstraintMatrixColumn->getDevicePointer(), &ccmaConstraintMatrixValue->getDevicePointer(), &ccmaConverged->getDevicePointer(), &i};
void* updateArgs[] = {&ccmaNumAtomConstraints->getDevicePointer(), &ccmaAtomConstraints->getDevicePointer(), &ccmaDistance->getDevicePointer(), void* updateArgs[] = {&ccmaNumAtomConstraints->getDevicePointer(), &ccmaAtomConstraints->getDevicePointer(), &ccmaDistance->getDevicePointer(),
constrainVelocities ? &context.getVelm().getDevicePointer() : &posDelta->getDevicePointer(), constrainVelocities ? &context.getVelm().getDevicePointer() : &posDelta->getDevicePointer(),
&context.getVelm().getDevicePointer(), &ccmaDelta1->getDevicePointer(), &ccmaDelta2->getDevicePointer(), &context.getVelm().getDevicePointer(), &ccmaDelta1->getDevicePointer(), &ccmaDelta2->getDevicePointer(),
&ccmaConvergedDeviceMemory, &i}; &ccmaConverged->getDevicePointer(), &i};
const int checkInterval = 4; const int checkInterval = 4;
int* converged = (int*) context.getPinnedBuffer();
for (i = 0; i < 150; i++) { for (i = 0; i < 150; i++) {
if (i == 0) {
ccmaConvergedMemory[0] = 1;
ccmaConvergedMemory[1] = 0;
}
context.executeKernel(ccmaForceKernel, forceArgs, ccmaAtoms->getSize()); context.executeKernel(ccmaForceKernel, forceArgs, ccmaAtoms->getSize());
if ((i+1)%checkInterval == 0) if ((i+1)%checkInterval == 0) {
ccmaConverged->download(converged, false);
CHECK_RESULT2(cuEventRecord(ccmaEvent, 0), "Error recording event for CCMA"); CHECK_RESULT2(cuEventRecord(ccmaEvent, 0), "Error recording event for CCMA");
}
context.executeKernel(ccmaMultiplyKernel, multiplyArgs, ccmaAtoms->getSize()); context.executeKernel(ccmaMultiplyKernel, multiplyArgs, ccmaAtoms->getSize());
context.executeKernel(ccmaUpdateKernel, updateArgs, context.getNumAtoms()); context.executeKernel(ccmaUpdateKernel, updateArgs, context.getNumAtoms());
if ((i+1)%checkInterval == 0) { if ((i+1)%checkInterval == 0) {
CHECK_RESULT2(cuEventSynchronize(ccmaEvent), "Error synchronizing on event for CCMA"); CHECK_RESULT2(cuEventSynchronize(ccmaEvent), "Error synchronizing on event for CCMA");
if (ccmaConvergedMemory[i%2]) if (converged[i%2])
break; break;
} }
} }
......
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2009-2012 Stanford University and the Authors. * * Portions copyright (c) 2009-2013 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -140,8 +140,7 @@ private: ...@@ -140,8 +140,7 @@ private:
CudaArray* ccmaConstraintMatrixValue; CudaArray* ccmaConstraintMatrixValue;
CudaArray* ccmaDelta1; CudaArray* ccmaDelta1;
CudaArray* ccmaDelta2; CudaArray* ccmaDelta2;
int* ccmaConvergedMemory; CudaArray* ccmaConverged;
CUdeviceptr ccmaConvergedDeviceMemory;
CUevent ccmaEvent; CUevent ccmaEvent;
CudaArray* vsite2AvgAtoms; CudaArray* vsite2AvgAtoms;
CudaArray* vsite2AvgWeights; CudaArray* vsite2AvgWeights;
......
This diff is collapsed.
...@@ -557,8 +557,7 @@ class CudaCalcNonbondedForceKernel : public CalcNonbondedForceKernel { ...@@ -557,8 +557,7 @@ class CudaCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
public: public:
CudaCalcNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cu, System& system) : CalcNonbondedForceKernel(name, platform), CudaCalcNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cu, System& system) : CalcNonbondedForceKernel(name, platform),
cu(cu), hasInitializedFFT(false), sigmaEpsilon(NULL), exceptionParams(NULL), cosSinSums(NULL), directPmeGrid(NULL), reciprocalPmeGrid(NULL), cu(cu), hasInitializedFFT(false), sigmaEpsilon(NULL), exceptionParams(NULL), cosSinSums(NULL), directPmeGrid(NULL), reciprocalPmeGrid(NULL),
pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeBsplineTheta(NULL), pmeBsplineDTheta(NULL), pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeAtomRange(NULL), pmeAtomGridIndex(NULL), sort(NULL) {
pmeAtomRange(NULL), pmeAtomGridIndex(NULL), sort(NULL) {
} }
~CudaCalcNonbondedForceKernel(); ~CudaCalcNonbondedForceKernel();
/** /**
...@@ -607,8 +606,6 @@ private: ...@@ -607,8 +606,6 @@ private:
CudaArray* pmeBsplineModuliX; CudaArray* pmeBsplineModuliX;
CudaArray* pmeBsplineModuliY; CudaArray* pmeBsplineModuliY;
CudaArray* pmeBsplineModuliZ; CudaArray* pmeBsplineModuliZ;
CudaArray* pmeBsplineTheta;
CudaArray* pmeBsplineDTheta;
CudaArray* pmeAtomRange; CudaArray* pmeAtomRange;
CudaArray* pmeAtomGridIndex; CudaArray* pmeAtomGridIndex;
CudaSort* sort; CudaSort* sort;
...@@ -617,9 +614,6 @@ private: ...@@ -617,9 +614,6 @@ private:
CUfunction ewaldSumsKernel; CUfunction ewaldSumsKernel;
CUfunction ewaldForcesKernel; CUfunction ewaldForcesKernel;
CUfunction pmeGridIndexKernel; CUfunction pmeGridIndexKernel;
CUfunction pmeAtomRangeKernel;
CUfunction pmeZIndexKernel;
CUfunction pmeUpdateBsplinesKernel;
CUfunction pmeSpreadChargeKernel; CUfunction pmeSpreadChargeKernel;
CUfunction pmeFinishSpreadChargeKernel; CUfunction pmeFinishSpreadChargeKernel;
CUfunction pmeEvalEnergyKernel; CUfunction pmeEvalEnergyKernel;
...@@ -776,6 +770,8 @@ private: ...@@ -776,6 +770,8 @@ private:
System& system; System& system;
CUfunction pairValueKernel, perParticleValueKernel, pairEnergyKernel, perParticleEnergyKernel, gradientChainRuleKernel; CUfunction pairValueKernel, perParticleValueKernel, pairEnergyKernel, perParticleEnergyKernel, gradientChainRuleKernel;
std::vector<void*> pairValueArgs, perParticleValueArgs, pairEnergyArgs, perParticleEnergyArgs, gradientChainRuleArgs; std::vector<void*> pairValueArgs, perParticleValueArgs, pairEnergyArgs, perParticleEnergyArgs, gradientChainRuleArgs;
std::string pairValueSrc, pairEnergySrc;
std::map<std::string, std::string> pairValueDefines, pairEnergyDefines;
}; };
/** /**
......
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2009-2012 Stanford University and the Authors. * * Portions copyright (c) 2009-2013 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -36,6 +36,8 @@ ...@@ -36,6 +36,8 @@
namespace OpenMM { namespace OpenMM {
class CudaSort;
/** /**
* This class provides a generic interface for calculating nonbonded interactions. It does this in two * This class provides a generic interface for calculating nonbonded interactions. It does this in two
* ways. First, it can be used to create kernels that evaluate nonbonded interactions. Clients * ways. First, it can be used to create kernels that evaluate nonbonded interactions. Clients
...@@ -181,10 +183,10 @@ public: ...@@ -181,10 +183,10 @@ public:
return *interactingTiles; return *interactingTiles;
} }
/** /**
* Get the array containing flags for tiles with interactions. * Get the array containing the atoms in each tile with interactions.
*/ */
CudaArray& getInteractionFlags() { CudaArray& getInteractingAtoms() {
return *interactionFlags; return *interactingAtoms;
} }
/** /**
* Get the array containing exclusion flags. * Get the array containing exclusion flags.
...@@ -192,6 +194,12 @@ public: ...@@ -192,6 +194,12 @@ public:
CudaArray& getExclusions() { CudaArray& getExclusions() {
return *exclusions; return *exclusions;
} }
/**
* Get the array containing tiles with exclusions.
*/
CudaArray& getExclusionTiles() {
return *exclusionTiles;
}
/** /**
* Get the array containing the index into the exclusion array for each tile. * Get the array containing the index into the exclusion array for each tile.
*/ */
...@@ -217,9 +225,17 @@ public: ...@@ -217,9 +225,17 @@ public:
return numTiles; return numTiles;
} }
/** /**
* Set the range of tiles that should be processed by this context. * Set whether to add padding to the cutoff distance when building the neighbor list.
* This increases the size of the neighbor list (and thus the cost of computing interactions),
* but also means we don't need to rebuild it every time step. The default value is true,
* since usually this improves performance. For very expensive interactions, however,
* it may be better to set this to false.
*/
void setUsePadding(bool padding);
/**
* Set the range of atom blocks and tiles that should be processed by this context.
*/ */
void setTileRange(int startTileIndex, int numTiles); void setAtomBlockRange(double startFraction, double endFraction);
/** /**
* Create a Kernel for evaluating a nonbonded interaction. Cutoffs and periodic boundary conditions * Create a Kernel for evaluating a nonbonded interaction. Cutoffs and periodic boundary conditions
* are assumed to be the same as those for the default interaction Kernel, since this kernel will use * are assumed to be the same as those for the default interaction Kernel, since this kernel will use
...@@ -232,42 +248,38 @@ public: ...@@ -232,42 +248,38 @@ public:
* @param isSymmetric specifies whether the interaction is symmetric * @param isSymmetric specifies whether the interaction is symmetric
*/ */
CUfunction createInteractionKernel(const std::string& source, std::vector<ParameterInfo>& params, std::vector<ParameterInfo>& arguments, bool useExclusions, bool isSymmetric); CUfunction createInteractionKernel(const std::string& source, std::vector<ParameterInfo>& params, std::vector<ParameterInfo>& arguments, bool useExclusions, bool isSymmetric);
/**
* This is a utility routine for locating data in the exclusions array. It takes the (x,y) indices of a tile,
* and returns the location in the array where the data for that tile begins.
*
* This routine requires that x >= y. If not, it will throw an exception.
*
* @param x the x index of the tile
* @param y the y index of the tile
* @param exclusionIndices the content of the exclusionIndices array
* @param exclusionRowIndices the content of the exclusionRowIndices array
* @return the index in the exclusions array at which the data for that tile begins
*/
static int findExclusionIndex(int x, int y, const std::vector<unsigned int>& exclusionIndices, const std::vector<unsigned int>& exclusionRowIndices);
private: private:
class BlockSortTrait;
CudaContext& context; CudaContext& context;
CUfunction forceKernel; CUfunction forceKernel;
CUfunction findBlockBoundsKernel; CUfunction findBlockBoundsKernel;
CUfunction sortBoxDataKernel;
CUfunction findInteractingBlocksKernel; CUfunction findInteractingBlocksKernel;
CUfunction findInteractionsWithinBlocksKernel; CUfunction findInteractionsWithinBlocksKernel;
CudaArray* exclusionTiles;
CudaArray* exclusions; CudaArray* exclusions;
CudaArray* exclusionIndices; CudaArray* exclusionIndices;
CudaArray* exclusionRowIndices; CudaArray* exclusionRowIndices;
CudaArray* interactingTiles; CudaArray* interactingTiles;
CudaArray* interactionFlags; CudaArray* interactingAtoms;
CudaArray* interactionCount; CudaArray* interactionCount;
CudaArray* blockCenter; CudaArray* blockCenter;
CudaArray* blockBoundingBox; CudaArray* blockBoundingBox;
std::vector<void*> forceArgs, findBlockBoundsArgs, findInteractingBlocksArgs, findInteractionsWithinBlocksArgs; CudaArray* sortedBlocks;
CudaArray* sortedBlockCenter;
CudaArray* sortedBlockBoundingBox;
CudaArray* oldPositions;
CudaArray* rebuildNeighborList;
CudaSort* blockSorter;
std::vector<void*> forceArgs, findBlockBoundsArgs, sortBoxDataArgs, findInteractingBlocksArgs;
std::vector<std::vector<int> > atomExclusions; std::vector<std::vector<int> > atomExclusions;
std::vector<ParameterInfo> parameters; std::vector<ParameterInfo> parameters;
std::vector<ParameterInfo> arguments; std::vector<ParameterInfo> arguments;
std::string kernelSource; std::string kernelSource;
std::map<std::string, std::string> kernelDefines; std::map<std::string, std::string> kernelDefines;
double cutoff; double cutoff;
bool useCutoff, usePeriodic, anyExclusions; bool useCutoff, usePeriodic, anyExclusions, usePadding;
int startTileIndex, numTiles, maxTiles, numForceThreadBlocks, forceThreadBlockSize, nonbondedForceGroup, numAtoms; int startTileIndex, numTiles, startBlockIndex, numBlocks, maxTiles, numForceThreadBlocks, forceThreadBlockSize, nonbondedForceGroup, numAtoms;
}; };
/** /**
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2011-2012 Stanford University and the Authors. * * Portions copyright (c) 2011-2013 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -118,7 +118,7 @@ private: ...@@ -118,7 +118,7 @@ private:
}; };
CudaParallelCalcForcesAndEnergyKernel::CudaParallelCalcForcesAndEnergyKernel(string name, const Platform& platform, CudaPlatform::PlatformData& data) : CudaParallelCalcForcesAndEnergyKernel::CudaParallelCalcForcesAndEnergyKernel(string name, const Platform& platform, CudaPlatform::PlatformData& data) :
CalcForcesAndEnergyKernel(name, platform), data(data), completionTimes(data.contexts.size()), contextTiles(data.contexts.size()), contextForces(NULL), CalcForcesAndEnergyKernel(name, platform), data(data), completionTimes(data.contexts.size()), contextNonbondedFractions(data.contexts.size()), contextForces(NULL),
pinnedPositionBuffer(NULL), pinnedForceBuffer(NULL) { pinnedPositionBuffer(NULL), pinnedForceBuffer(NULL) {
for (int i = 0; i < (int) data.contexts.size(); i++) for (int i = 0; i < (int) data.contexts.size(); i++)
kernels.push_back(Kernel(new CudaCalcForcesAndEnergyKernel(name, platform, *data.contexts[i]))); kernels.push_back(Kernel(new CudaCalcForcesAndEnergyKernel(name, platform, *data.contexts[i])));
...@@ -141,6 +141,8 @@ void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) { ...@@ -141,6 +141,8 @@ void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
sumKernel = cu.getKernel(module, "sumForces"); sumKernel = cu.getKernel(module, "sumForces");
for (int i = 0; i < (int) kernels.size(); i++) for (int i = 0; i < (int) kernels.size(); i++)
getKernel(i).initialize(system); getKernel(i).initialize(system);
for (int i = 0; i < (int) contextNonbondedFractions.size(); i++)
contextNonbondedFractions[i] = 1/(double) contextNonbondedFractions.size();
} }
void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) { void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) {
...@@ -184,30 +186,26 @@ double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& con ...@@ -184,30 +186,26 @@ double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& con
void* args[] = {&cu.getForce().getDevicePointer(), &contextForces->getDevicePointer(), &bufferSize, &numBuffers}; void* args[] = {&cu.getForce().getDevicePointer(), &contextForces->getDevicePointer(), &bufferSize, &numBuffers};
cu.executeKernel(sumKernel, args, bufferSize); cu.executeKernel(sumKernel, args, bufferSize);
// Balance work between the contexts by transferring a few nonbonded tiles from the context that // Balance work between the contexts by transferring a little nonbonded work from the context that
// finished last to the one that finished first. // finished last to the one that finished first.
int firstIndex = 0, lastIndex = 0; int firstIndex = 0, lastIndex = 0;
int totalTiles = 0;
for (int i = 0; i < (int) completionTimes.size(); i++) { for (int i = 0; i < (int) completionTimes.size(); i++) {
if (completionTimes[i] < completionTimes[firstIndex]) if (completionTimes[i] < completionTimes[firstIndex])
firstIndex = i; firstIndex = i;
if (completionTimes[i] > completionTimes[lastIndex]) if (completionTimes[i] > completionTimes[lastIndex])
lastIndex = i; lastIndex = i;
contextTiles[i] = data.contexts[i]->getNonbondedUtilities().getNumTiles(); }
totalTiles += contextTiles[i]; double fractionToTransfer = min(0.001, contextNonbondedFractions[lastIndex]);
} contextNonbondedFractions[firstIndex] += fractionToTransfer;
int tilesToTransfer = totalTiles/1000; contextNonbondedFractions[lastIndex] -= fractionToTransfer;
if (tilesToTransfer < 1) double startFraction = 0.0;
tilesToTransfer = 1; for (int i = 0; i < (int) contextNonbondedFractions.size(); i++) {
if (tilesToTransfer > contextTiles[lastIndex]) double endFraction = startFraction+contextNonbondedFractions[i];
tilesToTransfer = contextTiles[lastIndex]; if (i == contextNonbondedFractions.size()-1)
contextTiles[firstIndex] += tilesToTransfer; endFraction = 1.0; // Avoid roundoff error
contextTiles[lastIndex] -= tilesToTransfer; data.contexts[i]->getNonbondedUtilities().setAtomBlockRange(startFraction, endFraction);
int startIndex = 0; startFraction = endFraction;
for (int i = 0; i < (int) contextTiles.size(); i++) {
data.contexts[i]->getNonbondedUtilities().setTileRange(startIndex, contextTiles[i]);
startIndex += contextTiles[i];
} }
} }
return energy; return energy;
......
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2011-2012 Stanford University and the Authors. * * Portions copyright (c) 2011-2013 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -80,7 +80,7 @@ private: ...@@ -80,7 +80,7 @@ private:
CudaPlatform::PlatformData& data; CudaPlatform::PlatformData& data;
std::vector<Kernel> kernels; std::vector<Kernel> kernels;
std::vector<long long> completionTimes; std::vector<long long> completionTimes;
std::vector<int> contextTiles; std::vector<double> contextNonbondedFractions;
CudaArray* contextForces; CudaArray* contextForces;
void* pinnedPositionBuffer; void* pinnedPositionBuffer;
long long* pinnedForceBuffer; long long* pinnedForceBuffer;
......
...@@ -32,7 +32,7 @@ using namespace OpenMM; ...@@ -32,7 +32,7 @@ using namespace OpenMM;
using namespace std; using namespace std;
CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length) : context(context), trait(trait), CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length) : context(context), trait(trait),
dataRange(NULL), bucketOfElement(NULL), offsetInBucket(NULL), bucketOffset(NULL), buckets(NULL) { dataRange(NULL), bucketOfElement(NULL), offsetInBucket(NULL), bucketOffset(NULL), buckets(NULL), dataLength(length) {
// Create kernels. // Create kernels.
map<string, string> replacements; map<string, string> replacements;
...@@ -43,6 +43,7 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length) ...@@ -43,6 +43,7 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length)
replacements["MAX_KEY"] = trait->getMaxKey(); replacements["MAX_KEY"] = trait->getMaxKey();
replacements["MAX_VALUE"] = trait->getMaxValue(); replacements["MAX_VALUE"] = trait->getMaxValue();
CUmodule module = context.createModule(context.replaceStrings(CudaKernelSources::sort, replacements)); CUmodule module = context.createModule(context.replaceStrings(CudaKernelSources::sort, replacements));
shortListKernel = context.getKernel(module, "sortShortList");
computeRangeKernel = context.getKernel(module, "computeRange"); computeRangeKernel = context.getKernel(module, "computeRange");
assignElementsKernel = context.getKernel(module, "assignElementsToBuckets"); assignElementsKernel = context.getKernel(module, "assignElementsToBuckets");
computeBucketPositionsKernel = context.getKernel(module, "computeBucketPositions"); computeBucketPositionsKernel = context.getKernel(module, "computeBucketPositions");
...@@ -53,15 +54,16 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length) ...@@ -53,15 +54,16 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length)
int maxBlockSize; int maxBlockSize;
cuDeviceGetAttribute(&maxBlockSize, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, context.getDevice()); cuDeviceGetAttribute(&maxBlockSize, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, context.getDevice());
int maxSharedMem;
cuDeviceGetAttribute(&maxSharedMem, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, context.getDevice());
unsigned int maxLocalBuffer = (unsigned int) ((maxSharedMem/trait->getDataSize())/2);
isShortList = (length <= maxLocalBuffer);
for (rangeKernelSize = 1; rangeKernelSize*2 <= maxBlockSize; rangeKernelSize *= 2) for (rangeKernelSize = 1; rangeKernelSize*2 <= maxBlockSize; rangeKernelSize *= 2)
; ;
positionsKernelSize = rangeKernelSize; positionsKernelSize = rangeKernelSize;
sortKernelSize = rangeKernelSize/2; sortKernelSize = (isShortList ? rangeKernelSize/2 : rangeKernelSize/4);
if (rangeKernelSize > length) if (rangeKernelSize > length)
rangeKernelSize = length; rangeKernelSize = length;
int maxSharedMem;
cuDeviceGetAttribute(&maxSharedMem, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, context.getDevice());
unsigned int maxLocalBuffer = (unsigned int) ((maxSharedMem/trait->getDataSize())/2);
if (sortKernelSize > maxLocalBuffer) if (sortKernelSize > maxLocalBuffer)
sortKernelSize = maxLocalBuffer; sortKernelSize = maxLocalBuffer;
unsigned int targetBucketSize = sortKernelSize/2; unsigned int targetBucketSize = sortKernelSize/2;
...@@ -73,11 +75,13 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length) ...@@ -73,11 +75,13 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length)
// Create workspace arrays. // Create workspace arrays.
if (!isShortList) {
dataRange = new CudaArray(context, 2, trait->getKeySize(), "sortDataRange"); dataRange = new CudaArray(context, 2, trait->getKeySize(), "sortDataRange");
bucketOffset = CudaArray::create<uint1>(context, numBuckets, "bucketOffset"); bucketOffset = CudaArray::create<uint1>(context, numBuckets, "bucketOffset");
bucketOfElement = CudaArray::create<uint1>(context, length, "bucketOfElement"); bucketOfElement = CudaArray::create<uint1>(context, length, "bucketOfElement");
offsetInBucket = CudaArray::create<uint1>(context, length, "offsetInBucket"); offsetInBucket = CudaArray::create<uint1>(context, length, "offsetInBucket");
buckets = new CudaArray(context, length, trait->getDataSize(), "buckets"); buckets = new CudaArray(context, length, trait->getDataSize(), "buckets");
}
} }
CudaSort::~CudaSort() { CudaSort::~CudaSort() {
...@@ -95,22 +99,27 @@ CudaSort::~CudaSort() { ...@@ -95,22 +99,27 @@ CudaSort::~CudaSort() {
} }
void CudaSort::sort(CudaArray& data) { void CudaSort::sort(CudaArray& data) {
if (data.getSize() != bucketOfElement->getSize() || data.getElementSize() != trait->getDataSize()) if (data.getSize() != dataLength || data.getElementSize() != trait->getDataSize())
throw OpenMMException("CudaSort called with different data size"); throw OpenMMException("CudaSort called with different data size");
if (data.getSize() == 0) if (data.getSize() == 0)
return; return;
if (isShortList) {
// We can use a simpler sort kernel that does the entire operation at once in local memory.
void* sortArgs[] = {&data.getDevicePointer(), &dataLength};
context.executeKernel(shortListKernel, sortArgs, sortKernelSize, sortKernelSize, dataLength*trait->getDataSize());
}
else {
// Compute the range of data values. // Compute the range of data values.
unsigned int dataSize = data.getSize(); void* rangeArgs[] = {&data.getDevicePointer(), &dataLength, &dataRange->getDevicePointer()};
void* rangeArgs[] = {&data.getDevicePointer(), &dataSize, &dataRange->getDevicePointer()};
context.executeKernel(computeRangeKernel, rangeArgs, rangeKernelSize, rangeKernelSize, rangeKernelSize*trait->getKeySize()); context.executeKernel(computeRangeKernel, rangeArgs, rangeKernelSize, rangeKernelSize, rangeKernelSize*trait->getKeySize());
// Assign array elements to buckets. // Assign array elements to buckets.
unsigned int numBuckets = bucketOffset->getSize(); unsigned int numBuckets = bucketOffset->getSize();
context.clearBuffer(*bucketOffset); context.clearBuffer(*bucketOffset);
void* elementsArgs[] = {&data.getDevicePointer(), &dataSize, &numBuckets, &dataRange->getDevicePointer(), void* elementsArgs[] = {&data.getDevicePointer(), &dataLength, &numBuckets, &dataRange->getDevicePointer(),
&bucketOffset->getDevicePointer(), &bucketOfElement->getDevicePointer(), &offsetInBucket->getDevicePointer()}; &bucketOffset->getDevicePointer(), &bucketOfElement->getDevicePointer(), &offsetInBucket->getDevicePointer()};
context.executeKernel(assignElementsKernel, elementsArgs, data.getSize()); context.executeKernel(assignElementsKernel, elementsArgs, data.getSize());
...@@ -121,7 +130,7 @@ void CudaSort::sort(CudaArray& data) { ...@@ -121,7 +130,7 @@ void CudaSort::sort(CudaArray& data) {
// Copy the data into the buckets. // Copy the data into the buckets.
void* copyArgs[] = {&data.getDevicePointer(), &buckets->getDevicePointer(), &dataSize, &bucketOffset->getDevicePointer(), void* copyArgs[] = {&data.getDevicePointer(), &buckets->getDevicePointer(), &dataLength, &bucketOffset->getDevicePointer(),
&bucketOfElement->getDevicePointer(), &offsetInBucket->getDevicePointer()}; &bucketOfElement->getDevicePointer(), &offsetInBucket->getDevicePointer()};
context.executeKernel(copyToBucketsKernel, copyArgs, data.getSize()); context.executeKernel(copyToBucketsKernel, copyArgs, data.getSize());
...@@ -129,4 +138,5 @@ void CudaSort::sort(CudaArray& data) { ...@@ -129,4 +138,5 @@ void CudaSort::sort(CudaArray& data) {
void* sortArgs[] = {&data.getDevicePointer(), &buckets->getDevicePointer(), &numBuckets, &bucketOffset->getDevicePointer()}; void* sortArgs[] = {&data.getDevicePointer(), &buckets->getDevicePointer(), &numBuckets, &bucketOffset->getDevicePointer()};
context.executeKernel(sortBucketsKernel, sortArgs, ((data.getSize()+sortKernelSize-1)/sortKernelSize)*sortKernelSize, sortKernelSize, sortKernelSize*trait->getDataSize()); context.executeKernel(sortBucketsKernel, sortArgs, ((data.getSize()+sortKernelSize-1)/sortKernelSize)*sortKernelSize, sortKernelSize, sortKernelSize*trait->getDataSize());
}
} }
...@@ -92,8 +92,9 @@ private: ...@@ -92,8 +92,9 @@ private:
CudaArray* offsetInBucket; CudaArray* offsetInBucket;
CudaArray* bucketOffset; CudaArray* bucketOffset;
CudaArray* buckets; CudaArray* buckets;
CUfunction computeRangeKernel, assignElementsKernel, computeBucketPositionsKernel, copyToBucketsKernel, sortBucketsKernel; CUfunction shortListKernel, computeRangeKernel, assignElementsKernel, computeBucketPositionsKernel, copyToBucketsKernel, sortBucketsKernel;
unsigned int rangeKernelSize, positionsKernelSize, sortKernelSize; unsigned int dataLength, rangeKernelSize, positionsKernelSize, sortKernelSize;
bool isShortList;
}; };
/** /**
......
#if USE_EWALD #if USE_EWALD
bool needCorrection = isExcluded && atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS; bool needCorrection = hasExclusions && isExcluded && atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS;
if (!isExcluded || needCorrection) { if (!isExcluded || needCorrection) {
real tempForce = 0.0f;
if (r2 < CUTOFF_SQUARED || needCorrection) { if (r2 < CUTOFF_SQUARED || needCorrection) {
const real alphaR = EWALD_ALPHA*r; const real alphaR = EWALD_ALPHA*r;
const real expAlphaRSqr = EXP(-alphaR*alphaR); const real expAlphaRSqr = EXP(-alphaR*alphaR);
...@@ -16,6 +15,7 @@ if (!isExcluded || needCorrection) { ...@@ -16,6 +15,7 @@ if (!isExcluded || needCorrection) {
t *= t; t *= t;
t *= t; t *= t;
const real erfcAlphaR = RECIP(t*t); const real erfcAlphaR = RECIP(t*t);
real tempForce = 0.0f;
if (needCorrection) { if (needCorrection) {
// Subtract off the part of this interaction that was included in the reciprocal space contribution. // Subtract off the part of this interaction that was included in the reciprocal space contribution.
...@@ -36,8 +36,8 @@ if (!isExcluded || needCorrection) { ...@@ -36,8 +36,8 @@ if (!isExcluded || needCorrection) {
tempEnergy += prefactor*erfcAlphaR; tempEnergy += prefactor*erfcAlphaR;
#endif #endif
} }
}
dEdR += tempForce*invR*invR; dEdR += tempForce*invR*invR;
}
} }
#else #else
{ {
......
#define STORE_DERIVATIVE_1(INDEX) atomicAdd(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (deriv##INDEX##_1*0x100000000))); #define STORE_DERIVATIVE_1(INDEX) atomicAdd(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (deriv##INDEX##_1*0x100000000)));
#define STORE_DERIVATIVE_2(INDEX) atomicAdd(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].deriv##INDEX*0x100000000))); #define STORE_DERIVATIVE_2(INDEX) atomicAdd(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].deriv##INDEX*0x100000000)));
#define TILE_SIZE 32
typedef struct { typedef struct {
real4 posq; real4 posq;
...@@ -15,88 +14,43 @@ typedef struct { ...@@ -15,88 +14,43 @@ typedef struct {
* Compute a force based on pair interactions. * Compute a force based on pair interactions.
*/ */
extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forceBuffers, real* __restrict__ energyBuffer, extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forceBuffers, real* __restrict__ energyBuffer,
const real4* __restrict__ posq, const unsigned int* __restrict__ exclusions, const unsigned int* __restrict__ exclusionIndices, const real4* __restrict__ posq, const unsigned int* __restrict__ exclusions, const ushort2* __restrict__ exclusionTiles,
const unsigned int* __restrict__ exclusionRowIndices,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const unsigned int* __restrict__ interactionFlags const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const real4* __restrict__ blockCenter, const unsigned int* __restrict__ interactingAtoms
#else #else
unsigned int numTiles unsigned int numTiles
#endif #endif
PARAMETER_ARGUMENTS) { PARAMETER_ARGUMENTS) {
unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE; const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE; const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
#ifdef USE_CUTOFF const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
unsigned int numTiles = interactionCount[0]; const unsigned int tbx = threadIdx.x - tgx;
unsigned int pos = warp*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
unsigned int end = (warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
#else
unsigned int pos = warp*numTiles/totalWarps;
unsigned int end = (warp+1)*numTiles/totalWarps;
#endif
real energy = 0; real energy = 0;
unsigned int lasty = 0xFFFFFFFF;
__shared__ AtomData localData[THREAD_BLOCK_SIZE]; __shared__ AtomData localData[THREAD_BLOCK_SIZE];
__shared__ unsigned int exclusionRange[2*WARPS_PER_GROUP];
__shared__ int exclusionIndex[WARPS_PER_GROUP];
do { // First loop: process tiles that contain exclusions.
// Extract the coordinates of this tile
const unsigned int tgx = threadIdx.x & (TILE_SIZE-1); const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
const unsigned int tbx = threadIdx.x - tgx; const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
const unsigned int localGroupIndex = threadIdx.x/TILE_SIZE; for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
unsigned int x, y; const ushort2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x;
const unsigned int y = tileIndices.y;
real3 force = make_real3(0); real3 force = make_real3(0);
DECLARE_ATOM1_DERIVATIVES DECLARE_ATOM1_DERIVATIVES
if (pos < end) {
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
x = tileIndices.x;
y = tileIndices.y;
}
else
#endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
unsigned int atom1 = x*TILE_SIZE + tgx; unsigned int atom1 = x*TILE_SIZE + tgx;
real4 posq1 = posq[atom1]; real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS LOAD_ATOM1_PARAMETERS
// Locate the exclusion data for this tile.
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
if (tgx < 2) unsigned int excl = exclusions[pos*TILE_SIZE+tgx];
exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
if (tgx == 0)
exclusionIndex[localGroupIndex] = -1;
for (unsigned int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
if (exclusionIndices[i] == y)
exclusionIndex[localGroupIndex] = i*TILE_SIZE;
bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
#else
bool hasExclusions = false;
#endif #endif
if (pos >= end) if (x == y) {
; // This warp is done.
else if (x == y) {
// This tile is on the diagonal. // This tile is on the diagonal.
const unsigned int localAtomIndex = threadIdx.x; const unsigned int localAtomIndex = threadIdx.x;
localData[localAtomIndex].posq = posq1; localData[localAtomIndex].posq = posq1;
LOAD_LOCAL_PARAMETERS_FROM_1 LOAD_LOCAL_PARAMETERS_FROM_1
#ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
#endif
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
int atom2 = tbx+j; int atom2 = tbx+j;
real4 posq2 = localData[atom2].posq; real4 posq2 = localData[atom2].posq;
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z); real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
...@@ -115,6 +69,9 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc ...@@ -115,6 +69,9 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
atom2 = y*TILE_SIZE+j; atom2 = y*TILE_SIZE+j;
real dEdR = 0; real dEdR = 0;
real tempEnergy = 0; real tempEnergy = 0;
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) { if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
COMPUTE_INTERACTION COMPUTE_INTERACTION
dEdR /= -r; dEdR /= -r;
...@@ -136,32 +93,16 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc ...@@ -136,32 +93,16 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
// This is an off-diagonal tile. // This is an off-diagonal tile.
const unsigned int localAtomIndex = threadIdx.x; const unsigned int localAtomIndex = threadIdx.x;
if (lasty != y) {
unsigned int j = y*TILE_SIZE + tgx; unsigned int j = y*TILE_SIZE + tgx;
localData[localAtomIndex].posq = posq[j]; localData[localAtomIndex].posq = posq[j];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
localData[localAtomIndex].force = make_real3(0); localData[localAtomIndex].force = make_real3(0);
CLEAR_LOCAL_DERIVATIVES CLEAR_LOCAL_DERIVATIVES
#ifdef USE_CUTOFF
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
if (!hasExclusions && flags == 0) {
// No interactions in this tile.
}
else
#endif
{
// Compute the full set of interactions in this tile.
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[localGroupIndex]+tgx] : 0xFFFFFFFF);
excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx)); excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
#endif #endif
unsigned int tj = tgx; unsigned int tj = tgx;
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (j = 0; j < TILE_SIZE; j++) {
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
int atom2 = tbx+tj; int atom2 = tbx+tj;
real4 posq2 = localData[atom2].posq; real4 posq2 = localData[atom2].posq;
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z); real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
...@@ -180,6 +121,9 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc ...@@ -180,6 +121,9 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
atom2 = y*TILE_SIZE+tj; atom2 = y*TILE_SIZE+tj;
real dEdR = 0; real dEdR = 0;
real tempEnergy = 0; real tempEnergy = 0;
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) { if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
COMPUTE_INTERACTION COMPUTE_INTERACTION
dEdR /= -r; dEdR /= -r;
...@@ -203,27 +147,216 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc ...@@ -203,27 +147,216 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
tj = (tj + 1) & (TILE_SIZE - 1); tj = (tj + 1) & (TILE_SIZE - 1);
} }
} }
}
}
lasty = y;
// Write results. // Write results.
if (pos < end) { unsigned int offset = x*TILE_SIZE + tgx;
const unsigned int offset = x*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (force.x*0x100000000))); atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000))); atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000))); atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
STORE_DERIVATIVES_1 STORE_DERIVATIVES_1
} if (x != y) {
if (pos < end && x != y) { offset = y*TILE_SIZE + tgx;
const unsigned int offset = y*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000))); atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000))); atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000))); atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
STORE_DERIVATIVES_2 STORE_DERIVATIVES_2
} }
}
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
// of them (no cutoff).
#ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0];
int pos = warp*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
int end = (warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
#else
int pos = warp*numTiles/totalWarps;
int end = (warp+1)*numTiles/totalWarps;
#endif
int skipBase = 0;
int currentSkipIndex = tbx;
__shared__ int atomIndices[THREAD_BLOCK_SIZE];
__shared__ int skipTiles[THREAD_BLOCK_SIZE];
skipTiles[threadIdx.x] = -1;
while (pos < end) {
const bool isExcluded = false;
real3 force = make_real3(0);
DECLARE_ATOM1_DERIVATIVES
bool includeTile = true;
// Extract the coordinates of this tile.
unsigned int x, y;
bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
x = tileIndices.x;
singlePeriodicCopy = tileIndices.y;
}
else
#endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
// Skip over tiles that have exclusions, since they were already processed.
while (skipTiles[tbx+TILE_SIZE-1] < pos) {
if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[skipBase+tgx];
skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
}
else
skipTiles[threadIdx.x] = end;
skipBase += TILE_SIZE;
currentSkipIndex = tbx;
}
while (skipTiles[currentSkipIndex] < pos)
currentSkipIndex++;
includeTile = (skipTiles[currentSkipIndex] != pos);
}
if (includeTile) {
unsigned int atom1 = x*TILE_SIZE + tgx;
// Load atom data for this tile.
real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
const unsigned int localAtomIndex = threadIdx.x;
#ifdef USE_CUTOFF
unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+tgx] : y*TILE_SIZE + tgx);
#else
unsigned int j = y*TILE_SIZE + tgx;
#endif
atomIndices[threadIdx.x] = j;
if (j < PADDED_NUM_ATOMS) {
localData[localAtomIndex].posq = posq[j];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
localData[localAtomIndex].force = make_real3(0);
CLEAR_LOCAL_DERIVATIVES
}
#ifdef USE_PERIODIC
if (singlePeriodicCopy) {
// The box is small enough that we can just translate all the atoms into a single periodic
// box, then skip having to apply periodic boundary conditions later.
real4 blockCenterX = blockCenter[x];
posq1.x -= floor((posq1.x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
posq1.y -= floor((posq1.y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
posq1.z -= floor((posq1.z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
localData[threadIdx.x].posq.x -= floor((localData[threadIdx.x].posq.x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
localData[threadIdx.x].posq.y -= floor((localData[threadIdx.x].posq.y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
localData[threadIdx.x].posq.z -= floor((localData[threadIdx.x].posq.z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+tj;
real4 posq2 = localData[atom2].posq;
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = atomIndices[tbx+tj];
real dEdR = 0;
real tempEnergy = 0;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
COMPUTE_INTERACTION
dEdR /= -r;
}
energy += tempEnergy;
delta *= dEdR;
force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
atom2 = tbx+tj;
localData[atom2].force.x += delta.x;
localData[atom2].force.y += delta.y;
localData[atom2].force.z += delta.z;
RECORD_DERIVATIVE_2
#ifdef USE_CUTOFF
}
#endif
tj = (tj + 1) & (TILE_SIZE - 1);
}
}
else
#endif
{
// We need to apply periodic boundary conditions separately for each interaction.
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+tj;
real4 posq2 = localData[atom2].posq;
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = atomIndices[tbx+tj];
real dEdR = 0;
real tempEnergy = 0;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
COMPUTE_INTERACTION
dEdR /= -r;
}
energy += tempEnergy;
delta *= dEdR;
force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
atom2 = tbx+tj;
localData[atom2].force.x += delta.x;
localData[atom2].force.y += delta.y;
localData[atom2].force.z += delta.z;
RECORD_DERIVATIVE_2
#ifdef USE_CUTOFF
}
#endif
tj = (tj + 1) & (TILE_SIZE - 1);
}
}
// Write results.
atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
unsigned int offset = atom1;
STORE_DERIVATIVES_1
#ifdef USE_CUTOFF
unsigned int atom2 = atomIndices[threadIdx.x];
#else
unsigned int atom2 = y*TILE_SIZE + tgx;
#endif
if (atom2 < PADDED_NUM_ATOMS) {
atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
offset = atom2;
STORE_DERIVATIVES_2
}
}
pos++; pos++;
} while (pos < end); }
energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy; energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
} }
#define TILE_SIZE 32
typedef struct { typedef struct {
real4 posq; real4 posq;
real value, temp; real value, temp;
...@@ -13,86 +11,41 @@ typedef struct { ...@@ -13,86 +11,41 @@ typedef struct {
* Compute a value based on pair interactions. * Compute a value based on pair interactions.
*/ */
extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const unsigned int* __restrict__ exclusions, extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const unsigned int* __restrict__ exclusions,
const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices, unsigned long long* __restrict__ global_value, const ushort2* __restrict__ exclusionTiles, unsigned long long* __restrict__ global_value,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const unsigned int* __restrict__ interactionFlags const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const real4* __restrict__ blockCenter, const unsigned int* __restrict__ interactingAtoms
#else #else
unsigned int numTiles unsigned int numTiles
#endif #endif
PARAMETER_ARGUMENTS) { PARAMETER_ARGUMENTS) {
unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE; const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE; const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
#ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0];
unsigned int pos = warp*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
unsigned int end = (warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
#else
unsigned int pos = warp*numTiles/totalWarps;
unsigned int end = (warp+1)*numTiles/totalWarps;
#endif
real energy = 0;
unsigned int lasty = 0xFFFFFFFF;
__shared__ AtomData localData[THREAD_BLOCK_SIZE];
__shared__ unsigned int exclusionRange[2*WARPS_PER_GROUP];
__shared__ int exclusionIndex[WARPS_PER_GROUP];
do {
// Extract the coordinates of this tile
const unsigned int tgx = threadIdx.x & (TILE_SIZE-1); const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
const unsigned int tbx = threadIdx.x - tgx; const unsigned int tbx = threadIdx.x - tgx;
const unsigned int localGroupIndex = threadIdx.x/TILE_SIZE; __shared__ AtomData localData[THREAD_BLOCK_SIZE];
unsigned int x, y;
// First loop: process tiles that contain exclusions.
const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
const ushort2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x;
const unsigned int y = tileIndices.y;
real value = 0; real value = 0;
if (pos < end) {
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
x = tileIndices.x;
y = tileIndices.y;
}
else
#endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
unsigned int atom1 = x*TILE_SIZE + tgx; unsigned int atom1 = x*TILE_SIZE + tgx;
real4 posq1 = posq[atom1]; real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS LOAD_ATOM1_PARAMETERS
// Locate the exclusion data for this tile.
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
if (tgx < 2) unsigned int excl = exclusions[pos*TILE_SIZE+tgx];
exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
if (tgx == 0)
exclusionIndex[localGroupIndex] = -1;
for (unsigned int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
if (exclusionIndices[i] == y)
exclusionIndex[localGroupIndex] = i*TILE_SIZE;
bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
#else
bool hasExclusions = false;
#endif #endif
if (pos >= end) if (x == y) {
; // This warp is done.
else if (x == y) {
// This tile is on the diagonal. // This tile is on the diagonal.
const unsigned int localAtomIndex = threadIdx.x; const unsigned int localAtomIndex = threadIdx.x;
localData[localAtomIndex].posq = posq1; localData[localAtomIndex].posq = posq1;
LOAD_LOCAL_PARAMETERS_FROM_1 LOAD_LOCAL_PARAMETERS_FROM_1
#ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
#endif
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
int atom2 = tbx+j; int atom2 = tbx+j;
real4 posq2 = localData[atom2].posq; real4 posq2 = localData[atom2].posq;
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z); real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
...@@ -112,7 +65,8 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const ...@@ -112,7 +65,8 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
real tempValue1 = 0; real tempValue1 = 0;
real tempValue2 = 0; real tempValue2 = 0;
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
if (!isExcluded && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) { bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 0x1));
if (!isExcluded && atom1 != atom2) {
#else #else
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) { if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
#endif #endif
...@@ -130,25 +84,17 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const ...@@ -130,25 +84,17 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
else { else {
// This is an off-diagonal tile. // This is an off-diagonal tile.
if (lasty != y) {
unsigned int j = y*TILE_SIZE + tgx;
localData[threadIdx.x].posq = posq[j];
const unsigned int localAtomIndex = threadIdx.x; const unsigned int localAtomIndex = threadIdx.x;
unsigned int j = y*TILE_SIZE + tgx;
localData[localAtomIndex].posq = posq[j];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
} localData[localAtomIndex].value = 0;
localData[threadIdx.x].value = 0; #ifdef USE_EXCLUSIONS
#ifdef USE_CUTOFF excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF); #endif
if (!hasExclusions && flags != 0xFFFFFFFF) { unsigned int tj = tgx;
if (flags == 0) { for (j = 0; j < TILE_SIZE; j++) {
// No interactions in this tile. int atom2 = tbx+tj;
}
else {
// Compute only a subset of the interactions in this tile.
for (unsigned int j = 0; j < TILE_SIZE; j++) {
if ((flags&(1<<j)) != 0) {
int atom2 = tbx+j;
real4 posq2 = localData[atom2].posq; real4 posq2 = localData[atom2].posq;
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z); real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
...@@ -157,44 +103,162 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const ...@@ -157,44 +103,162 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z; delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif #endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real tempValue1 = 0; #ifdef USE_CUTOFF
real tempValue2 = 0;
if (r2 < CUTOFF_SQUARED) { if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2); real invR = RSQRT(r2);
real r = RECIP(invR); real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j; atom2 = y*TILE_SIZE+tj;
real tempValue1 = 0;
real tempValue2 = 0;
#ifdef USE_EXCLUSIONS
bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 0x1));
if (!isExcluded) {
#else
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) { if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
#endif
COMPUTE_VALUE COMPUTE_VALUE
} }
value += tempValue1; value += tempValue1;
localData[tbx+tj].value += tempValue2;
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl >>= 1;
#endif
tj = (tj + 1) & (TILE_SIZE - 1);
}
} }
localData[threadIdx.x].temp = tempValue2;
// Sum the forces on atom2. // Write results.
if (tgx % 4 == 0) unsigned int offset = x*TILE_SIZE + tgx;
localData[threadIdx.x].temp += localData[threadIdx.x+1].temp+localData[threadIdx.x+2].temp+localData[threadIdx.x+3].temp; atomicAdd(&global_value[offset], static_cast<unsigned long long>((long long) (value*0x100000000)));
if (tgx == 0) if (x != y) {
localData[tbx+j].value += localData[threadIdx.x].temp+localData[threadIdx.x+4].temp+localData[threadIdx.x+8].temp+localData[threadIdx.x+12].temp+localData[threadIdx.x+16].temp+localData[threadIdx.x+20].temp+localData[threadIdx.x+24].temp+localData[threadIdx.x+28].temp; offset = y*TILE_SIZE + tgx;
} atomicAdd(&global_value[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].value*0x100000000)));
} }
} }
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
// of them (no cutoff).
#ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0];
int pos = warp*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
int end = (warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
#else
int pos = warp*numTiles/totalWarps;
int end = (warp+1)*numTiles/totalWarps;
#endif
int skipBase = 0;
int currentSkipIndex = tbx;
__shared__ int atomIndices[THREAD_BLOCK_SIZE];
__shared__ int skipTiles[THREAD_BLOCK_SIZE];
skipTiles[threadIdx.x] = -1;
while (pos < end) {
real value = 0;
bool includeTile = true;
// Extract the coordinates of this tile.
unsigned int x, y;
bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
x = tileIndices.x;
singlePeriodicCopy = tileIndices.y;
} }
else else
#endif #endif
{ {
// Compute the full set of interactions in this tile. y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
#ifdef USE_EXCLUSIONS // Skip over tiles that have exclusions, since they were already processed.
unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[localGroupIndex]+tgx] : 0xFFFFFFFF);
excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx)); while (skipTiles[tbx+TILE_SIZE-1] < pos) {
if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[skipBase+tgx];
skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
}
else
skipTiles[threadIdx.x] = end;
skipBase += TILE_SIZE;
currentSkipIndex = tbx;
}
while (skipTiles[currentSkipIndex] < pos)
currentSkipIndex++;
includeTile = (skipTiles[currentSkipIndex] != pos);
}
if (includeTile) {
unsigned int atom1 = x*TILE_SIZE + tgx;
// Load atom data for this tile.
real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
const unsigned int localAtomIndex = threadIdx.x;
#ifdef USE_CUTOFF
unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+tgx] : y*TILE_SIZE + tgx);
#else
unsigned int j = y*TILE_SIZE + tgx;
#endif #endif
atomIndices[threadIdx.x] = j;
if (j < PADDED_NUM_ATOMS) {
localData[localAtomIndex].posq = posq[j];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
localData[localAtomIndex].value = 0;
}
#ifdef USE_PERIODIC
if (singlePeriodicCopy) {
// The box is small enough that we can just translate all the atoms into a single periodic
// box, then skip having to apply periodic boundary conditions later.
real4 blockCenterX = blockCenter[x];
posq1.x -= floor((posq1.x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
posq1.y -= floor((posq1.y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
posq1.z -= floor((posq1.z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
localData[threadIdx.x].posq.x -= floor((localData[threadIdx.x].posq.x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
localData[threadIdx.x].posq.y -= floor((localData[threadIdx.x].posq.y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
localData[threadIdx.x].posq.z -= floor((localData[threadIdx.x].posq.z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
unsigned int tj = tgx; unsigned int tj = tgx;
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
#ifdef USE_EXCLUSIONS int atom2 = tbx+tj;
bool isExcluded = !(excl & 0x1); real4 posq2 = localData[atom2].posq;
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
if (r2 < CUTOFF_SQUARED) {
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = atomIndices[tbx+tj];
real tempValue1 = 0;
real tempValue2 = 0;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
COMPUTE_VALUE
}
value += tempValue1;
localData[tbx+tj].value += tempValue2;
}
tj = (tj + 1) & (TILE_SIZE - 1);
}
}
else
#endif #endif
{
// We need to apply periodic boundary conditions separately for each interaction.
unsigned int tj = tgx;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+tj; int atom2 = tbx+tj;
real4 posq2 = localData[atom2].posq; real4 posq2 = localData[atom2].posq;
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z); real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
...@@ -210,41 +274,32 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const ...@@ -210,41 +274,32 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
real invR = RSQRT(r2); real invR = RSQRT(r2);
real r = RECIP(invR); real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+tj; atom2 = atomIndices[tbx+tj];
real tempValue1 = 0; real tempValue1 = 0;
real tempValue2 = 0; real tempValue2 = 0;
#ifdef USE_EXCLUSIONS
if (!isExcluded && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
#else
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) { if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
#endif
COMPUTE_VALUE COMPUTE_VALUE
} }
value += tempValue1; value += tempValue1;
localData[tbx+tj].value += tempValue2; localData[tbx+tj].value += tempValue2;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
} }
#endif
#ifdef USE_EXCLUSIONS
excl >>= 1;
#endif #endif
tj = (tj + 1) & (TILE_SIZE - 1); tj = (tj + 1) & (TILE_SIZE - 1);
} }
} }
}
}
// Write results. // Write results.
if (pos < end) { atomicAdd(&global_value[atom1], static_cast<unsigned long long>((long long) (value*0x100000000)));
const unsigned int offset = x*TILE_SIZE + tgx; #ifdef USE_CUTOFF
atomicAdd(&global_value[offset], static_cast<unsigned long long>((long long) (value*0x100000000))); unsigned int atom2 = atomIndices[threadIdx.x];
} #else
if (pos < end && x != y) { unsigned int atom2 = y*TILE_SIZE + tgx;
const unsigned int offset = y*TILE_SIZE + tgx; #endif
atomicAdd(&global_value[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].value*0x100000000))); if (atom2 < PADDED_NUM_ATOMS)
atomicAdd(&global_value[atom2], static_cast<unsigned long long>((long long) (localData[threadIdx.x].value*0x100000000)));
} }
lasty = y;
pos++; pos++;
} while (pos < end); }
} }
...@@ -48,12 +48,12 @@ inline __device__ real computeAngle(real4 vec1, real4 vec2) { ...@@ -48,12 +48,12 @@ inline __device__ real computeAngle(real4 vec1, real4 vec2) {
real3 crossProduct = cross(vec1, vec2); real3 crossProduct = cross(vec1, vec2);
real scale = vec1.w*vec2.w; real scale = vec1.w*vec2.w;
angle = asin(SQRT(dot(crossProduct, crossProduct)/scale)); angle = ASIN(SQRT(dot(crossProduct, crossProduct)/scale));
if (cosine < 0.0f) if (cosine < 0.0f)
angle = M_PI-angle; angle = M_PI-angle;
} }
else else
angle = acos(cosine); angle = ACOS(cosine);
return angle; return angle;
} }
......
...@@ -35,11 +35,11 @@ extern "C" __global__ void calculateEwaldCosSinSums(real* __restrict__ energyBuf ...@@ -35,11 +35,11 @@ extern "C" __global__ void calculateEwaldCosSinSums(real* __restrict__ energyBuf
for (int atom = 0; atom < NUM_ATOMS; atom++) { for (int atom = 0; atom < NUM_ATOMS; atom++) {
real4 apos = posq[atom]; real4 apos = posq[atom];
real phase = apos.x*kx; real phase = apos.x*kx;
real2 structureFactor = make_real2(cos(phase), sin(phase)); real2 structureFactor = make_real2(COS(phase), SIN(phase));
phase = apos.y*ky; phase = apos.y*ky;
structureFactor = multofReal2(structureFactor, make_real2(cos(phase), sin(phase))); structureFactor = multofReal2(structureFactor, make_real2(COS(phase), SIN(phase)));
phase = apos.z*kz; phase = apos.z*kz;
structureFactor = multofReal2(structureFactor, make_real2(cos(phase), sin(phase))); structureFactor = multofReal2(structureFactor, make_real2(COS(phase), SIN(phase)));
sum += apos.w*structureFactor; sum += apos.w*structureFactor;
} }
cosSinSum[index] = sum; cosSinSum[index] = sum;
...@@ -76,9 +76,9 @@ extern "C" __global__ void calculateEwaldForces(unsigned long long* __restrict__ ...@@ -76,9 +76,9 @@ extern "C" __global__ void calculateEwaldForces(unsigned long long* __restrict__
for (int ry = lowry; ry < KMAX_Y; ry++) { for (int ry = lowry; ry < KMAX_Y; ry++) {
real ky = ry*reciprocalBoxSize.y; real ky = ry*reciprocalBoxSize.y;
real phase = apos.x*kx; real phase = apos.x*kx;
real2 tab_xy = make_real2(cos(phase), sin(phase)); real2 tab_xy = make_real2(COS(phase), SIN(phase));
phase = apos.y*ky; phase = apos.y*ky;
tab_xy = multofReal2(tab_xy, make_real2(cos(phase), sin(phase))); tab_xy = multofReal2(tab_xy, make_real2(COS(phase), SIN(phase)));
for (int rz = lowrz; rz < KMAX_Z; rz++) { for (int rz = lowrz; rz < KMAX_Z; rz++) {
real kz = rz*reciprocalBoxSize.z; real kz = rz*reciprocalBoxSize.z;
...@@ -88,7 +88,7 @@ extern "C" __global__ void calculateEwaldForces(unsigned long long* __restrict__ ...@@ -88,7 +88,7 @@ extern "C" __global__ void calculateEwaldForces(unsigned long long* __restrict__
real k2 = kx*kx + ky*ky + kz*kz; real k2 = kx*kx + ky*ky + kz*kz;
real ak = EXP(k2*EXP_COEFFICIENT)/k2; real ak = EXP(k2*EXP_COEFFICIENT)/k2;
phase = apos.z*kz; phase = apos.z*kz;
real2 structureFactor = multofReal2(tab_xy, make_real2(cos(phase), sin(phase))); real2 structureFactor = multofReal2(tab_xy, make_real2(COS(phase), SIN(phase)));
real2 sum = cosSinSum[index]; real2 sum = cosSinSum[index];
real dEdR = 2*reciprocalCoefficient*ak*apos.w*(sum.x*structureFactor.y - sum.y*structureFactor.x); real dEdR = 2*reciprocalCoefficient*ak*apos.w*(sum.x*structureFactor.y - sum.y*structureFactor.x);
force.x += dEdR*kx; force.x += dEdR*kx;
......
This diff is collapsed.
...@@ -24,14 +24,14 @@ extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restri ...@@ -24,14 +24,14 @@ extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restri
state.y ^= state.y << 13; state.y ^= state.y << 13;
state.y ^= state.y >> 17; state.y ^= state.y >> 17;
state.y ^= state.y << 5; state.y ^= state.y << 5;
x1 = sqrt(-2.0f * log(x1)); x1 = SQRT(-2.0f * LOG(x1));
k = (state.z >> 2) + (state.w >> 3) + (carry >> 2); k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
m = state.w + state.w + state.z + carry; m = state.w + state.w + state.z + carry;
state.z = state.w; state.z = state.w;
state.w = m; state.w = m;
carry = k >> 30; carry = k >> 30;
float x2 = (float)(state.x + state.y + state.w) / (float)0xffffffff; float x2 = (float)(state.x + state.y + state.w) / (float)0xffffffff;
value.x = x1 * cos(2.0f * 3.14159265f * x2); value.x = x1 * COS(2.0f * 3.14159265f * x2);
// Generate second value. // Generate second value.
...@@ -49,14 +49,14 @@ extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restri ...@@ -49,14 +49,14 @@ extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restri
state.y ^= state.y << 13; state.y ^= state.y << 13;
state.y ^= state.y >> 17; state.y ^= state.y >> 17;
state.y ^= state.y << 5; state.y ^= state.y << 5;
x3 = sqrt(-2.0f * log(x3)); x3 = SQRT(-2.0f * LOG(x3));
k = (state.z >> 2) + (state.w >> 3) + (carry >> 2); k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
m = state.w + state.w + state.z + carry; m = state.w + state.w + state.z + carry;
state.z = state.w; state.z = state.w;
state.w = m; state.w = m;
carry = k >> 30; carry = k >> 30;
float x4 = (float)(state.x + state.y + state.w) / (float)0xffffffff; float x4 = (float)(state.x + state.y + state.w) / (float)0xffffffff;
value.y = x3 * cos(2.0f * 3.14159265f * x4); value.y = x3 * COS(2.0f * 3.14159265f * x4);
// Generate third value. // Generate third value.
...@@ -74,14 +74,14 @@ extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restri ...@@ -74,14 +74,14 @@ extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restri
state.y ^= state.y << 13; state.y ^= state.y << 13;
state.y ^= state.y >> 17; state.y ^= state.y >> 17;
state.y ^= state.y << 5; state.y ^= state.y << 5;
x5 = sqrt(-2.0f * log(x5)); x5 = SQRT(-2.0f * LOG(x5));
k = (state.z >> 2) + (state.w >> 3) + (carry >> 2); k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
m = state.w + state.w + state.z + carry; m = state.w + state.w + state.z + carry;
state.z = state.w; state.z = state.w;
state.w = m; state.w = m;
carry = k >> 30; carry = k >> 30;
float x6 = (float)(state.x + state.y + state.w) / (float)0xffffffff; float x6 = (float)(state.x + state.y + state.w) / (float)0xffffffff;
value.z = x5 * cos(2.0f * 3.14159265f * x6); value.z = x5 * COS(2.0f * 3.14159265f * x6);
// Generate fourth value. // Generate fourth value.
...@@ -99,14 +99,14 @@ extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restri ...@@ -99,14 +99,14 @@ extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restri
state.y ^= state.y << 13; state.y ^= state.y << 13;
state.y ^= state.y >> 17; state.y ^= state.y >> 17;
state.y ^= state.y << 5; state.y ^= state.y << 5;
x7 = sqrt(-2.0f * log(x7)); x7 = SQRT(-2.0f * LOG(x7));
k = (state.z >> 2) + (state.w >> 3) + (carry >> 2); k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
m = state.w + state.w + state.z + carry; m = state.w + state.w + state.z + carry;
state.z = state.w; state.z = state.w;
state.w = m; state.w = m;
carry = k >> 30; carry = k >> 30;
float x8 = (float)(state.x + state.y + state.w) / (float)0xffffffff; float x8 = (float)(state.x + state.y + state.w) / (float)0xffffffff;
value.w = x7 * cos(2.0f * 3.14159265f * x8); value.w = x7 * COS(2.0f * 3.14159265f * x8);
// Record the values. // Record the values.
...@@ -412,9 +412,9 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co ...@@ -412,9 +412,9 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co
mixed yaksYd = zaksZd*xaksXd - xaksZd*zaksXd; mixed yaksYd = zaksZd*xaksXd - xaksZd*zaksXd;
mixed zaksYd = xaksZd*yaksXd - yaksZd*xaksXd; mixed zaksYd = xaksZd*yaksXd - yaksZd*xaksXd;
mixed axlng = sqrt(xaksXd*xaksXd + yaksXd*yaksXd + zaksXd*zaksXd); mixed axlng = SQRT(xaksXd*xaksXd + yaksXd*yaksXd + zaksXd*zaksXd);
mixed aylng = sqrt(xaksYd*xaksYd + yaksYd*yaksYd + zaksYd*zaksYd); mixed aylng = SQRT(xaksYd*xaksYd + yaksYd*yaksYd + zaksYd*zaksYd);
mixed azlng = sqrt(xaksZd*xaksZd + yaksZd*yaksZd + zaksZd*zaksZd); mixed azlng = SQRT(xaksZd*xaksZd + yaksZd*yaksZd + zaksZd*zaksZd);
mixed trns11 = xaksXd / axlng; mixed trns11 = xaksXd / axlng;
mixed trns21 = yaksXd / axlng; mixed trns21 = yaksXd / axlng;
mixed trns31 = zaksXd / axlng; mixed trns31 = zaksXd / axlng;
...@@ -440,13 +440,13 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co ...@@ -440,13 +440,13 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co
// --- Step2 A2' --- // --- Step2 A2' ---
float rc = 0.5f*params.y; float rc = 0.5f*params.y;
mixed rb = sqrt(params.x*params.x-rc*rc); mixed rb = SQRT(params.x*params.x-rc*rc);
mixed ra = rb*(m1+m2)*invTotalMass; mixed ra = rb*(m1+m2)*invTotalMass;
rb -= ra; rb -= ra;
mixed sinphi = za1d/ra; mixed sinphi = za1d/ra;
mixed cosphi = sqrt(1-sinphi*sinphi); mixed cosphi = SQRT(1-sinphi*sinphi);
mixed sinpsi = (zb1d-zc1d) / (2*rc*cosphi); mixed sinpsi = (zb1d-zc1d) / (2*rc*cosphi);
mixed cospsi = sqrt(1-sinpsi*sinpsi); mixed cospsi = SQRT(1-sinpsi*sinpsi);
mixed ya2d = ra*cosphi; mixed ya2d = ra*cosphi;
mixed xb2d = - rc*cospsi; mixed xb2d = - rc*cospsi;
...@@ -454,7 +454,7 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co ...@@ -454,7 +454,7 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co
mixed yc2d = - rb*cosphi + rc*sinpsi*sinphi; mixed yc2d = - rb*cosphi + rc*sinpsi*sinphi;
mixed xb2d2 = xb2d*xb2d; mixed xb2d2 = xb2d*xb2d;
mixed hh2 = 4.0f*xb2d2 + (yb2d-yc2d)*(yb2d-yc2d) + (zb1d-zc1d)*(zb1d-zc1d); mixed hh2 = 4.0f*xb2d2 + (yb2d-yc2d)*(yb2d-yc2d) + (zb1d-zc1d)*(zb1d-zc1d);
mixed deltx = 2.0f*xb2d + sqrt(4.0f*xb2d2 - hh2 + params.y*params.y); mixed deltx = 2.0f*xb2d + SQRT(4.0f*xb2d2 - hh2 + params.y*params.y);
xb2d -= deltx*0.5f; xb2d -= deltx*0.5f;
// --- Step3 al,be,ga --- // --- Step3 al,be,ga ---
...@@ -464,11 +464,11 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co ...@@ -464,11 +464,11 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co
mixed gamma = xb0d*yb1d - xb1d*yb0d + xc0d*yc1d - xc1d*yc0d; mixed gamma = xb0d*yb1d - xb1d*yb0d + xc0d*yc1d - xc1d*yc0d;
mixed al2be2 = alpha*alpha + beta*beta; mixed al2be2 = alpha*alpha + beta*beta;
mixed sintheta = (alpha*gamma - beta*sqrt(al2be2 - gamma*gamma)) / al2be2; mixed sintheta = (alpha*gamma - beta*SQRT(al2be2 - gamma*gamma)) / al2be2;
// --- Step4 A3' --- // --- Step4 A3' ---
mixed costheta = sqrt(1-sintheta*sintheta); mixed costheta = SQRT(1-sintheta*sintheta);
mixed xa3d = - ya2d*sintheta; mixed xa3d = - ya2d*sintheta;
mixed ya3d = ya2d*costheta; mixed ya3d = ya2d*costheta;
mixed za3d = za1d; mixed za3d = za1d;
...@@ -534,9 +534,9 @@ extern "C" __global__ void applySettleToVelocities(int numClusters, mixed tol, c ...@@ -534,9 +534,9 @@ extern "C" __global__ void applySettleToVelocities(int numClusters, mixed tol, c
mixed3 eAB = make_mixed3(apos1.x-apos0.x, apos1.y-apos0.y, apos1.z-apos0.z); mixed3 eAB = make_mixed3(apos1.x-apos0.x, apos1.y-apos0.y, apos1.z-apos0.z);
mixed3 eBC = make_mixed3(apos2.x-apos1.x, apos2.y-apos1.y, apos2.z-apos1.z); mixed3 eBC = make_mixed3(apos2.x-apos1.x, apos2.y-apos1.y, apos2.z-apos1.z);
mixed3 eCA = make_mixed3(apos0.x-apos2.x, apos0.y-apos2.y, apos0.z-apos2.z); mixed3 eCA = make_mixed3(apos0.x-apos2.x, apos0.y-apos2.y, apos0.z-apos2.z);
eAB *= rsqrt(eAB.x*eAB.x + eAB.y*eAB.y + eAB.z*eAB.z); eAB *= RSQRT(eAB.x*eAB.x + eAB.y*eAB.y + eAB.z*eAB.z);
eBC *= rsqrt(eBC.x*eBC.x + eBC.y*eBC.y + eBC.z*eBC.z); eBC *= RSQRT(eBC.x*eBC.x + eBC.y*eBC.y + eBC.z*eBC.z);
eCA *= rsqrt(eCA.x*eCA.x + eCA.y*eCA.y + eCA.z*eCA.z); eCA *= RSQRT(eCA.x*eCA.x + eCA.y*eCA.y + eCA.z*eCA.z);
mixed vAB = (v1.x-v0.x)*eAB.x + (v1.y-v0.y)*eAB.y + (v1.z-v0.z)*eAB.z; mixed vAB = (v1.x-v0.x)*eAB.x + (v1.y-v0.y)*eAB.y + (v1.z-v0.z)*eAB.z;
mixed vBC = (v2.x-v1.x)*eBC.x + (v2.y-v1.y)*eBC.y + (v2.z-v1.z)*eBC.z; mixed vBC = (v2.x-v1.x)*eBC.x + (v2.y-v1.y)*eBC.y + (v2.z-v1.z)*eBC.z;
mixed vCA = (v0.x-v2.x)*eCA.x + (v0.y-v2.y)*eCA.y + (v0.z-v2.z)*eCA.z; mixed vCA = (v0.x-v2.x)*eCA.x + (v0.y-v2.y)*eCA.y + (v0.z-v2.z)*eCA.z;
...@@ -574,7 +574,8 @@ extern "C" __global__ void applySettleToVelocities(int numClusters, mixed tol, c ...@@ -574,7 +574,8 @@ extern "C" __global__ void applySettleToVelocities(int numClusters, mixed tol, c
/** /**
* Compute the direction each CCMA constraint is pointing in. This is called once at the beginning of constraint evaluation. * Compute the direction each CCMA constraint is pointing in. This is called once at the beginning of constraint evaluation.
*/ */
extern "C" __global__ void computeCCMAConstraintDirections(const int2* __restrict__ constraintAtoms, mixed4* __restrict__ constraintDistance, const real4* __restrict__ atomPositions, const real4* __restrict__ posqCorrection) { extern "C" __global__ void computeCCMAConstraintDirections(const int2* __restrict__ constraintAtoms, mixed4* __restrict__ constraintDistance,
const real4* __restrict__ atomPositions, const real4* __restrict__ posqCorrection, int* __restrict__ converged) {
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_CCMA_CONSTRAINTS; index += blockDim.x*gridDim.x) { for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_CCMA_CONSTRAINTS; index += blockDim.x*gridDim.x) {
// Compute the direction for this constraint. // Compute the direction for this constraint.
...@@ -587,6 +588,10 @@ extern "C" __global__ void computeCCMAConstraintDirections(const int2* __restric ...@@ -587,6 +588,10 @@ extern "C" __global__ void computeCCMAConstraintDirections(const int2* __restric
dir.z = oldPos1.z-oldPos2.z; dir.z = oldPos1.z-oldPos2.z;
constraintDistance[index] = dir; constraintDistance[index] = dir;
} }
if (threadIdx.x == 0 && blockIdx.x == 0) {
converged[0] = 1;
converged[1] = 0;
}
} }
/** /**
...@@ -605,6 +610,7 @@ extern "C" __global__ void computeCCMAPositionConstraintForce(const int2* __rest ...@@ -605,6 +610,7 @@ extern "C" __global__ void computeCCMAPositionConstraintForce(const int2* __rest
__syncthreads(); __syncthreads();
mixed lowerTol = 1-2*tol+tol*tol; mixed lowerTol = 1-2*tol+tol*tol;
mixed upperTol = 1+2*tol+tol*tol; mixed upperTol = 1+2*tol+tol*tol;
bool threadConverged = true;
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_CCMA_CONSTRAINTS; index += blockDim.x*gridDim.x) { for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_CCMA_CONSTRAINTS; index += blockDim.x*gridDim.x) {
// Compute the force due to this constraint. // Compute the force due to this constraint.
...@@ -620,14 +626,13 @@ extern "C" __global__ void computeCCMAPositionConstraintForce(const int2* __rest ...@@ -620,14 +626,13 @@ extern "C" __global__ void computeCCMAPositionConstraintForce(const int2* __rest
mixed dist2 = dir.w*dir.w; mixed dist2 = dir.w*dir.w;
mixed diff = dist2 - rp2; mixed diff = dist2 - rp2;
delta1[index] = (rrpr > d_ij2*1e-6f ? reducedMass[index]*diff/rrpr : 0.0f); delta1[index] = (rrpr > d_ij2*1e-6f ? reducedMass[index]*diff/rrpr : 0.0f);
threadConverged &= (rp2 > lowerTol*dist2 && rp2 < upperTol*dist2);
// See whether it has converged. }
if (groupConverged && !threadConverged)
if (groupConverged && (rp2 < lowerTol*dist2 || rp2 > upperTol*dist2)) {
groupConverged = 0; groupConverged = 0;
__syncthreads();
if (threadIdx.x == 0 && !groupConverged)
converged[iteration%2] = 0; converged[iteration%2] = 0;
}
}
} }
/** /**
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment