Commit b33ee3b0 authored by peastman's avatar peastman
Browse files

More conversion of OpenCLArrays

parent d59b0373
......@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2011-2016 Stanford University and the Authors. *
* Portions copyright (c) 2011-2018 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -81,7 +81,6 @@ namespace OpenMM {
class OPENMM_EXPORT_OPENCL OpenCLBondedUtilities {
public:
OpenCLBondedUtilities(OpenCLContext& context);
~OpenCLBondedUtilities();
/**
* Add a bonded interaction.
*
......@@ -143,8 +142,8 @@ private:
std::vector<std::vector<int> > forceSets;
std::vector<cl::Memory*> arguments;
std::vector<std::string> argTypes;
std::vector<OpenCLArray*> atomIndices;
std::vector<OpenCLArray*> bufferIndices;
std::vector<OpenCLArray> atomIndices;
std::vector<OpenCLArray> bufferIndices;
std::vector<std::string> prefixCode;
std::vector<std::string> energyParameterDerivatives;
int numForceBuffers, maxBonds, allGroups;
......
......@@ -33,11 +33,10 @@ namespace OpenMM {
class OPENMM_EXPORT_OPENCL OpenCLCompact {
public:
OpenCLCompact(OpenCLContext& context);
~OpenCLCompact();
void compactStream(OpenCLArray& dOut, OpenCLArray& dIn, OpenCLArray& dValid, OpenCLArray& numValid);
private:
OpenCLContext& context;
OpenCLArray* dgBlockCounts;
OpenCLArray dgBlockCounts;
cl::Kernel countKernel;
cl::Kernel moveValidKernel;
};
......
......@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2017 Stanford University and the Authors. *
* Portions copyright (c) 2009-2018 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -42,25 +42,24 @@ namespace OpenMM {
class OPENMM_EXPORT_OPENCL OpenCLIntegrationUtilities {
public:
OpenCLIntegrationUtilities(OpenCLContext& context, const System& system);
~OpenCLIntegrationUtilities();
/**
* Get the array which contains position deltas.
*/
OpenCLArray& getPosDelta() {
return *posDelta;
return posDelta;
}
/**
* Get the array which contains random values. Each element is a float4, whose components
* are independent, normally distributed random numbers with mean 0 and variance 1.
*/
OpenCLArray& getRandom() {
return *random;
return random;
}
/**
* Get the array which contains the current step size.
*/
OpenCLArray& getStepSize() {
return *stepSize;
return stepSize;
}
/**
* Set the size to use for the next step.
......@@ -131,36 +130,36 @@ private:
cl::Kernel ccmaPosUpdateKernel, ccmaVelUpdateKernel;
cl::Kernel vsitePositionKernel, vsiteForceKernel, vsiteAddForcesKernel;
cl::Kernel randomKernel, timeShiftKernel;
OpenCLArray* posDelta;
OpenCLArray* settleAtoms;
OpenCLArray* settleParams;
OpenCLArray* shakeAtoms;
OpenCLArray* shakeParams;
OpenCLArray* random;
OpenCLArray* randomSeed;
OpenCLArray* stepSize;
OpenCLArray* ccmaAtoms;
OpenCLArray* ccmaDistance;
OpenCLArray* ccmaReducedMass;
OpenCLArray* ccmaAtomConstraints;
OpenCLArray* ccmaNumAtomConstraints;
OpenCLArray* ccmaConstraintMatrixColumn;
OpenCLArray* ccmaConstraintMatrixValue;
OpenCLArray* ccmaDelta1;
OpenCLArray* ccmaDelta2;
OpenCLArray* ccmaConverged;
OpenCLArray* ccmaConvergedHostBuffer;
OpenCLArray* vsite2AvgAtoms;
OpenCLArray* vsite2AvgWeights;
OpenCLArray* vsite3AvgAtoms;
OpenCLArray* vsite3AvgWeights;
OpenCLArray* vsiteOutOfPlaneAtoms;
OpenCLArray* vsiteOutOfPlaneWeights;
OpenCLArray* vsiteLocalCoordsIndex;
OpenCLArray* vsiteLocalCoordsAtoms;
OpenCLArray* vsiteLocalCoordsWeights;
OpenCLArray* vsiteLocalCoordsPos;
OpenCLArray* vsiteLocalCoordsStartIndex;
OpenCLArray posDelta;
OpenCLArray settleAtoms;
OpenCLArray settleParams;
OpenCLArray shakeAtoms;
OpenCLArray shakeParams;
OpenCLArray random;
OpenCLArray randomSeed;
OpenCLArray stepSize;
OpenCLArray ccmaAtoms;
OpenCLArray ccmaDistance;
OpenCLArray ccmaReducedMass;
OpenCLArray ccmaAtomConstraints;
OpenCLArray ccmaNumAtomConstraints;
OpenCLArray ccmaConstraintMatrixColumn;
OpenCLArray ccmaConstraintMatrixValue;
OpenCLArray ccmaDelta1;
OpenCLArray ccmaDelta2;
OpenCLArray ccmaConverged;
OpenCLArray ccmaConvergedHostBuffer;
OpenCLArray vsite2AvgAtoms;
OpenCLArray vsite2AvgWeights;
OpenCLArray vsite3AvgAtoms;
OpenCLArray vsite3AvgWeights;
OpenCLArray vsiteOutOfPlaneAtoms;
OpenCLArray vsiteOutOfPlaneWeights;
OpenCLArray vsiteLocalCoordsIndex;
OpenCLArray vsiteLocalCoordsAtoms;
OpenCLArray vsiteLocalCoordsWeights;
OpenCLArray vsiteLocalCoordsPos;
OpenCLArray vsiteLocalCoordsStartIndex;
int randomPos;
int lastSeed, numVsites;
bool hasInitializedPosConstraintKernels, hasInitializedVelConstraintKernels, ccmaUseDirectBuffer, hasOverlappingVsites;
......
......@@ -1132,12 +1132,8 @@ private:
class OpenCLCalcGayBerneForceKernel : public CalcGayBerneForceKernel {
public:
OpenCLCalcGayBerneForceKernel(std::string name, const Platform& platform, OpenCLContext& cl) : CalcGayBerneForceKernel(name, platform), cl(cl),
hasInitializedKernels(false), sortedParticles(NULL), axisParticleIndices(NULL), sigParams(NULL), epsParams(NULL), scale(NULL), exceptionParticles(NULL),
exceptionParams(NULL), aMatrix(NULL),
bMatrix(NULL), gMatrix(NULL), exclusions(NULL), exclusionStartIndex(NULL), blockCenter(NULL), blockBoundingBox(NULL), neighbors(NULL),
neighborIndex(NULL), neighborBlockCount(NULL), sortedPos(NULL), torque(NULL) {
hasInitializedKernels(false) {
}
~OpenCLCalcGayBerneForceKernel();
/**
* Initialize the kernel.
*
......@@ -1169,25 +1165,25 @@ private:
bool hasInitializedKernels;
int numRealParticles, maxNeighborBlocks;
GayBerneForce::NonbondedMethod nonbondedMethod;
OpenCLArray* sortedParticles;
OpenCLArray* axisParticleIndices;
OpenCLArray* sigParams;
OpenCLArray* epsParams;
OpenCLArray* scale;
OpenCLArray* exceptionParticles;
OpenCLArray* exceptionParams;
OpenCLArray* aMatrix;
OpenCLArray* bMatrix;
OpenCLArray* gMatrix;
OpenCLArray* exclusions;
OpenCLArray* exclusionStartIndex;
OpenCLArray* blockCenter;
OpenCLArray* blockBoundingBox;
OpenCLArray* neighbors;
OpenCLArray* neighborIndex;
OpenCLArray* neighborBlockCount;
OpenCLArray* sortedPos;
OpenCLArray* torque;
OpenCLArray sortedParticles;
OpenCLArray axisParticleIndices;
OpenCLArray sigParams;
OpenCLArray epsParams;
OpenCLArray scale;
OpenCLArray exceptionParticles;
OpenCLArray exceptionParams;
OpenCLArray aMatrix;
OpenCLArray bMatrix;
OpenCLArray gMatrix;
OpenCLArray exclusions;
OpenCLArray exclusionStartIndex;
OpenCLArray blockCenter;
OpenCLArray blockBoundingBox;
OpenCLArray neighbors;
OpenCLArray neighborIndex;
OpenCLArray neighborBlockCount;
OpenCLArray sortedPos;
OpenCLArray torque;
std::vector<bool> isRealParticle;
std::vector<std::pair<int, int> > exceptionAtoms;
std::vector<std::pair<int, int> > excludedPairs;
......@@ -1200,9 +1196,8 @@ private:
class OpenCLCalcCustomCVForceKernel : public CalcCustomCVForceKernel {
public:
OpenCLCalcCustomCVForceKernel(std::string name, const Platform& platform, OpenCLContext& cl) : CalcCustomCVForceKernel(name, platform),
cl(cl), hasInitializedKernels(false), invAtomOrder(NULL), innerInvAtomOrder(NULL) {
cl(cl), hasInitializedKernels(false) {
}
~OpenCLCalcCustomCVForceKernel();
/**
* Initialize the kernel.
*
......@@ -1236,9 +1231,9 @@ private:
std::vector<std::string> variableNames, paramDerivNames, globalParameterNames;
std::vector<Lepton::ExpressionProgram> variableDerivExpressions;
std::vector<Lepton::ExpressionProgram> paramDerivExpressions;
std::vector<OpenCLArray*> cvForces;
OpenCLArray* invAtomOrder;
OpenCLArray* innerInvAtomOrder;
std::vector<OpenCLArray> cvForces;
OpenCLArray invAtomOrder;
OpenCLArray innerInvAtomOrder;
cl::Kernel copyStateKernel, copyForcesKernel, addForcesKernel;
};
......@@ -1247,10 +1242,8 @@ private:
*/
class OpenCLCalcRMSDForceKernel : public CalcRMSDForceKernel {
public:
OpenCLCalcRMSDForceKernel(std::string name, const Platform& platform, OpenCLContext& cl) : CalcRMSDForceKernel(name, platform),
cl(cl), referencePos(NULL), particles(NULL), buffer(NULL) {
OpenCLCalcRMSDForceKernel(std::string name, const Platform& platform, OpenCLContext& cl) : CalcRMSDForceKernel(name, platform), cl(cl) {
}
~OpenCLCalcRMSDForceKernel();
/**
* Initialize the kernel.
*
......@@ -1289,9 +1282,9 @@ private:
OpenCLContext& cl;
ForceInfo* info;
double sumNormRef;
OpenCLArray* referencePos;
OpenCLArray* particles;
OpenCLArray* buffer;
OpenCLArray referencePos;
OpenCLArray particles;
OpenCLArray buffer;
cl::Kernel kernel1, kernel2;
};
......@@ -1337,9 +1330,8 @@ private:
class OpenCLIntegrateLangevinStepKernel : public IntegrateLangevinStepKernel {
public:
OpenCLIntegrateLangevinStepKernel(std::string name, const Platform& platform, OpenCLContext& cl) : IntegrateLangevinStepKernel(name, platform), cl(cl),
hasInitializedKernels(false), params(NULL) {
hasInitializedKernels(false) {
}
~OpenCLIntegrateLangevinStepKernel();
/**
* Initialize the kernel, setting up the particle masses.
*
......@@ -1365,7 +1357,7 @@ private:
OpenCLContext& cl;
double prevTemp, prevFriction, prevStepSize;
bool hasInitializedKernels;
OpenCLArray* params;
OpenCLArray params;
cl::Kernel kernel1, kernel2;
};
......@@ -1451,9 +1443,8 @@ private:
class OpenCLIntegrateVariableLangevinStepKernel : public IntegrateVariableLangevinStepKernel {
public:
OpenCLIntegrateVariableLangevinStepKernel(std::string name, const Platform& platform, OpenCLContext& cl) : IntegrateVariableLangevinStepKernel(name, platform), cl(cl),
hasInitializedKernels(false), params(NULL) {
hasInitializedKernels(false) {
}
~OpenCLIntegrateVariableLangevinStepKernel();
/**
* Initialize the kernel, setting up the particle masses.
*
......@@ -1481,7 +1472,7 @@ private:
OpenCLContext& cl;
bool hasInitializedKernels;
int blockSize;
OpenCLArray* params;
OpenCLArray params;
cl::Kernel kernel1, kernel2, selectSizeKernel;
double prevTemp, prevFriction, prevErrorTol;
};
......@@ -1493,8 +1484,7 @@ class OpenCLIntegrateCustomStepKernel : public IntegrateCustomStepKernel {
public:
enum GlobalTargetType {DT, VARIABLE, PARAMETER};
OpenCLIntegrateCustomStepKernel(std::string name, const Platform& platform, OpenCLContext& cl) : IntegrateCustomStepKernel(name, platform), cl(cl),
hasInitializedKernels(false), localValuesAreCurrent(false), globalValues(NULL), sumBuffer(NULL), summedValue(NULL), uniformRandoms(NULL),
randomSeed(NULL), perDofEnergyParamDerivs(NULL), perDofValues(NULL), needsEnergyParamDerivs(false) {
hasInitializedKernels(false), localValuesAreCurrent(false), perDofValues(NULL), needsEnergyParamDerivs(false) {
}
~OpenCLIntegrateCustomStepKernel();
/**
......@@ -1575,15 +1565,15 @@ private:
int numGlobalVariables, sumWorkGroupSize;
bool hasInitializedKernels, deviceValuesAreCurrent, deviceGlobalsAreCurrent, modifiesParameters, keNeedsForce, hasAnyConstraints, needsEnergyParamDerivs;
mutable bool localValuesAreCurrent;
OpenCLArray* globalValues;
OpenCLArray* sumBuffer;
OpenCLArray* summedValue;
OpenCLArray* uniformRandoms;
OpenCLArray* randomSeed;
OpenCLArray* perDofEnergyParamDerivs;
std::vector<OpenCLArray*> tabulatedFunctions;
OpenCLArray globalValues;
OpenCLArray sumBuffer;
OpenCLArray summedValue;
OpenCLArray uniformRandoms;
OpenCLArray randomSeed;
OpenCLArray perDofEnergyParamDerivs;
std::vector<OpenCLArray> tabulatedFunctions;
std::map<int, double> savedEnergy;
std::map<int, OpenCLArray*> savedForces;
std::map<int, OpenCLArray> savedForces;
std::set<int> validSavedForces;
OpenCLParameterSet* perDofValues;
mutable std::vector<std::vector<cl_float> > localPerDofValuesFloat;
......@@ -1635,9 +1625,8 @@ public:
class OpenCLApplyAndersenThermostatKernel : public ApplyAndersenThermostatKernel {
public:
OpenCLApplyAndersenThermostatKernel(std::string name, const Platform& platform, OpenCLContext& cl) : ApplyAndersenThermostatKernel(name, platform), cl(cl),
hasInitializedKernels(false), atomGroups(NULL) {
hasInitializedKernels(false) {
}
~OpenCLApplyAndersenThermostatKernel();
/**
* Initialize the kernel.
*
......@@ -1655,7 +1644,7 @@ private:
OpenCLContext& cl;
bool hasInitializedKernels;
int randomSeed;
OpenCLArray* atomGroups;
OpenCLArray atomGroups;
cl::Kernel kernel;
};
......@@ -1665,9 +1654,8 @@ private:
class OpenCLApplyMonteCarloBarostatKernel : public ApplyMonteCarloBarostatKernel {
public:
OpenCLApplyMonteCarloBarostatKernel(std::string name, const Platform& platform, OpenCLContext& cl) : ApplyMonteCarloBarostatKernel(name, platform), cl(cl),
hasInitializedKernels(false), savedPositions(NULL), savedForces(NULL), moleculeAtoms(NULL), moleculeStartIndex(NULL) {
hasInitializedKernels(false) {
}
~OpenCLApplyMonteCarloBarostatKernel();
/**
* Initialize the kernel.
*
......@@ -1699,10 +1687,10 @@ private:
OpenCLContext& cl;
bool hasInitializedKernels;
int numMolecules;
OpenCLArray* savedPositions;
OpenCLArray* savedForces;
OpenCLArray* moleculeAtoms;
OpenCLArray* moleculeStartIndex;
OpenCLArray savedPositions;
OpenCLArray savedForces;
OpenCLArray moleculeAtoms;
OpenCLArray moleculeStartIndex;
cl::Kernel kernel;
std::vector<int> lastAtomOrder;
};
......@@ -1712,9 +1700,8 @@ private:
*/
class OpenCLRemoveCMMotionKernel : public RemoveCMMotionKernel {
public:
OpenCLRemoveCMMotionKernel(std::string name, const Platform& platform, OpenCLContext& cl) : RemoveCMMotionKernel(name, platform), cl(cl), cmMomentum(NULL) {
OpenCLRemoveCMMotionKernel(std::string name, const Platform& platform, OpenCLContext& cl) : RemoveCMMotionKernel(name, platform), cl(cl) {
}
~OpenCLRemoveCMMotionKernel();
/**
* Initialize the kernel, setting up the particle masses.
*
......@@ -1731,7 +1718,7 @@ public:
private:
OpenCLContext& cl;
int frequency;
OpenCLArray* cmMomentum;
OpenCLArray cmMomentum;
cl::Kernel kernel1, kernel2;
};
......
......@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2016 Stanford University and the Authors. *
* Portions copyright (c) 2009-2018 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -175,55 +175,55 @@ public:
* Get the array containing the center of each atom block.
*/
OpenCLArray& getBlockCenters() {
return *blockCenter;
return blockCenter;
}
/**
* Get the array containing the dimensions of each atom block.
*/
OpenCLArray& getBlockBoundingBoxes() {
return *blockBoundingBox;
return blockBoundingBox;
}
/**
* Get the array whose first element contains the number of tiles with interactions.
*/
OpenCLArray& getInteractionCount() {
return *interactionCount;
return interactionCount;
}
/**
* Get the array containing tiles with interactions.
*/
OpenCLArray& getInteractingTiles() {
return *interactingTiles;
return interactingTiles;
}
/**
* Get the array containing the atoms in each tile with interactions.
*/
OpenCLArray& getInteractingAtoms() {
return *interactingAtoms;
return interactingAtoms;
}
/**
* Get the array containing exclusion flags.
*/
OpenCLArray& getExclusions() {
return *exclusions;
return exclusions;
}
/**
* Get the array containing tiles with exclusions.
*/
OpenCLArray& getExclusionTiles() {
return *exclusionTiles;
return exclusionTiles;
}
/**
* Get the array containing the index into the exclusion array for each tile.
*/
OpenCLArray& getExclusionIndices() {
return *exclusionIndices;
return exclusionIndices;
}
/**
* Get the array listing where the exclusion data starts for each row.
*/
OpenCLArray& getExclusionRowIndices() {
return *exclusionRowIndices;
return exclusionRowIndices;
}
/**
* Get the index of the first tile this context is responsible for processing.
......@@ -275,20 +275,20 @@ private:
class BlockSortTrait;
OpenCLContext& context;
std::map<int, KernelSet> groupKernels;
OpenCLArray* exclusionTiles;
OpenCLArray* exclusions;
OpenCLArray* exclusionIndices;
OpenCLArray* exclusionRowIndices;
OpenCLArray* interactingTiles;
OpenCLArray* interactingAtoms;
OpenCLArray* interactionCount;
OpenCLArray* blockCenter;
OpenCLArray* blockBoundingBox;
OpenCLArray* sortedBlocks;
OpenCLArray* sortedBlockCenter;
OpenCLArray* sortedBlockBoundingBox;
OpenCLArray* oldPositions;
OpenCLArray* rebuildNeighborList;
OpenCLArray exclusionTiles;
OpenCLArray exclusions;
OpenCLArray exclusionIndices;
OpenCLArray exclusionRowIndices;
OpenCLArray interactingTiles;
OpenCLArray interactingAtoms;
OpenCLArray interactionCount;
OpenCLArray blockCenter;
OpenCLArray blockBoundingBox;
OpenCLArray sortedBlocks;
OpenCLArray sortedBlockCenter;
OpenCLArray sortedBlockBoundingBox;
OpenCLArray oldPositions;
OpenCLArray rebuildNeighborList;
OpenCLSort* blockSorter;
cl::Event downloadCountEvent;
cl::Buffer* pinnedCountBuffer;
......
......@@ -84,7 +84,7 @@ private:
std::vector<long long> completionTimes;
std::vector<double> contextNonbondedFractions;
std::vector<int> tileCounts;
OpenCLArray* contextForces;
OpenCLArray contextForces;
cl::Buffer* pinnedPositionBuffer;
cl::Buffer* pinnedForceBuffer;
void* pinnedPositionMemory;
......
......@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2010-2013 Stanford University and the Authors. *
* Portions copyright (c) 2010-2018 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -87,11 +87,11 @@ public:
private:
OpenCLContext& context;
SortTrait* trait;
OpenCLArray* dataRange;
OpenCLArray* bucketOfElement;
OpenCLArray* offsetInBucket;
OpenCLArray* bucketOffset;
OpenCLArray* buckets;
OpenCLArray dataRange;
OpenCLArray bucketOfElement;
OpenCLArray offsetInBucket;
OpenCLArray bucketOffset;
OpenCLArray buckets;
cl::Kernel shortListKernel, computeRangeKernel, assignElementsKernel, computeBucketPositionsKernel, copyToBucketsKernel, sortBucketsKernel;
unsigned int dataLength, rangeKernelSize, positionsKernelSize, sortKernelSize;
bool isShortList;
......
......@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2011-2016 Stanford University and the Authors. *
* Portions copyright (c) 2011-2018 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -36,13 +36,6 @@ using namespace std;
OpenCLBondedUtilities::OpenCLBondedUtilities(OpenCLContext& context) : context(context), numForceBuffers(0), maxBonds(0), allGroups(0), hasInitializedKernels(false) {
}
OpenCLBondedUtilities::~OpenCLBondedUtilities() {
for (int i = 0; i < (int) atomIndices.size(); i++)
delete atomIndices[i];
for (int i = 0; i < (int) bufferIndices.size(); i++)
delete bufferIndices[i];
}
void OpenCLBondedUtilities::addInteraction(const vector<vector<int> >& atoms, const string& source, int group) {
if (atoms.size() > 0) {
forceAtoms.push_back(atoms);
......@@ -92,6 +85,7 @@ void OpenCLBondedUtilities::initialize(const System& system) {
vector<vector<cl_uint> > bufferVec(numForces);
vector<vector<int> > bufferCounter(numForces, vector<int>(system.getNumParticles(), 0));
vector<int> numBuffers(numForces, 0);
atomIndices.resize(numForces);
for (int i = 0; i < numForces; i++) {
int numBonds = forceAtoms[i].size();
int numAtoms = forceAtoms[i][0].size();
......@@ -101,9 +95,8 @@ void OpenCLBondedUtilities::initialize(const System& system) {
for (int atom = 0; atom < numAtoms; atom++)
indexVec[bond*width+atom] = forceAtoms[i][bond][atom];
}
OpenCLArray* indices = OpenCLArray::create<cl_uint>(context, indexVec.size(), "bondedIndices");
indices->upload(indexVec);
atomIndices.push_back(indices);
atomIndices[i].initialize<cl_uint>(context, indexVec.size(), "bondedIndices");
atomIndices[i].upload(indexVec);
bufferVec[i].resize(width*numBonds, 0);
for (int bond = 0; bond < numBonds; bond++) {
for (int atom = 0; atom < numAtoms; atom++)
......@@ -177,9 +170,8 @@ void OpenCLBondedUtilities::initialize(const System& system) {
for (int bond = 0; bond < numBonds; bond++)
for (int atom = 0; atom < numAtoms; atom++)
bufferVec[force][bond*width+atom] += bufferCounter[forceSets[i][k]][forceAtoms[force][bond][atom]];
OpenCLArray* buffers = OpenCLArray::create<cl_uint>(context, bufferVec[force].size(), "bondedBufferIndices");
buffers->upload(bufferVec[force]);
bufferIndices[force] = buffers;
bufferIndices[force].initialize<cl_uint>(context, bufferVec[force].size(), "bondedBufferIndices");
bufferIndices[force].upload(bufferVec[force]);
}
// Create the kernels.
......@@ -291,8 +283,8 @@ void OpenCLBondedUtilities::computeInteractions(int groups) {
kernel.setArg<cl::Buffer>(index++, context.getPosq().getDeviceBuffer());
index += 6;
for (int j = 0; j < (int) forceSets[i].size(); j++) {
kernel.setArg<cl::Buffer>(index++, atomIndices[forceSets[i][j]]->getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, bufferIndices[forceSets[i][j]]->getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, atomIndices[forceSets[i][j]].getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, bufferIndices[forceSets[i][j]].getDeviceBuffer());
}
for (int j = 0; j < (int) arguments.size(); j++)
kernel.setArg<cl::Memory>(index++, *arguments[j]);
......
......@@ -29,18 +29,13 @@
using namespace OpenMM;
OpenCLCompact::OpenCLCompact(OpenCLContext& context) : context(context), dgBlockCounts(NULL) {
dgBlockCounts = OpenCLArray::create<cl_uint>(context, context.getNumThreadBlocks(), "dgBlockCounts");
OpenCLCompact::OpenCLCompact(OpenCLContext& context) : context(context) {
dgBlockCounts.initialize<cl_uint>(context, context.getNumThreadBlocks(), "dgBlockCounts");
cl::Program program = context.createProgram(OpenCLKernelSources::compact);
countKernel = cl::Kernel(program, "countElts");
moveValidKernel = cl::Kernel(program, "moveValidElementsStaged");
}
OpenCLCompact::~OpenCLCompact() {
if (dgBlockCounts != NULL)
delete dgBlockCounts;
}
void OpenCLCompact::compactStream(OpenCLArray& dOut, OpenCLArray& dIn, OpenCLArray& dValid, OpenCLArray& numValid) {
// Figure out # elements per block
unsigned int len = dIn.getSize();
......@@ -51,7 +46,7 @@ void OpenCLCompact::compactStream(OpenCLArray& dOut, OpenCLArray& dIn, OpenCLArr
// TODO: implement loop over blocks of 10M
// Phase 1: Calculate number of valid elements per thread block
countKernel.setArg<cl::Buffer>(0, dgBlockCounts->getDeviceBuffer());
countKernel.setArg<cl::Buffer>(0, dgBlockCounts.getDeviceBuffer());
countKernel.setArg<cl::Buffer>(1, dValid.getDeviceBuffer());
countKernel.setArg<cl_uint>(2, len);
countKernel.setArg(3, 128*sizeof(cl_uint), NULL);
......@@ -61,7 +56,7 @@ void OpenCLCompact::compactStream(OpenCLArray& dOut, OpenCLArray& dIn, OpenCLArr
moveValidKernel.setArg<cl::Buffer>(0, dIn.getDeviceBuffer());
moveValidKernel.setArg<cl::Buffer>(1, dOut.getDeviceBuffer());
moveValidKernel.setArg<cl::Buffer>(2, dValid.getDeviceBuffer());
moveValidKernel.setArg<cl::Buffer>(3, dgBlockCounts->getDeviceBuffer());
moveValidKernel.setArg<cl::Buffer>(3, dgBlockCounts.getDeviceBuffer());
moveValidKernel.setArg<cl_uint>(4, len);
moveValidKernel.setArg<cl::Buffer>(5, numValid.getDeviceBuffer());
moveValidKernel.setArg(6, 128*sizeof(cl_uint), NULL);
......
......@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2017 Stanford University and the Authors. *
* Portions copyright (c) 2009-2018 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -25,7 +25,6 @@
* -------------------------------------------------------------------------- */
#include "OpenCLIntegrationUtilities.h"
#include "OpenCLArray.h"
#include "OpenCLKernelSources.h"
#include "openmm/internal/OSRngSeed.h"
#include "openmm/HarmonicAngleForce.h"
......@@ -97,31 +96,24 @@ static void setPosqCorrectionArg(OpenCLContext& cl, cl::Kernel& kernel, int inde
}
OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, const System& system) : context(context),
posDelta(NULL), settleAtoms(NULL), settleParams(NULL), shakeAtoms(NULL), shakeParams(NULL),
random(NULL), randomSeed(NULL), randomPos(0), stepSize(NULL), ccmaAtoms(NULL), ccmaDistance(NULL),
ccmaReducedMass(NULL), ccmaAtomConstraints(NULL), ccmaNumAtomConstraints(NULL), ccmaConstraintMatrixColumn(NULL),
ccmaConstraintMatrixValue(NULL), ccmaDelta1(NULL), ccmaDelta2(NULL), ccmaConverged(NULL), ccmaConvergedHostBuffer(NULL),
vsite2AvgAtoms(NULL), vsite2AvgWeights(NULL), vsite3AvgAtoms(NULL), vsite3AvgWeights(NULL),
vsiteOutOfPlaneAtoms(NULL), vsiteOutOfPlaneWeights(NULL), vsiteLocalCoordsIndex(NULL), vsiteLocalCoordsAtoms(NULL),
vsiteLocalCoordsWeights(NULL), vsiteLocalCoordsPos(NULL), vsiteLocalCoordsStartIndex(NULL),
hasInitializedPosConstraintKernels(false), hasInitializedVelConstraintKernels(false), hasOverlappingVsites(false) {
randomPos(0), hasInitializedPosConstraintKernels(false), hasInitializedVelConstraintKernels(false), hasOverlappingVsites(false) {
// Create workspace arrays.
lastStepSize = mm_double2(0.0, 0.0);
if (context.getUseDoublePrecision() || context.getUseMixedPrecision()) {
posDelta = OpenCLArray::create<mm_double4>(context, context.getPaddedNumAtoms(), "posDelta");
vector<mm_double4> deltas(posDelta->getSize(), mm_double4(0.0, 0.0, 0.0, 0.0));
posDelta->upload(deltas);
stepSize = OpenCLArray::create<mm_double2>(context, 1, "stepSize");
stepSize->upload(&lastStepSize);
posDelta.initialize<mm_double4>(context, context.getPaddedNumAtoms(), "posDelta");
vector<mm_double4> deltas(posDelta.getSize(), mm_double4(0.0, 0.0, 0.0, 0.0));
posDelta.upload(deltas);
stepSize.initialize<mm_double2>(context, 1, "stepSize");
stepSize.upload(&lastStepSize);
}
else {
posDelta = OpenCLArray::create<mm_float4>(context, context.getPaddedNumAtoms(), "posDelta");
vector<mm_float4> deltas(posDelta->getSize(), mm_float4(0.0f, 0.0f, 0.0f, 0.0f));
posDelta->upload(deltas);
stepSize = OpenCLArray::create<mm_float2>(context, 1, "stepSize");
posDelta.initialize<mm_float4>(context, context.getPaddedNumAtoms(), "posDelta");
vector<mm_float4> deltas(posDelta.getSize(), mm_float4(0.0f, 0.0f, 0.0f, 0.0f));
posDelta.upload(deltas);
stepSize.initialize<mm_float2>(context, 1, "stepSize");
mm_float2 lastStepSizeFloat = mm_float2(0.0f, 0.0f);
stepSize->upload(&lastStepSizeFloat);
stepSize.upload(&lastStepSizeFloat);
}
// Create the time shift kernel for calculating kinetic energy.
......@@ -227,10 +219,10 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
isShakeAtom[atom3] = true;
}
if (atoms.size() > 0) {
settleAtoms = OpenCLArray::create<mm_int4>(context, atoms.size(), "settleAtoms");
settleParams = OpenCLArray::create<mm_float2>(context, params.size(), "settleParams");
settleAtoms->upload(atoms);
settleParams->upload(params);
settleAtoms.initialize<mm_int4>(context, atoms.size(), "settleAtoms");
settleParams.initialize<mm_float2>(context, params.size(), "settleParams");
settleAtoms.upload(atoms);
settleParams.upload(params);
}
}
......@@ -310,10 +302,10 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
isShakeAtom[cluster.peripheralID[2]] = true;
++index;
}
shakeAtoms = OpenCLArray::create<mm_int4>(context, atoms.size(), "shakeAtoms");
shakeParams = OpenCLArray::create<mm_float4>(context, params.size(), "shakeParams");
shakeAtoms->upload(atoms);
shakeParams->upload(params);
shakeAtoms.initialize<mm_int4>(context, atoms.size(), "shakeAtoms");
shakeParams.initialize<mm_float4>(context, params.size(), "shakeParams");
shakeAtoms.upload(atoms);
shakeParams.upload(params);
}
// Find connected constraints for CCMA.
......@@ -390,28 +382,28 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
// Record the CCMA data structures.
ccmaAtoms = OpenCLArray::create<mm_int2>(context, numCCMA, "CcmaAtoms");
ccmaAtomConstraints = OpenCLArray::create<cl_int>(context, numAtoms*maxAtomConstraints, "CcmaAtomConstraints");
ccmaNumAtomConstraints = OpenCLArray::create<cl_int>(context, numAtoms, "CcmaAtomConstraintsIndex");
ccmaConstraintMatrixColumn = OpenCLArray::create<cl_int>(context, numCCMA*maxRowElements, "ConstraintMatrixColumn");
ccmaConverged = OpenCLArray::create<cl_int>(context, 2, "CcmaConverged");
ccmaConvergedHostBuffer = OpenCLArray::create<cl_int>(context, 1, "CcmaConvergedHostBuffer", CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR);
ccmaAtoms.initialize<mm_int2>(context, numCCMA, "CcmaAtoms");
ccmaAtomConstraints.initialize<cl_int>(context, numAtoms*maxAtomConstraints, "CcmaAtomConstraints");
ccmaNumAtomConstraints.initialize<cl_int>(context, numAtoms, "CcmaAtomConstraintsIndex");
ccmaConstraintMatrixColumn.initialize<cl_int>(context, numCCMA*maxRowElements, "ConstraintMatrixColumn");
ccmaConverged.initialize<cl_int>(context, 2, "CcmaConverged");
ccmaConvergedHostBuffer.initialize<cl_int>(context, 1, "CcmaConvergedHostBuffer", CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR);
// Different communication mechanisms give optimal performance on AMD and on NVIDIA.
string vendor = context.getDevice().getInfo<CL_DEVICE_VENDOR>();
ccmaUseDirectBuffer = (vendor.size() >= 28 && vendor.substr(0, 28) == "Advanced Micro Devices, Inc.");
vector<mm_int2> atomsVec(ccmaAtoms->getSize());
vector<cl_int> atomConstraintsVec(ccmaAtomConstraints->getSize());
vector<cl_int> numAtomConstraintsVec(ccmaNumAtomConstraints->getSize());
vector<cl_int> constraintMatrixColumnVec(ccmaConstraintMatrixColumn->getSize());
vector<mm_int2> atomsVec(ccmaAtoms.getSize());
vector<cl_int> atomConstraintsVec(ccmaAtomConstraints.getSize());
vector<cl_int> numAtomConstraintsVec(ccmaNumAtomConstraints.getSize());
vector<cl_int> constraintMatrixColumnVec(ccmaConstraintMatrixColumn.getSize());
if (context.getUseDoublePrecision() || context.getUseMixedPrecision()) {
ccmaDistance = OpenCLArray::create<mm_double4>(context, numCCMA, "CcmaDistance");
ccmaDelta1 = OpenCLArray::create<cl_double>(context, numCCMA, "CcmaDelta1");
ccmaDelta2 = OpenCLArray::create<cl_double>(context, numCCMA, "CcmaDelta2");
ccmaReducedMass = OpenCLArray::create<cl_double>(context, numCCMA, "CcmaReducedMass");
ccmaConstraintMatrixValue = OpenCLArray::create<cl_double>(context, numCCMA*maxRowElements, "ConstraintMatrixValue");
vector<mm_double4> distanceVec(ccmaDistance->getSize());
vector<cl_double> reducedMassVec(ccmaReducedMass->getSize());
vector<cl_double> constraintMatrixValueVec(ccmaConstraintMatrixValue->getSize());
ccmaDistance.initialize<mm_double4>(context, numCCMA, "CcmaDistance");
ccmaDelta1.initialize<cl_double>(context, numCCMA, "CcmaDelta1");
ccmaDelta2.initialize<cl_double>(context, numCCMA, "CcmaDelta2");
ccmaReducedMass.initialize<cl_double>(context, numCCMA, "CcmaReducedMass");
ccmaConstraintMatrixValue.initialize<cl_double>(context, numCCMA*maxRowElements, "ConstraintMatrixValue");
vector<mm_double4> distanceVec(ccmaDistance.getSize());
vector<cl_double> reducedMassVec(ccmaReducedMass.getSize());
vector<cl_double> constraintMatrixValueVec(ccmaConstraintMatrixValue.getSize());
for (int i = 0; i < numCCMA; i++) {
int index = constraintOrder[i];
int c = ccmaConstraints[index];
......@@ -432,19 +424,19 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
atomConstraintsVec[i+j*numAtoms] = (forward ? inverseOrder[atomConstraints[i][j]]+1 : -inverseOrder[atomConstraints[i][j]]-1);
}
}
ccmaDistance->upload(distanceVec);
ccmaReducedMass->upload(reducedMassVec);
ccmaConstraintMatrixValue->upload(constraintMatrixValueVec);
ccmaDistance.upload(distanceVec);
ccmaReducedMass.upload(reducedMassVec);
ccmaConstraintMatrixValue.upload(constraintMatrixValueVec);
}
else {
ccmaDistance = OpenCLArray::create<mm_float4>(context, numCCMA, "CcmaDistance");
ccmaDelta1 = OpenCLArray::create<cl_float>(context, numCCMA, "CcmaDelta1");
ccmaDelta2 = OpenCLArray::create<cl_float>(context, numCCMA, "CcmaDelta2");
ccmaReducedMass = OpenCLArray::create<cl_float>(context, numCCMA, "CcmaReducedMass");
ccmaConstraintMatrixValue = OpenCLArray::create<cl_float>(context, numCCMA*maxRowElements, "ConstraintMatrixValue");
vector<mm_float4> distanceVec(ccmaDistance->getSize());
vector<cl_float> reducedMassVec(ccmaReducedMass->getSize());
vector<cl_float> constraintMatrixValueVec(ccmaConstraintMatrixValue->getSize());
ccmaDistance.initialize<mm_float4>(context, numCCMA, "CcmaDistance");
ccmaDelta1.initialize<cl_float>(context, numCCMA, "CcmaDelta1");
ccmaDelta2.initialize<cl_float>(context, numCCMA, "CcmaDelta2");
ccmaReducedMass.initialize<cl_float>(context, numCCMA, "CcmaReducedMass");
ccmaConstraintMatrixValue.initialize<cl_float>(context, numCCMA*maxRowElements, "ConstraintMatrixValue");
vector<mm_float4> distanceVec(ccmaDistance.getSize());
vector<cl_float> reducedMassVec(ccmaReducedMass.getSize());
vector<cl_float> constraintMatrixValueVec(ccmaConstraintMatrixValue.getSize());
for (int i = 0; i < numCCMA; i++) {
int index = constraintOrder[i];
int c = ccmaConstraints[index];
......@@ -465,14 +457,14 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
atomConstraintsVec[i+j*numAtoms] = (forward ? inverseOrder[atomConstraints[i][j]]+1 : -inverseOrder[atomConstraints[i][j]]-1);
}
}
ccmaDistance->upload(distanceVec);
ccmaReducedMass->upload(reducedMassVec);
ccmaConstraintMatrixValue->upload(constraintMatrixValueVec);
ccmaDistance.upload(distanceVec);
ccmaReducedMass.upload(reducedMassVec);
ccmaConstraintMatrixValue.upload(constraintMatrixValueVec);
}
ccmaAtoms->upload(atomsVec);
ccmaAtomConstraints->upload(atomConstraintsVec);
ccmaNumAtomConstraints->upload(numAtomConstraintsVec);
ccmaConstraintMatrixColumn->upload(constraintMatrixColumnVec);
ccmaAtoms.upload(atomsVec);
ccmaAtomConstraints.upload(atomConstraintsVec);
ccmaNumAtomConstraints.upload(numAtomConstraintsVec);
ccmaConstraintMatrixColumn.upload(constraintMatrixColumnVec);
// Create the CCMA kernels.
......@@ -554,73 +546,73 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
int numOutOfPlane = vsiteOutOfPlaneAtomVec.size();
int numLocalCoords = vsiteLocalCoordsPosVec.size();
numVsites = num2Avg+num3Avg+numOutOfPlane+numLocalCoords;
vsite2AvgAtoms = OpenCLArray::create<mm_int4>(context, max(1, num2Avg), "vsite2AvgAtoms");
vsite3AvgAtoms = OpenCLArray::create<mm_int4>(context, max(1, num3Avg), "vsite3AvgAtoms");
vsiteOutOfPlaneAtoms = OpenCLArray::create<mm_int4>(context, max(1, numOutOfPlane), "vsiteOutOfPlaneAtoms");
vsiteLocalCoordsIndex = OpenCLArray::create<cl_int>(context, max(1, (int) vsiteLocalCoordsIndexVec.size()), "vsiteLocalCoordsIndex");
vsiteLocalCoordsAtoms = OpenCLArray::create<cl_int>(context, max(1, (int) vsiteLocalCoordsAtomVec.size()), "vsiteLocalCoordsAtoms");
vsiteLocalCoordsStartIndex = OpenCLArray::create<cl_int>(context, max(1, (int) vsiteLocalCoordsStartVec.size()), "vsiteLocalCoordsStartIndex");
vsite2AvgAtoms.initialize<mm_int4>(context, max(1, num2Avg), "vsite2AvgAtoms");
vsite3AvgAtoms.initialize<mm_int4>(context, max(1, num3Avg), "vsite3AvgAtoms");
vsiteOutOfPlaneAtoms.initialize<mm_int4>(context, max(1, numOutOfPlane), "vsiteOutOfPlaneAtoms");
vsiteLocalCoordsIndex.initialize<cl_int>(context, max(1, (int) vsiteLocalCoordsIndexVec.size()), "vsiteLocalCoordsIndex");
vsiteLocalCoordsAtoms.initialize<cl_int>(context, max(1, (int) vsiteLocalCoordsAtomVec.size()), "vsiteLocalCoordsAtoms");
vsiteLocalCoordsStartIndex.initialize<cl_int>(context, max(1, (int) vsiteLocalCoordsStartVec.size()), "vsiteLocalCoordsStartIndex");
if (num2Avg > 0)
vsite2AvgAtoms->upload(vsite2AvgAtomVec);
vsite2AvgAtoms.upload(vsite2AvgAtomVec);
if (num3Avg > 0)
vsite3AvgAtoms->upload(vsite3AvgAtomVec);
vsite3AvgAtoms.upload(vsite3AvgAtomVec);
if (numOutOfPlane > 0)
vsiteOutOfPlaneAtoms->upload(vsiteOutOfPlaneAtomVec);
vsiteOutOfPlaneAtoms.upload(vsiteOutOfPlaneAtomVec);
if (numLocalCoords > 0) {
vsiteLocalCoordsIndex->upload(vsiteLocalCoordsIndexVec);
vsiteLocalCoordsAtoms->upload(vsiteLocalCoordsAtomVec);
vsiteLocalCoordsStartIndex->upload(vsiteLocalCoordsStartVec);
vsiteLocalCoordsIndex.upload(vsiteLocalCoordsIndexVec);
vsiteLocalCoordsAtoms.upload(vsiteLocalCoordsAtomVec);
vsiteLocalCoordsStartIndex.upload(vsiteLocalCoordsStartVec);
}
if (context.getUseDoublePrecision()) {
vsite2AvgWeights = OpenCLArray::create<mm_double2>(context, max(1, num2Avg), "vsite2AvgWeights");
vsite3AvgWeights = OpenCLArray::create<mm_double4>(context, max(1, num3Avg), "vsite3AvgWeights");
vsiteOutOfPlaneWeights = OpenCLArray::create<mm_double4>(context, max(1, numOutOfPlane), "vsiteOutOfPlaneWeights");
vsiteLocalCoordsWeights = OpenCLArray::create<cl_double>(context, max(1, (int) vsiteLocalCoordsWeightVec.size()), "vsiteLocalCoordsWeights");
vsiteLocalCoordsPos = OpenCLArray::create<mm_double4>(context, max(1, (int) vsiteLocalCoordsPosVec.size()), "vsiteLocalCoordsPos");
vsite2AvgWeights.initialize<mm_double2>(context, max(1, num2Avg), "vsite2AvgWeights");
vsite3AvgWeights.initialize<mm_double4>(context, max(1, num3Avg), "vsite3AvgWeights");
vsiteOutOfPlaneWeights.initialize<mm_double4>(context, max(1, numOutOfPlane), "vsiteOutOfPlaneWeights");
vsiteLocalCoordsWeights.initialize<cl_double>(context, max(1, (int) vsiteLocalCoordsWeightVec.size()), "vsiteLocalCoordsWeights");
vsiteLocalCoordsPos.initialize<mm_double4>(context, max(1, (int) vsiteLocalCoordsPosVec.size()), "vsiteLocalCoordsPos");
if (num2Avg > 0)
vsite2AvgWeights->upload(vsite2AvgWeightVec);
vsite2AvgWeights.upload(vsite2AvgWeightVec);
if (num3Avg > 0)
vsite3AvgWeights->upload(vsite3AvgWeightVec);
vsite3AvgWeights.upload(vsite3AvgWeightVec);
if (numOutOfPlane > 0)
vsiteOutOfPlaneWeights->upload(vsiteOutOfPlaneWeightVec);
vsiteOutOfPlaneWeights.upload(vsiteOutOfPlaneWeightVec);
if (numLocalCoords > 0) {
vsiteLocalCoordsWeights->upload(vsiteLocalCoordsWeightVec);
vsiteLocalCoordsPos->upload(vsiteLocalCoordsPosVec);
vsiteLocalCoordsWeights.upload(vsiteLocalCoordsWeightVec);
vsiteLocalCoordsPos.upload(vsiteLocalCoordsPosVec);
}
}
else {
vsite2AvgWeights = OpenCLArray::create<mm_float2>(context, max(1, num2Avg), "vsite2AvgWeights");
vsite3AvgWeights = OpenCLArray::create<mm_float4>(context, max(1, num3Avg), "vsite3AvgWeights");
vsiteOutOfPlaneWeights = OpenCLArray::create<mm_float4>(context, max(1, numOutOfPlane), "vsiteOutOfPlaneWeights");
vsiteLocalCoordsWeights = OpenCLArray::create<cl_float>(context, max(1, (int) vsiteLocalCoordsWeightVec.size()), "vsiteLocalCoordsWeights");
vsiteLocalCoordsPos = OpenCLArray::create<mm_float4>(context, max(1, (int) vsiteLocalCoordsPosVec.size()), "vsiteLocalCoordsPos");
vsite2AvgWeights.initialize<mm_float2>(context, max(1, num2Avg), "vsite2AvgWeights");
vsite3AvgWeights.initialize<mm_float4>(context, max(1, num3Avg), "vsite3AvgWeights");
vsiteOutOfPlaneWeights.initialize<mm_float4>(context, max(1, numOutOfPlane), "vsiteOutOfPlaneWeights");
vsiteLocalCoordsWeights.initialize<cl_float>(context, max(1, (int) vsiteLocalCoordsWeightVec.size()), "vsiteLocalCoordsWeights");
vsiteLocalCoordsPos.initialize<mm_float4>(context, max(1, (int) vsiteLocalCoordsPosVec.size()), "vsiteLocalCoordsPos");
if (num2Avg > 0) {
vector<mm_float2> floatWeights(num2Avg);
for (int i = 0; i < num2Avg; i++)
floatWeights[i] = mm_float2((float) vsite2AvgWeightVec[i].x, (float) vsite2AvgWeightVec[i].y);
vsite2AvgWeights->upload(floatWeights);
vsite2AvgWeights.upload(floatWeights);
}
if (num3Avg > 0) {
vector<mm_float4> floatWeights(num3Avg);
for (int i = 0; i < num3Avg; i++)
floatWeights[i] = mm_float4((float) vsite3AvgWeightVec[i].x, (float) vsite3AvgWeightVec[i].y, (float) vsite3AvgWeightVec[i].z, 0.0f);
vsite3AvgWeights->upload(floatWeights);
vsite3AvgWeights.upload(floatWeights);
}
if (numOutOfPlane > 0) {
vector<mm_float4> floatWeights(numOutOfPlane);
for (int i = 0; i < numOutOfPlane; i++)
floatWeights[i] = mm_float4((float) vsiteOutOfPlaneWeightVec[i].x, (float) vsiteOutOfPlaneWeightVec[i].y, (float) vsiteOutOfPlaneWeightVec[i].z, 0.0f);
vsiteOutOfPlaneWeights->upload(floatWeights);
vsiteOutOfPlaneWeights.upload(floatWeights);
}
if (numLocalCoords > 0) {
vector<cl_float> floatWeights(vsiteLocalCoordsWeightVec.size());
for (int i = 0; i < (int) vsiteLocalCoordsWeightVec.size(); i++)
floatWeights[i] = (cl_float) vsiteLocalCoordsWeightVec[i];
vsiteLocalCoordsWeights->upload(floatWeights);
vsiteLocalCoordsWeights.upload(floatWeights);
vector<mm_float4> floatPos(vsiteLocalCoordsPosVec.size());
for (int i = 0; i < (int) vsiteLocalCoordsPosVec.size(); i++)
floatPos[i] = mm_float4((float) vsiteLocalCoordsPosVec[i].x, (float) vsiteLocalCoordsPosVec[i].y, (float) vsiteLocalCoordsPosVec[i].z, 0.0f);
vsiteLocalCoordsPos->upload(floatPos);
vsiteLocalCoordsPos.upload(floatPos);
}
}
......@@ -655,17 +647,17 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
vsitePositionKernel.setArg<cl::Buffer>(index++, context.getPosq().getDeviceBuffer());
if (context.getUseMixedPrecision())
vsitePositionKernel.setArg<cl::Buffer>(index++, context.getPosqCorrection().getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(index++, vsite2AvgAtoms->getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(index++, vsite2AvgWeights->getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(index++, vsite3AvgAtoms->getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(index++, vsite3AvgWeights->getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(index++, vsiteOutOfPlaneAtoms->getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(index++, vsiteOutOfPlaneWeights->getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(index++, vsiteLocalCoordsIndex->getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(index++, vsiteLocalCoordsAtoms->getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(index++, vsiteLocalCoordsWeights->getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(index++, vsiteLocalCoordsPos->getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(index++, vsiteLocalCoordsStartIndex->getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(index++, vsite2AvgAtoms.getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(index++, vsite2AvgWeights.getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(index++, vsite3AvgAtoms.getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(index++, vsite3AvgWeights.getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(index++, vsiteOutOfPlaneAtoms.getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(index++, vsiteOutOfPlaneWeights.getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(index++, vsiteLocalCoordsIndex.getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(index++, vsiteLocalCoordsAtoms.getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(index++, vsiteLocalCoordsWeights.getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(index++, vsiteLocalCoordsPos.getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(index++, vsiteLocalCoordsStartIndex.getDeviceBuffer());
vsiteForceKernel = cl::Kernel(vsiteProgram, "distributeForces");
index = 0;
vsiteForceKernel.setArg<cl::Buffer>(index++, context.getPosq().getDeviceBuffer());
......@@ -674,102 +666,39 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
index++; // Skip argument 2: the force array hasn't been created yet.
if (context.getUseMixedPrecision())
vsiteForceKernel.setArg<cl::Buffer>(index++, context.getPosqCorrection().getDeviceBuffer());
vsiteForceKernel.setArg<cl::Buffer>(index++, vsite2AvgAtoms->getDeviceBuffer());
vsiteForceKernel.setArg<cl::Buffer>(index++, vsite2AvgWeights->getDeviceBuffer());
vsiteForceKernel.setArg<cl::Buffer>(index++, vsite3AvgAtoms->getDeviceBuffer());
vsiteForceKernel.setArg<cl::Buffer>(index++, vsite3AvgWeights->getDeviceBuffer());
vsiteForceKernel.setArg<cl::Buffer>(index++, vsiteOutOfPlaneAtoms->getDeviceBuffer());
vsiteForceKernel.setArg<cl::Buffer>(index++, vsiteOutOfPlaneWeights->getDeviceBuffer());
vsiteForceKernel.setArg<cl::Buffer>(index++, vsiteLocalCoordsIndex->getDeviceBuffer());
vsiteForceKernel.setArg<cl::Buffer>(index++, vsiteLocalCoordsAtoms->getDeviceBuffer());
vsiteForceKernel.setArg<cl::Buffer>(index++, vsiteLocalCoordsWeights->getDeviceBuffer());
vsiteForceKernel.setArg<cl::Buffer>(index++, vsiteLocalCoordsPos->getDeviceBuffer());
vsiteForceKernel.setArg<cl::Buffer>(index++, vsiteLocalCoordsStartIndex->getDeviceBuffer());
vsiteForceKernel.setArg<cl::Buffer>(index++, vsite2AvgAtoms.getDeviceBuffer());
vsiteForceKernel.setArg<cl::Buffer>(index++, vsite2AvgWeights.getDeviceBuffer());
vsiteForceKernel.setArg<cl::Buffer>(index++, vsite3AvgAtoms.getDeviceBuffer());
vsiteForceKernel.setArg<cl::Buffer>(index++, vsite3AvgWeights.getDeviceBuffer());
vsiteForceKernel.setArg<cl::Buffer>(index++, vsiteOutOfPlaneAtoms.getDeviceBuffer());
vsiteForceKernel.setArg<cl::Buffer>(index++, vsiteOutOfPlaneWeights.getDeviceBuffer());
vsiteForceKernel.setArg<cl::Buffer>(index++, vsiteLocalCoordsIndex.getDeviceBuffer());
vsiteForceKernel.setArg<cl::Buffer>(index++, vsiteLocalCoordsAtoms.getDeviceBuffer());
vsiteForceKernel.setArg<cl::Buffer>(index++, vsiteLocalCoordsWeights.getDeviceBuffer());
vsiteForceKernel.setArg<cl::Buffer>(index++, vsiteLocalCoordsPos.getDeviceBuffer());
vsiteForceKernel.setArg<cl::Buffer>(index++, vsiteLocalCoordsStartIndex.getDeviceBuffer());
if (hasOverlappingVsites && context.getSupports64BitGlobalAtomics())
vsiteAddForcesKernel = cl::Kernel(vsiteProgram, "addDistributedForces");
}
OpenCLIntegrationUtilities::~OpenCLIntegrationUtilities() {
if (posDelta != NULL)
delete posDelta;
if (settleAtoms != NULL)
delete settleAtoms;
if (settleParams != NULL)
delete settleParams;
if (shakeAtoms != NULL)
delete shakeAtoms;
if (shakeParams != NULL)
delete shakeParams;
if (random != NULL)
delete random;
if (randomSeed != NULL)
delete randomSeed;
if (stepSize != NULL)
delete stepSize;
if (ccmaAtoms != NULL)
delete ccmaAtoms;
if (ccmaDistance != NULL)
delete ccmaDistance;
if (ccmaReducedMass != NULL)
delete ccmaReducedMass;
if (ccmaAtomConstraints != NULL)
delete ccmaAtomConstraints;
if (ccmaNumAtomConstraints != NULL)
delete ccmaNumAtomConstraints;
if (ccmaConstraintMatrixColumn != NULL)
delete ccmaConstraintMatrixColumn;
if (ccmaConstraintMatrixValue != NULL)
delete ccmaConstraintMatrixValue;
if (ccmaDelta1 != NULL)
delete ccmaDelta1;
if (ccmaDelta2 != NULL)
delete ccmaDelta2;
if (ccmaConverged != NULL)
delete ccmaConverged;
if (ccmaConvergedHostBuffer != NULL)
delete ccmaConvergedHostBuffer;
if (vsite2AvgAtoms != NULL)
delete vsite2AvgAtoms;
if (vsite2AvgWeights != NULL)
delete vsite2AvgWeights;
if (vsite3AvgAtoms != NULL)
delete vsite3AvgAtoms;
if (vsite3AvgWeights != NULL)
delete vsite3AvgWeights;
if (vsiteOutOfPlaneAtoms != NULL)
delete vsiteOutOfPlaneAtoms;
if (vsiteOutOfPlaneWeights != NULL)
delete vsiteOutOfPlaneWeights;
if (vsiteLocalCoordsIndex != NULL)
delete vsiteLocalCoordsIndex;
if (vsiteLocalCoordsAtoms != NULL)
delete vsiteLocalCoordsAtoms;
if (vsiteLocalCoordsWeights != NULL)
delete vsiteLocalCoordsWeights;
if (vsiteLocalCoordsPos != NULL)
delete vsiteLocalCoordsPos;
if (vsiteLocalCoordsStartIndex != NULL)
delete vsiteLocalCoordsStartIndex;
}
void OpenCLIntegrationUtilities::setNextStepSize(double size) {
if (size != lastStepSize.x || size != lastStepSize.y) {
lastStepSize = mm_double2(size, size);
if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
stepSize->upload(&lastStepSize);
stepSize.upload(&lastStepSize);
else {
mm_float2 lastStepSizeFloat = mm_float2((float) size, (float) size);
stepSize->upload(&lastStepSizeFloat);
stepSize.upload(&lastStepSizeFloat);
}
}
}
double OpenCLIntegrationUtilities::getLastStepSize() {
if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
stepSize->download(&lastStepSize);
stepSize.download(&lastStepSize);
else {
mm_float2 lastStepSizeFloat;
stepSize->download(&lastStepSizeFloat);
stepSize.download(&lastStepSizeFloat);
lastStepSize = mm_double2(lastStepSizeFloat.x, lastStepSizeFloat.y);
}
return lastStepSize.y;
......@@ -802,102 +731,102 @@ void OpenCLIntegrationUtilities::applyConstraints(bool constrainVelocities, doub
ccmaUpdateKernel = ccmaPosUpdateKernel;
hasInitializedPosConstraintKernels = true;
}
if (settleAtoms != NULL) {
if (settleAtoms.isInitialized()) {
if (!hasInitialized) {
settleKernel.setArg<cl_int>(0, settleAtoms->getSize());
settleKernel.setArg<cl_int>(0, settleAtoms.getSize());
settleKernel.setArg<cl::Buffer>(2, context.getPosq().getDeviceBuffer());
if (context.getUseMixedPrecision())
settleKernel.setArg<cl::Buffer>(3, context.getPosqCorrection().getDeviceBuffer());
else
settleKernel.setArg<void*>(3, NULL);
settleKernel.setArg<cl::Buffer>(4, posDelta->getDeviceBuffer());
settleKernel.setArg<cl::Buffer>(4, posDelta.getDeviceBuffer());
settleKernel.setArg<cl::Buffer>(5, context.getVelm().getDeviceBuffer());
settleKernel.setArg<cl::Buffer>(6, settleAtoms->getDeviceBuffer());
settleKernel.setArg<cl::Buffer>(7, settleParams->getDeviceBuffer());
settleKernel.setArg<cl::Buffer>(6, settleAtoms.getDeviceBuffer());
settleKernel.setArg<cl::Buffer>(7, settleParams.getDeviceBuffer());
}
if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
settleKernel.setArg<cl_double>(1, (cl_double) tol);
else
settleKernel.setArg<cl_float>(1, (cl_float) tol);
context.executeKernel(settleKernel, settleAtoms->getSize());
context.executeKernel(settleKernel, settleAtoms.getSize());
}
if (shakeAtoms != NULL) {
if (shakeAtoms.isInitialized()) {
if (!hasInitialized) {
shakeKernel.setArg<cl_int>(0, shakeAtoms->getSize());
shakeKernel.setArg<cl_int>(0, shakeAtoms.getSize());
shakeKernel.setArg<cl::Buffer>(2, context.getPosq().getDeviceBuffer());
if (context.getUseMixedPrecision())
shakeKernel.setArg<cl::Buffer>(3, context.getPosqCorrection().getDeviceBuffer());
else
shakeKernel.setArg<void*>(3, NULL);
shakeKernel.setArg<cl::Buffer>(4, constrainVelocities ? context.getVelm().getDeviceBuffer() : posDelta->getDeviceBuffer());
shakeKernel.setArg<cl::Buffer>(5, shakeAtoms->getDeviceBuffer());
shakeKernel.setArg<cl::Buffer>(6, shakeParams->getDeviceBuffer());
shakeKernel.setArg<cl::Buffer>(4, constrainVelocities ? context.getVelm().getDeviceBuffer() : posDelta.getDeviceBuffer());
shakeKernel.setArg<cl::Buffer>(5, shakeAtoms.getDeviceBuffer());
shakeKernel.setArg<cl::Buffer>(6, shakeParams.getDeviceBuffer());
}
if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
shakeKernel.setArg<cl_double>(1, (cl_double) tol);
else
shakeKernel.setArg<cl_float>(1, (cl_float) tol);
context.executeKernel(shakeKernel, shakeAtoms->getSize());
context.executeKernel(shakeKernel, shakeAtoms.getSize());
}
if (ccmaAtoms != NULL) {
if (ccmaAtoms.isInitialized()) {
if (!hasInitialized) {
ccmaDirectionsKernel.setArg<cl::Buffer>(0, ccmaAtoms->getDeviceBuffer());
ccmaDirectionsKernel.setArg<cl::Buffer>(1, ccmaDistance->getDeviceBuffer());
ccmaDirectionsKernel.setArg<cl::Buffer>(0, ccmaAtoms.getDeviceBuffer());
ccmaDirectionsKernel.setArg<cl::Buffer>(1, ccmaDistance.getDeviceBuffer());
ccmaDirectionsKernel.setArg<cl::Buffer>(2, context.getPosq().getDeviceBuffer());
if (context.getUseMixedPrecision())
ccmaDirectionsKernel.setArg<cl::Buffer>(3, context.getPosqCorrection().getDeviceBuffer());
else
ccmaDirectionsKernel.setArg<void*>(3, NULL);
ccmaDirectionsKernel.setArg<cl::Buffer>(4, ccmaConverged->getDeviceBuffer());
ccmaForceKernel.setArg<cl::Buffer>(0, ccmaAtoms->getDeviceBuffer());
ccmaForceKernel.setArg<cl::Buffer>(1, ccmaDistance->getDeviceBuffer());
ccmaForceKernel.setArg<cl::Buffer>(2, constrainVelocities ? context.getVelm().getDeviceBuffer() : posDelta->getDeviceBuffer());
ccmaForceKernel.setArg<cl::Buffer>(3, ccmaReducedMass->getDeviceBuffer());
ccmaForceKernel.setArg<cl::Buffer>(4, ccmaDelta1->getDeviceBuffer());
ccmaForceKernel.setArg<cl::Buffer>(5, ccmaConverged->getDeviceBuffer());
ccmaForceKernel.setArg<cl::Buffer>(6, ccmaConvergedHostBuffer->getDeviceBuffer());
ccmaMultiplyKernel.setArg<cl::Buffer>(0, ccmaDelta1->getDeviceBuffer());
ccmaMultiplyKernel.setArg<cl::Buffer>(1, ccmaDelta2->getDeviceBuffer());
ccmaMultiplyKernel.setArg<cl::Buffer>(2, ccmaConstraintMatrixColumn->getDeviceBuffer());
ccmaMultiplyKernel.setArg<cl::Buffer>(3, ccmaConstraintMatrixValue->getDeviceBuffer());
ccmaMultiplyKernel.setArg<cl::Buffer>(4, ccmaConverged->getDeviceBuffer());
ccmaUpdateKernel.setArg<cl::Buffer>(0, ccmaNumAtomConstraints->getDeviceBuffer());
ccmaUpdateKernel.setArg<cl::Buffer>(1, ccmaAtomConstraints->getDeviceBuffer());
ccmaUpdateKernel.setArg<cl::Buffer>(2, ccmaDistance->getDeviceBuffer());
ccmaUpdateKernel.setArg<cl::Buffer>(3, constrainVelocities ? context.getVelm().getDeviceBuffer() : posDelta->getDeviceBuffer());
ccmaDirectionsKernel.setArg<cl::Buffer>(4, ccmaConverged.getDeviceBuffer());
ccmaForceKernel.setArg<cl::Buffer>(0, ccmaAtoms.getDeviceBuffer());
ccmaForceKernel.setArg<cl::Buffer>(1, ccmaDistance.getDeviceBuffer());
ccmaForceKernel.setArg<cl::Buffer>(2, constrainVelocities ? context.getVelm().getDeviceBuffer() : posDelta.getDeviceBuffer());
ccmaForceKernel.setArg<cl::Buffer>(3, ccmaReducedMass.getDeviceBuffer());
ccmaForceKernel.setArg<cl::Buffer>(4, ccmaDelta1.getDeviceBuffer());
ccmaForceKernel.setArg<cl::Buffer>(5, ccmaConverged.getDeviceBuffer());
ccmaForceKernel.setArg<cl::Buffer>(6, ccmaConvergedHostBuffer.getDeviceBuffer());
ccmaMultiplyKernel.setArg<cl::Buffer>(0, ccmaDelta1.getDeviceBuffer());
ccmaMultiplyKernel.setArg<cl::Buffer>(1, ccmaDelta2.getDeviceBuffer());
ccmaMultiplyKernel.setArg<cl::Buffer>(2, ccmaConstraintMatrixColumn.getDeviceBuffer());
ccmaMultiplyKernel.setArg<cl::Buffer>(3, ccmaConstraintMatrixValue.getDeviceBuffer());
ccmaMultiplyKernel.setArg<cl::Buffer>(4, ccmaConverged.getDeviceBuffer());
ccmaUpdateKernel.setArg<cl::Buffer>(0, ccmaNumAtomConstraints.getDeviceBuffer());
ccmaUpdateKernel.setArg<cl::Buffer>(1, ccmaAtomConstraints.getDeviceBuffer());
ccmaUpdateKernel.setArg<cl::Buffer>(2, ccmaDistance.getDeviceBuffer());
ccmaUpdateKernel.setArg<cl::Buffer>(3, constrainVelocities ? context.getVelm().getDeviceBuffer() : posDelta.getDeviceBuffer());
ccmaUpdateKernel.setArg<cl::Buffer>(4, context.getVelm().getDeviceBuffer());
ccmaUpdateKernel.setArg<cl::Buffer>(5, ccmaDelta1->getDeviceBuffer());
ccmaUpdateKernel.setArg<cl::Buffer>(6, ccmaDelta2->getDeviceBuffer());
ccmaUpdateKernel.setArg<cl::Buffer>(7, ccmaConverged->getDeviceBuffer());
ccmaUpdateKernel.setArg<cl::Buffer>(5, ccmaDelta1.getDeviceBuffer());
ccmaUpdateKernel.setArg<cl::Buffer>(6, ccmaDelta2.getDeviceBuffer());
ccmaUpdateKernel.setArg<cl::Buffer>(7, ccmaConverged.getDeviceBuffer());
}
if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
ccmaForceKernel.setArg<cl_double>(7, (cl_double) tol);
else
ccmaForceKernel.setArg<cl_float>(7, (cl_float) tol);
context.executeKernel(ccmaDirectionsKernel, ccmaAtoms->getSize());
context.executeKernel(ccmaDirectionsKernel, ccmaAtoms.getSize());
const int checkInterval = 4;
int* converged = (int*) context.getPinnedBuffer();
int* ccmaConvergedHostMemory = (int*) context.getQueue().enqueueMapBuffer(ccmaConvergedHostBuffer->getDeviceBuffer(), CL_TRUE, CL_MAP_WRITE, 0, sizeof(cl_int));
int* ccmaConvergedHostMemory = (int*) context.getQueue().enqueueMapBuffer(ccmaConvergedHostBuffer.getDeviceBuffer(), CL_TRUE, CL_MAP_WRITE, 0, sizeof(cl_int));
ccmaConvergedHostMemory[0] = 0;
context.getQueue().enqueueUnmapMemObject(ccmaConvergedHostBuffer->getDeviceBuffer(), ccmaConvergedHostMemory);
context.getQueue().enqueueUnmapMemObject(ccmaConvergedHostBuffer.getDeviceBuffer(), ccmaConvergedHostMemory);
for (int i = 0; i < 150; i++) {
ccmaForceKernel.setArg<cl_int>(8, i);
context.executeKernel(ccmaForceKernel, ccmaAtoms->getSize());
context.executeKernel(ccmaForceKernel, ccmaAtoms.getSize());
cl::Event event;
if ((i+1)%checkInterval == 0 && !ccmaUseDirectBuffer)
context.getQueue().enqueueReadBuffer(ccmaConverged->getDeviceBuffer(), CL_FALSE, 0, 2*sizeof(cl_int), converged, NULL, &event);
context.getQueue().enqueueReadBuffer(ccmaConverged.getDeviceBuffer(), CL_FALSE, 0, 2*sizeof(cl_int), converged, NULL, &event);
ccmaMultiplyKernel.setArg<cl_int>(5, i);
context.executeKernel(ccmaMultiplyKernel, ccmaAtoms->getSize());
context.executeKernel(ccmaMultiplyKernel, ccmaAtoms.getSize());
ccmaUpdateKernel.setArg<cl_int>(8, i);
context.executeKernel(ccmaUpdateKernel, context.getNumAtoms());
if ((i+1)%checkInterval == 0) {
if (ccmaUseDirectBuffer) {
ccmaConvergedHostMemory = (int*) context.getQueue().enqueueMapBuffer(ccmaConvergedHostBuffer->getDeviceBuffer(), CL_FALSE, CL_MAP_READ, 0, sizeof(cl_int), NULL, &event);
ccmaConvergedHostMemory = (int*) context.getQueue().enqueueMapBuffer(ccmaConvergedHostBuffer.getDeviceBuffer(), CL_FALSE, CL_MAP_READ, 0, sizeof(cl_int), NULL, &event);
context.getQueue().flush();
while (event.getInfo<CL_EVENT_COMMAND_EXECUTION_STATUS>() != CL_COMPLETE)
;
converged[i%2] = ccmaConvergedHostMemory[0];
context.getQueue().enqueueUnmapMemObject(ccmaConvergedHostBuffer->getDeviceBuffer(), ccmaConvergedHostMemory);
context.getQueue().enqueueUnmapMemObject(ccmaConvergedHostBuffer.getDeviceBuffer(), ccmaConvergedHostMemory);
}
else
event.wait();
......@@ -938,7 +867,7 @@ void OpenCLIntegrationUtilities::distributeForcesFromVirtualSites() {
}
void OpenCLIntegrationUtilities::initRandomNumberGenerator(unsigned int randomNumberSeed) {
if (random != NULL) {
if (random.isInitialized()) {
if (randomNumberSeed != lastSeed)
throw OpenMMException("OpenCLIntegrationUtilities::initRandomNumberGenerator(): Requested two different values for the random number seed");
return;
......@@ -947,23 +876,23 @@ void OpenCLIntegrationUtilities::initRandomNumberGenerator(unsigned int randomNu
// Create the random number arrays.
lastSeed = randomNumberSeed;
random = OpenCLArray::create<mm_float4>(context, 4*context.getPaddedNumAtoms(), "random");
randomSeed = OpenCLArray::create<mm_int4>(context, context.getNumThreadBlocks()*OpenCLContext::ThreadBlockSize, "randomSeed");
randomPos = random->getSize();
random.initialize<mm_float4>(context, 4*context.getPaddedNumAtoms(), "random");
randomSeed.initialize<mm_int4>(context, context.getNumThreadBlocks()*OpenCLContext::ThreadBlockSize, "randomSeed");
randomPos = random.getSize();
// Use a quick and dirty RNG to pick seeds for the real random number generator.
vector<mm_int4> seed(randomSeed->getSize());
vector<mm_int4> seed(randomSeed.getSize());
unsigned int r = randomNumberSeed;
// A seed of 0 means use a unique one
if (r == 0) r = (unsigned int) osrngseed();
for (int i = 0; i < randomSeed->getSize(); i++) {
for (int i = 0; i < randomSeed.getSize(); i++) {
seed[i].x = r = (1664525*r + 1013904223) & 0xFFFFFFFF;
seed[i].y = r = (1664525*r + 1013904223) & 0xFFFFFFFF;
seed[i].z = r = (1664525*r + 1013904223) & 0xFFFFFFFF;
seed[i].w = r = (1664525*r + 1013904223) & 0xFFFFFFFF;
}
randomSeed->upload(seed);
randomSeed.upload(seed);
// Create the kernel.
......@@ -972,45 +901,44 @@ void OpenCLIntegrationUtilities::initRandomNumberGenerator(unsigned int randomNu
}
int OpenCLIntegrationUtilities::prepareRandomNumbers(int numValues) {
if (randomPos+numValues <= random->getSize()) {
if (randomPos+numValues <= random.getSize()) {
int oldPos = randomPos;
randomPos += numValues;
return oldPos;
}
if (numValues > random->getSize()) {
delete random;
random = OpenCLArray::create<mm_float4>(context, numValues, "random");
if (numValues > random.getSize()) {
random.resize(numValues);
}
randomKernel.setArg<cl_int>(0, random->getSize());
randomKernel.setArg<cl::Buffer>(1, random->getDeviceBuffer());
randomKernel.setArg<cl::Buffer>(2, randomSeed->getDeviceBuffer());
context.executeKernel(randomKernel, random->getSize());
randomKernel.setArg<cl_int>(0, random.getSize());
randomKernel.setArg<cl::Buffer>(1, random.getDeviceBuffer());
randomKernel.setArg<cl::Buffer>(2, randomSeed.getDeviceBuffer());
context.executeKernel(randomKernel, random.getSize());
randomPos = numValues;
return 0;
}
void OpenCLIntegrationUtilities::createCheckpoint(ostream& stream) {
if(random == NULL)
if (!random.isInitialized())
return;
stream.write((char*) &randomPos, sizeof(int));
vector<mm_float4> randomVec;
random->download(randomVec);
stream.write((char*) &randomVec[0], sizeof(mm_float4)*random->getSize());
random.download(randomVec);
stream.write((char*) &randomVec[0], sizeof(mm_float4)*random.getSize());
vector<mm_int4> randomSeedVec;
randomSeed->download(randomSeedVec);
stream.write((char*) &randomSeedVec[0], sizeof(mm_int4)*randomSeed->getSize());
randomSeed.download(randomSeedVec);
stream.write((char*) &randomSeedVec[0], sizeof(mm_int4)*randomSeed.getSize());
}
void OpenCLIntegrationUtilities::loadCheckpoint(istream& stream) {
if(random == NULL)
if (!random.isInitialized())
return;
stream.read((char*) &randomPos, sizeof(int));
vector<mm_float4> randomVec(random->getSize());
stream.read((char*) &randomVec[0], sizeof(mm_float4)*random->getSize());
random->upload(randomVec);
vector<mm_int4> randomSeedVec(randomSeed->getSize());
stream.read((char*) &randomSeedVec[0], sizeof(mm_int4)*randomSeed->getSize());
randomSeed->upload(randomSeedVec);
vector<mm_float4> randomVec(random.getSize());
stream.read((char*) &randomVec[0], sizeof(mm_float4)*random.getSize());
random.upload(randomVec);
vector<mm_int4> randomSeedVec(randomSeed.getSize());
stream.read((char*) &randomSeedVec[0], sizeof(mm_int4)*randomSeed.getSize());
randomSeed.upload(randomSeedVec);
}
double OpenCLIntegrationUtilities::computeKineticEnergy(double timeShift) {
......@@ -1018,7 +946,7 @@ double OpenCLIntegrationUtilities::computeKineticEnergy(double timeShift) {
if (timeShift != 0) {
// Copy the velocities into the posDelta array while we temporarily modify them.
context.getVelm().copyTo(*posDelta);
context.getVelm().copyTo(posDelta);
// Apply the time shift.
......@@ -1057,6 +985,6 @@ double OpenCLIntegrationUtilities::computeKineticEnergy(double timeShift) {
// Restore the velocities.
if (timeShift != 0)
posDelta->copyTo(context.getVelm());
posDelta.copyTo(context.getVelm());
return 0.5*energy;
}
......@@ -6357,47 +6357,6 @@ private:
OpenCLCalcGayBerneForceKernel& owner;
};
OpenCLCalcGayBerneForceKernel::~OpenCLCalcGayBerneForceKernel() {
if (sortedParticles != NULL)
delete sortedParticles;
if (axisParticleIndices != NULL)
delete axisParticleIndices;
if (sigParams != NULL)
delete sigParams;
if (epsParams != NULL)
delete epsParams;
if (scale != NULL)
delete scale;
if (exceptionParticles != NULL)
delete exceptionParticles;
if (exceptionParams != NULL)
delete exceptionParams;
if (aMatrix != NULL)
delete aMatrix;
if (bMatrix != NULL)
delete bMatrix;
if (gMatrix != NULL)
delete gMatrix;
if (exclusions != NULL)
delete exclusions;
if (exclusionStartIndex != NULL)
delete exclusionStartIndex;
if (blockCenter != NULL)
delete blockCenter;
if (blockBoundingBox != NULL)
delete blockBoundingBox;
if (neighbors != NULL)
delete neighbors;
if (neighborIndex != NULL)
delete neighborIndex;
if (neighborBlockCount != NULL)
delete neighborBlockCount;
if (sortedPos != NULL)
delete sortedPos;
if (torque != NULL)
delete torque;
}
void OpenCLCalcGayBerneForceKernel::initialize(const System& system, const GayBerneForce& force) {
if (!cl.getSupports64BitGlobalAtomics())
throw OpenMMException("GayBerneForce requires a device that supports 64 bit atomic operations");
......@@ -6405,14 +6364,14 @@ void OpenCLCalcGayBerneForceKernel::initialize(const System& system, const GayBe
// Initialize interactions.
int numParticles = force.getNumParticles();
sigParams = OpenCLArray::create<mm_float4>(cl, cl.getPaddedNumAtoms(), "sigParams");
epsParams = OpenCLArray::create<mm_float2>(cl, cl.getPaddedNumAtoms(), "epsParams");
scale = OpenCLArray::create<mm_float4>(cl, cl.getPaddedNumAtoms(), "scale");
axisParticleIndices = OpenCLArray::create<mm_int2>(cl, cl.getPaddedNumAtoms(), "axisParticleIndices");
sortedParticles = OpenCLArray::create<cl_int>(cl, cl.getPaddedNumAtoms(), "sortedParticles");
aMatrix = OpenCLArray::create<cl_float>(cl, 9*cl.getPaddedNumAtoms(), "aMatrix");
bMatrix = OpenCLArray::create<cl_float>(cl, 9*cl.getPaddedNumAtoms(), "bMatrix");
gMatrix = OpenCLArray::create<cl_float>(cl, 9*cl.getPaddedNumAtoms(), "gMatrix");
sigParams.initialize<mm_float4>(cl, cl.getPaddedNumAtoms(), "sigParams");
epsParams.initialize<mm_float2>(cl, cl.getPaddedNumAtoms(), "epsParams");
scale.initialize<mm_float4>(cl, cl.getPaddedNumAtoms(), "scale");
axisParticleIndices.initialize<mm_int2>(cl, cl.getPaddedNumAtoms(), "axisParticleIndices");
sortedParticles.initialize<cl_int>(cl, cl.getPaddedNumAtoms(), "sortedParticles");
aMatrix.initialize<cl_float>(cl, 9*cl.getPaddedNumAtoms(), "aMatrix");
bMatrix.initialize<cl_float>(cl, 9*cl.getPaddedNumAtoms(), "bMatrix");
gMatrix.initialize<cl_float>(cl, 9*cl.getPaddedNumAtoms(), "gMatrix");
vector<mm_float4> sigParamsVector(cl.getPaddedNumAtoms(), mm_float4(0, 0, 0, 0));
vector<mm_float2> epsParamsVector(cl.getPaddedNumAtoms(), mm_float2(0, 0));
vector<mm_float4> scaleVector(cl.getPaddedNumAtoms(), mm_float4(0, 0, 0, 0));
......@@ -6428,10 +6387,10 @@ void OpenCLCalcGayBerneForceKernel::initialize(const System& system, const GayBe
scaleVector[i] = mm_float4((float) (1/sqrt(ex)), (float) (1/sqrt(ey)), (float) (1/sqrt(ez)), 0);
isRealParticle[i] = (epsilon != 0.0);
}
sigParams->upload(sigParamsVector);
epsParams->upload(epsParamsVector);
scale->upload(scaleVector);
axisParticleIndices->upload(axisParticleVector);
sigParams.upload(sigParamsVector);
epsParams.upload(epsParamsVector);
scale.upload(scaleVector);
axisParticleIndices.upload(axisParticleVector);
// Record exceptions and exclusions.
......@@ -6454,29 +6413,29 @@ void OpenCLCalcGayBerneForceKernel::initialize(const System& system, const GayBe
if (isRealParticle[i])
numRealParticles++;
int numExceptions = exceptionParamsVec.size();
exclusions = OpenCLArray::create<cl_int>(cl, max(1, (int) excludedPairs.size()), "exclusions");
exclusionStartIndex = OpenCLArray::create<cl_int>(cl, numRealParticles+1, "exclusionStartIndex");
exceptionParticles = OpenCLArray::create<mm_int4>(cl, max(1, numExceptions), "exceptionParticles");
exceptionParams = OpenCLArray::create<mm_float2>(cl, max(1, numExceptions), "exceptionParams");
exclusions.initialize<cl_int>(cl, max(1, (int) excludedPairs.size()), "exclusions");
exclusionStartIndex.initialize<cl_int>(cl, numRealParticles+1, "exclusionStartIndex");
exceptionParticles.initialize<mm_int4>(cl, max(1, numExceptions), "exceptionParticles");
exceptionParams.initialize<mm_float2>(cl, max(1, numExceptions), "exceptionParams");
if (numExceptions > 0)
exceptionParams->upload(exceptionParamsVec);
exceptionParams.upload(exceptionParamsVec);
// Create data structures used for the neighbor list.
int numAtomBlocks = (numRealParticles+31)/32;
int elementSize = (cl.getUseDoublePrecision() ? sizeof(cl_double) : sizeof(cl_float));
blockCenter = new OpenCLArray(cl, numAtomBlocks, 4*elementSize, "blockCenter");
blockBoundingBox = new OpenCLArray(cl, numAtomBlocks, 4*elementSize, "blockBoundingBox");
sortedPos = new OpenCLArray(cl, numRealParticles, 4*elementSize, "sortedPos");
blockCenter.initialize(cl, numAtomBlocks, 4*elementSize, "blockCenter");
blockBoundingBox.initialize(cl, numAtomBlocks, 4*elementSize, "blockBoundingBox");
sortedPos.initialize(cl, numRealParticles, 4*elementSize, "sortedPos");
maxNeighborBlocks = numRealParticles*2;
neighbors = OpenCLArray::create<cl_int>(cl, maxNeighborBlocks*32, "neighbors");
neighborIndex = OpenCLArray::create<cl_int>(cl, maxNeighborBlocks, "neighborIndex");
neighborBlockCount = OpenCLArray::create<cl_int>(cl, 1, "neighborBlockCount");
neighbors.initialize<cl_int>(cl, maxNeighborBlocks*32, "neighbors");
neighborIndex.initialize<cl_int>(cl, maxNeighborBlocks, "neighborIndex");
neighborBlockCount.initialize<cl_int>(cl, 1, "neighborBlockCount");
// Create array for accumulating torques.
torque = OpenCLArray::create<cl_long>(cl, 3*cl.getPaddedNumAtoms(), "torque");
cl.addAutoclearBuffer(*torque);
torque.initialize<cl_long>(cl, 3*cl.getPaddedNumAtoms(), "torque");
cl.addAutoclearBuffer(torque);
// Create the kernels.
......@@ -6519,60 +6478,60 @@ double OpenCLCalcGayBerneForceKernel::execute(ContextImpl& context, bool include
sortAtoms();
framesKernel.setArg<cl_int>(0, numRealParticles);
framesKernel.setArg<cl::Buffer>(1, cl.getPosq().getDeviceBuffer());
framesKernel.setArg<cl::Buffer>(2, axisParticleIndices->getDeviceBuffer());
framesKernel.setArg<cl::Buffer>(3, sigParams->getDeviceBuffer());
framesKernel.setArg<cl::Buffer>(4, scale->getDeviceBuffer());
framesKernel.setArg<cl::Buffer>(5, aMatrix->getDeviceBuffer());
framesKernel.setArg<cl::Buffer>(6, bMatrix->getDeviceBuffer());
framesKernel.setArg<cl::Buffer>(7, gMatrix->getDeviceBuffer());
framesKernel.setArg<cl::Buffer>(8, sortedParticles->getDeviceBuffer());
framesKernel.setArg<cl::Buffer>(2, axisParticleIndices.getDeviceBuffer());
framesKernel.setArg<cl::Buffer>(3, sigParams.getDeviceBuffer());
framesKernel.setArg<cl::Buffer>(4, scale.getDeviceBuffer());
framesKernel.setArg<cl::Buffer>(5, aMatrix.getDeviceBuffer());
framesKernel.setArg<cl::Buffer>(6, bMatrix.getDeviceBuffer());
framesKernel.setArg<cl::Buffer>(7, gMatrix.getDeviceBuffer());
framesKernel.setArg<cl::Buffer>(8, sortedParticles.getDeviceBuffer());
blockBoundsKernel.setArg<cl_int>(0, numRealParticles);
blockBoundsKernel.setArg<cl::Buffer>(6, sortedParticles->getDeviceBuffer());
blockBoundsKernel.setArg<cl::Buffer>(6, sortedParticles.getDeviceBuffer());
blockBoundsKernel.setArg<cl::Buffer>(7, cl.getPosq().getDeviceBuffer());
blockBoundsKernel.setArg<cl::Buffer>(8, sortedPos->getDeviceBuffer());
blockBoundsKernel.setArg<cl::Buffer>(9, blockCenter->getDeviceBuffer());
blockBoundsKernel.setArg<cl::Buffer>(10, blockBoundingBox->getDeviceBuffer());
blockBoundsKernel.setArg<cl::Buffer>(11, neighborBlockCount->getDeviceBuffer());
blockBoundsKernel.setArg<cl::Buffer>(8, sortedPos.getDeviceBuffer());
blockBoundsKernel.setArg<cl::Buffer>(9, blockCenter.getDeviceBuffer());
blockBoundsKernel.setArg<cl::Buffer>(10, blockBoundingBox.getDeviceBuffer());
blockBoundsKernel.setArg<cl::Buffer>(11, neighborBlockCount.getDeviceBuffer());
neighborsKernel.setArg<cl_int>(0, numRealParticles);
neighborsKernel.setArg<cl_int>(1, maxNeighborBlocks);
neighborsKernel.setArg<cl::Buffer>(7, sortedPos->getDeviceBuffer());
neighborsKernel.setArg<cl::Buffer>(8, blockCenter->getDeviceBuffer());
neighborsKernel.setArg<cl::Buffer>(9, blockBoundingBox->getDeviceBuffer());
neighborsKernel.setArg<cl::Buffer>(10, neighbors->getDeviceBuffer());
neighborsKernel.setArg<cl::Buffer>(11, neighborIndex->getDeviceBuffer());
neighborsKernel.setArg<cl::Buffer>(12, neighborBlockCount->getDeviceBuffer());
neighborsKernel.setArg<cl::Buffer>(13, exclusions->getDeviceBuffer());
neighborsKernel.setArg<cl::Buffer>(14, exclusionStartIndex->getDeviceBuffer());
neighborsKernel.setArg<cl::Buffer>(7, sortedPos.getDeviceBuffer());
neighborsKernel.setArg<cl::Buffer>(8, blockCenter.getDeviceBuffer());
neighborsKernel.setArg<cl::Buffer>(9, blockBoundingBox.getDeviceBuffer());
neighborsKernel.setArg<cl::Buffer>(10, neighbors.getDeviceBuffer());
neighborsKernel.setArg<cl::Buffer>(11, neighborIndex.getDeviceBuffer());
neighborsKernel.setArg<cl::Buffer>(12, neighborBlockCount.getDeviceBuffer());
neighborsKernel.setArg<cl::Buffer>(13, exclusions.getDeviceBuffer());
neighborsKernel.setArg<cl::Buffer>(14, exclusionStartIndex.getDeviceBuffer());
int index = 0;
forceKernel.setArg<cl::Buffer>(index++, cl.getLongForceBuffer().getDeviceBuffer());
forceKernel.setArg<cl::Buffer>(index++, torque->getDeviceBuffer());
forceKernel.setArg<cl::Buffer>(index++, torque.getDeviceBuffer());
forceKernel.setArg<cl_int>(index++, numRealParticles);
forceKernel.setArg<cl_int>(index++, exceptionAtoms.size());
forceKernel.setArg<cl::Buffer>(index++, cl.getEnergyBuffer().getDeviceBuffer());
forceKernel.setArg<cl::Buffer>(index++, sortedPos->getDeviceBuffer());
forceKernel.setArg<cl::Buffer>(index++, sigParams->getDeviceBuffer());
forceKernel.setArg<cl::Buffer>(index++, epsParams->getDeviceBuffer());
forceKernel.setArg<cl::Buffer>(index++, sortedParticles->getDeviceBuffer());
forceKernel.setArg<cl::Buffer>(index++, aMatrix->getDeviceBuffer());
forceKernel.setArg<cl::Buffer>(index++, bMatrix->getDeviceBuffer());
forceKernel.setArg<cl::Buffer>(index++, gMatrix->getDeviceBuffer());
forceKernel.setArg<cl::Buffer>(index++, exclusions->getDeviceBuffer());
forceKernel.setArg<cl::Buffer>(index++, exclusionStartIndex->getDeviceBuffer());
forceKernel.setArg<cl::Buffer>(index++, exceptionParticles->getDeviceBuffer());
forceKernel.setArg<cl::Buffer>(index++, exceptionParams->getDeviceBuffer());
forceKernel.setArg<cl::Buffer>(index++, sortedPos.getDeviceBuffer());
forceKernel.setArg<cl::Buffer>(index++, sigParams.getDeviceBuffer());
forceKernel.setArg<cl::Buffer>(index++, epsParams.getDeviceBuffer());
forceKernel.setArg<cl::Buffer>(index++, sortedParticles.getDeviceBuffer());
forceKernel.setArg<cl::Buffer>(index++, aMatrix.getDeviceBuffer());
forceKernel.setArg<cl::Buffer>(index++, bMatrix.getDeviceBuffer());
forceKernel.setArg<cl::Buffer>(index++, gMatrix.getDeviceBuffer());
forceKernel.setArg<cl::Buffer>(index++, exclusions.getDeviceBuffer());
forceKernel.setArg<cl::Buffer>(index++, exclusionStartIndex.getDeviceBuffer());
forceKernel.setArg<cl::Buffer>(index++, exceptionParticles.getDeviceBuffer());
forceKernel.setArg<cl::Buffer>(index++, exceptionParams.getDeviceBuffer());
if (nonbondedMethod != GayBerneForce::NoCutoff) {
forceKernel.setArg<cl_int>(index++, maxNeighborBlocks);
forceKernel.setArg<cl::Buffer>(index++, neighbors->getDeviceBuffer());
forceKernel.setArg<cl::Buffer>(index++, neighborIndex->getDeviceBuffer());
forceKernel.setArg<cl::Buffer>(index++, neighborBlockCount->getDeviceBuffer());
forceKernel.setArg<cl::Buffer>(index++, neighbors.getDeviceBuffer());
forceKernel.setArg<cl::Buffer>(index++, neighborIndex.getDeviceBuffer());
forceKernel.setArg<cl::Buffer>(index++, neighborBlockCount.getDeviceBuffer());
}
index = 0;
torqueKernel.setArg<cl::Buffer>(index++, cl.getLongForceBuffer().getDeviceBuffer());
torqueKernel.setArg<cl::Buffer>(index++, torque->getDeviceBuffer());
torqueKernel.setArg<cl::Buffer>(index++, torque.getDeviceBuffer());
torqueKernel.setArg<cl_int>(index++, numRealParticles);
torqueKernel.setArg<cl::Buffer>(index++, cl.getPosq().getDeviceBuffer());
torqueKernel.setArg<cl::Buffer>(index++, axisParticleIndices->getDeviceBuffer());
torqueKernel.setArg<cl::Buffer>(index++, sortedParticles->getDeviceBuffer());
torqueKernel.setArg<cl::Buffer>(index++, axisParticleIndices.getDeviceBuffer());
torqueKernel.setArg<cl::Buffer>(index++, sortedParticles.getDeviceBuffer());
}
cl.executeKernel(framesKernel, numRealParticles);
setPeriodicBoxArgs(cl, blockBoundsKernel, 1);
......@@ -6586,7 +6545,7 @@ double OpenCLCalcGayBerneForceKernel::execute(ContextImpl& context, bool include
cl.executeKernel(neighborsKernel, numRealParticles);
cl_int* count = (cl_int*) cl.getPinnedBuffer();
cl::Event event;
cl.getQueue().enqueueReadBuffer(neighborBlockCount->getDeviceBuffer(), CL_FALSE, 0, neighborBlockCount->getSize()*neighborBlockCount->getElementSize(), count, NULL, &event);
cl.getQueue().enqueueReadBuffer(neighborBlockCount.getDeviceBuffer(), CL_FALSE, 0, neighborBlockCount.getSize()*neighborBlockCount.getElementSize(), count, NULL, &event);
setPeriodicBoxArgs(cl, forceKernel, 20);
cl.executeKernel(forceKernel, cl.getNonbondedUtilities().getNumForceThreadBlocks()*cl.getNonbondedUtilities().getForceThreadBlockSize());
event.wait();
......@@ -6595,17 +6554,13 @@ double OpenCLCalcGayBerneForceKernel::execute(ContextImpl& context, bool include
// There wasn't enough room for the neighbor list, so we need to recreate it.
delete neighbors;
neighbors = NULL;
delete neighborIndex;
neighborIndex = NULL;
maxNeighborBlocks = (int) ceil((*count)*1.1);
neighbors = OpenCLArray::create<cl_int>(cl, maxNeighborBlocks*32, "neighbors");
neighborIndex = OpenCLArray::create<cl_int>(cl, maxNeighborBlocks, "neighborIndex");
neighborsKernel.setArg<cl::Buffer>(10, neighbors->getDeviceBuffer());
neighborsKernel.setArg<cl::Buffer>(11, neighborIndex->getDeviceBuffer());
forceKernel.setArg<cl::Buffer>(17, neighbors->getDeviceBuffer());
forceKernel.setArg<cl::Buffer>(18, neighborIndex->getDeviceBuffer());
neighbors.resize(maxNeighborBlocks*32);
neighborIndex.resize(maxNeighborBlocks);
neighborsKernel.setArg<cl::Buffer>(10, neighbors.getDeviceBuffer());
neighborsKernel.setArg<cl::Buffer>(11, neighborIndex.getDeviceBuffer());
forceKernel.setArg<cl::Buffer>(17, neighbors.getDeviceBuffer());
forceKernel.setArg<cl::Buffer>(18, neighborIndex.getDeviceBuffer());
}
}
cl.executeKernel(torqueKernel, numRealParticles);
......@@ -6644,9 +6599,9 @@ void OpenCLCalcGayBerneForceKernel::copyParametersToContext(ContextImpl& context
if (epsilon != 0.0 && !isRealParticle[i])
throw OpenMMException("updateParametersInContext: The set of ignored particles (ones with epsilon=0) has changed");
}
sigParams->upload(sigParamsVector);
epsParams->upload(epsParamsVector);
scale->upload(scaleVector);
sigParams.upload(sigParamsVector);
epsParams.upload(epsParamsVector);
scale.upload(scaleVector);
// Record the exceptions.
......@@ -6658,7 +6613,7 @@ void OpenCLCalcGayBerneForceKernel::copyParametersToContext(ContextImpl& context
force.getExceptionParameters(exceptions[i], atom1, atom2, sigma, epsilon);
exceptionParamsVec[i] = mm_float2((float) sigma, (float) epsilon);
}
exceptionParams->upload(exceptionParamsVec);
exceptionParams.upload(exceptionParamsVec);
}
cl.invalidateMolecules(info);
sortAtoms();
......@@ -6679,7 +6634,7 @@ void OpenCLCalcGayBerneForceKernel::sortAtoms() {
particles[nextIndex++] = atom;
}
}
sortedParticles->upload(particles);
sortedParticles.upload(particles);
// Update the list of exception particles.
......@@ -6688,7 +6643,7 @@ void OpenCLCalcGayBerneForceKernel::sortAtoms() {
vector<mm_int4> exceptionParticlesVec(numExceptions);
for (int i = 0; i < numExceptions; i++)
exceptionParticlesVec[i] = mm_int4(exceptionAtoms[i].first, exceptionAtoms[i].second, inverseOrder[exceptionAtoms[i].first], inverseOrder[exceptionAtoms[i].second]);
exceptionParticles->upload(exceptionParticlesVec);
exceptionParticles.upload(exceptionParticlesVec);
}
// Rebuild the list of exclusions.
......@@ -6700,16 +6655,16 @@ void OpenCLCalcGayBerneForceKernel::sortAtoms() {
excludedAtoms[first].push_back(second);
}
int index = 0;
vector<int> exclusionVec(exclusions->getSize());
vector<int> startIndexVec(exclusionStartIndex->getSize());
vector<int> exclusionVec(exclusions.getSize());
vector<int> startIndexVec(exclusionStartIndex.getSize());
for (int i = 0; i < numRealParticles; i++) {
startIndexVec[i] = index;
for (int j = 0; j < excludedAtoms[i].size(); j++)
exclusionVec[index++] = excludedAtoms[i][j];
}
startIndexVec[numRealParticles] = index;
exclusions->upload(exclusionVec);
exclusionStartIndex->upload(startIndexVec);
exclusions.upload(exclusionVec);
exclusionStartIndex.upload(startIndexVec);
}
class OpenCLCalcCustomCVForceKernel::ReorderListener : public OpenCLContext::ReorderListener {
......@@ -6728,15 +6683,6 @@ private:
OpenCLArray& invAtomOrder;
};
OpenCLCalcCustomCVForceKernel::~OpenCLCalcCustomCVForceKernel() {
for (auto force : cvForces)
delete force;
if (invAtomOrder != NULL)
delete invAtomOrder;
if (innerInvAtomOrder != NULL)
delete innerInvAtomOrder;
}
void OpenCLCalcCustomCVForceKernel::initialize(const System& system, const CustomCVForce& force, ContextImpl& innerContext) {
int numCVs = force.getNumCollectiveVariables();
cl.addForce(new OpenCLForceInfo(1));
......@@ -6779,10 +6725,11 @@ void OpenCLCalcCustomCVForceKernel::initialize(const System& system, const Custo
// Create arrays for storing information.
int elementSize = (cl.getUseDoublePrecision() || cl.getUseMixedPrecision() ? sizeof(double) : sizeof(float));
cvForces.resize(numCVs);
for (int i = 0; i < numCVs; i++)
cvForces.push_back(new OpenCLArray(cl, cl.getNumAtoms(), 4*elementSize, "cvForce"));
invAtomOrder = OpenCLArray::create<cl_int>(cl, cl.getPaddedNumAtoms(), "invAtomOrder");
innerInvAtomOrder = OpenCLArray::create<cl_int>(cl, cl.getPaddedNumAtoms(), "innerInvAtomOrder");
cvForces[i].initialize(cl, cl.getNumAtoms(), 4*elementSize, "cvForce");
invAtomOrder.initialize<cl_int>(cl, cl.getPaddedNumAtoms(), "invAtomOrder");
innerInvAtomOrder.initialize<cl_int>(cl, cl.getPaddedNumAtoms(), "innerInvAtomOrder");
// Create the kernels.
......@@ -6809,7 +6756,7 @@ double OpenCLCalcCustomCVForceKernel::execute(ContextImpl& context, ContextImpl&
vector<map<string, double> > cvDerivs(numCVs);
for (int i = 0; i < numCVs; i++) {
cvValues.push_back(innerContext.calcForcesAndEnergy(true, true, 1<<i));
copyForcesKernel.setArg<cl::Buffer>(0, cvForces[i]->getDeviceBuffer());
copyForcesKernel.setArg<cl::Buffer>(0, cvForces[i].getDeviceBuffer());
cl.executeKernel(copyForcesKernel, numAtoms);
innerContext.getEnergyParameterDerivatives(cvDerivs[i]);
}
......@@ -6852,8 +6799,8 @@ void OpenCLCalcCustomCVForceKernel::copyState(ContextImpl& context, ContextImpl&
// Initialize the listeners.
ReorderListener* listener1 = new ReorderListener(cl, *invAtomOrder);
ReorderListener* listener2 = new ReorderListener(cl2, *innerInvAtomOrder);
ReorderListener* listener1 = new ReorderListener(cl, invAtomOrder);
ReorderListener* listener2 = new ReorderListener(cl2, innerInvAtomOrder);
cl.addReorderListener(listener1);
cl2.addReorderListener(listener2);
listener1->execute();
......@@ -6866,7 +6813,7 @@ void OpenCLCalcCustomCVForceKernel::copyState(ContextImpl& context, ContextImpl&
copyStateKernel.setArg<cl::Buffer>(3, cl.getAtomIndexArray().getDeviceBuffer());
copyStateKernel.setArg<cl::Buffer>(4, cl2.getPosq().getDeviceBuffer());
copyStateKernel.setArg<cl::Buffer>(6, cl2.getVelm().getDeviceBuffer());
copyStateKernel.setArg<cl::Buffer>(7, innerInvAtomOrder->getDeviceBuffer());
copyStateKernel.setArg<cl::Buffer>(7, innerInvAtomOrder.getDeviceBuffer());
copyStateKernel.setArg<cl_int>(8, numAtoms);
if (cl.getUseMixedPrecision()) {
copyStateKernel.setArg<cl::Buffer>(1, cl.getPosqCorrection().getDeviceBuffer());
......@@ -6877,7 +6824,7 @@ void OpenCLCalcCustomCVForceKernel::copyState(ContextImpl& context, ContextImpl&
copyStateKernel.setArg<void*>(5, NULL);
}
copyForcesKernel.setArg<cl::Buffer>(1, invAtomOrder->getDeviceBuffer());
copyForcesKernel.setArg<cl::Buffer>(1, invAtomOrder.getDeviceBuffer());
copyForcesKernel.setArg<cl::Buffer>(2, cl2.getForce().getDeviceBuffer());
copyForcesKernel.setArg<cl::Buffer>(3, cl2.getAtomIndexArray().getDeviceBuffer());
copyForcesKernel.setArg<cl_int>(4, numAtoms);
......@@ -6885,7 +6832,7 @@ void OpenCLCalcCustomCVForceKernel::copyState(ContextImpl& context, ContextImpl&
addForcesKernel.setArg<cl::Buffer>(0, cl.getForce().getDeviceBuffer());
addForcesKernel.setArg<cl_int>(1, numAtoms);
for (int i = 0; i < cvForces.size(); i++)
addForcesKernel.setArg<cl::Buffer>(2*i+2, cvForces[i]->getDeviceBuffer());
addForcesKernel.setArg<cl::Buffer>(2*i+2, cvForces[i].getDeviceBuffer());
}
cl.executeKernel(copyStateKernel, numAtoms);
Vec3 a, b, c;
......@@ -6917,15 +6864,6 @@ private:
set<int> particles;
};
OpenCLCalcRMSDForceKernel::~OpenCLCalcRMSDForceKernel() {
if (referencePos != NULL)
delete referencePos;
if (particles != NULL)
delete particles;
if (buffer != NULL)
delete buffer;
}
void OpenCLCalcRMSDForceKernel::initialize(const System& system, const RMSDForce& force) {
// Create data structures.
......@@ -6934,9 +6872,9 @@ void OpenCLCalcRMSDForceKernel::initialize(const System& system, const RMSDForce
int numParticles = force.getParticles().size();
if (numParticles == 0)
numParticles = system.getNumParticles();
referencePos = new OpenCLArray(cl, system.getNumParticles(), 4*elementSize, "referencePos");
particles = OpenCLArray::create<cl_int>(cl, numParticles, "particles");
buffer = new OpenCLArray(cl, 13, elementSize, "buffer");
referencePos.initialize(cl, system.getNumParticles(), 4*elementSize, "referencePos");
particles.initialize<cl_int>(cl, numParticles, "particles");
buffer.initialize(cl, 13, elementSize, "buffer");
recordParameters(force);
info = new ForceInfo(force);
cl.addForce(info);
......@@ -6965,18 +6903,18 @@ void OpenCLCalcRMSDForceKernel::recordParameters(const RMSDForce& force) {
// Upload them to the device.
particles->upload(particleVec);
particles.upload(particleVec);
if (cl.getUseDoublePrecision()) {
vector<mm_double4> pos;
for (Vec3 p : centeredPositions)
pos.push_back(mm_double4(p[0], p[1], p[2], 0));
referencePos->upload(pos);
referencePos.upload(pos);
}
else {
vector<mm_float4> pos;
for (Vec3 p : centeredPositions)
pos.push_back(mm_float4(p[0], p[1], p[2], 0));
referencePos->upload(pos);
referencePos.upload(pos);
}
// Record the sum of the norms of the reference positions.
......@@ -6998,13 +6936,13 @@ template <class REAL>
double OpenCLCalcRMSDForceKernel::executeImpl(ContextImpl& context) {
// Execute the first kernel.
int numParticles = particles->getSize();
int numParticles = particles.getSize();
int blockSize = min(256, (int) kernel1.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(cl.getDevice()));
kernel1.setArg<cl_int>(0, numParticles);
kernel1.setArg<cl::Buffer>(1, cl.getPosq().getDeviceBuffer());
kernel1.setArg<cl::Buffer>(2, referencePos->getDeviceBuffer());
kernel1.setArg<cl::Buffer>(3, particles->getDeviceBuffer());
kernel1.setArg<cl::Buffer>(4, buffer->getDeviceBuffer());
kernel1.setArg<cl::Buffer>(2, referencePos.getDeviceBuffer());
kernel1.setArg<cl::Buffer>(3, particles.getDeviceBuffer());
kernel1.setArg<cl::Buffer>(4, buffer.getDeviceBuffer());
kernel1.setArg(5, blockSize*sizeof(REAL), NULL);
cl.executeKernel(kernel1, blockSize, blockSize);
......@@ -7012,7 +6950,7 @@ double OpenCLCalcRMSDForceKernel::executeImpl(ContextImpl& context) {
// and eigenvector.
vector<REAL> b;
buffer->download(b);
buffer.download(b);
Array2D<double> F(4, 4);
F[0][0] = b[0*3+0] + b[1*3+1] + b[2*3+2];
F[1][0] = b[1*3+2] - b[2*3+1];
......@@ -7066,30 +7004,25 @@ double OpenCLCalcRMSDForceKernel::executeImpl(ContextImpl& context) {
// Upload it to the device and invoke the kernel to apply forces.
buffer->upload(b);
buffer.upload(b);
kernel2.setArg<cl_int>(0, numParticles);
kernel2.setArg<cl::Buffer>(1, cl.getPosq().getDeviceBuffer());
kernel2.setArg<cl::Buffer>(2, referencePos->getDeviceBuffer());
kernel2.setArg<cl::Buffer>(3, particles->getDeviceBuffer());
kernel2.setArg<cl::Buffer>(4, buffer->getDeviceBuffer());
kernel2.setArg<cl::Buffer>(2, referencePos.getDeviceBuffer());
kernel2.setArg<cl::Buffer>(3, particles.getDeviceBuffer());
kernel2.setArg<cl::Buffer>(4, buffer.getDeviceBuffer());
kernel2.setArg<cl::Buffer>(5, cl.getForceBuffers().getDeviceBuffer());
cl.executeKernel(kernel2, numParticles);
return rmsd;
}
void OpenCLCalcRMSDForceKernel::copyParametersToContext(ContextImpl& context, const RMSDForce& force) {
if (referencePos->getSize() != force.getReferencePositions().size())
if (referencePos.getSize() != force.getReferencePositions().size())
throw OpenMMException("updateParametersInContext: The number of reference positions has changed");
int numParticles = force.getParticles().size();
if (numParticles == 0)
numParticles = context.getSystem().getNumParticles();
if (numParticles != particles->getSize()) {
// Recreate the particles array.
delete particles;
particles = NULL;
particles = OpenCLArray::create<cl_int>(cl, numParticles, "particles");
}
if (numParticles != particles.getSize())
particles.resize(numParticles);
recordParameters(force);
// Mark that the current reordering may be invalid.
......@@ -7160,11 +7093,6 @@ double OpenCLIntegrateVerletStepKernel::computeKineticEnergy(ContextImpl& contex
return cl.getIntegrationUtilities().computeKineticEnergy(0.5*integrator.getStepSize());
}
OpenCLIntegrateLangevinStepKernel::~OpenCLIntegrateLangevinStepKernel() {
if (params != NULL)
delete params;
}
void OpenCLIntegrateLangevinStepKernel::initialize(const System& system, const LangevinIntegrator& integrator) {
cl.getPlatformData().initializeContexts(system);
cl.getIntegrationUtilities().initRandomNumberGenerator(integrator.getRandomNumberSeed());
......@@ -7174,7 +7102,7 @@ void OpenCLIntegrateLangevinStepKernel::initialize(const System& system, const L
cl::Program program = cl.createProgram(OpenCLKernelSources::langevin, defines, "");
kernel1 = cl::Kernel(program, "integrateLangevinPart1");
kernel2 = cl::Kernel(program, "integrateLangevinPart2");
params = new OpenCLArray(cl, 3, cl.getUseDoublePrecision() || cl.getUseMixedPrecision() ? sizeof(cl_double) : sizeof(cl_float), "langevinParams");
params.initialize(cl, 3, cl.getUseDoublePrecision() || cl.getUseMixedPrecision() ? sizeof(cl_double) : sizeof(cl_float), "langevinParams");
prevStepSize = -1.0;
}
......@@ -7186,7 +7114,7 @@ void OpenCLIntegrateLangevinStepKernel::execute(ContextImpl& context, const Lang
kernel1.setArg<cl::Buffer>(0, cl.getVelm().getDeviceBuffer());
kernel1.setArg<cl::Buffer>(1, cl.getForce().getDeviceBuffer());
kernel1.setArg<cl::Buffer>(2, integration.getPosDelta().getDeviceBuffer());
kernel1.setArg<cl::Buffer>(3, params->getDeviceBuffer());
kernel1.setArg<cl::Buffer>(3, params.getDeviceBuffer());
kernel1.setArg<cl::Buffer>(4, integration.getStepSize().getDeviceBuffer());
kernel1.setArg<cl::Buffer>(5, integration.getRandom().getDeviceBuffer());
kernel2.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
......@@ -7207,18 +7135,18 @@ void OpenCLIntegrateLangevinStepKernel::execute(ContextImpl& context, const Lang
double fscale = (friction == 0 ? stepSize : (1-vscale)/friction);
double noisescale = sqrt(kT*(1-vscale*vscale));
if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision()) {
vector<cl_double> p(params->getSize());
vector<cl_double> p(params.getSize());
p[0] = vscale;
p[1] = fscale;
p[2] = noisescale;
params->upload(p);
params.upload(p);
}
else {
vector<cl_float> p(params->getSize());
vector<cl_float> p(params.getSize());
p[0] = (cl_float) vscale;
p[1] = (cl_float) fscale;
p[2] = (cl_float) noisescale;
params->upload(p);
params.upload(p);
}
prevTemp = temperature;
prevFriction = friction;
......@@ -7429,11 +7357,6 @@ double OpenCLIntegrateVariableVerletStepKernel::computeKineticEnergy(ContextImpl
return cl.getIntegrationUtilities().computeKineticEnergy(0.5*integrator.getStepSize());
}
OpenCLIntegrateVariableLangevinStepKernel::~OpenCLIntegrateVariableLangevinStepKernel() {
if (params != NULL)
delete params;
}
void OpenCLIntegrateVariableLangevinStepKernel::initialize(const System& system, const VariableLangevinIntegrator& integrator) {
cl.getPlatformData().initializeContexts(system);
cl.getIntegrationUtilities().initRandomNumberGenerator(integrator.getRandomNumberSeed());
......@@ -7444,9 +7367,9 @@ void OpenCLIntegrateVariableLangevinStepKernel::initialize(const System& system,
kernel1 = cl::Kernel(program, "integrateLangevinPart1");
kernel2 = cl::Kernel(program, "integrateLangevinPart2");
selectSizeKernel = cl::Kernel(program, "selectLangevinStepSize");
params = new OpenCLArray(cl, 3, cl.getUseDoublePrecision() || cl.getUseMixedPrecision() ? sizeof(cl_double) : sizeof(cl_float), "langevinParams");
params.initialize(cl, 3, cl.getUseDoublePrecision() || cl.getUseMixedPrecision() ? sizeof(cl_double) : sizeof(cl_float), "langevinParams");
blockSize = min(256, system.getNumParticles());
blockSize = max(blockSize, params->getSize());
blockSize = max(blockSize, params.getSize());
blockSize = min(blockSize, (int) selectSizeKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(cl.getDevice()));
}
......@@ -7459,7 +7382,7 @@ double OpenCLIntegrateVariableLangevinStepKernel::execute(ContextImpl& context,
kernel1.setArg<cl::Buffer>(0, cl.getVelm().getDeviceBuffer());
kernel1.setArg<cl::Buffer>(1, cl.getForce().getDeviceBuffer());
kernel1.setArg<cl::Buffer>(2, integration.getPosDelta().getDeviceBuffer());
kernel1.setArg<cl::Buffer>(3, params->getDeviceBuffer());
kernel1.setArg<cl::Buffer>(3, params.getDeviceBuffer());
kernel1.setArg<cl::Buffer>(4, integration.getStepSize().getDeviceBuffer());
kernel1.setArg<cl::Buffer>(5, integration.getRandom().getDeviceBuffer());
kernel2.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
......@@ -7470,9 +7393,9 @@ double OpenCLIntegrateVariableLangevinStepKernel::execute(ContextImpl& context,
selectSizeKernel.setArg<cl::Buffer>(4, integration.getStepSize().getDeviceBuffer());
selectSizeKernel.setArg<cl::Buffer>(5, cl.getVelm().getDeviceBuffer());
selectSizeKernel.setArg<cl::Buffer>(6, cl.getForce().getDeviceBuffer());
selectSizeKernel.setArg<cl::Buffer>(7, params->getDeviceBuffer());
selectSizeKernel.setArg<cl::Buffer>(7, params.getDeviceBuffer());
int elementSize = (useDouble ? sizeof(cl_double) : sizeof(cl_float));
selectSizeKernel.setArg(8, params->getSize()*elementSize, NULL);
selectSizeKernel.setArg(8, params.getSize()*elementSize, NULL);
selectSizeKernel.setArg(9, blockSize*elementSize, NULL);
}
......@@ -7619,24 +7542,8 @@ private:
};
OpenCLIntegrateCustomStepKernel::~OpenCLIntegrateCustomStepKernel() {
if (globalValues != NULL)
delete globalValues;
if (sumBuffer != NULL)
delete sumBuffer;
if (summedValue != NULL)
delete summedValue;
if (uniformRandoms != NULL)
delete uniformRandoms;
if (randomSeed != NULL)
delete randomSeed;
if (perDofEnergyParamDerivs != NULL)
delete perDofEnergyParamDerivs;
if (perDofValues != NULL)
delete perDofValues;
for (auto function : tabulatedFunctions)
delete function;
for (auto& f : savedForces)
delete f.second;
}
void OpenCLIntegrateCustomStepKernel::initialize(const System& system, const CustomIntegrator& integrator) {
......@@ -7644,8 +7551,8 @@ void OpenCLIntegrateCustomStepKernel::initialize(const System& system, const Cus
cl.getIntegrationUtilities().initRandomNumberGenerator(integrator.getRandomNumberSeed());
numGlobalVariables = integrator.getNumGlobalVariables();
int elementSize = (cl.getUseDoublePrecision() || cl.getUseMixedPrecision() ? sizeof(double) : sizeof(float));
sumBuffer = new OpenCLArray(cl, ((3*system.getNumParticles()+3)/4)*4, elementSize, "sumBuffer");
summedValue = new OpenCLArray(cl, 1, elementSize, "summedValue");
sumBuffer.initialize(cl, ((3*system.getNumParticles()+3)/4)*4, elementSize, "sumBuffer");
summedValue.initialize(cl, 1, elementSize, "summedValue");
perDofValues = new OpenCLParameterSet(cl, integrator.getNumPerDofVariables(), 3*system.getNumParticles(), "perDofVariables", false, cl.getUseDoublePrecision() || cl.getUseMixedPrecision());
cl.addReorderListener(new ReorderListener(cl, *perDofValues, localPerDofValuesFloat, localPerDofValuesDouble, deviceValuesAreCurrent));
SimTKOpenMMUtilities::setRandomNumberSeed(integrator.getRandomNumberSeed());
......@@ -7728,6 +7635,7 @@ void OpenCLIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context
vector<pair<string, string> > functionNames;
vector<const TabulatedFunction*> functionList;
vector<string> tableTypes;
tabulatedFunctions.resize(integrator.getNumTabulatedFunctions());
for (int i = 0; i < integrator.getNumTabulatedFunctions(); i++) {
functionList.push_back(&integrator.getTabulatedFunction(i));
string name = integrator.getTabulatedFunctionName(i);
......@@ -7736,8 +7644,8 @@ void OpenCLIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context
functions[name] = createReferenceTabulatedFunction(integrator.getTabulatedFunction(i));
int width;
vector<float> f = cl.getExpressionUtilities().computeFunctionCoefficients(integrator.getTabulatedFunction(i), width);
tabulatedFunctions.push_back(OpenCLArray::create<float>(cl, f.size(), "TabulatedFunction"));
tabulatedFunctions[tabulatedFunctions.size()-1]->upload(f);
tabulatedFunctions[i].initialize<float>(cl, f.size(), "TabulatedFunction");
tabulatedFunctions[i].upload(f);
if (width == 1)
tableTypes.push_back("float");
else
......@@ -7801,8 +7709,10 @@ void OpenCLIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context
forceGroupFlags[step] = 1<<forceGroup[step];
if (forceGroupFlags[step] == -2 && step > 0)
forceGroupFlags[step] = forceGroupFlags[step-1];
if (forceGroupFlags[step] != -2 && savedForces.find(forceGroupFlags[step]) == savedForces.end())
savedForces[forceGroupFlags[step]] = new OpenCLArray(cl, cl.getForce().getSize(), cl.getForce().getElementSize(), "savedForces");
if (forceGroupFlags[step] != -2 && savedForces.find(forceGroupFlags[step]) == savedForces.end()) {
savedForces[forceGroupFlags[step]] = OpenCLArray();
savedForces[forceGroupFlags[step]].initialize(cl, cl.getForce().getSize(), cl.getForce().getElementSize(), "savedForces");
}
}
// Allocate space for storing global values, both on the host and the device.
......@@ -7810,7 +7720,7 @@ void OpenCLIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context
globalValuesFloat.resize(expressionSet.getNumVariables());
globalValuesDouble.resize(expressionSet.getNumVariables());
int elementSize = (cl.getUseDoublePrecision() || cl.getUseMixedPrecision() ? sizeof(double) : sizeof(float));
globalValues = new OpenCLArray(cl, expressionSet.getNumVariables(), elementSize, "globalValues");
globalValues.initialize(cl, expressionSet.getNumVariables(), elementSize, "globalValues");
for (int i = 0; i < integrator.getNumGlobalVariables(); i++) {
globalValuesDouble[globalVariableIndex[i]] = initialGlobalVariables[i];
expressionSet.setVariable(globalVariableIndex[i], initialGlobalVariables[i]);
......@@ -7823,7 +7733,7 @@ void OpenCLIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context
int numContextParams = context.getParameters().size();
localPerDofEnergyParamDerivsFloat.resize(numContextParams);
localPerDofEnergyParamDerivsDouble.resize(numContextParams);
perDofEnergyParamDerivs = new OpenCLArray(cl, max(1, numContextParams), elementSize, "perDofEnergyParamDerivs");
perDofEnergyParamDerivs.initialize(cl, max(1, numContextParams), elementSize, "perDofEnergyParamDerivs");
// Record information about the targets of steps that will be stored in global variables.
......@@ -7974,14 +7884,14 @@ void OpenCLIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context
kernel.setArg<cl::Buffer>(index++, cl.getVelm().getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, cl.getForce().getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, integration.getStepSize().getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, globalValues->getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, sumBuffer->getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, globalValues.getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, sumBuffer.getDeviceBuffer());
index += 4;
kernel.setArg<cl::Buffer>(index++, perDofEnergyParamDerivs->getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, perDofEnergyParamDerivs.getDeviceBuffer());
for (auto& buffer : perDofValues->getBuffers())
kernel.setArg<cl::Memory>(index++, buffer.getMemory());
for (auto array : tabulatedFunctions)
kernel.setArg<cl::Buffer>(index++, array->getDeviceBuffer());
for (auto& array : tabulatedFunctions)
kernel.setArg<cl::Buffer>(index++, array.getDeviceBuffer());
if (stepType[step] == CustomIntegrator::ComputeSum) {
// Create a second kernel for this step that sums the values.
......@@ -7989,8 +7899,8 @@ void OpenCLIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context
kernel = cl::Kernel(program, useDouble ? "computeDoubleSum" : "computeFloatSum");
kernels[step].push_back(kernel);
index = 0;
kernel.setArg<cl::Buffer>(index++, sumBuffer->getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, summedValue->getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, sumBuffer.getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, summedValue.getDeviceBuffer());
kernel.setArg<cl_int>(index++, 3*numAtoms);
}
}
......@@ -8012,9 +7922,9 @@ void OpenCLIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context
int maxUniformRandoms = 1;
for (int required : requiredUniform)
maxUniformRandoms = max(maxUniformRandoms, required);
uniformRandoms = OpenCLArray::create<mm_float4>(cl, maxUniformRandoms, "uniformRandoms");
randomSeed = OpenCLArray::create<mm_int4>(cl, cl.getNumThreadBlocks()*OpenCLContext::ThreadBlockSize, "randomSeed");
vector<mm_int4> seed(randomSeed->getSize());
uniformRandoms.initialize<mm_float4>(cl, maxUniformRandoms, "uniformRandoms");
randomSeed.initialize<mm_int4>(cl, cl.getNumThreadBlocks()*OpenCLContext::ThreadBlockSize, "randomSeed");
vector<mm_int4> seed(randomSeed.getSize());
int rseed = integrator.getRandomNumberSeed();
// A random seed of 0 means use a unique one
if (rseed == 0)
......@@ -8026,12 +7936,12 @@ void OpenCLIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context
s.z = r = (1664525*r + 1013904223) & 0xFFFFFFFF;
s.w = r = (1664525*r + 1013904223) & 0xFFFFFFFF;
}
randomSeed->upload(seed);
randomSeed.upload(seed);
cl::Program randomProgram = cl.createProgram(OpenCLKernelSources::customIntegrator, defines);
randomKernel = cl::Kernel(randomProgram, "generateRandomNumbers");
randomKernel.setArg<cl_int>(0, maxUniformRandoms);
randomKernel.setArg<cl::Buffer>(1, uniformRandoms->getDeviceBuffer());
randomKernel.setArg<cl::Buffer>(2, randomSeed->getDeviceBuffer());
randomKernel.setArg<cl::Buffer>(1, uniformRandoms.getDeviceBuffer());
randomKernel.setArg<cl::Buffer>(2, randomSeed.getDeviceBuffer());
// Create the kernel for computing kinetic energy.
......@@ -8067,19 +7977,19 @@ void OpenCLIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context
kineticEnergyKernel.setArg<cl::Buffer>(index++, cl.getVelm().getDeviceBuffer());
kineticEnergyKernel.setArg<cl::Buffer>(index++, cl.getForce().getDeviceBuffer());
kineticEnergyKernel.setArg<cl::Buffer>(index++, integration.getStepSize().getDeviceBuffer());
kineticEnergyKernel.setArg<cl::Buffer>(index++, globalValues->getDeviceBuffer());
kineticEnergyKernel.setArg<cl::Buffer>(index++, sumBuffer->getDeviceBuffer());
kineticEnergyKernel.setArg<cl::Buffer>(index++, globalValues.getDeviceBuffer());
kineticEnergyKernel.setArg<cl::Buffer>(index++, sumBuffer.getDeviceBuffer());
index += 2;
kineticEnergyKernel.setArg<cl::Buffer>(index++, uniformRandoms->getDeviceBuffer());
kineticEnergyKernel.setArg<cl::Buffer>(index++, uniformRandoms.getDeviceBuffer());
if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision())
kineticEnergyKernel.setArg<cl_double>(index++, 0.0);
else
kineticEnergyKernel.setArg<cl_float>(index++, 0.0f);
kineticEnergyKernel.setArg<cl::Buffer>(index++, perDofEnergyParamDerivs->getDeviceBuffer());
kineticEnergyKernel.setArg<cl::Buffer>(index++, perDofEnergyParamDerivs.getDeviceBuffer());
for (int i = 0; i < (int) perDofValues->getBuffers().size(); i++)
kineticEnergyKernel.setArg<cl::Memory>(index++, perDofValues->getBuffers()[i].getMemory());
for (auto array : tabulatedFunctions)
kineticEnergyKernel.setArg<cl::Buffer>(index++, array->getDeviceBuffer());
for (auto& array : tabulatedFunctions)
kineticEnergyKernel.setArg<cl::Buffer>(index++, array.getDeviceBuffer());
keNeedsForce = usesVariable(keExpression, "f");
// Create a second kernel to sum the values.
......@@ -8087,8 +7997,8 @@ void OpenCLIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context
program = cl.createProgram(OpenCLKernelSources::customIntegrator, defines);
sumKineticEnergyKernel = cl::Kernel(program, useDouble ? "computeDoubleSum" : "computeFloatSum");
index = 0;
sumKineticEnergyKernel.setArg<cl::Buffer>(index++, sumBuffer->getDeviceBuffer());
sumKineticEnergyKernel.setArg<cl::Buffer>(index++, summedValue->getDeviceBuffer());
sumKineticEnergyKernel.setArg<cl::Buffer>(index++, sumBuffer.getDeviceBuffer());
sumKineticEnergyKernel.setArg<cl::Buffer>(index++, summedValue.getDeviceBuffer());
sumKineticEnergyKernel.setArg<cl_int>(index++, 3*numAtoms);
// Delete the custom functions.
......@@ -8181,7 +8091,7 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
// The forces are still valid. We just need a different force group right now. Save the old
// forces in case we need them again.
cl.getForce().copyTo(*savedForces[lastForceGroups]);
cl.getForce().copyTo(savedForces[lastForceGroups]);
validSavedForces.insert(lastForceGroups);
}
}
......@@ -8196,7 +8106,7 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
if (!computeEnergy && validSavedForces.find(forceGroups) != validSavedForces.end()) {
// We can just restore the forces we saved earlier.
savedForces[forceGroups]->copyTo(cl.getForce());
savedForces[forceGroups].copyTo(cl.getForce());
context.getLastForceGroups() = forceGroups;
}
else {
......@@ -8209,12 +8119,12 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision()) {
for (int i = 0; i < perDofEnergyParamDerivNames.size(); i++)
localPerDofEnergyParamDerivsDouble[i] = energyParamDerivs[perDofEnergyParamDerivNames[i]];
perDofEnergyParamDerivs->upload(localPerDofEnergyParamDerivsDouble);
perDofEnergyParamDerivs.upload(localPerDofEnergyParamDerivsDouble);
}
else {
for (int i = 0; i < perDofEnergyParamDerivNames.size(); i++)
localPerDofEnergyParamDerivsFloat[i] = (float) energyParamDerivs[perDofEnergyParamDerivNames[i]];
perDofEnergyParamDerivs->upload(localPerDofEnergyParamDerivsFloat);
perDofEnergyParamDerivs.upload(localPerDofEnergyParamDerivsFloat);
}
}
}
......@@ -8227,18 +8137,18 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
// Upload the global values to the device.
if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision())
globalValues->upload(globalValuesDouble);
globalValues.upload(globalValuesDouble);
else {
for (int j = 0; j < (int) globalValuesDouble.size(); j++)
globalValuesFloat[j] = (float) globalValuesDouble[j];
globalValues->upload(globalValuesFloat);
globalValues.upload(globalValuesFloat);
}
}
bool stepInvalidatesForces = invalidatesForces[step];
if (stepType[step] == CustomIntegrator::ComputePerDof && !merged[step]) {
kernels[step][0].setArg<cl_uint>(9, integration.prepareRandomNumbers(requiredGaussian[step]));
kernels[step][0].setArg<cl::Buffer>(8, integration.getRandom().getDeviceBuffer());
kernels[step][0].setArg<cl::Buffer>(10, uniformRandoms->getDeviceBuffer());
kernels[step][0].setArg<cl::Buffer>(10, uniformRandoms.getDeviceBuffer());
if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision())
kernels[step][0].setArg<cl_double>(11, energy);
else
......@@ -8256,24 +8166,24 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
else if (stepType[step] == CustomIntegrator::ComputeSum) {
kernels[step][0].setArg<cl_uint>(9, integration.prepareRandomNumbers(requiredGaussian[step]));
kernels[step][0].setArg<cl::Buffer>(8, integration.getRandom().getDeviceBuffer());
kernels[step][0].setArg<cl::Buffer>(10, uniformRandoms->getDeviceBuffer());
kernels[step][0].setArg<cl::Buffer>(10, uniformRandoms.getDeviceBuffer());
if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision())
kernels[step][0].setArg<cl_double>(11, energy);
else
kernels[step][0].setArg<cl_float>(11, (cl_float) energy);
if (requiredUniform[step] > 0)
cl.executeKernel(randomKernel, numAtoms);
cl.clearBuffer(*sumBuffer);
cl.clearBuffer(sumBuffer);
cl.executeKernel(kernels[step][0], numAtoms, 128);
cl.executeKernel(kernels[step][1], sumWorkGroupSize, sumWorkGroupSize);
if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision()) {
double value;
summedValue->download(&value);
summedValue.download(&value);
recordGlobalValue(value, stepTarget[step], integrator);
}
else {
float value;
summedValue->download(&value);
summedValue.download(&value);
recordGlobalValue(value, stepTarget[step], integrator);
}
}
......@@ -8363,19 +8273,19 @@ double OpenCLIntegrateCustomStepKernel::computeKineticEnergy(ContextImpl& contex
energy = context.calcForcesAndEnergy(true, willNeedEnergy, -1);
forcesAreValid = true;
}
cl.clearBuffer(*sumBuffer);
cl.clearBuffer(sumBuffer);
kineticEnergyKernel.setArg<cl::Buffer>(8, cl.getIntegrationUtilities().getRandom().getDeviceBuffer());
kineticEnergyKernel.setArg<cl_uint>(9, 0);
cl.executeKernel(kineticEnergyKernel, cl.getNumAtoms());
cl.executeKernel(sumKineticEnergyKernel, sumWorkGroupSize, sumWorkGroupSize);
if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision()) {
double ke;
summedValue->download(&ke);
summedValue.download(&ke);
return ke;
}
else {
float ke;
summedValue->download(&ke);
summedValue.download(&ke);
return ke;
}
}
......@@ -8410,7 +8320,7 @@ void OpenCLIntegrateCustomStepKernel::recordChangedParameters(ContextImpl& conte
}
void OpenCLIntegrateCustomStepKernel::getGlobalVariables(ContextImpl& context, vector<double>& values) const {
if (globalValues == NULL) {
if (!globalValues.isInitialized()) {
// The data structures haven't been created yet, so just return the list of values that was given earlier.
values = initialGlobalVariables;
......@@ -8424,7 +8334,7 @@ void OpenCLIntegrateCustomStepKernel::getGlobalVariables(ContextImpl& context, v
void OpenCLIntegrateCustomStepKernel::setGlobalVariables(ContextImpl& context, const vector<double>& values) {
if (numGlobalVariables == 0)
return;
if (globalValues == NULL) {
if (!globalValues.isInitialized()) {
// The data structures haven't been created yet, so just store the list of values.
initialGlobalVariables = values;
......@@ -8483,11 +8393,6 @@ void OpenCLIntegrateCustomStepKernel::setPerDofVariable(ContextImpl& context, in
deviceValuesAreCurrent = false;
}
OpenCLApplyAndersenThermostatKernel::~OpenCLApplyAndersenThermostatKernel() {
if (atomGroups != NULL)
delete atomGroups;
}
void OpenCLApplyAndersenThermostatKernel::initialize(const System& system, const AndersenThermostat& thermostat) {
randomSeed = thermostat.getRandomNumberSeed();
map<string, string> defines;
......@@ -8499,13 +8404,13 @@ void OpenCLApplyAndersenThermostatKernel::initialize(const System& system, const
// Create the arrays with the group definitions.
vector<vector<int> > groups = AndersenThermostatImpl::calcParticleGroups(system);
atomGroups = OpenCLArray::create<int>(cl, cl.getNumAtoms(), "atomGroups");
vector<int> atoms(atomGroups->getSize());
atomGroups.initialize<int>(cl, cl.getNumAtoms(), "atomGroups");
vector<int> atoms(atomGroups.getSize());
for (int i = 0; i < (int) groups.size(); i++) {
for (int j = 0; j < (int) groups[i].size(); j++)
atoms[groups[i][j]] = i;
}
atomGroups->upload(atoms);
atomGroups.upload(atoms);
}
void OpenCLApplyAndersenThermostatKernel::execute(ContextImpl& context) {
......@@ -8514,7 +8419,7 @@ void OpenCLApplyAndersenThermostatKernel::execute(ContextImpl& context) {
kernel.setArg<cl::Buffer>(2, cl.getVelm().getDeviceBuffer());
kernel.setArg<cl::Buffer>(3, cl.getIntegrationUtilities().getStepSize().getDeviceBuffer());
kernel.setArg<cl::Buffer>(4, cl.getIntegrationUtilities().getRandom().getDeviceBuffer());
kernel.setArg<cl::Buffer>(6, atomGroups->getDeviceBuffer());
kernel.setArg<cl::Buffer>(6, atomGroups.getDeviceBuffer());
}
kernel.setArg<cl_float>(0, (cl_float) context.getParameter(AndersenThermostat::CollisionFrequency()));
kernel.setArg<cl_float>(1, (cl_float) (BOLTZ*context.getParameter(AndersenThermostat::Temperature())));
......@@ -8522,20 +8427,9 @@ void OpenCLApplyAndersenThermostatKernel::execute(ContextImpl& context) {
cl.executeKernel(kernel, cl.getNumAtoms());
}
OpenCLApplyMonteCarloBarostatKernel::~OpenCLApplyMonteCarloBarostatKernel() {
if (savedPositions != NULL)
delete savedPositions;
if (savedForces != NULL)
delete savedForces;
if (moleculeAtoms != NULL)
delete moleculeAtoms;
if (moleculeStartIndex != NULL)
delete moleculeStartIndex;
}
void OpenCLApplyMonteCarloBarostatKernel::initialize(const System& system, const Force& thermostat) {
savedPositions = new OpenCLArray(cl, cl.getPaddedNumAtoms(), cl.getUseDoublePrecision() ? sizeof(mm_double4) : sizeof(mm_float4), "savedPositions");
savedForces = new OpenCLArray(cl, cl.getPaddedNumAtoms(), cl.getUseDoublePrecision() ? sizeof(mm_double4) : sizeof(mm_float4), "savedForces");
savedPositions.initialize(cl, cl.getPaddedNumAtoms(), cl.getUseDoublePrecision() ? sizeof(mm_double4) : sizeof(mm_float4), "savedPositions");
savedForces.initialize(cl, cl.getPaddedNumAtoms(), cl.getUseDoublePrecision() ? sizeof(mm_double4) : sizeof(mm_float4), "savedForces");
cl::Program program = cl.createProgram(OpenCLKernelSources::monteCarloBarostat);
kernel = cl::Kernel(program, "scalePositions");
}
......@@ -8548,10 +8442,10 @@ void OpenCLApplyMonteCarloBarostatKernel::scaleCoordinates(ContextImpl& context,
vector<vector<int> > molecules = context.getMolecules();
numMolecules = molecules.size();
moleculeAtoms = OpenCLArray::create<int>(cl, cl.getNumAtoms(), "moleculeAtoms");
moleculeStartIndex = OpenCLArray::create<int>(cl, numMolecules+1, "moleculeStartIndex");
vector<int> atoms(moleculeAtoms->getSize());
vector<int> startIndex(moleculeStartIndex->getSize());
moleculeAtoms.initialize<int>(cl, cl.getNumAtoms(), "moleculeAtoms");
moleculeStartIndex.initialize<int>(cl, numMolecules+1, "moleculeStartIndex");
vector<int> atoms(moleculeAtoms.getSize());
vector<int> startIndex(moleculeStartIndex.getSize());
int index = 0;
for (int i = 0; i < numMolecules; i++) {
startIndex[i] = index;
......@@ -8559,19 +8453,19 @@ void OpenCLApplyMonteCarloBarostatKernel::scaleCoordinates(ContextImpl& context,
atoms[index++] = molecule;
}
startIndex[numMolecules] = index;
moleculeAtoms->upload(atoms);
moleculeStartIndex->upload(startIndex);
moleculeAtoms.upload(atoms);
moleculeStartIndex.upload(startIndex);
// Initialize the kernel arguments.
kernel.setArg<cl_int>(3, numMolecules);
kernel.setArg<cl::Buffer>(9, cl.getPosq().getDeviceBuffer());
kernel.setArg<cl::Buffer>(10, moleculeAtoms->getDeviceBuffer());
kernel.setArg<cl::Buffer>(11, moleculeStartIndex->getDeviceBuffer());
kernel.setArg<cl::Buffer>(10, moleculeAtoms.getDeviceBuffer());
kernel.setArg<cl::Buffer>(11, moleculeStartIndex.getDeviceBuffer());
}
int bytesToCopy = cl.getPosq().getSize()*(cl.getUseDoublePrecision() ? sizeof(mm_double4) : sizeof(mm_float4));
cl.getQueue().enqueueCopyBuffer(cl.getPosq().getDeviceBuffer(), savedPositions->getDeviceBuffer(), 0, 0, bytesToCopy);
cl.getQueue().enqueueCopyBuffer(cl.getForce().getDeviceBuffer(), savedForces->getDeviceBuffer(), 0, 0, bytesToCopy);
cl.getQueue().enqueueCopyBuffer(cl.getPosq().getDeviceBuffer(), savedPositions.getDeviceBuffer(), 0, 0, bytesToCopy);
cl.getQueue().enqueueCopyBuffer(cl.getForce().getDeviceBuffer(), savedForces.getDeviceBuffer(), 0, 0, bytesToCopy);
kernel.setArg<cl_float>(0, (cl_float) scaleX);
kernel.setArg<cl_float>(1, (cl_float) scaleY);
kernel.setArg<cl_float>(2, (cl_float) scaleZ);
......@@ -8584,19 +8478,14 @@ void OpenCLApplyMonteCarloBarostatKernel::scaleCoordinates(ContextImpl& context,
void OpenCLApplyMonteCarloBarostatKernel::restoreCoordinates(ContextImpl& context) {
int bytesToCopy = cl.getPosq().getSize()*(cl.getUseDoublePrecision() ? sizeof(mm_double4) : sizeof(mm_float4));
cl.getQueue().enqueueCopyBuffer(savedPositions->getDeviceBuffer(), cl.getPosq().getDeviceBuffer(), 0, 0, bytesToCopy);
cl.getQueue().enqueueCopyBuffer(savedForces->getDeviceBuffer(), cl.getForce().getDeviceBuffer(), 0, 0, bytesToCopy);
}
OpenCLRemoveCMMotionKernel::~OpenCLRemoveCMMotionKernel() {
if (cmMomentum != NULL)
delete cmMomentum;
cl.getQueue().enqueueCopyBuffer(savedPositions.getDeviceBuffer(), cl.getPosq().getDeviceBuffer(), 0, 0, bytesToCopy);
cl.getQueue().enqueueCopyBuffer(savedForces.getDeviceBuffer(), cl.getForce().getDeviceBuffer(), 0, 0, bytesToCopy);
}
void OpenCLRemoveCMMotionKernel::initialize(const System& system, const CMMotionRemover& force) {
frequency = force.getFrequency();
int numAtoms = cl.getNumAtoms();
cmMomentum = OpenCLArray::create<mm_float4>(cl, (numAtoms+OpenCLContext::ThreadBlockSize-1)/OpenCLContext::ThreadBlockSize, "cmMomentum");
cmMomentum.initialize<mm_float4>(cl, (numAtoms+OpenCLContext::ThreadBlockSize-1)/OpenCLContext::ThreadBlockSize, "cmMomentum");
double totalMass = 0.0;
for (int i = 0; i < numAtoms; i++)
totalMass += system.getParticleMass(i);
......@@ -8606,12 +8495,12 @@ void OpenCLRemoveCMMotionKernel::initialize(const System& system, const CMMotion
kernel1 = cl::Kernel(program, "calcCenterOfMassMomentum");
kernel1.setArg<cl_int>(0, numAtoms);
kernel1.setArg<cl::Buffer>(1, cl.getVelm().getDeviceBuffer());
kernel1.setArg<cl::Buffer>(2, cmMomentum->getDeviceBuffer());
kernel1.setArg<cl::Buffer>(2, cmMomentum.getDeviceBuffer());
kernel1.setArg(3, OpenCLContext::ThreadBlockSize*sizeof(mm_float4), NULL);
kernel2 = cl::Kernel(program, "removeCenterOfMassMomentum");
kernel2.setArg<cl_int>(0, numAtoms);
kernel2.setArg<cl::Buffer>(1, cl.getVelm().getDeviceBuffer());
kernel2.setArg<cl::Buffer>(2, cmMomentum->getDeviceBuffer());
kernel2.setArg<cl::Buffer>(2, cmMomentum.getDeviceBuffer());
kernel2.setArg(3, OpenCLContext::ThreadBlockSize*sizeof(mm_float4), NULL);
}
......
......@@ -55,9 +55,7 @@ private:
};
OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : context(context), useCutoff(false), usePeriodic(false), anyExclusions(false), usePadding(true),
numForceBuffers(0), exclusionIndices(NULL), exclusionRowIndices(NULL), exclusionTiles(NULL), exclusions(NULL), interactingTiles(NULL), interactingAtoms(NULL),
interactionCount(NULL), blockCenter(NULL), blockBoundingBox(NULL), sortedBlocks(NULL), sortedBlockCenter(NULL), sortedBlockBoundingBox(NULL),
oldPositions(NULL), rebuildNeighborList(NULL), blockSorter(NULL), pinnedCountBuffer(NULL), pinnedCountMemory(NULL), forceRebuildNeighborList(true), lastCutoff(0.0), groupFlags(0) {
numForceBuffers(0), blockSorter(NULL), pinnedCountBuffer(NULL), pinnedCountMemory(NULL), forceRebuildNeighborList(true), lastCutoff(0.0), groupFlags(0) {
// Decide how many thread blocks and force buffers to use.
deviceIsCpu = (context.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU);
......@@ -95,34 +93,6 @@ OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : con
}
OpenCLNonbondedUtilities::~OpenCLNonbondedUtilities() {
if (exclusionIndices != NULL)
delete exclusionIndices;
if (exclusionRowIndices != NULL)
delete exclusionRowIndices;
if (exclusionTiles != NULL)
delete exclusionTiles;
if (exclusions != NULL)
delete exclusions;
if (interactingTiles != NULL)
delete interactingTiles;
if (interactingAtoms != NULL)
delete interactingAtoms;
if (interactionCount != NULL)
delete interactionCount;
if (blockCenter != NULL)
delete blockCenter;
if (blockBoundingBox != NULL)
delete blockBoundingBox;
if (sortedBlocks != NULL)
delete sortedBlocks;
if (sortedBlockCenter != NULL)
delete sortedBlockCenter;
if (sortedBlockBoundingBox != NULL)
delete sortedBlockBoundingBox;
if (oldPositions != NULL)
delete oldPositions;
if (rebuildNeighborList != NULL)
delete rebuildNeighborList;
if (blockSorter != NULL)
delete blockSorter;
if (pinnedCountBuffer != NULL)
......@@ -246,8 +216,8 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
for (set<pair<int, int> >::const_iterator iter = tilesWithExclusions.begin(); iter != tilesWithExclusions.end(); ++iter)
exclusionTilesVec.push_back(mm_ushort2((unsigned short) iter->first, (unsigned short) iter->second));
sort(exclusionTilesVec.begin(), exclusionTilesVec.end(), context.getSIMDWidth() <= 32 ? compareUshort2 : compareUshort2LargeSIMD);
exclusionTiles = OpenCLArray::create<mm_ushort2>(context, exclusionTilesVec.size(), "exclusionTiles");
exclusionTiles->upload(exclusionTilesVec);
exclusionTiles.initialize<mm_ushort2>(context, exclusionTilesVec.size(), "exclusionTiles");
exclusionTiles.upload(exclusionTilesVec);
map<pair<int, int>, int> exclusionTileMap;
for (int i = 0; i < (int) exclusionTilesVec.size(); i++) {
mm_ushort2 tile = exclusionTilesVec[i];
......@@ -268,17 +238,17 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
maxExclusions = 0;
for (int i = 0; i < (int) exclusionBlocksForBlock.size(); i++)
maxExclusions = (maxExclusions > exclusionBlocksForBlock[i].size() ? maxExclusions : exclusionBlocksForBlock[i].size());
exclusionIndices = OpenCLArray::create<cl_uint>(context, exclusionIndicesVec.size(), "exclusionIndices");
exclusionRowIndices = OpenCLArray::create<cl_uint>(context, exclusionRowIndicesVec.size(), "exclusionRowIndices");
exclusionIndices->upload(exclusionIndicesVec);
exclusionRowIndices->upload(exclusionRowIndicesVec);
exclusionIndices.initialize<cl_uint>(context, exclusionIndicesVec.size(), "exclusionIndices");
exclusionRowIndices.initialize<cl_uint>(context, exclusionRowIndicesVec.size(), "exclusionRowIndices");
exclusionIndices.upload(exclusionIndicesVec);
exclusionRowIndices.upload(exclusionRowIndicesVec);
// Record the exclusion data.
exclusions = OpenCLArray::create<cl_uint>(context, tilesWithExclusions.size()*OpenCLContext::TileSize, "exclusions");
exclusions.initialize<cl_uint>(context, tilesWithExclusions.size()*OpenCLContext::TileSize, "exclusions");
cl_uint allFlags = (cl_uint) -1;
vector<cl_uint> exclusionVec(exclusions->getSize(), allFlags);
for (int i = 0; i < exclusions->getSize(); ++i)
vector<cl_uint> exclusionVec(exclusions.getSize(), allFlags);
for (int i = 0; i < exclusions.getSize(); ++i)
exclusionVec[i] = 0xFFFFFFFF;
for (int atom1 = 0; atom1 < (int) atomExclusions.size(); ++atom1) {
int x = atom1/OpenCLContext::TileSize;
......@@ -298,7 +268,7 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
}
}
atomExclusions.clear(); // We won't use this again, so free the memory it used
exclusions->upload(exclusionVec);
exclusions.upload(exclusionVec);
// Create data structures for the neighbor list.
......@@ -312,20 +282,20 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
if (maxTiles < 1)
maxTiles = 1;
int numAtoms = context.getNumAtoms();
interactingTiles = OpenCLArray::create<cl_int>(context, maxTiles, "interactingTiles");
interactingAtoms = OpenCLArray::create<cl_int>(context, OpenCLContext::TileSize*maxTiles, "interactingAtoms");
interactionCount = OpenCLArray::create<cl_uint>(context, 1, "interactionCount");
interactingTiles.initialize<cl_int>(context, maxTiles, "interactingTiles");
interactingAtoms.initialize<cl_int>(context, OpenCLContext::TileSize*maxTiles, "interactingAtoms");
interactionCount.initialize<cl_uint>(context, 1, "interactionCount");
int elementSize = (context.getUseDoublePrecision() ? sizeof(cl_double) : sizeof(cl_float));
blockCenter = new OpenCLArray(context, numAtomBlocks, 4*elementSize, "blockCenter");
blockBoundingBox = new OpenCLArray(context, numAtomBlocks, 4*elementSize, "blockBoundingBox");
sortedBlocks = new OpenCLArray(context, numAtomBlocks, 2*elementSize, "sortedBlocks");
sortedBlockCenter = new OpenCLArray(context, numAtomBlocks+1, 4*elementSize, "sortedBlockCenter");
sortedBlockBoundingBox = new OpenCLArray(context, numAtomBlocks+1, 4*elementSize, "sortedBlockBoundingBox");
oldPositions = new OpenCLArray(context, numAtoms, 4*elementSize, "oldPositions");
rebuildNeighborList = OpenCLArray::create<int>(context, 1, "rebuildNeighborList");
blockCenter.initialize(context, numAtomBlocks, 4*elementSize, "blockCenter");
blockBoundingBox.initialize(context, numAtomBlocks, 4*elementSize, "blockBoundingBox");
sortedBlocks.initialize(context, numAtomBlocks, 2*elementSize, "sortedBlocks");
sortedBlockCenter.initialize(context, numAtomBlocks+1, 4*elementSize, "sortedBlockCenter");
sortedBlockBoundingBox.initialize(context, numAtomBlocks+1, 4*elementSize, "sortedBlockBoundingBox");
oldPositions.initialize(context, numAtoms, 4*elementSize, "oldPositions");
rebuildNeighborList.initialize<int>(context, 1, "rebuildNeighborList");
blockSorter = new OpenCLSort(context, new BlockSortTrait(context.getUseDoublePrecision()), numAtomBlocks);
vector<cl_uint> count(1, 0);
interactionCount->upload(count);
interactionCount.upload(count);
}
}
......@@ -376,14 +346,14 @@ void OpenCLNonbondedUtilities::prepareInteractions(int forceGroups) {
forceRebuildNeighborList = true;
setPeriodicBoxArgs(context, kernels.findBlockBoundsKernel, 1);
context.executeKernel(kernels.findBlockBoundsKernel, context.getNumAtoms());
blockSorter->sort(*sortedBlocks);
blockSorter->sort(sortedBlocks);
kernels.sortBoxDataKernel.setArg<cl_int>(9, forceRebuildNeighborList);
context.executeKernel(kernels.sortBoxDataKernel, context.getNumAtoms());
setPeriodicBoxArgs(context, kernels.findInteractingBlocksKernel, 0);
context.executeKernel(kernels.findInteractingBlocksKernel, context.getNumAtoms(), interactingBlocksThreadBlockSize);
forceRebuildNeighborList = false;
lastCutoff = kernels.cutoffDistance;
context.getQueue().enqueueReadBuffer(interactionCount->getDeviceBuffer(), CL_FALSE, 0, sizeof(int), pinnedCountMemory, NULL, &downloadCountEvent);
context.getQueue().enqueueReadBuffer(interactionCount.getDeviceBuffer(), CL_FALSE, 0, sizeof(int), pinnedCountMemory, NULL, &downloadCountEvent);
}
void OpenCLNonbondedUtilities::computeInteractions(int forceGroups, bool includeForces, bool includeEnergy) {
......@@ -407,7 +377,7 @@ void OpenCLNonbondedUtilities::computeInteractions(int forceGroups, bool include
bool OpenCLNonbondedUtilities::updateNeighborListSize() {
if (!useCutoff)
return false;
if (pinnedCountMemory[0] <= (unsigned int) interactingTiles->getSize())
if (pinnedCountMemory[0] <= (unsigned int) interactingTiles.getSize())
return false;
// The most recent timestep had too many interactions to fit in the arrays. Make the arrays bigger to prevent
......@@ -417,31 +387,27 @@ bool OpenCLNonbondedUtilities::updateNeighborListSize() {
int totalTiles = context.getNumAtomBlocks()*(context.getNumAtomBlocks()+1)/2;
if (maxTiles > totalTiles)
maxTiles = totalTiles;
delete interactingTiles;
delete interactingAtoms;
interactingTiles = NULL; // Avoid an error in the destructor if the following allocation fails
interactingAtoms = NULL;
interactingTiles = OpenCLArray::create<cl_int>(context, maxTiles, "interactingTiles");
interactingAtoms = OpenCLArray::create<cl_int>(context, OpenCLContext::TileSize*maxTiles, "interactingAtoms");
interactingTiles.resize(maxTiles);
interactingAtoms.resize(OpenCLContext::TileSize*maxTiles);
for (map<int, KernelSet>::iterator iter = groupKernels.begin(); iter != groupKernels.end(); ++iter) {
KernelSet& kernels = iter->second;
if (*reinterpret_cast<cl_kernel*>(&kernels.forceKernel) != NULL) {
kernels.forceKernel.setArg<cl::Buffer>(7, interactingTiles->getDeviceBuffer());
kernels.forceKernel.setArg<cl::Buffer>(7, interactingTiles.getDeviceBuffer());
kernels.forceKernel.setArg<cl_uint>(14, maxTiles);
kernels.forceKernel.setArg<cl::Buffer>(17, interactingAtoms->getDeviceBuffer());
kernels.forceKernel.setArg<cl::Buffer>(17, interactingAtoms.getDeviceBuffer());
}
if (*reinterpret_cast<cl_kernel*>(&kernels.energyKernel) != NULL) {
kernels.energyKernel.setArg<cl::Buffer>(7, interactingTiles->getDeviceBuffer());
kernels.energyKernel.setArg<cl::Buffer>(7, interactingTiles.getDeviceBuffer());
kernels.energyKernel.setArg<cl_uint>(14, maxTiles);
kernels.energyKernel.setArg<cl::Buffer>(17, interactingAtoms->getDeviceBuffer());
kernels.energyKernel.setArg<cl::Buffer>(17, interactingAtoms.getDeviceBuffer());
}
if (*reinterpret_cast<cl_kernel*>(&kernels.forceEnergyKernel) != NULL) {
kernels.forceEnergyKernel.setArg<cl::Buffer>(7, interactingTiles->getDeviceBuffer());
kernels.forceEnergyKernel.setArg<cl::Buffer>(7, interactingTiles.getDeviceBuffer());
kernels.forceEnergyKernel.setArg<cl_uint>(14, maxTiles);
kernels.forceEnergyKernel.setArg<cl::Buffer>(17, interactingAtoms->getDeviceBuffer());
kernels.forceEnergyKernel.setArg<cl::Buffer>(17, interactingAtoms.getDeviceBuffer());
}
kernels.findInteractingBlocksKernel.setArg<cl::Buffer>(6, interactingTiles->getDeviceBuffer());
kernels.findInteractingBlocksKernel.setArg<cl::Buffer>(7, interactingAtoms->getDeviceBuffer());
kernels.findInteractingBlocksKernel.setArg<cl::Buffer>(6, interactingTiles.getDeviceBuffer());
kernels.findInteractingBlocksKernel.setArg<cl::Buffer>(7, interactingAtoms.getDeviceBuffer());
kernels.findInteractingBlocksKernel.setArg<cl_uint>(9, maxTiles);
}
forceRebuildNeighborList = true;
......@@ -506,7 +472,7 @@ void OpenCLNonbondedUtilities::createKernelsForGroups(int groups) {
defines["PADDING"] = context.doubleToString(padding);
defines["PADDED_CUTOFF"] = context.doubleToString(paddedCutoff);
defines["PADDED_CUTOFF_SQUARED"] = context.doubleToString(paddedCutoff*paddedCutoff);
defines["NUM_TILES_WITH_EXCLUSIONS"] = context.intToString(exclusionTiles->getSize());
defines["NUM_TILES_WITH_EXCLUSIONS"] = context.intToString(exclusionTiles.getSize());
defines["NUM_BLOCKS"] = context.intToString(context.getNumAtomBlocks());
defines["SIMD_WIDTH"] = context.intToString(context.getSIMDWidth());
if (usePeriodic)
......@@ -523,36 +489,36 @@ void OpenCLNonbondedUtilities::createKernelsForGroups(int groups) {
kernels.findBlockBoundsKernel = cl::Kernel(interactingBlocksProgram, "findBlockBounds");
kernels.findBlockBoundsKernel.setArg<cl_int>(0, context.getNumAtoms());
kernels.findBlockBoundsKernel.setArg<cl::Buffer>(6, context.getPosq().getDeviceBuffer());
kernels.findBlockBoundsKernel.setArg<cl::Buffer>(7, blockCenter->getDeviceBuffer());
kernels.findBlockBoundsKernel.setArg<cl::Buffer>(8, blockBoundingBox->getDeviceBuffer());
kernels.findBlockBoundsKernel.setArg<cl::Buffer>(9, rebuildNeighborList->getDeviceBuffer());
kernels.findBlockBoundsKernel.setArg<cl::Buffer>(10, sortedBlocks->getDeviceBuffer());
kernels.findBlockBoundsKernel.setArg<cl::Buffer>(7, blockCenter.getDeviceBuffer());
kernels.findBlockBoundsKernel.setArg<cl::Buffer>(8, blockBoundingBox.getDeviceBuffer());
kernels.findBlockBoundsKernel.setArg<cl::Buffer>(9, rebuildNeighborList.getDeviceBuffer());
kernels.findBlockBoundsKernel.setArg<cl::Buffer>(10, sortedBlocks.getDeviceBuffer());
kernels.sortBoxDataKernel = cl::Kernel(interactingBlocksProgram, "sortBoxData");
kernels.sortBoxDataKernel.setArg<cl::Buffer>(0, sortedBlocks->getDeviceBuffer());
kernels.sortBoxDataKernel.setArg<cl::Buffer>(1, blockCenter->getDeviceBuffer());
kernels.sortBoxDataKernel.setArg<cl::Buffer>(2, blockBoundingBox->getDeviceBuffer());
kernels.sortBoxDataKernel.setArg<cl::Buffer>(3, sortedBlockCenter->getDeviceBuffer());
kernels.sortBoxDataKernel.setArg<cl::Buffer>(4, sortedBlockBoundingBox->getDeviceBuffer());
kernels.sortBoxDataKernel.setArg<cl::Buffer>(0, sortedBlocks.getDeviceBuffer());
kernels.sortBoxDataKernel.setArg<cl::Buffer>(1, blockCenter.getDeviceBuffer());
kernels.sortBoxDataKernel.setArg<cl::Buffer>(2, blockBoundingBox.getDeviceBuffer());
kernels.sortBoxDataKernel.setArg<cl::Buffer>(3, sortedBlockCenter.getDeviceBuffer());
kernels.sortBoxDataKernel.setArg<cl::Buffer>(4, sortedBlockBoundingBox.getDeviceBuffer());
kernels.sortBoxDataKernel.setArg<cl::Buffer>(5, context.getPosq().getDeviceBuffer());
kernels.sortBoxDataKernel.setArg<cl::Buffer>(6, oldPositions->getDeviceBuffer());
kernels.sortBoxDataKernel.setArg<cl::Buffer>(7, interactionCount->getDeviceBuffer());
kernels.sortBoxDataKernel.setArg<cl::Buffer>(8, rebuildNeighborList->getDeviceBuffer());
kernels.sortBoxDataKernel.setArg<cl::Buffer>(6, oldPositions.getDeviceBuffer());
kernels.sortBoxDataKernel.setArg<cl::Buffer>(7, interactionCount.getDeviceBuffer());
kernels.sortBoxDataKernel.setArg<cl::Buffer>(8, rebuildNeighborList.getDeviceBuffer());
kernels.sortBoxDataKernel.setArg<cl_int>(9, true);
kernels.findInteractingBlocksKernel = cl::Kernel(interactingBlocksProgram, "findBlocksWithInteractions");
kernels.findInteractingBlocksKernel.setArg<cl::Buffer>(5, interactionCount->getDeviceBuffer());
kernels.findInteractingBlocksKernel.setArg<cl::Buffer>(6, interactingTiles->getDeviceBuffer());
kernels.findInteractingBlocksKernel.setArg<cl::Buffer>(7, interactingAtoms->getDeviceBuffer());
kernels.findInteractingBlocksKernel.setArg<cl::Buffer>(5, interactionCount.getDeviceBuffer());
kernels.findInteractingBlocksKernel.setArg<cl::Buffer>(6, interactingTiles.getDeviceBuffer());
kernels.findInteractingBlocksKernel.setArg<cl::Buffer>(7, interactingAtoms.getDeviceBuffer());
kernels.findInteractingBlocksKernel.setArg<cl::Buffer>(8, context.getPosq().getDeviceBuffer());
kernels.findInteractingBlocksKernel.setArg<cl_uint>(9, interactingTiles->getSize());
kernels.findInteractingBlocksKernel.setArg<cl_uint>(9, interactingTiles.getSize());
kernels.findInteractingBlocksKernel.setArg<cl_uint>(10, startBlockIndex);
kernels.findInteractingBlocksKernel.setArg<cl_uint>(11, numBlocks);
kernels.findInteractingBlocksKernel.setArg<cl::Buffer>(12, sortedBlocks->getDeviceBuffer());
kernels.findInteractingBlocksKernel.setArg<cl::Buffer>(13, sortedBlockCenter->getDeviceBuffer());
kernels.findInteractingBlocksKernel.setArg<cl::Buffer>(14, sortedBlockBoundingBox->getDeviceBuffer());
kernels.findInteractingBlocksKernel.setArg<cl::Buffer>(15, exclusionIndices->getDeviceBuffer());
kernels.findInteractingBlocksKernel.setArg<cl::Buffer>(16, exclusionRowIndices->getDeviceBuffer());
kernels.findInteractingBlocksKernel.setArg<cl::Buffer>(17, oldPositions->getDeviceBuffer());
kernels.findInteractingBlocksKernel.setArg<cl::Buffer>(18, rebuildNeighborList->getDeviceBuffer());
kernels.findInteractingBlocksKernel.setArg<cl::Buffer>(12, sortedBlocks.getDeviceBuffer());
kernels.findInteractingBlocksKernel.setArg<cl::Buffer>(13, sortedBlockCenter.getDeviceBuffer());
kernels.findInteractingBlocksKernel.setArg<cl::Buffer>(14, sortedBlockBoundingBox.getDeviceBuffer());
kernels.findInteractingBlocksKernel.setArg<cl::Buffer>(15, exclusionIndices.getDeviceBuffer());
kernels.findInteractingBlocksKernel.setArg<cl::Buffer>(16, exclusionRowIndices.getDeviceBuffer());
kernels.findInteractingBlocksKernel.setArg<cl::Buffer>(17, oldPositions.getDeviceBuffer());
kernels.findInteractingBlocksKernel.setArg<cl::Buffer>(18, rebuildNeighborList.getDeviceBuffer());
if (kernels.findInteractingBlocksKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice()) < groupSize) {
// The device can't handle this block size, so reduce it.
......@@ -700,7 +666,7 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
defines["PADDED_NUM_ATOMS"] = context.intToString(context.getPaddedNumAtoms());
defines["NUM_BLOCKS"] = context.intToString(context.getNumAtomBlocks());
defines["TILE_SIZE"] = context.intToString(OpenCLContext::TileSize);
int numExclusionTiles = exclusionTiles->getSize();
int numExclusionTiles = exclusionTiles.getSize();
defines["NUM_TILES_WITH_EXCLUSIONS"] = context.intToString(numExclusionTiles);
int numContexts = context.getPlatformData().contexts.size();
int startExclusionIndex = context.getContextIndex()*numExclusionTiles/numContexts;
......@@ -726,18 +692,18 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
kernel.setArg<cl::Buffer>(index++, context.getForceBuffers().getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, context.getEnergyBuffer().getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, context.getPosq().getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, exclusions->getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, exclusionTiles->getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, exclusions.getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, exclusionTiles.getDeviceBuffer());
kernel.setArg<cl_uint>(index++, startTileIndex);
kernel.setArg<cl_uint>(index++, numTiles);
if (useCutoff) {
kernel.setArg<cl::Buffer>(index++, interactingTiles->getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, interactionCount->getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, interactingTiles.getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, interactionCount.getDeviceBuffer());
index += 5; // The periodic box size arguments are set when the kernel is executed.
kernel.setArg<cl_uint>(index++, interactingTiles->getSize());
kernel.setArg<cl::Buffer>(index++, blockCenter->getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, blockBoundingBox->getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, interactingAtoms->getDeviceBuffer());
kernel.setArg<cl_uint>(index++, interactingTiles.getSize());
kernel.setArg<cl::Buffer>(index++, blockCenter.getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, blockBoundingBox.getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, interactingAtoms.getDeviceBuffer());
}
for (int i = 0; i < (int) params.size(); i++) {
kernel.setArg<cl::Memory>(index++, params[i].getMemory());
......
......@@ -118,14 +118,12 @@ private:
OpenCLParallelCalcForcesAndEnergyKernel::OpenCLParallelCalcForcesAndEnergyKernel(string name, const Platform& platform, OpenCLPlatform::PlatformData& data) :
CalcForcesAndEnergyKernel(name, platform), data(data), completionTimes(data.contexts.size()), contextNonbondedFractions(data.contexts.size()),
tileCounts(data.contexts.size()), contextForces(NULL), pinnedPositionBuffer(NULL), pinnedPositionMemory(NULL), pinnedForceBuffer(NULL), pinnedForceMemory(NULL) {
tileCounts(data.contexts.size()), pinnedPositionBuffer(NULL), pinnedPositionMemory(NULL), pinnedForceBuffer(NULL), pinnedForceMemory(NULL) {
for (int i = 0; i < (int) data.contexts.size(); i++)
kernels.push_back(Kernel(new OpenCLCalcForcesAndEnergyKernel(name, platform, *data.contexts[i])));
}
OpenCLParallelCalcForcesAndEnergyKernel::~OpenCLParallelCalcForcesAndEnergyKernel() {
if (contextForces != NULL)
delete contextForces;
if (pinnedPositionBuffer != NULL)
delete pinnedPositionBuffer;
if (pinnedForceBuffer != NULL)
......@@ -142,8 +140,8 @@ void OpenCLParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
void OpenCLParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) {
OpenCLContext& cl0 = *data.contexts[0];
int elementSize = (cl0.getUseDoublePrecision() ? sizeof(mm_double4) : sizeof(mm_float4));
if (contextForces == NULL) {
contextForces = OpenCLArray::create<mm_float4>(cl0, &cl0.getForceBuffers().getDeviceBuffer(),
if (!contextForces.isInitialized()) {
contextForces.initialize<mm_float4>(cl0, &cl0.getForceBuffers().getDeviceBuffer(),
data.contexts.size()*cl0.getPaddedNumAtoms(), "contextForces");
int bufferBytes = (data.contexts.size()-1)*cl0.getPaddedNumAtoms()*elementSize;
pinnedPositionBuffer = new cl::Buffer(cl0.getContext(), CL_MEM_ALLOC_HOST_PTR, bufferBytes);
......@@ -179,9 +177,9 @@ double OpenCLParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& c
OpenCLContext& cl = *data.contexts[0];
int numAtoms = cl.getPaddedNumAtoms();
int elementSize = (cl.getUseDoublePrecision() ? sizeof(mm_double4) : sizeof(mm_float4));
cl.getQueue().enqueueWriteBuffer(contextForces->getDeviceBuffer(), CL_FALSE, numAtoms*elementSize,
cl.getQueue().enqueueWriteBuffer(contextForces.getDeviceBuffer(), CL_FALSE, numAtoms*elementSize,
numAtoms*(data.contexts.size()-1)*elementSize, pinnedForceMemory);
cl.reduceBuffer(*contextForces, data.contexts.size());
cl.reduceBuffer(contextForces, data.contexts.size());
// Balance work between the contexts by transferring a little nonbonded work from the context that
// finished last to the one that finished first.
......
......@@ -31,8 +31,7 @@
using namespace OpenMM;
using namespace std;
OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int length) : context(context), trait(trait),
dataRange(NULL), bucketOfElement(NULL), offsetInBucket(NULL), bucketOffset(NULL), buckets(NULL), dataLength(length) {
OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int length) : context(context), trait(trait), dataLength(length) {
// Create kernels.
std::map<std::string, std::string> replacements;
......@@ -81,26 +80,16 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le
// Create workspace arrays.
if (!isShortList) {
dataRange = new OpenCLArray(context, 2, trait->getKeySize(), "sortDataRange");
bucketOffset = OpenCLArray::create<cl_uint>(context, numBuckets, "bucketOffset");
bucketOfElement = OpenCLArray::create<cl_uint>(context, length, "bucketOfElement");
offsetInBucket = OpenCLArray::create<cl_uint>(context, length, "offsetInBucket");
buckets = new OpenCLArray(context, length, trait->getDataSize(), "buckets");
dataRange.initialize(context, 2, trait->getKeySize(), "sortDataRange");
bucketOffset.initialize<cl_uint>(context, numBuckets, "bucketOffset");
bucketOfElement.initialize<cl_uint>(context, length, "bucketOfElement");
offsetInBucket.initialize<cl_uint>(context, length, "offsetInBucket");
buckets.initialize(context, length, trait->getDataSize(), "buckets");
}
}
OpenCLSort::~OpenCLSort() {
delete trait;
if (dataRange != NULL)
delete dataRange;
if (bucketOfElement != NULL)
delete bucketOfElement;
if (offsetInBucket != NULL)
delete offsetInBucket;
if (bucketOffset != NULL)
delete bucketOffset;
if (buckets != NULL)
delete buckets;
}
void OpenCLSort::sort(OpenCLArray& data) {
......@@ -119,14 +108,14 @@ void OpenCLSort::sort(OpenCLArray& data) {
else {
// Compute the range of data values.
unsigned int numBuckets = bucketOffset->getSize();
unsigned int numBuckets = bucketOffset.getSize();
computeRangeKernel.setArg<cl::Buffer>(0, data.getDeviceBuffer());
computeRangeKernel.setArg<cl_uint>(1, data.getSize());
computeRangeKernel.setArg<cl::Buffer>(2, dataRange->getDeviceBuffer());
computeRangeKernel.setArg<cl::Buffer>(2, dataRange.getDeviceBuffer());
computeRangeKernel.setArg(3, rangeKernelSize*trait->getKeySize(), NULL);
computeRangeKernel.setArg(4, rangeKernelSize*trait->getKeySize(), NULL);
computeRangeKernel.setArg<cl_int>(5, numBuckets);
computeRangeKernel.setArg<cl::Buffer>(6, bucketOffset->getDeviceBuffer());
computeRangeKernel.setArg<cl::Buffer>(6, bucketOffset.getDeviceBuffer());
context.executeKernel(computeRangeKernel, rangeKernelSize, rangeKernelSize);
// Assign array elements to buckets.
......@@ -134,35 +123,35 @@ void OpenCLSort::sort(OpenCLArray& data) {
assignElementsKernel.setArg<cl::Buffer>(0, data.getDeviceBuffer());
assignElementsKernel.setArg<cl_int>(1, data.getSize());
assignElementsKernel.setArg<cl_int>(2, numBuckets);
assignElementsKernel.setArg<cl::Buffer>(3, dataRange->getDeviceBuffer());
assignElementsKernel.setArg<cl::Buffer>(4, bucketOffset->getDeviceBuffer());
assignElementsKernel.setArg<cl::Buffer>(5, bucketOfElement->getDeviceBuffer());
assignElementsKernel.setArg<cl::Buffer>(6, offsetInBucket->getDeviceBuffer());
assignElementsKernel.setArg<cl::Buffer>(3, dataRange.getDeviceBuffer());
assignElementsKernel.setArg<cl::Buffer>(4, bucketOffset.getDeviceBuffer());
assignElementsKernel.setArg<cl::Buffer>(5, bucketOfElement.getDeviceBuffer());
assignElementsKernel.setArg<cl::Buffer>(6, offsetInBucket.getDeviceBuffer());
context.executeKernel(assignElementsKernel, data.getSize());
// Compute the position of each bucket.
computeBucketPositionsKernel.setArg<cl_int>(0, numBuckets);
computeBucketPositionsKernel.setArg<cl::Buffer>(1, bucketOffset->getDeviceBuffer());
computeBucketPositionsKernel.setArg<cl::Buffer>(1, bucketOffset.getDeviceBuffer());
computeBucketPositionsKernel.setArg(2, positionsKernelSize*sizeof(cl_int), NULL);
context.executeKernel(computeBucketPositionsKernel, positionsKernelSize, positionsKernelSize);
// Copy the data into the buckets.
copyToBucketsKernel.setArg<cl::Buffer>(0, data.getDeviceBuffer());
copyToBucketsKernel.setArg<cl::Buffer>(1, buckets->getDeviceBuffer());
copyToBucketsKernel.setArg<cl::Buffer>(1, buckets.getDeviceBuffer());
copyToBucketsKernel.setArg<cl_int>(2, data.getSize());
copyToBucketsKernel.setArg<cl::Buffer>(3, bucketOffset->getDeviceBuffer());
copyToBucketsKernel.setArg<cl::Buffer>(4, bucketOfElement->getDeviceBuffer());
copyToBucketsKernel.setArg<cl::Buffer>(5, offsetInBucket->getDeviceBuffer());
copyToBucketsKernel.setArg<cl::Buffer>(3, bucketOffset.getDeviceBuffer());
copyToBucketsKernel.setArg<cl::Buffer>(4, bucketOfElement.getDeviceBuffer());
copyToBucketsKernel.setArg<cl::Buffer>(5, offsetInBucket.getDeviceBuffer());
context.executeKernel(copyToBucketsKernel, data.getSize());
// Sort each bucket.
sortBucketsKernel.setArg<cl::Buffer>(0, data.getDeviceBuffer());
sortBucketsKernel.setArg<cl::Buffer>(1, buckets->getDeviceBuffer());
sortBucketsKernel.setArg<cl::Buffer>(1, buckets.getDeviceBuffer());
sortBucketsKernel.setArg<cl_int>(2, numBuckets);
sortBucketsKernel.setArg<cl::Buffer>(3, bucketOffset->getDeviceBuffer());
sortBucketsKernel.setArg<cl::Buffer>(3, bucketOffset.getDeviceBuffer());
sortBucketsKernel.setArg(4, sortKernelSize*trait->getDataSize(), NULL);
context.executeKernel(sortBucketsKernel, ((data.getSize()+sortKernelSize-1)/sortKernelSize)*sortKernelSize, sortKernelSize);
}
......
......@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2013-2015 Stanford University and the Authors. *
* Portions copyright (c) 2013-2018 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -107,13 +107,6 @@ private:
const DrudeForce& force;
};
OpenCLCalcDrudeForceKernel::~OpenCLCalcDrudeForceKernel() {
if (particleParams != NULL)
delete particleParams;
if (pairParams != NULL)
delete pairParams;
}
void OpenCLCalcDrudeForceKernel::initialize(const System& system, const DrudeForce& force) {
int numContexts = cl.getPlatformData().contexts.size();
int startParticleIndex = cl.getContextIndex()*force.getNumParticles()/numContexts;
......@@ -123,7 +116,7 @@ void OpenCLCalcDrudeForceKernel::initialize(const System& system, const DrudeFor
// Create the harmonic interaction .
vector<vector<int> > atoms(numParticles, vector<int>(5));
particleParams = OpenCLArray::create<mm_float4>(cl, numParticles, "drudeParticleParams");
particleParams.initialize<mm_float4>(cl, numParticles, "drudeParticleParams");
vector<mm_float4> paramVector(numParticles);
for (int i = 0; i < numParticles; i++) {
double charge, polarizability, aniso12, aniso34;
......@@ -145,9 +138,9 @@ void OpenCLCalcDrudeForceKernel::initialize(const System& system, const DrudeFor
}
paramVector[i] = mm_float4((float) k1, (float) k2, (float) k3, 0.0f);
}
particleParams->upload(paramVector);
particleParams.upload(paramVector);
map<string, string> replacements;
replacements["PARAMS"] = cl.getBondedUtilities().addArgument(particleParams->getDeviceBuffer(), "float4");
replacements["PARAMS"] = cl.getBondedUtilities().addArgument(particleParams.getDeviceBuffer(), "float4");
cl.getBondedUtilities().addInteraction(atoms, cl.replaceStrings(OpenCLDrudeKernelSources::drudeParticleForce, replacements), force.getForceGroup());
}
int startPairIndex = cl.getContextIndex()*force.getNumScreenedPairs()/numContexts;
......@@ -157,7 +150,7 @@ void OpenCLCalcDrudeForceKernel::initialize(const System& system, const DrudeFor
// Create the screened interaction between dipole pairs.
vector<vector<int> > atoms(numPairs, vector<int>(4));
pairParams = OpenCLArray::create<mm_float2>(cl, numPairs, "drudePairParams");
pairParams.initialize<mm_float2>(cl, numPairs, "drudePairParams");
vector<mm_float2> paramVector(numPairs);
for (int i = 0; i < numPairs; i++) {
int drude1, drude2;
......@@ -171,9 +164,9 @@ void OpenCLCalcDrudeForceKernel::initialize(const System& system, const DrudeFor
double energyScale = ONE_4PI_EPS0*charge1*charge2;
paramVector[i] = mm_float2((float) screeningScale, (float) energyScale);
}
pairParams->upload(paramVector);
pairParams.upload(paramVector);
map<string, string> replacements;
replacements["PARAMS"] = cl.getBondedUtilities().addArgument(pairParams->getDeviceBuffer(), "float2");
replacements["PARAMS"] = cl.getBondedUtilities().addArgument(pairParams.getDeviceBuffer(), "float2");
cl.getBondedUtilities().addInteraction(atoms, cl.replaceStrings(OpenCLDrudeKernelSources::drudePairForce, replacements), force.getForceGroup());
}
cl.addForce(new OpenCLDrudeForceInfo(force));
......@@ -192,7 +185,7 @@ void OpenCLCalcDrudeForceKernel::copyParametersToContext(ContextImpl& context, c
int endParticleIndex = (cl.getContextIndex()+1)*force.getNumParticles()/numContexts;
int numParticles = endParticleIndex-startParticleIndex;
if (numParticles > 0) {
if (particleParams == NULL || numParticles != particleParams->getSize())
if (!particleParams.isInitialized() || numParticles != particleParams.getSize())
throw OpenMMException("updateParametersInContext: The number of Drude particles has changed");
vector<mm_float4> paramVector(numParticles);
for (int i = 0; i < numParticles; i++) {
......@@ -211,7 +204,7 @@ void OpenCLCalcDrudeForceKernel::copyParametersToContext(ContextImpl& context, c
k2 = 0;
paramVector[i] = mm_float4((float) k1, (float) k2, (float) k3, 0.0f);
}
particleParams->upload(paramVector);
particleParams.upload(paramVector);
}
// Set the pair parameters.
......@@ -220,7 +213,7 @@ void OpenCLCalcDrudeForceKernel::copyParametersToContext(ContextImpl& context, c
int endPairIndex = (cl.getContextIndex()+1)*force.getNumScreenedPairs()/numContexts;
int numPairs = endPairIndex-startPairIndex;
if (numPairs > 0) {
if (pairParams == NULL || numPairs != pairParams->getSize())
if (!pairParams.isInitialized() || numPairs != pairParams.getSize())
throw OpenMMException("updateParametersInContext: The number of screened pairs has changed");
vector<mm_float2> paramVector(numPairs);
for (int i = 0; i < numPairs; i++) {
......@@ -235,17 +228,10 @@ void OpenCLCalcDrudeForceKernel::copyParametersToContext(ContextImpl& context, c
double energyScale = ONE_4PI_EPS0*charge1*charge2;
paramVector[i] = mm_float2((float) screeningScale, (float) energyScale);
}
pairParams->upload(paramVector);
pairParams.upload(paramVector);
}
}
OpenCLIntegrateDrudeLangevinStepKernel::~OpenCLIntegrateDrudeLangevinStepKernel() {
if (normalParticles != NULL)
delete normalParticles;
if (pairParticles != NULL)
delete pairParticles;
}
void OpenCLIntegrateDrudeLangevinStepKernel::initialize(const System& system, const DrudeLangevinIntegrator& integrator, const DrudeForce& force) {
cl.getPlatformData().initializeContexts(system);
cl.getIntegrationUtilities().initRandomNumberGenerator((unsigned int) integrator.getRandomNumberSeed());
......@@ -266,12 +252,12 @@ void OpenCLIntegrateDrudeLangevinStepKernel::initialize(const System& system, co
pairParticleVec.push_back(mm_int2(p, p1));
}
normalParticleVec.insert(normalParticleVec.begin(), particles.begin(), particles.end());
normalParticles = OpenCLArray::create<int>(cl, max((int) normalParticleVec.size(), 1), "drudeNormalParticles");
pairParticles = OpenCLArray::create<cl_int2>(cl, max((int) pairParticleVec.size(), 1), "drudePairParticles");
normalParticles.initialize<int>(cl, max((int) normalParticleVec.size(), 1), "drudeNormalParticles");
pairParticles.initialize<cl_int2>(cl, max((int) pairParticleVec.size(), 1), "drudePairParticles");
if (normalParticleVec.size() > 0)
normalParticles->upload(normalParticleVec);
normalParticles.upload(normalParticleVec);
if (pairParticleVec.size() > 0)
pairParticles->upload(pairParticleVec);
pairParticles.upload(pairParticleVec);
// Create kernels.
......@@ -296,8 +282,8 @@ void OpenCLIntegrateDrudeLangevinStepKernel::execute(ContextImpl& context, const
kernel1.setArg<cl::Buffer>(0, cl.getVelm().getDeviceBuffer());
kernel1.setArg<cl::Buffer>(1, cl.getForce().getDeviceBuffer());
kernel1.setArg<cl::Buffer>(2, integration.getPosDelta().getDeviceBuffer());
kernel1.setArg<cl::Buffer>(3, normalParticles->getDeviceBuffer());
kernel1.setArg<cl::Buffer>(4, pairParticles->getDeviceBuffer());
kernel1.setArg<cl::Buffer>(3, normalParticles.getDeviceBuffer());
kernel1.setArg<cl::Buffer>(4, pairParticles.getDeviceBuffer());
kernel1.setArg<cl::Buffer>(5, integration.getStepSize().getDeviceBuffer());
kernel1.setArg<cl::Buffer>(12, integration.getRandom().getDeviceBuffer());
kernel2.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
......@@ -314,7 +300,7 @@ void OpenCLIntegrateDrudeLangevinStepKernel::execute(ContextImpl& context, const
else
hardwallKernel.setArg<void*>(1, NULL);
hardwallKernel.setArg<cl::Buffer>(2, cl.getVelm().getDeviceBuffer());
hardwallKernel.setArg<cl::Buffer>(3, pairParticles->getDeviceBuffer());
hardwallKernel.setArg<cl::Buffer>(3, pairParticles.getDeviceBuffer());
hardwallKernel.setArg<cl::Buffer>(4, integration.getStepSize().getDeviceBuffer());
}
......@@ -363,7 +349,7 @@ void OpenCLIntegrateDrudeLangevinStepKernel::execute(ContextImpl& context, const
// Call the first integration kernel.
kernel1.setArg<cl_uint>(13, integration.prepareRandomNumbers(normalParticles->getSize()+2*pairParticles->getSize()));
kernel1.setArg<cl_uint>(13, integration.prepareRandomNumbers(normalParticles.getSize()+2*pairParticles.getSize()));
cl.executeKernel(kernel1, numAtoms);
// Apply constraints.
......@@ -377,7 +363,7 @@ void OpenCLIntegrateDrudeLangevinStepKernel::execute(ContextImpl& context, const
// Apply hard wall constraints.
if (maxDrudeDistance > 0)
cl.executeKernel(hardwallKernel, pairParticles->getSize());
cl.executeKernel(hardwallKernel, pairParticles.getSize());
integration.computeVirtualSites();
// Update the time and step count.
......
......@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2013-2015 Stanford University and the Authors. *
* Portions copyright (c) 2013-2018 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -45,9 +45,8 @@ namespace OpenMM {
class OpenCLCalcDrudeForceKernel : public CalcDrudeForceKernel {
public:
OpenCLCalcDrudeForceKernel(std::string name, const Platform& platform, OpenCLContext& cl) :
CalcDrudeForceKernel(name, platform), cl(cl), particleParams(NULL), pairParams(NULL) {
CalcDrudeForceKernel(name, platform), cl(cl) {
}
~OpenCLCalcDrudeForceKernel();
/**
* Initialize the kernel.
*
......@@ -73,8 +72,8 @@ public:
void copyParametersToContext(ContextImpl& context, const DrudeForce& force);
private:
OpenCLContext& cl;
OpenCLArray* particleParams;
OpenCLArray* pairParams;
OpenCLArray particleParams;
OpenCLArray pairParams;
};
/**
......@@ -83,9 +82,8 @@ private:
class OpenCLIntegrateDrudeLangevinStepKernel : public IntegrateDrudeLangevinStepKernel {
public:
OpenCLIntegrateDrudeLangevinStepKernel(std::string name, const Platform& platform, OpenCLContext& cl) :
IntegrateDrudeLangevinStepKernel(name, platform), cl(cl), hasInitializedKernels(false), normalParticles(NULL), pairParticles(NULL) {
IntegrateDrudeLangevinStepKernel(name, platform), cl(cl), hasInitializedKernels(false) {
}
~OpenCLIntegrateDrudeLangevinStepKernel();
/**
* Initialize the kernel.
*
......@@ -112,8 +110,8 @@ private:
OpenCLContext& cl;
bool hasInitializedKernels;
double prevStepSize;
OpenCLArray* normalParticles;
OpenCLArray* pairParticles;
OpenCLArray normalParticles;
OpenCLArray pairParticles;
cl::Kernel kernel1, kernel2, hardwallKernel;
};
......
......@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2011-2013 Stanford University and the Authors. *
* Portions copyright (c) 2011-2018 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -41,19 +41,6 @@
using namespace OpenMM;
using namespace std;
OpenCLIntegrateRPMDStepKernel::~OpenCLIntegrateRPMDStepKernel() {
if (forces != NULL)
delete forces;
if (positions != NULL)
delete positions;
if (velocities != NULL)
delete velocities;
if (contractedForces != NULL)
delete contractedForces;
if (contractedPositions != NULL)
delete contractedPositions;
}
void OpenCLIntegrateRPMDStepKernel::initialize(const System& system, const RPMDIntegrator& integrator) {
cl.getPlatformData().initializeContexts(system);
numCopies = integrator.getNumCopies();
......@@ -63,32 +50,32 @@ void OpenCLIntegrateRPMDStepKernel::initialize(const System& system, const RPMDI
throw OpenMMException("RPMDIntegrator: the number of copies must be a multiple of powers of 2, 3, and 5.");
int paddedParticles = cl.getPaddedNumAtoms();
int forceElementSize = (cl.getUseDoublePrecision() ? sizeof(mm_double4) : sizeof(mm_float4));
forces = new OpenCLArray(cl, numCopies*paddedParticles, forceElementSize, "rpmdForces");
forces.initialize(cl, numCopies*paddedParticles, forceElementSize, "rpmdForces");
bool useDoublePrecision = (cl.getUseDoublePrecision() || cl.getUseMixedPrecision());
int elementSize = (useDoublePrecision ? sizeof(mm_double4) : sizeof(mm_float4));
positions = new OpenCLArray(cl, numCopies*paddedParticles, elementSize, "rpmdPositions");
velocities = new OpenCLArray(cl, numCopies*paddedParticles, elementSize, "rpmdVelocities");
positions.initialize(cl, numCopies*paddedParticles, elementSize, "rpmdPositions");
velocities.initialize(cl, numCopies*paddedParticles, elementSize, "rpmdVelocities");
cl.getIntegrationUtilities().initRandomNumberGenerator((unsigned int) integrator.getRandomNumberSeed());
// Fill in the posq and velm arrays with safe values to avoid a risk of nans.
if (useDoublePrecision) {
vector<mm_double4> temp(positions->getSize());
for (int i = 0; i < positions->getSize(); i++)
vector<mm_double4> temp(positions.getSize());
for (int i = 0; i < positions.getSize(); i++)
temp[i] = mm_double4(0, 0, 0, 0);
positions->upload(temp);
for (int i = 0; i < velocities->getSize(); i++)
positions.upload(temp);
for (int i = 0; i < velocities.getSize(); i++)
temp[i] = mm_double4(0, 0, 0, 1);
velocities->upload(temp);
velocities.upload(temp);
}
else {
vector<mm_float4> temp(positions->getSize());
for (int i = 0; i < positions->getSize(); i++)
vector<mm_float4> temp(positions.getSize());
for (int i = 0; i < positions.getSize(); i++)
temp[i] = mm_float4(0, 0, 0, 0);
positions->upload(temp);
for (int i = 0; i < velocities->getSize(); i++)
positions.upload(temp);
for (int i = 0; i < velocities.getSize(); i++)
temp[i] = mm_float4(0, 0, 0, 1);
velocities->upload(temp);
velocities.upload(temp);
}
// Build a list of contractions.
......@@ -117,8 +104,8 @@ void OpenCLIntegrateRPMDStepKernel::initialize(const System& system, const RPMDI
}
}
if (maxContractedCopies > 0) {
contractedForces = new OpenCLArray(cl, maxContractedCopies*paddedParticles, forceElementSize, "rpmdContractedForces");
contractedPositions = new OpenCLArray(cl, maxContractedCopies*paddedParticles, elementSize, "rpmdContractedPositions");
contractedForces.initialize(cl, maxContractedCopies*paddedParticles, forceElementSize, "rpmdContractedForces");
contractedPositions.initialize(cl, maxContractedCopies*paddedParticles, elementSize, "rpmdContractedPositions");
}
// Create kernels.
......@@ -164,30 +151,30 @@ void OpenCLIntegrateRPMDStepKernel::initialize(const System& system, const RPMDI
void OpenCLIntegrateRPMDStepKernel::initializeKernels(ContextImpl& context) {
hasInitializedKernel = true;
pileKernel.setArg<cl::Buffer>(0, velocities->getDeviceBuffer());
stepKernel.setArg<cl::Buffer>(0, positions->getDeviceBuffer());
stepKernel.setArg<cl::Buffer>(1, velocities->getDeviceBuffer());
stepKernel.setArg<cl::Buffer>(2, forces->getDeviceBuffer());
velocitiesKernel.setArg<cl::Buffer>(0, velocities->getDeviceBuffer());
velocitiesKernel.setArg<cl::Buffer>(1, forces->getDeviceBuffer());
translateKernel.setArg<cl::Buffer>(0, positions->getDeviceBuffer());
pileKernel.setArg<cl::Buffer>(0, velocities.getDeviceBuffer());
stepKernel.setArg<cl::Buffer>(0, positions.getDeviceBuffer());
stepKernel.setArg<cl::Buffer>(1, velocities.getDeviceBuffer());
stepKernel.setArg<cl::Buffer>(2, forces.getDeviceBuffer());
velocitiesKernel.setArg<cl::Buffer>(0, velocities.getDeviceBuffer());
velocitiesKernel.setArg<cl::Buffer>(1, forces.getDeviceBuffer());
translateKernel.setArg<cl::Buffer>(0, positions.getDeviceBuffer());
translateKernel.setArg<cl::Buffer>(1, cl.getPosq().getDeviceBuffer());
translateKernel.setArg<cl::Buffer>(2, cl.getAtomIndexArray().getDeviceBuffer());
copyToContextKernel.setArg<cl::Buffer>(0, velocities->getDeviceBuffer());
copyToContextKernel.setArg<cl::Buffer>(0, velocities.getDeviceBuffer());
copyToContextKernel.setArg<cl::Buffer>(1, cl.getVelm().getDeviceBuffer());
copyToContextKernel.setArg<cl::Buffer>(3, cl.getPosq().getDeviceBuffer());
copyToContextKernel.setArg<cl::Buffer>(4, cl.getAtomIndexArray().getDeviceBuffer());
copyFromContextKernel.setArg<cl::Buffer>(0, cl.getForce().getDeviceBuffer());
copyFromContextKernel.setArg<cl::Buffer>(2, cl.getVelm().getDeviceBuffer());
copyFromContextKernel.setArg<cl::Buffer>(3, velocities->getDeviceBuffer());
copyFromContextKernel.setArg<cl::Buffer>(3, velocities.getDeviceBuffer());
copyFromContextKernel.setArg<cl::Buffer>(4, cl.getPosq().getDeviceBuffer());
copyFromContextKernel.setArg<cl::Buffer>(6, cl.getAtomIndexArray().getDeviceBuffer());
for (auto& g : groupsByCopies) {
int copies = g.first;
positionContractionKernels[copies].setArg<cl::Buffer>(0, positions->getDeviceBuffer());
positionContractionKernels[copies].setArg<cl::Buffer>(1, contractedPositions->getDeviceBuffer());
forceContractionKernels[copies].setArg<cl::Buffer>(0, forces->getDeviceBuffer());
forceContractionKernels[copies].setArg<cl::Buffer>(1, contractedForces->getDeviceBuffer());
positionContractionKernels[copies].setArg<cl::Buffer>(0, positions.getDeviceBuffer());
positionContractionKernels[copies].setArg<cl::Buffer>(1, contractedPositions.getDeviceBuffer());
forceContractionKernels[copies].setArg<cl::Buffer>(0, forces.getDeviceBuffer());
forceContractionKernels[copies].setArg<cl::Buffer>(1, contractedForces.getDeviceBuffer());
}
}
......@@ -261,9 +248,9 @@ void OpenCLIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDInte
void OpenCLIntegrateRPMDStepKernel::computeForces(ContextImpl& context) {
// Compute forces from all groups that didn't have a specified contraction.
copyToContextKernel.setArg<cl::Buffer>(2, positions->getDeviceBuffer());
copyFromContextKernel.setArg<cl::Buffer>(1, forces->getDeviceBuffer());
copyFromContextKernel.setArg<cl::Buffer>(5, positions->getDeviceBuffer());
copyToContextKernel.setArg<cl::Buffer>(2, positions.getDeviceBuffer());
copyFromContextKernel.setArg<cl::Buffer>(1, forces.getDeviceBuffer());
copyFromContextKernel.setArg<cl::Buffer>(5, positions.getDeviceBuffer());
for (int i = 0; i < numCopies; i++) {
copyToContextKernel.setArg<cl_int>(5, i);
cl.executeKernel(copyToContextKernel, cl.getNumAtoms());
......@@ -283,9 +270,9 @@ void OpenCLIntegrateRPMDStepKernel::computeForces(ContextImpl& context) {
// Now loop over contractions and compute forces from them.
if (groupsByCopies.size() > 0) {
copyToContextKernel.setArg<cl::Buffer>(2, contractedPositions->getDeviceBuffer());
copyFromContextKernel.setArg<cl::Buffer>(1, contractedForces->getDeviceBuffer());
copyFromContextKernel.setArg<cl::Buffer>(5, contractedPositions->getDeviceBuffer());
copyToContextKernel.setArg<cl::Buffer>(2, contractedPositions.getDeviceBuffer());
copyFromContextKernel.setArg<cl::Buffer>(1, contractedForces.getDeviceBuffer());
copyFromContextKernel.setArg<cl::Buffer>(5, contractedPositions.getDeviceBuffer());
for (auto& g : groupsByCopies) {
int copies = g.first;
int groupFlags = g.second;
......@@ -313,7 +300,7 @@ void OpenCLIntegrateRPMDStepKernel::computeForces(ContextImpl& context) {
if (groupsByCopies.size() > 0) {
// Ensure the Context contains the positions from the last copy, since we'll assume that later.
copyToContextKernel.setArg<cl::Buffer>(2, positions->getDeviceBuffer());
copyToContextKernel.setArg<cl::Buffer>(2, positions.getDeviceBuffer());
copyToContextKernel.setArg<cl_int>(5, numCopies-1);
cl.executeKernel(copyToContextKernel, cl.getNumAtoms());
}
......@@ -324,7 +311,7 @@ double OpenCLIntegrateRPMDStepKernel::computeKineticEnergy(ContextImpl& context,
}
void OpenCLIntegrateRPMDStepKernel::setPositions(int copy, const vector<Vec3>& pos) {
if (positions == NULL)
if (!positions.isInitialized())
throw OpenMMException("RPMDIntegrator: Cannot set positions before the integrator is added to a Context");
if (pos.size() != numParticles)
throw OpenMMException("RPMDIntegrator: wrong number of values passed to setPositions()");
......@@ -346,7 +333,7 @@ void OpenCLIntegrateRPMDStepKernel::setPositions(int copy, const vector<Vec3>& p
cl.getPosq().download(posq);
for (int i = 0; i < numParticles; i++)
posq[i] = mm_double4(offsetPos[i][0], offsetPos[i][1], offsetPos[i][2], posq[i].w);
cl.getQueue().enqueueWriteBuffer(positions->getDeviceBuffer(), CL_TRUE, copy*cl.getPaddedNumAtoms()*sizeof(mm_double4), numParticles*sizeof(mm_double4), &posq[0]);
cl.getQueue().enqueueWriteBuffer(positions.getDeviceBuffer(), CL_TRUE, copy*cl.getPaddedNumAtoms()*sizeof(mm_double4), numParticles*sizeof(mm_double4), &posq[0]);
}
else if (cl.getUseMixedPrecision()) {
vector<mm_float4> posqf(cl.getPaddedNumAtoms());
......@@ -354,19 +341,19 @@ void OpenCLIntegrateRPMDStepKernel::setPositions(int copy, const vector<Vec3>& p
vector<mm_double4> posq(cl.getPaddedNumAtoms());
for (int i = 0; i < numParticles; i++)
posq[i] = mm_double4(offsetPos[i][0], offsetPos[i][1], offsetPos[i][2], posqf[i].w);
cl.getQueue().enqueueWriteBuffer(positions->getDeviceBuffer(), CL_TRUE, copy*cl.getPaddedNumAtoms()*sizeof(mm_double4), numParticles*sizeof(mm_double4), &posq[0]);
cl.getQueue().enqueueWriteBuffer(positions.getDeviceBuffer(), CL_TRUE, copy*cl.getPaddedNumAtoms()*sizeof(mm_double4), numParticles*sizeof(mm_double4), &posq[0]);
}
else {
vector<mm_float4> posq(cl.getPaddedNumAtoms());
cl.getPosq().download(posq);
for (int i = 0; i < numParticles; i++)
posq[i] = mm_float4((cl_float) offsetPos[i][0], (cl_float) offsetPos[i][1], (cl_float) offsetPos[i][2], posq[i].w);
cl.getQueue().enqueueWriteBuffer(positions->getDeviceBuffer(), CL_TRUE, copy*cl.getPaddedNumAtoms()*sizeof(mm_float4), numParticles*sizeof(mm_float4), &posq[0]);
cl.getQueue().enqueueWriteBuffer(positions.getDeviceBuffer(), CL_TRUE, copy*cl.getPaddedNumAtoms()*sizeof(mm_float4), numParticles*sizeof(mm_float4), &posq[0]);
}
}
void OpenCLIntegrateRPMDStepKernel::setVelocities(int copy, const vector<Vec3>& vel) {
if (velocities == NULL)
if (!velocities.isInitialized())
throw OpenMMException("RPMDIntegrator: Cannot set velocities before the integrator is added to a Context");
if (vel.size() != numParticles)
throw OpenMMException("RPMDIntegrator: wrong number of values passed to setVelocities()");
......@@ -375,21 +362,21 @@ void OpenCLIntegrateRPMDStepKernel::setVelocities(int copy, const vector<Vec3>&
cl.getVelm().download(velm);
for (int i = 0; i < numParticles; i++)
velm[i] = mm_double4(vel[i][0], vel[i][1], vel[i][2], velm[i].w);
cl.getQueue().enqueueWriteBuffer(velocities->getDeviceBuffer(), CL_TRUE, copy*cl.getPaddedNumAtoms()*sizeof(mm_double4), numParticles*sizeof(mm_double4), &velm[0]);
cl.getQueue().enqueueWriteBuffer(velocities.getDeviceBuffer(), CL_TRUE, copy*cl.getPaddedNumAtoms()*sizeof(mm_double4), numParticles*sizeof(mm_double4), &velm[0]);
}
else {
vector<mm_float4> velm(cl.getPaddedNumAtoms());
cl.getVelm().download(velm);
for (int i = 0; i < numParticles; i++)
velm[i] = mm_float4((cl_float) vel[i][0], (cl_float) vel[i][1], (cl_float) vel[i][2], velm[i].w);
cl.getQueue().enqueueWriteBuffer(velocities->getDeviceBuffer(), CL_TRUE, copy*cl.getPaddedNumAtoms()*sizeof(mm_float4), numParticles*sizeof(mm_float4), &velm[0]);
cl.getQueue().enqueueWriteBuffer(velocities.getDeviceBuffer(), CL_TRUE, copy*cl.getPaddedNumAtoms()*sizeof(mm_float4), numParticles*sizeof(mm_float4), &velm[0]);
}
}
void OpenCLIntegrateRPMDStepKernel::copyToContext(int copy, ContextImpl& context) {
if (!hasInitializedKernel)
initializeKernels(context);
copyToContextKernel.setArg<cl::Buffer>(2, positions->getDeviceBuffer());
copyToContextKernel.setArg<cl::Buffer>(2, positions.getDeviceBuffer());
copyToContextKernel.setArg<cl_int>(5, copy);
cl.executeKernel(copyToContextKernel, cl.getNumAtoms());
}
......
......@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2011-2013 Stanford University and the Authors. *
* Portions copyright (c) 2011-2018 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -45,9 +45,8 @@ namespace OpenMM {
class OpenCLIntegrateRPMDStepKernel : public IntegrateRPMDStepKernel {
public:
OpenCLIntegrateRPMDStepKernel(std::string name, const Platform& platform, OpenCLContext& cl) :
IntegrateRPMDStepKernel(name, platform), cl(cl), hasInitializedKernel(false), forces(NULL), positions(NULL), velocities(NULL), contractedForces(NULL), contractedPositions(NULL) {
IntegrateRPMDStepKernel(name, platform), cl(cl), hasInitializedKernel(false) {
}
~OpenCLIntegrateRPMDStepKernel();
/**
* Initialize the kernel.
*
......@@ -92,11 +91,11 @@ private:
int numCopies, numParticles, workgroupSize;
std::map<int, int> groupsByCopies;
int groupsNotContracted;
OpenCLArray* forces;
OpenCLArray* positions;
OpenCLArray* velocities;
OpenCLArray* contractedForces;
OpenCLArray* contractedPositions;
OpenCLArray forces;
OpenCLArray positions;
OpenCLArray velocities;
OpenCLArray contractedForces;
OpenCLArray contractedPositions;
cl::Kernel pileKernel, stepKernel, velocitiesKernel, copyToContextKernel, copyFromContextKernel, translateKernel;
std::map<int, cl::Kernel> positionContractionKernels;
std::map<int, cl::Kernel> forceContractionKernels;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment