Commit b8c86406 authored by Peter Eastman's avatar Peter Eastman
Browse files

Began converting CudaArrays.

parent b33ee3b0
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2009-2012 Stanford University and the Authors. * * Portions copyright (c) 2009-2018 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -57,6 +57,11 @@ public: ...@@ -57,6 +57,11 @@ public:
static CudaArray* create(CudaContext& context, int size, const std::string& name) { static CudaArray* create(CudaContext& context, int size, const std::string& name) {
return new CudaArray(context, size, sizeof(T), name); return new CudaArray(context, size, sizeof(T), name);
} }
/**
* Create an uninitialized CudaArray object. It does not point to any device memory,
* and cannot be used until initialize() is called on it.
*/
CudaArray();
/** /**
* Create a CudaArray object. * Create a CudaArray object.
* *
...@@ -67,6 +72,36 @@ public: ...@@ -67,6 +72,36 @@ public:
*/ */
CudaArray(CudaContext& context, int size, int elementSize, const std::string& name); CudaArray(CudaContext& context, int size, int elementSize, const std::string& name);
~CudaArray(); ~CudaArray();
/**
* Initialize this object.
*
* @param context the context for which to create the array
* @param size the number of elements in the array
* @param elementSize the size of each element in bytes
* @param name the name of the array
*/
void initialize(CudaContext& context, int size, int elementSize, const std::string& name);
/**
* Initialize this object. The template argument is the data type of each array element.
*
* @param context the context for which to create the array
* @param size the number of elements in the array
* @param name the name of the array
*/
template <class T>
void initialize(CudaContext& context, int size, const std::string& name) {
initialize(context, size, sizeof(T), name);
}
/**
* Recreate the internal storage to have a different size.
*/
void resize(int size);
/**
* Get whether this array has been initialized.
*/
bool isInitialized() const {
return (pointer != 0);
}
/** /**
* Get the number of elements in the array. * Get the number of elements in the array.
*/ */
...@@ -134,7 +169,7 @@ public: ...@@ -134,7 +169,7 @@ public:
*/ */
void copyTo(CudaArray& dest) const; void copyTo(CudaArray& dest) const;
private: private:
CudaContext& context; CudaContext* context;
CUdeviceptr pointer; CUdeviceptr pointer;
int size, elementSize; int size, elementSize;
bool ownsMemory; bool ownsMemory;
......
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2011-2016 Stanford University and the Authors. * * Portions copyright (c) 2011-2018 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -81,7 +81,6 @@ namespace OpenMM { ...@@ -81,7 +81,6 @@ namespace OpenMM {
class OPENMM_EXPORT_CUDA CudaBondedUtilities { class OPENMM_EXPORT_CUDA CudaBondedUtilities {
public: public:
CudaBondedUtilities(CudaContext& context); CudaBondedUtilities(CudaContext& context);
~CudaBondedUtilities();
/** /**
* Add a bonded interaction. * Add a bonded interaction.
* *
...@@ -136,7 +135,7 @@ private: ...@@ -136,7 +135,7 @@ private:
std::vector<int> forceGroup; std::vector<int> forceGroup;
std::vector<CUdeviceptr> arguments; std::vector<CUdeviceptr> arguments;
std::vector<std::string> argTypes; std::vector<std::string> argTypes;
std::vector<std::vector<CudaArray*> > atomIndices; std::vector<std::vector<CudaArray> > atomIndices;
std::vector<std::string> prefixCode; std::vector<std::string> prefixCode;
std::vector<std::string> energyParameterDerivatives; std::vector<std::string> energyParameterDerivatives;
std::vector<void*> kernelArgs; std::vector<void*> kernelArgs;
......
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2009-2017 Stanford University and the Authors. * * Portions copyright (c) 2009-2018 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -41,6 +41,7 @@ ...@@ -41,6 +41,7 @@
#include <builtin_types.h> #include <builtin_types.h>
#include <vector_functions.h> #include <vector_functions.h>
#include "windowsExportCuda.h" #include "windowsExportCuda.h"
#include "CudaArray.h"
#include "CudaPlatform.h" #include "CudaPlatform.h"
#include "openmm/Kernel.h" #include "openmm/Kernel.h"
...@@ -48,7 +49,6 @@ typedef unsigned int tileflags; ...@@ -48,7 +49,6 @@ typedef unsigned int tileflags;
namespace OpenMM { namespace OpenMM {
class CudaArray;
class CudaForceInfo; class CudaForceInfo;
class CudaExpressionUtilities; class CudaExpressionUtilities;
class CudaIntegrationUtilities; class CudaIntegrationUtilities;
...@@ -152,37 +152,37 @@ public: ...@@ -152,37 +152,37 @@ public:
* Get the array which contains the position (the xyz components) and charge (the w component) of each atom. * Get the array which contains the position (the xyz components) and charge (the w component) of each atom.
*/ */
CudaArray& getPosq() { CudaArray& getPosq() {
return *posq; return posq;
} }
/** /**
* Get the array which contains a correction to the position of each atom. This only exists if getUseMixedPrecision() returns true. * Get the array which contains a correction to the position of each atom. This only exists if getUseMixedPrecision() returns true.
*/ */
CudaArray& getPosqCorrection() { CudaArray& getPosqCorrection() {
return *posqCorrection; return posqCorrection;
} }
/** /**
* Get the array which contains the velocity (the xyz components) and inverse mass (the w component) of each atom. * Get the array which contains the velocity (the xyz components) and inverse mass (the w component) of each atom.
*/ */
CudaArray& getVelm() { CudaArray& getVelm() {
return *velm; return velm;
} }
/** /**
* Get the array which contains the force on each atom (represented as three long longs in 64 bit fixed point). * Get the array which contains the force on each atom (represented as three long longs in 64 bit fixed point).
*/ */
CudaArray& getForce() { CudaArray& getForce() {
return *force; return force;
} }
/** /**
* Get the array which contains the buffer in which energy is computed. * Get the array which contains the buffer in which energy is computed.
*/ */
CudaArray& getEnergyBuffer() { CudaArray& getEnergyBuffer() {
return *energyBuffer; return energyBuffer;
} }
/** /**
* Get the array which contains the buffer in which derivatives of the energy with respect to parameters are computed. * Get the array which contains the buffer in which derivatives of the energy with respect to parameters are computed.
*/ */
CudaArray& getEnergyParamDerivBuffer() { CudaArray& getEnergyParamDerivBuffer() {
return *energyParamDerivBuffer; return energyParamDerivBuffer;
} }
/** /**
* Get a pointer to a block of pinned memory that can be used for efficient transfers between host and device. * Get a pointer to a block of pinned memory that can be used for efficient transfers between host and device.
...@@ -201,7 +201,7 @@ public: ...@@ -201,7 +201,7 @@ public:
* Get the array which contains the index of each atom. * Get the array which contains the index of each atom.
*/ */
CudaArray& getAtomIndexArray() { CudaArray& getAtomIndexArray() {
return *atomIndexDevice; return atomIndexDevice;
} }
/** /**
* Get the number of cells by which the positions are offset. * Get the number of cells by which the positions are offset.
...@@ -649,15 +649,15 @@ private: ...@@ -649,15 +649,15 @@ private:
std::vector<MoleculeGroup> moleculeGroups; std::vector<MoleculeGroup> moleculeGroups;
std::vector<int4> posCellOffsets; std::vector<int4> posCellOffsets;
void* pinnedBuffer; void* pinnedBuffer;
CudaArray* posq; CudaArray posq;
CudaArray* posqCorrection; CudaArray posqCorrection;
CudaArray* velm; CudaArray velm;
CudaArray* force; CudaArray force;
CudaArray* energyBuffer; CudaArray energyBuffer;
CudaArray* energySum; CudaArray energySum;
CudaArray* energyParamDerivBuffer; CudaArray energyParamDerivBuffer;
CudaArray* atomIndexDevice; CudaArray atomIndexDevice;
CudaArray* chargeBuffer; CudaArray chargeBuffer;
std::vector<std::string> energyParamDerivNames; std::vector<std::string> energyParamDerivNames;
std::map<std::string, double> energyParamDerivWorkspace; std::map<std::string, double> energyParamDerivWorkspace;
std::vector<int> atomIndex; std::vector<int> atomIndex;
......
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2009-2017 Stanford University and the Authors. * * Portions copyright (c) 2009-2018 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -47,20 +47,20 @@ public: ...@@ -47,20 +47,20 @@ public:
* Get the array which contains position deltas. * Get the array which contains position deltas.
*/ */
CudaArray& getPosDelta() { CudaArray& getPosDelta() {
return *posDelta; return posDelta;
} }
/** /**
* Get the array which contains random values. Each element is a float4, whose components * Get the array which contains random values. Each element is a float4, whose components
* are independent, normally distributed random numbers with mean 0 and variance 1. * are independent, normally distributed random numbers with mean 0 and variance 1.
*/ */
CudaArray& getRandom() { CudaArray& getRandom() {
return *random; return random;
} }
/** /**
* Get the array which contains the current step size. * Get the array which contains the current step size.
*/ */
CudaArray& getStepSize() { CudaArray& getStepSize() {
return *stepSize; return stepSize;
} }
/** /**
* Set the size to use for the next step. * Set the size to use for the next step.
...@@ -131,38 +131,38 @@ private: ...@@ -131,38 +131,38 @@ private:
CUfunction ccmaUpdateKernel; CUfunction ccmaUpdateKernel;
CUfunction vsitePositionKernel, vsiteForceKernel; CUfunction vsitePositionKernel, vsiteForceKernel;
CUfunction randomKernel, timeShiftKernel; CUfunction randomKernel, timeShiftKernel;
CudaArray* posDelta; CudaArray posDelta;
CudaArray* settleAtoms; CudaArray settleAtoms;
CudaArray* settleParams; CudaArray settleParams;
CudaArray* shakeAtoms; CudaArray shakeAtoms;
CudaArray* shakeParams; CudaArray shakeParams;
CudaArray* random; CudaArray random;
CudaArray* randomSeed; CudaArray randomSeed;
CudaArray* stepSize; CudaArray stepSize;
CudaArray* ccmaAtoms; CudaArray ccmaAtoms;
CudaArray* ccmaDistance; CudaArray ccmaDistance;
CudaArray* ccmaReducedMass; CudaArray ccmaReducedMass;
CudaArray* ccmaAtomConstraints; CudaArray ccmaAtomConstraints;
CudaArray* ccmaNumAtomConstraints; CudaArray ccmaNumAtomConstraints;
CudaArray* ccmaConstraintMatrixColumn; CudaArray ccmaConstraintMatrixColumn;
CudaArray* ccmaConstraintMatrixValue; CudaArray ccmaConstraintMatrixValue;
CudaArray* ccmaDelta1; CudaArray ccmaDelta1;
CudaArray* ccmaDelta2; CudaArray ccmaDelta2;
CudaArray* ccmaConverged; CudaArray ccmaConverged;
int* ccmaConvergedMemory; int* ccmaConvergedMemory;
CUdeviceptr ccmaConvergedDeviceMemory; CUdeviceptr ccmaConvergedDeviceMemory;
CUevent ccmaEvent; CUevent ccmaEvent;
CudaArray* vsite2AvgAtoms; CudaArray vsite2AvgAtoms;
CudaArray* vsite2AvgWeights; CudaArray vsite2AvgWeights;
CudaArray* vsite3AvgAtoms; CudaArray vsite3AvgAtoms;
CudaArray* vsite3AvgWeights; CudaArray vsite3AvgWeights;
CudaArray* vsiteOutOfPlaneAtoms; CudaArray vsiteOutOfPlaneAtoms;
CudaArray* vsiteOutOfPlaneWeights; CudaArray vsiteOutOfPlaneWeights;
CudaArray* vsiteLocalCoordsIndex; CudaArray vsiteLocalCoordsIndex;
CudaArray* vsiteLocalCoordsAtoms; CudaArray vsiteLocalCoordsAtoms;
CudaArray* vsiteLocalCoordsWeights; CudaArray vsiteLocalCoordsWeights;
CudaArray* vsiteLocalCoordsPos; CudaArray vsiteLocalCoordsPos;
CudaArray* vsiteLocalCoordsStartIndex; CudaArray vsiteLocalCoordsStartIndex;
int randomPos; int randomPos;
int lastSeed, numVsites; int lastSeed, numVsites;
double2 lastStepSize; double2 lastStepSize;
......
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2009-2016 Stanford University and the Authors. * * Portions copyright (c) 2009-2018 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -164,61 +164,61 @@ public: ...@@ -164,61 +164,61 @@ public:
* Get the array containing the center of each atom block. * Get the array containing the center of each atom block.
*/ */
CudaArray& getBlockCenters() { CudaArray& getBlockCenters() {
return *blockCenter; return blockCenter;
} }
/** /**
* Get the array containing the dimensions of each atom block. * Get the array containing the dimensions of each atom block.
*/ */
CudaArray& getBlockBoundingBoxes() { CudaArray& getBlockBoundingBoxes() {
return *blockBoundingBox; return blockBoundingBox;
} }
/** /**
* Get the array whose first element contains the number of tiles with interactions. * Get the array whose first element contains the number of tiles with interactions.
*/ */
CudaArray& getInteractionCount() { CudaArray& getInteractionCount() {
return *interactionCount; return interactionCount;
} }
/** /**
* Get the array containing tiles with interactions. * Get the array containing tiles with interactions.
*/ */
CudaArray& getInteractingTiles() { CudaArray& getInteractingTiles() {
return *interactingTiles; return interactingTiles;
} }
/** /**
* Get the array containing the atoms in each tile with interactions. * Get the array containing the atoms in each tile with interactions.
*/ */
CudaArray& getInteractingAtoms() { CudaArray& getInteractingAtoms() {
return *interactingAtoms; return interactingAtoms;
} }
/** /**
* Get the array containing single pairs in the neighbor list. * Get the array containing single pairs in the neighbor list.
*/ */
CudaArray& getSinglePairs() { CudaArray& getSinglePairs() {
return *singlePairs; return singlePairs;
} }
/** /**
* Get the array containing exclusion flags. * Get the array containing exclusion flags.
*/ */
CudaArray& getExclusions() { CudaArray& getExclusions() {
return *exclusions; return exclusions;
} }
/** /**
* Get the array containing tiles with exclusions. * Get the array containing tiles with exclusions.
*/ */
CudaArray& getExclusionTiles() { CudaArray& getExclusionTiles() {
return *exclusionTiles; return exclusionTiles;
} }
/** /**
* Get the array containing the index into the exclusion array for each tile. * Get the array containing the index into the exclusion array for each tile.
*/ */
CudaArray& getExclusionIndices() { CudaArray& getExclusionIndices() {
return *exclusionIndices; return exclusionIndices;
} }
/** /**
* Get the array listing where the exclusion data starts for each row. * Get the array listing where the exclusion data starts for each row.
*/ */
CudaArray& getExclusionRowIndices() { CudaArray& getExclusionRowIndices() {
return *exclusionRowIndices; return exclusionRowIndices;
} }
/** /**
* Get the index of the first tile this context is responsible for processing. * Get the index of the first tile this context is responsible for processing.
...@@ -270,22 +270,22 @@ private: ...@@ -270,22 +270,22 @@ private:
class BlockSortTrait; class BlockSortTrait;
CudaContext& context; CudaContext& context;
std::map<int, KernelSet> groupKernels; std::map<int, KernelSet> groupKernels;
CudaArray* exclusionTiles; CudaArray exclusionTiles;
CudaArray* exclusions; CudaArray exclusions;
CudaArray* exclusionIndices; CudaArray exclusionIndices;
CudaArray* exclusionRowIndices; CudaArray exclusionRowIndices;
CudaArray* interactingTiles; CudaArray interactingTiles;
CudaArray* interactingAtoms; CudaArray interactingAtoms;
CudaArray* interactionCount; CudaArray interactionCount;
CudaArray* singlePairs; CudaArray singlePairs;
CudaArray* singlePairCount; CudaArray singlePairCount;
CudaArray* blockCenter; CudaArray blockCenter;
CudaArray* blockBoundingBox; CudaArray blockBoundingBox;
CudaArray* sortedBlocks; CudaArray sortedBlocks;
CudaArray* sortedBlockCenter; CudaArray sortedBlockCenter;
CudaArray* sortedBlockBoundingBox; CudaArray sortedBlockBoundingBox;
CudaArray* oldPositions; CudaArray oldPositions;
CudaArray* rebuildNeighborList; CudaArray rebuildNeighborList;
CudaSort* blockSorter; CudaSort* blockSorter;
CUevent downloadCountEvent; CUevent downloadCountEvent;
int* pinnedCountBuffer; int* pinnedCountBuffer;
......
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2010-2012 Stanford University and the Authors. * * Portions copyright (c) 2010-2018 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -87,11 +87,11 @@ public: ...@@ -87,11 +87,11 @@ public:
private: private:
CudaContext& context; CudaContext& context;
SortTrait* trait; SortTrait* trait;
CudaArray* dataRange; CudaArray dataRange;
CudaArray* bucketOfElement; CudaArray bucketOfElement;
CudaArray* offsetInBucket; CudaArray offsetInBucket;
CudaArray* bucketOffset; CudaArray bucketOffset;
CudaArray* buckets; CudaArray buckets;
CUfunction shortListKernel, computeRangeKernel, assignElementsKernel, computeBucketPositionsKernel, copyToBucketsKernel, sortBucketsKernel; CUfunction shortListKernel, computeRangeKernel, assignElementsKernel, computeBucketPositionsKernel, copyToBucketsKernel, sortBucketsKernel;
unsigned int dataLength, rangeKernelSize, positionsKernelSize, sortKernelSize; unsigned int dataLength, rangeKernelSize, positionsKernelSize, sortKernelSize;
bool isShortList; bool isShortList;
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2012 Stanford University and the Authors. * * Portions copyright (c) 2012-2018 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -32,18 +32,15 @@ ...@@ -32,18 +32,15 @@
using namespace OpenMM; using namespace OpenMM;
CudaArray::CudaArray(CudaContext& context, int size, int elementSize, const std::string& name) : CudaArray::CudaArray() : pointer(0), ownsMemory(false) {
context(context), size(size), elementSize(elementSize), name(name), ownsMemory(true) { }
CUresult result = cuMemAlloc(&pointer, size*elementSize);
if (result != CUDA_SUCCESS) { CudaArray::CudaArray(CudaContext& context, int size, int elementSize, const std::string& name) : pointer(0) {
std::stringstream str; initialize(context, size, elementSize, name);
str<<"Error creating array "<<name<<": "<<CudaContext::getErrorString(result)<<" ("<<result<<")";
throw OpenMMException(str.str());
}
} }
CudaArray::~CudaArray() { CudaArray::~CudaArray() {
if (ownsMemory && context.getContextIsValid()) { if (pointer != 0 && ownsMemory && context->getContextIsValid()) {
CUresult result = cuMemFree(pointer); CUresult result = cuMemFree(pointer);
if (result != CUDA_SUCCESS) { if (result != CUDA_SUCCESS) {
std::stringstream str; std::stringstream str;
...@@ -53,12 +50,45 @@ CudaArray::~CudaArray() { ...@@ -53,12 +50,45 @@ CudaArray::~CudaArray() {
} }
} }
void CudaArray::initialize(CudaContext& context, int size, int elementSize, const std::string& name) {
if (this->pointer != 0)
throw OpenMMException("CudaArray has already been initialized");
this->context = &context;
this->size = size;
this->elementSize = elementSize;
this->name = name;
ownsMemory = true;
CUresult result = cuMemAlloc(&pointer, size*elementSize);
if (result != CUDA_SUCCESS) {
std::stringstream str;
str<<"Error creating array "<<name<<": "<<CudaContext::getErrorString(result)<<" ("<<result<<")";
throw OpenMMException(str.str());
}
}
void CudaArray::resize(int size) {
if (pointer == 0)
throw OpenMMException("CudaArray has not been initialized");
if (!ownsMemory)
throw OpenMMException("Cannot resize an array that does not own its storage");
CUresult result = cuMemFree(pointer);
if (result != CUDA_SUCCESS) {
std::stringstream str;
str<<"Error deleting array "<<name<<": "<<CudaContext::getErrorString(result)<<" ("<<result<<")";
throw OpenMMException(str.str());
}
pointer = 0;
initialize(*context, size, elementSize, name);
}
void CudaArray::upload(const void* data, bool blocking) { void CudaArray::upload(const void* data, bool blocking) {
if (pointer == 0)
throw OpenMMException("CudaArray has not been initialized");
CUresult result; CUresult result;
if (blocking) if (blocking)
result = cuMemcpyHtoD(pointer, data, size*elementSize); result = cuMemcpyHtoD(pointer, data, size*elementSize);
else else
result = cuMemcpyHtoDAsync(pointer, data, size*elementSize, context.getCurrentStream()); result = cuMemcpyHtoDAsync(pointer, data, size*elementSize, context->getCurrentStream());
if (result != CUDA_SUCCESS) { if (result != CUDA_SUCCESS) {
std::stringstream str; std::stringstream str;
str<<"Error uploading array "<<name<<": "<<CudaContext::getErrorString(result)<<" ("<<result<<")"; str<<"Error uploading array "<<name<<": "<<CudaContext::getErrorString(result)<<" ("<<result<<")";
...@@ -67,11 +97,13 @@ void CudaArray::upload(const void* data, bool blocking) { ...@@ -67,11 +97,13 @@ void CudaArray::upload(const void* data, bool blocking) {
} }
void CudaArray::download(void* data, bool blocking) const { void CudaArray::download(void* data, bool blocking) const {
if (pointer == 0)
throw OpenMMException("CudaArray has not been initialized");
CUresult result; CUresult result;
if (blocking) if (blocking)
result = cuMemcpyDtoH(data, pointer, size*elementSize); result = cuMemcpyDtoH(data, pointer, size*elementSize);
else else
result = cuMemcpyDtoHAsync(data, pointer, size*elementSize, context.getCurrentStream()); result = cuMemcpyDtoHAsync(data, pointer, size*elementSize, context->getCurrentStream());
if (result != CUDA_SUCCESS) { if (result != CUDA_SUCCESS) {
std::stringstream str; std::stringstream str;
str<<"Error downloading array "<<name<<": "<<CudaContext::getErrorString(result)<<" ("<<result<<")"; str<<"Error downloading array "<<name<<": "<<CudaContext::getErrorString(result)<<" ("<<result<<")";
...@@ -80,9 +112,11 @@ void CudaArray::download(void* data, bool blocking) const { ...@@ -80,9 +112,11 @@ void CudaArray::download(void* data, bool blocking) const {
} }
void CudaArray::copyTo(CudaArray& dest) const { void CudaArray::copyTo(CudaArray& dest) const {
if (pointer == 0)
throw OpenMMException("CudaArray has not been initialized");
if (dest.getSize() != size || dest.getElementSize() != elementSize) if (dest.getSize() != size || dest.getElementSize() != elementSize)
throw OpenMMException("Error copying array "+name+" to "+dest.getName()+": The destination array does not match the size of the array"); throw OpenMMException("Error copying array "+name+" to "+dest.getName()+": The destination array does not match the size of the array");
CUresult result = cuMemcpyDtoDAsync(dest.getDevicePointer(), pointer, size*elementSize, context.getCurrentStream()); CUresult result = cuMemcpyDtoDAsync(dest.getDevicePointer(), pointer, size*elementSize, context->getCurrentStream());
if (result != CUDA_SUCCESS) { if (result != CUDA_SUCCESS) {
std::stringstream str; std::stringstream str;
str<<"Error copying array "<<name<<" to "<<dest.getName()<<": "<<CudaContext::getErrorString(result)<<" ("<<result<<")"; str<<"Error copying array "<<name<<" to "<<dest.getName()<<": "<<CudaContext::getErrorString(result)<<" ("<<result<<")";
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2011-2016 Stanford University and the Authors. * * Portions copyright (c) 2011-2018 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -37,12 +37,6 @@ using namespace std; ...@@ -37,12 +37,6 @@ using namespace std;
CudaBondedUtilities::CudaBondedUtilities(CudaContext& context) : context(context), numForceBuffers(0), maxBonds(0), allGroups(0), hasInitializedKernels(false) { CudaBondedUtilities::CudaBondedUtilities(CudaContext& context) : context(context), numForceBuffers(0), maxBonds(0), allGroups(0), hasInitializedKernels(false) {
} }
CudaBondedUtilities::~CudaBondedUtilities() {
for (int i = 0; i < (int) atomIndices.size(); i++)
for (int j = 0; j < (int) atomIndices[i].size(); j++)
delete atomIndices[i][j];
}
void CudaBondedUtilities::addInteraction(const vector<vector<int> >& atoms, const string& source, int group) { void CudaBondedUtilities::addInteraction(const vector<vector<int> >& atoms, const string& source, int group) {
if (atoms.size() > 0) { if (atoms.size() > 0) {
forceAtoms.push_back(atoms); forceAtoms.push_back(atoms);
...@@ -99,9 +93,9 @@ void CudaBondedUtilities::initialize(const System& system) { ...@@ -99,9 +93,9 @@ void CudaBondedUtilities::initialize(const System& system) {
for (int atom = 0; atom < width; atom++) for (int atom = 0; atom < width; atom++)
indexVec[bond*paddedWidth+atom] = forceAtoms[i][bond][startAtom+atom]; indexVec[bond*paddedWidth+atom] = forceAtoms[i][bond][startAtom+atom];
} }
CudaArray* indices = new CudaArray(context, numBonds, 4*paddedWidth, "bondedIndices"); atomIndices[i].push_back(CudaArray());
indices->upload(&indexVec[0]); atomIndices[i].back().initialize(context, numBonds, 4*paddedWidth, "bondedIndices");
atomIndices[i].push_back(indices); atomIndices[i].back().upload(&indexVec[0]);
startAtom += width; startAtom += width;
} }
} }
...@@ -115,7 +109,7 @@ void CudaBondedUtilities::initialize(const System& system) { ...@@ -115,7 +109,7 @@ void CudaBondedUtilities::initialize(const System& system) {
s<<"extern \"C\" __global__ void computeBondedForces(unsigned long long* __restrict__ forceBuffer, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq, int groups, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ"; s<<"extern \"C\" __global__ void computeBondedForces(unsigned long long* __restrict__ forceBuffer, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq, int groups, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ";
for (int force = 0; force < numForces; force++) { for (int force = 0; force < numForces; force++) {
for (int i = 0; i < (int) atomIndices[force].size(); i++) { for (int i = 0; i < (int) atomIndices[force].size(); i++) {
int indexWidth = atomIndices[force][i]->getElementSize()/4; int indexWidth = atomIndices[force][i].getElementSize()/4;
string indexType = "uint"+context.intToString(indexWidth); string indexType = "uint"+context.intToString(indexWidth);
s<<", const "<<indexType<<"* __restrict__ atomIndices"<<force<<"_"<<i; s<<", const "<<indexType<<"* __restrict__ atomIndices"<<force<<"_"<<i;
} }
...@@ -154,7 +148,7 @@ string CudaBondedUtilities::createForceSource(int forceIndex, int numBonds, int ...@@ -154,7 +148,7 @@ string CudaBondedUtilities::createForceSource(int forceIndex, int numBonds, int
s<<"for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < "<<numBonds<<"; index += blockDim.x*gridDim.x) {\n"; s<<"for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < "<<numBonds<<"; index += blockDim.x*gridDim.x) {\n";
int startAtom = 0; int startAtom = 0;
for (int i = 0; i < (int) atomIndices[forceIndex].size(); i++) { for (int i = 0; i < (int) atomIndices[forceIndex].size(); i++) {
int indexWidth = atomIndices[forceIndex][i]->getElementSize()/4; int indexWidth = atomIndices[forceIndex][i].getElementSize()/4;
string indexType = "uint"+context.intToString(indexWidth); string indexType = "uint"+context.intToString(indexWidth);
s<<" "<<indexType<<" atoms"<<i<<" = atomIndices"<<forceIndex<<"_"<<i<<"[index];\n"; s<<" "<<indexType<<" atoms"<<i<<" = atomIndices"<<forceIndex<<"_"<<i<<"[index];\n";
int atomsToLoad = min(indexWidth, numAtoms-startAtom); int atomsToLoad = min(indexWidth, numAtoms-startAtom);
...@@ -191,7 +185,7 @@ void CudaBondedUtilities::computeInteractions(int groups) { ...@@ -191,7 +185,7 @@ void CudaBondedUtilities::computeInteractions(int groups) {
kernelArgs.push_back(context.getPeriodicBoxVecZPointer()); kernelArgs.push_back(context.getPeriodicBoxVecZPointer());
for (int i = 0; i < (int) atomIndices.size(); i++) for (int i = 0; i < (int) atomIndices.size(); i++)
for (int j = 0; j < (int) atomIndices[i].size(); j++) for (int j = 0; j < (int) atomIndices[i].size(); j++)
kernelArgs.push_back(&atomIndices[i][j]->getDevicePointer()); kernelArgs.push_back(&atomIndices[i][j].getDevicePointer());
for (int i = 0; i < (int) arguments.size(); i++) for (int i = 0; i < (int) arguments.size(); i++)
kernelArgs.push_back(&arguments[i]); kernelArgs.push_back(&arguments[i]);
if (energyParameterDerivatives.size() > 0) if (energyParameterDerivatives.size() > 0)
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2009-2017 Stanford University and the Authors. * * Portions copyright (c) 2009-2018 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -108,8 +108,7 @@ static int executeInWindows(const string &command) { ...@@ -108,8 +108,7 @@ static int executeInWindows(const string &command) {
CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const string& precision, const string& compiler, CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const string& precision, const string& compiler,
const string& tempDir, const std::string& hostCompiler, CudaPlatform::PlatformData& platformData, CudaContext* originalContext) : system(system), currentStream(0), const string& tempDir, const std::string& hostCompiler, CudaPlatform::PlatformData& platformData, CudaContext* originalContext) : system(system), currentStream(0),
time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), stepsSinceReorder(99999), contextIsValid(false), atomsWereReordered(false), hasCompilerKernel(false), isNvccAvailable(false), time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), stepsSinceReorder(99999), contextIsValid(false), atomsWereReordered(false), hasCompilerKernel(false), isNvccAvailable(false),
pinnedBuffer(NULL), posq(NULL), posqCorrection(NULL), velm(NULL), force(NULL), energyBuffer(NULL), energySum(NULL), energyParamDerivBuffer(NULL), atomIndexDevice(NULL), chargeBuffer(NULL), pinnedBuffer(NULL), integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL), thread(NULL) {
integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL), thread(NULL) {
// Determine what compiler to use. // Determine what compiler to use.
this->compiler = "\""+compiler+"\""; this->compiler = "\""+compiler+"\"";
...@@ -268,8 +267,8 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking ...@@ -268,8 +267,8 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
compilationDefines["BALLOT(var)"] = "__ballot(var);"; compilationDefines["BALLOT(var)"] = "__ballot(var);";
} }
if (useDoublePrecision) { if (useDoublePrecision) {
posq = CudaArray::create<double4>(*this, paddedNumAtoms, "posq"); posq.initialize<double4>(*this, paddedNumAtoms, "posq");
velm = CudaArray::create<double4>(*this, paddedNumAtoms, "velm"); velm.initialize<double4>(*this, paddedNumAtoms, "velm");
compilationDefines["USE_DOUBLE_PRECISION"] = "1"; compilationDefines["USE_DOUBLE_PRECISION"] = "1";
compilationDefines["make_real2"] = "make_double2"; compilationDefines["make_real2"] = "make_double2";
compilationDefines["make_real3"] = "make_double3"; compilationDefines["make_real3"] = "make_double3";
...@@ -279,9 +278,9 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking ...@@ -279,9 +278,9 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
compilationDefines["make_mixed4"] = "make_double4"; compilationDefines["make_mixed4"] = "make_double4";
} }
else if (useMixedPrecision) { else if (useMixedPrecision) {
posq = CudaArray::create<float4>(*this, paddedNumAtoms, "posq"); posq.initialize<float4>(*this, paddedNumAtoms, "posq");
posqCorrection = CudaArray::create<float4>(*this, paddedNumAtoms, "posqCorrection"); posqCorrection.initialize<float4>(*this, paddedNumAtoms, "posqCorrection");
velm = CudaArray::create<double4>(*this, paddedNumAtoms, "velm"); velm.initialize<double4>(*this, paddedNumAtoms, "velm");
compilationDefines["USE_MIXED_PRECISION"] = "1"; compilationDefines["USE_MIXED_PRECISION"] = "1";
compilationDefines["make_real2"] = "make_float2"; compilationDefines["make_real2"] = "make_float2";
compilationDefines["make_real3"] = "make_float3"; compilationDefines["make_real3"] = "make_float3";
...@@ -291,8 +290,8 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking ...@@ -291,8 +290,8 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
compilationDefines["make_mixed4"] = "make_double4"; compilationDefines["make_mixed4"] = "make_double4";
} }
else { else {
posq = CudaArray::create<float4>(*this, paddedNumAtoms, "posq"); posq.initialize<float4>(*this, paddedNumAtoms, "posq");
velm = CudaArray::create<float4>(*this, paddedNumAtoms, "velm"); velm.initialize<float4>(*this, paddedNumAtoms, "velm");
compilationDefines["make_real2"] = "make_float2"; compilationDefines["make_real2"] = "make_float2";
compilationDefines["make_real3"] = "make_float3"; compilationDefines["make_real3"] = "make_float3";
compilationDefines["make_real4"] = "make_float4"; compilationDefines["make_real4"] = "make_float4";
...@@ -415,24 +414,6 @@ CudaContext::~CudaContext() { ...@@ -415,24 +414,6 @@ CudaContext::~CudaContext() {
delete computation; delete computation;
if (pinnedBuffer != NULL) if (pinnedBuffer != NULL)
cuMemFreeHost(pinnedBuffer); cuMemFreeHost(pinnedBuffer);
if (posq != NULL)
delete posq;
if (posqCorrection != NULL)
delete posqCorrection;
if (velm != NULL)
delete velm;
if (force != NULL)
delete force;
if (energyBuffer != NULL)
delete energyBuffer;
if (energySum != NULL)
delete energySum;
if (energyParamDerivBuffer != NULL)
delete energyParamDerivBuffer;
if (atomIndexDevice != NULL)
delete atomIndexDevice;
if (chargeBuffer != NULL)
delete chargeBuffer;
if (integration != NULL) if (integration != NULL)
delete integration; delete integration;
if (expression != NULL) if (expression != NULL)
...@@ -456,20 +437,20 @@ void CudaContext::initialize() { ...@@ -456,20 +437,20 @@ void CudaContext::initialize() {
string errorMessage = "Error initializing Context"; string errorMessage = "Error initializing Context";
int numEnergyBuffers = max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers()); int numEnergyBuffers = max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers());
if (useDoublePrecision) { if (useDoublePrecision) {
energyBuffer = CudaArray::create<double>(*this, numEnergyBuffers, "energyBuffer"); energyBuffer.initialize<double>(*this, numEnergyBuffers, "energyBuffer");
energySum = CudaArray::create<double>(*this, 1, "energySum"); energySum.initialize<double>(*this, 1, "energySum");
int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers); int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers);
CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0)); CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0));
} }
else if (useMixedPrecision) { else if (useMixedPrecision) {
energyBuffer = CudaArray::create<double>(*this, numEnergyBuffers, "energyBuffer"); energyBuffer.initialize<double>(*this, numEnergyBuffers, "energyBuffer");
energySum = CudaArray::create<double>(*this, 1, "energySum"); energySum.initialize<double>(*this, 1, "energySum");
int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers); int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers);
CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0)); CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0));
} }
else { else {
energyBuffer = CudaArray::create<float>(*this, numEnergyBuffers, "energyBuffer"); energyBuffer.initialize<float>(*this, numEnergyBuffers, "energyBuffer");
energySum = CudaArray::create<float>(*this, 1, "energySum"); energySum.initialize<float>(*this, 1, "energySum");
int pinnedBufferSize = max(paddedNumAtoms*6, numEnergyBuffers); int pinnedBufferSize = max(paddedNumAtoms*6, numEnergyBuffers);
CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(float), 0)); CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(float), 0));
} }
...@@ -480,24 +461,24 @@ void CudaContext::initialize() { ...@@ -480,24 +461,24 @@ void CudaContext::initialize() {
else else
((float4*) pinnedBuffer)[i] = make_float4(0.0f, 0.0f, 0.0f, mass == 0.0 ? 0.0f : (float) (1.0/mass)); ((float4*) pinnedBuffer)[i] = make_float4(0.0f, 0.0f, 0.0f, mass == 0.0 ? 0.0f : (float) (1.0/mass));
} }
velm->upload(pinnedBuffer); velm.upload(pinnedBuffer);
bonded->initialize(system); bonded->initialize(system);
force = CudaArray::create<long long>(*this, paddedNumAtoms*3, "force"); force.initialize<long long>(*this, paddedNumAtoms*3, "force");
addAutoclearBuffer(force->getDevicePointer(), force->getSize()*force->getElementSize()); addAutoclearBuffer(force.getDevicePointer(), force.getSize()*force.getElementSize());
addAutoclearBuffer(energyBuffer->getDevicePointer(), energyBuffer->getSize()*energyBuffer->getElementSize()); addAutoclearBuffer(energyBuffer.getDevicePointer(), energyBuffer.getSize()*energyBuffer.getElementSize());
int numEnergyParamDerivs = energyParamDerivNames.size(); int numEnergyParamDerivs = energyParamDerivNames.size();
if (numEnergyParamDerivs > 0) { if (numEnergyParamDerivs > 0) {
if (useDoublePrecision || useMixedPrecision) if (useDoublePrecision || useMixedPrecision)
energyParamDerivBuffer = CudaArray::create<double>(*this, numEnergyParamDerivs*numEnergyBuffers, "energyParamDerivBuffer"); energyParamDerivBuffer.initialize<double>(*this, numEnergyParamDerivs*numEnergyBuffers, "energyParamDerivBuffer");
else else
energyParamDerivBuffer = CudaArray::create<float>(*this, numEnergyParamDerivs*numEnergyBuffers, "energyParamDerivBuffer"); energyParamDerivBuffer.initialize<float>(*this, numEnergyParamDerivs*numEnergyBuffers, "energyParamDerivBuffer");
addAutoclearBuffer(*energyParamDerivBuffer); addAutoclearBuffer(energyParamDerivBuffer);
} }
atomIndexDevice = CudaArray::create<int>(*this, paddedNumAtoms, "atomIndex"); atomIndexDevice.initialize<int>(*this, paddedNumAtoms, "atomIndex");
atomIndex.resize(paddedNumAtoms); atomIndex.resize(paddedNumAtoms);
for (int i = 0; i < paddedNumAtoms; ++i) for (int i = 0; i < paddedNumAtoms; ++i)
atomIndex[i] = i; atomIndex[i] = i;
atomIndexDevice->upload(atomIndex); atomIndexDevice.upload(atomIndex);
findMoleculeGroups(); findMoleculeGroups();
nonbonded->initialize(system); nonbonded->initialize(system);
} }
...@@ -890,11 +871,11 @@ void CudaContext::clearAutoclearBuffers() { ...@@ -890,11 +871,11 @@ void CudaContext::clearAutoclearBuffers() {
} }
double CudaContext::reduceEnergy() { double CudaContext::reduceEnergy() {
int bufferSize = energyBuffer->getSize(); int bufferSize = energyBuffer.getSize();
int workGroupSize = 512; int workGroupSize = 512;
void* args[] = {&energyBuffer->getDevicePointer(), &energySum->getDevicePointer(), &bufferSize, &workGroupSize}; void* args[] = {&energyBuffer.getDevicePointer(), &energySum.getDevicePointer(), &bufferSize, &workGroupSize};
executeKernel(reduceEnergyKernel, args, workGroupSize, workGroupSize, workGroupSize*energyBuffer->getElementSize()); executeKernel(reduceEnergyKernel, args, workGroupSize, workGroupSize, workGroupSize*energyBuffer.getElementSize());
energySum->download(pinnedBuffer); energySum.download(pinnedBuffer);
if (getUseDoublePrecision() || getUseMixedPrecision()) if (getUseDoublePrecision() || getUseMixedPrecision())
return *((double*) pinnedBuffer); return *((double*) pinnedBuffer);
else else
...@@ -902,21 +883,21 @@ double CudaContext::reduceEnergy() { ...@@ -902,21 +883,21 @@ double CudaContext::reduceEnergy() {
} }
void CudaContext::setCharges(const vector<double>& charges) { void CudaContext::setCharges(const vector<double>& charges) {
if (chargeBuffer == NULL) if (!chargeBuffer.isInitialized())
chargeBuffer = new CudaArray(*this, numAtoms, useDoublePrecision ? sizeof(double) : sizeof(float), "chargeBuffer"); chargeBuffer.initialize(*this, numAtoms, useDoublePrecision ? sizeof(double) : sizeof(float), "chargeBuffer");
if (getUseDoublePrecision()) { if (getUseDoublePrecision()) {
double* c = (double*) getPinnedBuffer(); double* c = (double*) getPinnedBuffer();
for (int i = 0; i < charges.size(); i++) for (int i = 0; i < charges.size(); i++)
c[i] = charges[i]; c[i] = charges[i];
chargeBuffer->upload(c); chargeBuffer.upload(c);
} }
else { else {
float* c = (float*) getPinnedBuffer(); float* c = (float*) getPinnedBuffer();
for (int i = 0; i < charges.size(); i++) for (int i = 0; i < charges.size(); i++)
c[i] = (float) charges[i]; c[i] = (float) charges[i];
chargeBuffer->upload(c); chargeBuffer.upload(c);
} }
void* args[] = {&chargeBuffer->getDevicePointer(), &posq->getDevicePointer(), &atomIndexDevice->getDevicePointer(), &numAtoms}; void* args[] = {&chargeBuffer.getDevicePointer(), &posq.getDevicePointer(), &atomIndexDevice.getDevicePointer(), &numAtoms};
executeKernel(setChargesKernel, args, numAtoms); executeKernel(setChargesKernel, args, numAtoms);
} }
...@@ -1178,16 +1159,16 @@ bool CudaContext::invalidateMolecules(CudaForceInfo* force) { ...@@ -1178,16 +1159,16 @@ bool CudaContext::invalidateMolecules(CudaForceInfo* force) {
vector<double4> newPosq(paddedNumAtoms, make_double4(0, 0, 0, 0)); vector<double4> newPosq(paddedNumAtoms, make_double4(0, 0, 0, 0));
vector<double4> oldVelm(paddedNumAtoms); vector<double4> oldVelm(paddedNumAtoms);
vector<double4> newVelm(paddedNumAtoms, make_double4(0, 0, 0, 0)); vector<double4> newVelm(paddedNumAtoms, make_double4(0, 0, 0, 0));
posq->download(oldPosq); posq.download(oldPosq);
velm->download(oldVelm); velm.download(oldVelm);
for (int i = 0; i < numAtoms; i++) { for (int i = 0; i < numAtoms; i++) {
int index = atomIndex[i]; int index = atomIndex[i];
newPosq[index] = oldPosq[i]; newPosq[index] = oldPosq[i];
newVelm[index] = oldVelm[i]; newVelm[index] = oldVelm[i];
newCellOffsets[index] = posCellOffsets[i]; newCellOffsets[index] = posCellOffsets[i];
} }
posq->upload(newPosq); posq.upload(newPosq);
velm->upload(newVelm); velm.upload(newVelm);
} }
else if (useMixedPrecision) { else if (useMixedPrecision) {
vector<float4> oldPosq(paddedNumAtoms); vector<float4> oldPosq(paddedNumAtoms);
...@@ -1196,8 +1177,8 @@ bool CudaContext::invalidateMolecules(CudaForceInfo* force) { ...@@ -1196,8 +1177,8 @@ bool CudaContext::invalidateMolecules(CudaForceInfo* force) {
vector<float4> newPosqCorrection(paddedNumAtoms, make_float4(0, 0, 0, 0)); vector<float4> newPosqCorrection(paddedNumAtoms, make_float4(0, 0, 0, 0));
vector<double4> oldVelm(paddedNumAtoms); vector<double4> oldVelm(paddedNumAtoms);
vector<double4> newVelm(paddedNumAtoms, make_double4(0, 0, 0, 0)); vector<double4> newVelm(paddedNumAtoms, make_double4(0, 0, 0, 0));
posq->download(oldPosq); posq.download(oldPosq);
velm->download(oldVelm); velm.download(oldVelm);
for (int i = 0; i < numAtoms; i++) { for (int i = 0; i < numAtoms; i++) {
int index = atomIndex[i]; int index = atomIndex[i];
newPosq[index] = oldPosq[i]; newPosq[index] = oldPosq[i];
...@@ -1205,31 +1186,31 @@ bool CudaContext::invalidateMolecules(CudaForceInfo* force) { ...@@ -1205,31 +1186,31 @@ bool CudaContext::invalidateMolecules(CudaForceInfo* force) {
newVelm[index] = oldVelm[i]; newVelm[index] = oldVelm[i];
newCellOffsets[index] = posCellOffsets[i]; newCellOffsets[index] = posCellOffsets[i];
} }
posq->upload(newPosq); posq.upload(newPosq);
posqCorrection->upload(newPosqCorrection); posqCorrection.upload(newPosqCorrection);
velm->upload(newVelm); velm.upload(newVelm);
} }
else { else {
vector<float4> oldPosq(paddedNumAtoms); vector<float4> oldPosq(paddedNumAtoms);
vector<float4> newPosq(paddedNumAtoms, make_float4(0, 0, 0, 0)); vector<float4> newPosq(paddedNumAtoms, make_float4(0, 0, 0, 0));
vector<float4> oldVelm(paddedNumAtoms); vector<float4> oldVelm(paddedNumAtoms);
vector<float4> newVelm(paddedNumAtoms, make_float4(0, 0, 0, 0)); vector<float4> newVelm(paddedNumAtoms, make_float4(0, 0, 0, 0));
posq->download(oldPosq); posq.download(oldPosq);
velm->download(oldVelm); velm.download(oldVelm);
for (int i = 0; i < numAtoms; i++) { for (int i = 0; i < numAtoms; i++) {
int index = atomIndex[i]; int index = atomIndex[i];
newPosq[index] = oldPosq[i]; newPosq[index] = oldPosq[i];
newVelm[index] = oldVelm[i]; newVelm[index] = oldVelm[i];
newCellOffsets[index] = posCellOffsets[i]; newCellOffsets[index] = posCellOffsets[i];
} }
posq->upload(newPosq); posq.upload(newPosq);
velm->upload(newVelm); velm.upload(newVelm);
} }
for (int i = 0; i < numAtoms; i++) { for (int i = 0; i < numAtoms; i++) {
atomIndex[i] = i; atomIndex[i] = i;
posCellOffsets[i] = newCellOffsets[i]; posCellOffsets[i] = newCellOffsets[i];
} }
atomIndexDevice->upload(atomIndex); atomIndexDevice.upload(atomIndex);
findMoleculeGroups(); findMoleculeGroups();
for (auto listener : reorderListeners) for (auto listener : reorderListeners)
listener->execute(); listener->execute();
...@@ -1262,10 +1243,10 @@ void CudaContext::reorderAtomsImpl() { ...@@ -1262,10 +1243,10 @@ void CudaContext::reorderAtomsImpl() {
vector<Real4> oldPosqCorrection(paddedNumAtoms, padding); vector<Real4> oldPosqCorrection(paddedNumAtoms, padding);
Mixed4 paddingMixed = {0, 0, 0, 0}; Mixed4 paddingMixed = {0, 0, 0, 0};
vector<Mixed4> oldVelm(paddedNumAtoms, paddingMixed); vector<Mixed4> oldVelm(paddedNumAtoms, paddingMixed);
posq->download(oldPosq); posq.download(oldPosq);
velm->download(oldVelm); velm.download(oldVelm);
if (useMixedPrecision) if (useMixedPrecision)
posqCorrection->download(oldPosqCorrection); posqCorrection.download(oldPosqCorrection);
Real minx = oldPosq[0].x, maxx = oldPosq[0].x; Real minx = oldPosq[0].x, maxx = oldPosq[0].x;
Real miny = oldPosq[0].y, maxy = oldPosq[0].y; Real miny = oldPosq[0].y, maxy = oldPosq[0].y;
Real minz = oldPosq[0].z, maxz = oldPosq[0].z; Real minz = oldPosq[0].z, maxz = oldPosq[0].z;
...@@ -1409,11 +1390,11 @@ void CudaContext::reorderAtomsImpl() { ...@@ -1409,11 +1390,11 @@ void CudaContext::reorderAtomsImpl() {
atomIndex[i] = originalIndex[i]; atomIndex[i] = originalIndex[i];
posCellOffsets[i] = newCellOffsets[i]; posCellOffsets[i] = newCellOffsets[i];
} }
posq->upload(newPosq); posq.upload(newPosq);
if (useMixedPrecision) if (useMixedPrecision)
posqCorrection->upload(newPosqCorrection); posqCorrection.upload(newPosqCorrection);
velm->upload(newVelm); velm.upload(newVelm);
atomIndexDevice->upload(atomIndex); atomIndexDevice.upload(atomIndex);
for (auto listener : reorderListeners) for (auto listener : reorderListeners)
listener->execute(); listener->execute();
} }
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2009-2017 Stanford University and the Authors. * * Portions copyright (c) 2009-2018 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -98,30 +98,24 @@ struct CudaIntegrationUtilities::ConstraintOrderer : public binary_function<int, ...@@ -98,30 +98,24 @@ struct CudaIntegrationUtilities::ConstraintOrderer : public binary_function<int,
}; };
CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const System& system) : context(context), CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const System& system) : context(context),
posDelta(NULL), settleAtoms(NULL), settleParams(NULL), shakeAtoms(NULL), shakeParams(NULL), randomPos(0) {
random(NULL), randomSeed(NULL), randomPos(0), stepSize(NULL), ccmaAtoms(NULL), ccmaDistance(NULL),
ccmaReducedMass(NULL), ccmaAtomConstraints(NULL), ccmaNumAtomConstraints(NULL), ccmaConstraintMatrixColumn(NULL),
ccmaConstraintMatrixValue(NULL), ccmaDelta1(NULL), ccmaDelta2(NULL), ccmaConverged(NULL), ccmaConvergedMemory(NULL),
vsite2AvgAtoms(NULL), vsite2AvgWeights(NULL), vsite3AvgAtoms(NULL), vsite3AvgWeights(NULL),
vsiteOutOfPlaneAtoms(NULL), vsiteOutOfPlaneWeights(NULL), vsiteLocalCoordsIndex(NULL), vsiteLocalCoordsAtoms(NULL),
vsiteLocalCoordsWeights(NULL), vsiteLocalCoordsPos(NULL), vsiteLocalCoordsStartIndex(NULL) {
// Create workspace arrays. // Create workspace arrays.
lastStepSize = make_double2(0.0, 0.0); lastStepSize = make_double2(0.0, 0.0);
if (context.getUseDoublePrecision() || context.getUseMixedPrecision()) { if (context.getUseDoublePrecision() || context.getUseMixedPrecision()) {
posDelta = CudaArray::create<double4>(context, context.getPaddedNumAtoms(), "posDelta"); posDelta.initialize<double4>(context, context.getPaddedNumAtoms(), "posDelta");
vector<double4> deltas(posDelta->getSize(), make_double4(0.0, 0.0, 0.0, 0.0)); vector<double4> deltas(posDelta.getSize(), make_double4(0.0, 0.0, 0.0, 0.0));
posDelta->upload(deltas); posDelta.upload(deltas);
stepSize = CudaArray::create<double2>(context, 1, "stepSize"); stepSize.initialize<double2>(context, 1, "stepSize");
stepSize->upload(&lastStepSize); stepSize.upload(&lastStepSize);
} }
else { else {
posDelta = CudaArray::create<float4>(context, context.getPaddedNumAtoms(), "posDelta"); posDelta.initialize<float4>(context, context.getPaddedNumAtoms(), "posDelta");
vector<float4> deltas(posDelta->getSize(), make_float4(0.0f, 0.0f, 0.0f, 0.0f)); vector<float4> deltas(posDelta.getSize(), make_float4(0.0f, 0.0f, 0.0f, 0.0f));
posDelta->upload(deltas); posDelta.upload(deltas);
stepSize = CudaArray::create<float2>(context, 1, "stepSize"); stepSize.initialize<float2>(context, 1, "stepSize");
float2 lastStepSizeFloat = make_float2(0.0f, 0.0f); float2 lastStepSizeFloat = make_float2(0.0f, 0.0f);
stepSize->upload(&lastStepSizeFloat); stepSize.upload(&lastStepSizeFloat);
} }
// Record the set of constraints and how many constraints each atom is involved in. // Record the set of constraints and how many constraints each atom is involved in.
...@@ -208,10 +202,10 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S ...@@ -208,10 +202,10 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
isShakeAtom[atom3] = true; isShakeAtom[atom3] = true;
} }
if (atoms.size() > 0) { if (atoms.size() > 0) {
settleAtoms = CudaArray::create<int4>(context, atoms.size(), "settleAtoms"); settleAtoms.initialize<int4>(context, atoms.size(), "settleAtoms");
settleParams = CudaArray::create<float2>(context, params.size(), "settleParams"); settleParams.initialize<float2>(context, params.size(), "settleParams");
settleAtoms->upload(atoms); settleAtoms.upload(atoms);
settleParams->upload(params); settleParams.upload(params);
} }
} }
...@@ -291,10 +285,10 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S ...@@ -291,10 +285,10 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
isShakeAtom[cluster.peripheralID[2]] = true; isShakeAtom[cluster.peripheralID[2]] = true;
++index; ++index;
} }
shakeAtoms = CudaArray::create<int4>(context, atoms.size(), "shakeAtoms"); shakeAtoms.initialize<int4>(context, atoms.size(), "shakeAtoms");
shakeParams = CudaArray::create<float4>(context, params.size(), "shakeParams"); shakeParams.initialize<float4>(context, params.size(), "shakeParams");
shakeAtoms->upload(atoms); shakeAtoms.upload(atoms);
shakeParams->upload(params); shakeParams.upload(params);
} }
// Find connected constraints for CCMA. // Find connected constraints for CCMA.
...@@ -371,26 +365,26 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S ...@@ -371,26 +365,26 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
// Record the CCMA data structures. // Record the CCMA data structures.
ccmaAtoms = CudaArray::create<int2>(context, numCCMA, "CcmaAtoms"); ccmaAtoms.initialize<int2>(context, numCCMA, "CcmaAtoms");
ccmaAtomConstraints = CudaArray::create<int>(context, numAtoms*maxAtomConstraints, "CcmaAtomConstraints"); ccmaAtomConstraints.initialize<int>(context, numAtoms*maxAtomConstraints, "CcmaAtomConstraints");
ccmaNumAtomConstraints = CudaArray::create<int>(context, numAtoms, "CcmaAtomConstraintsIndex"); ccmaNumAtomConstraints.initialize<int>(context, numAtoms, "CcmaAtomConstraintsIndex");
ccmaConstraintMatrixColumn = CudaArray::create<int>(context, numCCMA*maxRowElements, "ConstraintMatrixColumn"); ccmaConstraintMatrixColumn.initialize<int>(context, numCCMA*maxRowElements, "ConstraintMatrixColumn");
ccmaConverged = CudaArray::create<int>(context, 2, "ccmaConverged"); ccmaConverged.initialize<int>(context, 2, "ccmaConverged");
CHECK_RESULT2(cuMemHostAlloc((void**) &ccmaConvergedMemory, sizeof(int), CU_MEMHOSTALLOC_DEVICEMAP), "Error allocating pinned memory"); CHECK_RESULT2(cuMemHostAlloc((void**) &ccmaConvergedMemory, sizeof(int), CU_MEMHOSTALLOC_DEVICEMAP), "Error allocating pinned memory");
CHECK_RESULT2(cuMemHostGetDevicePointer(&ccmaConvergedDeviceMemory, ccmaConvergedMemory, 0), "Error getting device address for pinned memory"); CHECK_RESULT2(cuMemHostGetDevicePointer(&ccmaConvergedDeviceMemory, ccmaConvergedMemory, 0), "Error getting device address for pinned memory");
vector<int2> atomsVec(ccmaAtoms->getSize()); vector<int2> atomsVec(ccmaAtoms.getSize());
vector<int> atomConstraintsVec(ccmaAtomConstraints->getSize()); vector<int> atomConstraintsVec(ccmaAtomConstraints.getSize());
vector<int> numAtomConstraintsVec(ccmaNumAtomConstraints->getSize()); vector<int> numAtomConstraintsVec(ccmaNumAtomConstraints.getSize());
vector<int> constraintMatrixColumnVec(ccmaConstraintMatrixColumn->getSize()); vector<int> constraintMatrixColumnVec(ccmaConstraintMatrixColumn.getSize());
if (context.getUseDoublePrecision() || context.getUseMixedPrecision()) { if (context.getUseDoublePrecision() || context.getUseMixedPrecision()) {
ccmaDistance = CudaArray::create<double4>(context, numCCMA, "CcmaDistance"); ccmaDistance.initialize<double4>(context, numCCMA, "CcmaDistance");
ccmaDelta1 = CudaArray::create<double>(context, numCCMA, "CcmaDelta1"); ccmaDelta1.initialize<double>(context, numCCMA, "CcmaDelta1");
ccmaDelta2 = CudaArray::create<double>(context, numCCMA, "CcmaDelta2"); ccmaDelta2.initialize<double>(context, numCCMA, "CcmaDelta2");
ccmaReducedMass = CudaArray::create<double>(context, numCCMA, "CcmaReducedMass"); ccmaReducedMass.initialize<double>(context, numCCMA, "CcmaReducedMass");
ccmaConstraintMatrixValue = CudaArray::create<double>(context, numCCMA*maxRowElements, "ConstraintMatrixValue"); ccmaConstraintMatrixValue.initialize<double>(context, numCCMA*maxRowElements, "ConstraintMatrixValue");
vector<double4> distanceVec(ccmaDistance->getSize()); vector<double4> distanceVec(ccmaDistance.getSize());
vector<double> reducedMassVec(ccmaReducedMass->getSize()); vector<double> reducedMassVec(ccmaReducedMass.getSize());
vector<double> constraintMatrixValueVec(ccmaConstraintMatrixValue->getSize()); vector<double> constraintMatrixValueVec(ccmaConstraintMatrixValue.getSize());
for (int i = 0; i < numCCMA; i++) { for (int i = 0; i < numCCMA; i++) {
int index = constraintOrder[i]; int index = constraintOrder[i];
int c = ccmaConstraints[index]; int c = ccmaConstraints[index];
...@@ -404,19 +398,19 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S ...@@ -404,19 +398,19 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
} }
constraintMatrixColumnVec[i+matrix[index].size()*numCCMA] = numCCMA; constraintMatrixColumnVec[i+matrix[index].size()*numCCMA] = numCCMA;
} }
ccmaDistance->upload(distanceVec); ccmaDistance.upload(distanceVec);
ccmaReducedMass->upload(reducedMassVec); ccmaReducedMass.upload(reducedMassVec);
ccmaConstraintMatrixValue->upload(constraintMatrixValueVec); ccmaConstraintMatrixValue.upload(constraintMatrixValueVec);
} }
else { else {
ccmaDistance = CudaArray::create<float4>(context, numCCMA, "CcmaDistance"); ccmaDistance.initialize<float4>(context, numCCMA, "CcmaDistance");
ccmaDelta1 = CudaArray::create<float>(context, numCCMA, "CcmaDelta1"); ccmaDelta1.initialize<float>(context, numCCMA, "CcmaDelta1");
ccmaDelta2 = CudaArray::create<float>(context, numCCMA, "CcmaDelta2"); ccmaDelta2.initialize<float>(context, numCCMA, "CcmaDelta2");
ccmaReducedMass = CudaArray::create<float>(context, numCCMA, "CcmaReducedMass"); ccmaReducedMass.initialize<float>(context, numCCMA, "CcmaReducedMass");
ccmaConstraintMatrixValue = CudaArray::create<float>(context, numCCMA*maxRowElements, "ConstraintMatrixValue"); ccmaConstraintMatrixValue.initialize<float>(context, numCCMA*maxRowElements, "ConstraintMatrixValue");
vector<float4> distanceVec(ccmaDistance->getSize()); vector<float4> distanceVec(ccmaDistance.getSize());
vector<float> reducedMassVec(ccmaReducedMass->getSize()); vector<float> reducedMassVec(ccmaReducedMass.getSize());
vector<float> constraintMatrixValueVec(ccmaConstraintMatrixValue->getSize()); vector<float> constraintMatrixValueVec(ccmaConstraintMatrixValue.getSize());
for (int i = 0; i < numCCMA; i++) { for (int i = 0; i < numCCMA; i++) {
int index = constraintOrder[i]; int index = constraintOrder[i];
int c = ccmaConstraints[index]; int c = ccmaConstraints[index];
...@@ -430,9 +424,9 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S ...@@ -430,9 +424,9 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
} }
constraintMatrixColumnVec[i+matrix[index].size()*numCCMA] = numCCMA; constraintMatrixColumnVec[i+matrix[index].size()*numCCMA] = numCCMA;
} }
ccmaDistance->upload(distanceVec); ccmaDistance.upload(distanceVec);
ccmaReducedMass->upload(reducedMassVec); ccmaReducedMass.upload(reducedMassVec);
ccmaConstraintMatrixValue->upload(constraintMatrixValueVec); ccmaConstraintMatrixValue.upload(constraintMatrixValueVec);
} }
for (unsigned int i = 0; i < atomConstraints.size(); i++) { for (unsigned int i = 0; i < atomConstraints.size(); i++) {
numAtomConstraintsVec[i] = atomConstraints[i].size(); numAtomConstraintsVec[i] = atomConstraints[i].size();
...@@ -441,10 +435,10 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S ...@@ -441,10 +435,10 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
atomConstraintsVec[i+j*numAtoms] = (forward ? inverseOrder[atomConstraints[i][j]]+1 : -inverseOrder[atomConstraints[i][j]]-1); atomConstraintsVec[i+j*numAtoms] = (forward ? inverseOrder[atomConstraints[i][j]]+1 : -inverseOrder[atomConstraints[i][j]]-1);
} }
} }
ccmaAtoms->upload(atomsVec); ccmaAtoms.upload(atomsVec);
ccmaAtomConstraints->upload(atomConstraintsVec); ccmaAtomConstraints.upload(atomConstraintsVec);
ccmaNumAtomConstraints->upload(numAtomConstraintsVec); ccmaNumAtomConstraints.upload(numAtomConstraintsVec);
ccmaConstraintMatrixColumn->upload(constraintMatrixColumnVec); ccmaConstraintMatrixColumn.upload(constraintMatrixColumnVec);
} }
// Build the list of virtual sites. // Build the list of virtual sites.
...@@ -510,73 +504,73 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S ...@@ -510,73 +504,73 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
int num3Avg = vsite3AvgAtomVec.size(); int num3Avg = vsite3AvgAtomVec.size();
int numOutOfPlane = vsiteOutOfPlaneAtomVec.size(); int numOutOfPlane = vsiteOutOfPlaneAtomVec.size();
int numLocalCoords = vsiteLocalCoordsPosVec.size(); int numLocalCoords = vsiteLocalCoordsPosVec.size();
vsite2AvgAtoms = CudaArray::create<int4>(context, max(1, num2Avg), "vsite2AvgAtoms"); vsite2AvgAtoms.initialize<int4>(context, max(1, num2Avg), "vsite2AvgAtoms");
vsite3AvgAtoms = CudaArray::create<int4>(context, max(1, num3Avg), "vsite3AvgAtoms"); vsite3AvgAtoms.initialize<int4>(context, max(1, num3Avg), "vsite3AvgAtoms");
vsiteOutOfPlaneAtoms = CudaArray::create<int4>(context, max(1, numOutOfPlane), "vsiteOutOfPlaneAtoms"); vsiteOutOfPlaneAtoms.initialize<int4>(context, max(1, numOutOfPlane), "vsiteOutOfPlaneAtoms");
vsiteLocalCoordsIndex = CudaArray::create<int>(context, max(1, (int) vsiteLocalCoordsIndexVec.size()), "vsiteLocalCoordsIndex"); vsiteLocalCoordsIndex.initialize<int>(context, max(1, (int) vsiteLocalCoordsIndexVec.size()), "vsiteLocalCoordsIndex");
vsiteLocalCoordsAtoms = CudaArray::create<int>(context, max(1, (int) vsiteLocalCoordsAtomVec.size()), "vsiteLocalCoordsAtoms"); vsiteLocalCoordsAtoms.initialize<int>(context, max(1, (int) vsiteLocalCoordsAtomVec.size()), "vsiteLocalCoordsAtoms");
vsiteLocalCoordsStartIndex = CudaArray::create<int>(context, max(1, (int) vsiteLocalCoordsStartVec.size()), "vsiteLocalCoordsStartIndex"); vsiteLocalCoordsStartIndex.initialize<int>(context, max(1, (int) vsiteLocalCoordsStartVec.size()), "vsiteLocalCoordsStartIndex");
if (num2Avg > 0) if (num2Avg > 0)
vsite2AvgAtoms->upload(vsite2AvgAtomVec); vsite2AvgAtoms.upload(vsite2AvgAtomVec);
if (num3Avg > 0) if (num3Avg > 0)
vsite3AvgAtoms->upload(vsite3AvgAtomVec); vsite3AvgAtoms.upload(vsite3AvgAtomVec);
if (numOutOfPlane > 0) if (numOutOfPlane > 0)
vsiteOutOfPlaneAtoms->upload(vsiteOutOfPlaneAtomVec); vsiteOutOfPlaneAtoms.upload(vsiteOutOfPlaneAtomVec);
if (numLocalCoords > 0) { if (numLocalCoords > 0) {
vsiteLocalCoordsIndex->upload(vsiteLocalCoordsIndexVec); vsiteLocalCoordsIndex.upload(vsiteLocalCoordsIndexVec);
vsiteLocalCoordsAtoms->upload(vsiteLocalCoordsAtomVec); vsiteLocalCoordsAtoms.upload(vsiteLocalCoordsAtomVec);
vsiteLocalCoordsStartIndex->upload(vsiteLocalCoordsStartVec); vsiteLocalCoordsStartIndex.upload(vsiteLocalCoordsStartVec);
} }
if (context.getUseDoublePrecision()) { if (context.getUseDoublePrecision()) {
vsite2AvgWeights = CudaArray::create<double2>(context, max(1, num2Avg), "vsite2AvgWeights"); vsite2AvgWeights.initialize<double2>(context, max(1, num2Avg), "vsite2AvgWeights");
vsite3AvgWeights = CudaArray::create<double4>(context, max(1, num3Avg), "vsite3AvgWeights"); vsite3AvgWeights.initialize<double4>(context, max(1, num3Avg), "vsite3AvgWeights");
vsiteOutOfPlaneWeights = CudaArray::create<double4>(context, max(1, numOutOfPlane), "vsiteOutOfPlaneWeights"); vsiteOutOfPlaneWeights.initialize<double4>(context, max(1, numOutOfPlane), "vsiteOutOfPlaneWeights");
vsiteLocalCoordsWeights = CudaArray::create<double>(context, max(1, (int) vsiteLocalCoordsWeightVec.size()), "vsiteLocalCoordsWeights"); vsiteLocalCoordsWeights.initialize<double>(context, max(1, (int) vsiteLocalCoordsWeightVec.size()), "vsiteLocalCoordsWeights");
vsiteLocalCoordsPos = CudaArray::create<double4>(context, max(1, (int) vsiteLocalCoordsPosVec.size()), "vsiteLocalCoordsPos"); vsiteLocalCoordsPos.initialize<double4>(context, max(1, (int) vsiteLocalCoordsPosVec.size()), "vsiteLocalCoordsPos");
if (num2Avg > 0) if (num2Avg > 0)
vsite2AvgWeights->upload(vsite2AvgWeightVec); vsite2AvgWeights.upload(vsite2AvgWeightVec);
if (num3Avg > 0) if (num3Avg > 0)
vsite3AvgWeights->upload(vsite3AvgWeightVec); vsite3AvgWeights.upload(vsite3AvgWeightVec);
if (numOutOfPlane > 0) if (numOutOfPlane > 0)
vsiteOutOfPlaneWeights->upload(vsiteOutOfPlaneWeightVec); vsiteOutOfPlaneWeights.upload(vsiteOutOfPlaneWeightVec);
if (numLocalCoords > 0) { if (numLocalCoords > 0) {
vsiteLocalCoordsWeights->upload(vsiteLocalCoordsWeightVec); vsiteLocalCoordsWeights.upload(vsiteLocalCoordsWeightVec);
vsiteLocalCoordsPos->upload(vsiteLocalCoordsPosVec); vsiteLocalCoordsPos.upload(vsiteLocalCoordsPosVec);
} }
} }
else { else {
vsite2AvgWeights = CudaArray::create<float2>(context, max(1, num2Avg), "vsite2AvgWeights"); vsite2AvgWeights.initialize<float2>(context, max(1, num2Avg), "vsite2AvgWeights");
vsite3AvgWeights = CudaArray::create<float4>(context, max(1, num3Avg), "vsite3AvgWeights"); vsite3AvgWeights.initialize<float4>(context, max(1, num3Avg), "vsite3AvgWeights");
vsiteOutOfPlaneWeights = CudaArray::create<float4>(context, max(1, numOutOfPlane), "vsiteOutOfPlaneWeights"); vsiteOutOfPlaneWeights.initialize<float4>(context, max(1, numOutOfPlane), "vsiteOutOfPlaneWeights");
vsiteLocalCoordsWeights = CudaArray::create<float>(context, max(1, (int) vsiteLocalCoordsWeightVec.size()), "vsiteLocalCoordsWeights"); vsiteLocalCoordsWeights.initialize<float>(context, max(1, (int) vsiteLocalCoordsWeightVec.size()), "vsiteLocalCoordsWeights");
vsiteLocalCoordsPos = CudaArray::create<float4>(context, max(1, (int) vsiteLocalCoordsPosVec.size()), "vsiteLocalCoordsPos"); vsiteLocalCoordsPos.initialize<float4>(context, max(1, (int) vsiteLocalCoordsPosVec.size()), "vsiteLocalCoordsPos");
if (num2Avg > 0) { if (num2Avg > 0) {
vector<float2> floatWeights(num2Avg); vector<float2> floatWeights(num2Avg);
for (int i = 0; i < num2Avg; i++) for (int i = 0; i < num2Avg; i++)
floatWeights[i] = make_float2((float) vsite2AvgWeightVec[i].x, (float) vsite2AvgWeightVec[i].y); floatWeights[i] = make_float2((float) vsite2AvgWeightVec[i].x, (float) vsite2AvgWeightVec[i].y);
vsite2AvgWeights->upload(floatWeights); vsite2AvgWeights.upload(floatWeights);
} }
if (num3Avg > 0) { if (num3Avg > 0) {
vector<float4> floatWeights(num3Avg); vector<float4> floatWeights(num3Avg);
for (int i = 0; i < num3Avg; i++) for (int i = 0; i < num3Avg; i++)
floatWeights[i] = make_float4((float) vsite3AvgWeightVec[i].x, (float) vsite3AvgWeightVec[i].y, (float) vsite3AvgWeightVec[i].z, 0.0f); floatWeights[i] = make_float4((float) vsite3AvgWeightVec[i].x, (float) vsite3AvgWeightVec[i].y, (float) vsite3AvgWeightVec[i].z, 0.0f);
vsite3AvgWeights->upload(floatWeights); vsite3AvgWeights.upload(floatWeights);
} }
if (numOutOfPlane > 0) { if (numOutOfPlane > 0) {
vector<float4> floatWeights(numOutOfPlane); vector<float4> floatWeights(numOutOfPlane);
for (int i = 0; i < numOutOfPlane; i++) for (int i = 0; i < numOutOfPlane; i++)
floatWeights[i] = make_float4((float) vsiteOutOfPlaneWeightVec[i].x, (float) vsiteOutOfPlaneWeightVec[i].y, (float) vsiteOutOfPlaneWeightVec[i].z, 0.0f); floatWeights[i] = make_float4((float) vsiteOutOfPlaneWeightVec[i].x, (float) vsiteOutOfPlaneWeightVec[i].y, (float) vsiteOutOfPlaneWeightVec[i].z, 0.0f);
vsiteOutOfPlaneWeights->upload(floatWeights); vsiteOutOfPlaneWeights.upload(floatWeights);
} }
if (numLocalCoords > 0) { if (numLocalCoords > 0) {
vector<float> floatWeights(vsiteLocalCoordsWeightVec.size()); vector<float> floatWeights(vsiteLocalCoordsWeightVec.size());
for (int i = 0; i < (int) vsiteLocalCoordsWeightVec.size(); i++) for (int i = 0; i < (int) vsiteLocalCoordsWeightVec.size(); i++)
floatWeights[i] = (float) vsiteLocalCoordsWeightVec[i]; floatWeights[i] = (float) vsiteLocalCoordsWeightVec[i];
vsiteLocalCoordsWeights->upload(floatWeights); vsiteLocalCoordsWeights.upload(floatWeights);
vector<float4> floatPos(vsiteLocalCoordsPosVec.size()); vector<float4> floatPos(vsiteLocalCoordsPosVec.size());
for (int i = 0; i < (int) vsiteLocalCoordsPosVec.size(); i++) for (int i = 0; i < (int) vsiteLocalCoordsPosVec.size(); i++)
floatPos[i] = make_float4((float) vsiteLocalCoordsPosVec[i].x, (float) vsiteLocalCoordsPosVec[i].y, (float) vsiteLocalCoordsPosVec[i].z, 0.0f); floatPos[i] = make_float4((float) vsiteLocalCoordsPosVec[i].x, (float) vsiteLocalCoordsPosVec[i].y, (float) vsiteLocalCoordsPosVec[i].z, 0.0f);
vsiteLocalCoordsPos->upload(floatPos); vsiteLocalCoordsPos.upload(floatPos);
} }
} }
...@@ -610,86 +604,28 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S ...@@ -610,86 +604,28 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
CudaIntegrationUtilities::~CudaIntegrationUtilities() { CudaIntegrationUtilities::~CudaIntegrationUtilities() {
context.setAsCurrent(); context.setAsCurrent();
if (posDelta != NULL)
delete posDelta;
if (settleAtoms != NULL)
delete settleAtoms;
if (settleParams != NULL)
delete settleParams;
if (shakeAtoms != NULL)
delete shakeAtoms;
if (shakeParams != NULL)
delete shakeParams;
if (random != NULL)
delete random;
if (randomSeed != NULL)
delete randomSeed;
if (stepSize != NULL)
delete stepSize;
if (ccmaAtoms != NULL)
delete ccmaAtoms;
if (ccmaDistance != NULL)
delete ccmaDistance;
if (ccmaReducedMass != NULL)
delete ccmaReducedMass;
if (ccmaAtomConstraints != NULL)
delete ccmaAtomConstraints;
if (ccmaNumAtomConstraints != NULL)
delete ccmaNumAtomConstraints;
if (ccmaConstraintMatrixColumn != NULL)
delete ccmaConstraintMatrixColumn;
if (ccmaConstraintMatrixValue != NULL)
delete ccmaConstraintMatrixValue;
if (ccmaDelta1 != NULL)
delete ccmaDelta1;
if (ccmaDelta2 != NULL)
delete ccmaDelta2;
if (ccmaConverged != NULL)
delete ccmaConverged;
if (ccmaConvergedMemory != NULL) if (ccmaConvergedMemory != NULL)
cuMemFreeHost(ccmaConvergedMemory); cuMemFreeHost(ccmaConvergedMemory);
if (vsite2AvgAtoms != NULL)
delete vsite2AvgAtoms;
if (vsite2AvgWeights != NULL)
delete vsite2AvgWeights;
if (vsite3AvgAtoms != NULL)
delete vsite3AvgAtoms;
if (vsite3AvgWeights != NULL)
delete vsite3AvgWeights;
if (vsiteOutOfPlaneAtoms != NULL)
delete vsiteOutOfPlaneAtoms;
if (vsiteOutOfPlaneWeights != NULL)
delete vsiteOutOfPlaneWeights;
if (vsiteLocalCoordsIndex != NULL)
delete vsiteLocalCoordsIndex;
if (vsiteLocalCoordsAtoms != NULL)
delete vsiteLocalCoordsAtoms;
if (vsiteLocalCoordsWeights != NULL)
delete vsiteLocalCoordsWeights;
if (vsiteLocalCoordsPos != NULL)
delete vsiteLocalCoordsPos;
if (vsiteLocalCoordsStartIndex != NULL)
delete vsiteLocalCoordsStartIndex;
} }
void CudaIntegrationUtilities::setNextStepSize(double size) { void CudaIntegrationUtilities::setNextStepSize(double size) {
if (size != lastStepSize.x || size != lastStepSize.y) { if (size != lastStepSize.x || size != lastStepSize.y) {
lastStepSize = make_double2(size, size); lastStepSize = make_double2(size, size);
if (context.getUseDoublePrecision() || context.getUseMixedPrecision()) if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
stepSize->upload(&lastStepSize); stepSize.upload(&lastStepSize);
else { else {
float2 lastStepSizeFloat = make_float2((float) size, (float) size); float2 lastStepSizeFloat = make_float2((float) size, (float) size);
stepSize->upload(&lastStepSizeFloat); stepSize.upload(&lastStepSizeFloat);
} }
} }
} }
double CudaIntegrationUtilities::getLastStepSize() { double CudaIntegrationUtilities::getLastStepSize() {
if (context.getUseDoublePrecision() || context.getUseMixedPrecision()) if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
stepSize->download(&lastStepSize); stepSize.download(&lastStepSize);
else { else {
float2 lastStepSizeFloat; float2 lastStepSizeFloat;
stepSize->download(&lastStepSizeFloat); stepSize.download(&lastStepSizeFloat);
lastStepSize = make_double2(lastStepSizeFloat.x, lastStepSizeFloat.y); lastStepSize = make_double2(lastStepSizeFloat.x, lastStepSizeFloat.y);
} }
return lastStepSize.y; return lastStepSize.y;
...@@ -718,41 +654,41 @@ void CudaIntegrationUtilities::applyConstraints(bool constrainVelocities, double ...@@ -718,41 +654,41 @@ void CudaIntegrationUtilities::applyConstraints(bool constrainVelocities, double
float floatTol = (float) tol; float floatTol = (float) tol;
void* tolPointer = (context.getUseDoublePrecision() || context.getUseMixedPrecision() ? (void*) &tol : (void*) &floatTol); void* tolPointer = (context.getUseDoublePrecision() || context.getUseMixedPrecision() ? (void*) &tol : (void*) &floatTol);
CUdeviceptr posCorrection = (context.getUseMixedPrecision() ? context.getPosqCorrection().getDevicePointer() : 0); CUdeviceptr posCorrection = (context.getUseMixedPrecision() ? context.getPosqCorrection().getDevicePointer() : 0);
if (settleAtoms != NULL) { if (settleAtoms.isInitialized()) {
int numClusters = settleAtoms->getSize(); int numClusters = settleAtoms.getSize();
void* args[] = {&numClusters, tolPointer, &context.getPosq().getDevicePointer(), &posCorrection, void* args[] = {&numClusters, tolPointer, &context.getPosq().getDevicePointer(), &posCorrection,
&posDelta->getDevicePointer(), &context.getVelm().getDevicePointer(), &posDelta.getDevicePointer(), &context.getVelm().getDevicePointer(),
&settleAtoms->getDevicePointer(), &settleParams->getDevicePointer()}; &settleAtoms.getDevicePointer(), &settleParams.getDevicePointer()};
context.executeKernel(settleKernel, args, settleAtoms->getSize()); context.executeKernel(settleKernel, args, settleAtoms.getSize());
} }
if (shakeAtoms != NULL) { if (shakeAtoms.isInitialized()) {
int numClusters = shakeAtoms->getSize(); int numClusters = shakeAtoms.getSize();
void* args[] = {&numClusters, tolPointer, &context.getPosq().getDevicePointer(), &posCorrection, void* args[] = {&numClusters, tolPointer, &context.getPosq().getDevicePointer(), &posCorrection,
constrainVelocities ? &context.getVelm().getDevicePointer() : &posDelta->getDevicePointer(), constrainVelocities ? &context.getVelm().getDevicePointer() : &posDelta.getDevicePointer(),
&shakeAtoms->getDevicePointer(), &shakeParams->getDevicePointer()}; &shakeAtoms.getDevicePointer(), &shakeParams.getDevicePointer()};
context.executeKernel(shakeKernel, args, shakeAtoms->getSize()); context.executeKernel(shakeKernel, args, shakeAtoms.getSize());
} }
if (ccmaAtoms != NULL) { if (ccmaAtoms.isInitialized()) {
void* directionsArgs[] = {&ccmaAtoms->getDevicePointer(), &ccmaDistance->getDevicePointer(), &context.getPosq().getDevicePointer(), &posCorrection, &ccmaConverged->getDevicePointer()}; void* directionsArgs[] = {&ccmaAtoms.getDevicePointer(), &ccmaDistance.getDevicePointer(), &context.getPosq().getDevicePointer(), &posCorrection, &ccmaConverged.getDevicePointer()};
context.executeKernel(ccmaDirectionsKernel, directionsArgs, ccmaAtoms->getSize()); context.executeKernel(ccmaDirectionsKernel, directionsArgs, ccmaAtoms.getSize());
int i; int i;
void* forceArgs[] = {&ccmaAtoms->getDevicePointer(), &ccmaDistance->getDevicePointer(), void* forceArgs[] = {&ccmaAtoms.getDevicePointer(), &ccmaDistance.getDevicePointer(),
constrainVelocities ? &context.getVelm().getDevicePointer() : &posDelta->getDevicePointer(), constrainVelocities ? &context.getVelm().getDevicePointer() : &posDelta.getDevicePointer(),
&ccmaReducedMass->getDevicePointer(), &ccmaDelta1->getDevicePointer(), &ccmaConverged->getDevicePointer(), &ccmaReducedMass.getDevicePointer(), &ccmaDelta1.getDevicePointer(), &ccmaConverged.getDevicePointer(),
&ccmaConvergedDeviceMemory, tolPointer, &i}; &ccmaConvergedDeviceMemory, tolPointer, &i};
void* multiplyArgs[] = {&ccmaDelta1->getDevicePointer(), &ccmaDelta2->getDevicePointer(), void* multiplyArgs[] = {&ccmaDelta1.getDevicePointer(), &ccmaDelta2.getDevicePointer(),
&ccmaConstraintMatrixColumn->getDevicePointer(), &ccmaConstraintMatrixValue->getDevicePointer(), &ccmaConverged->getDevicePointer(), &i}; &ccmaConstraintMatrixColumn.getDevicePointer(), &ccmaConstraintMatrixValue.getDevicePointer(), &ccmaConverged.getDevicePointer(), &i};
void* updateArgs[] = {&ccmaNumAtomConstraints->getDevicePointer(), &ccmaAtomConstraints->getDevicePointer(), &ccmaDistance->getDevicePointer(), void* updateArgs[] = {&ccmaNumAtomConstraints.getDevicePointer(), &ccmaAtomConstraints.getDevicePointer(), &ccmaDistance.getDevicePointer(),
constrainVelocities ? &context.getVelm().getDevicePointer() : &posDelta->getDevicePointer(), constrainVelocities ? &context.getVelm().getDevicePointer() : &posDelta.getDevicePointer(),
&context.getVelm().getDevicePointer(), &ccmaDelta1->getDevicePointer(), &ccmaDelta2->getDevicePointer(), &context.getVelm().getDevicePointer(), &ccmaDelta1.getDevicePointer(), &ccmaDelta2.getDevicePointer(),
&ccmaConverged->getDevicePointer(), &i}; &ccmaConverged.getDevicePointer(), &i};
const int checkInterval = 4; const int checkInterval = 4;
ccmaConvergedMemory[0] = 0; ccmaConvergedMemory[0] = 0;
for (i = 0; i < 150; i++) { for (i = 0; i < 150; i++) {
context.executeKernel(ccmaForceKernel, forceArgs, ccmaAtoms->getSize()); context.executeKernel(ccmaForceKernel, forceArgs, ccmaAtoms.getSize());
if ((i+1)%checkInterval == 0) if ((i+1)%checkInterval == 0)
CHECK_RESULT2(cuEventRecord(ccmaEvent, 0), "Error recording event for CCMA"); CHECK_RESULT2(cuEventRecord(ccmaEvent, 0), "Error recording event for CCMA");
context.executeKernel(ccmaMultiplyKernel, multiplyArgs, ccmaAtoms->getSize()); context.executeKernel(ccmaMultiplyKernel, multiplyArgs, ccmaAtoms.getSize());
context.executeKernel(ccmaUpdateKernel, updateArgs, context.getNumAtoms()); context.executeKernel(ccmaUpdateKernel, updateArgs, context.getNumAtoms());
if ((i+1)%checkInterval == 0) { if ((i+1)%checkInterval == 0) {
CHECK_RESULT2(cuEventSynchronize(ccmaEvent), "Error synchronizing on event for CCMA"); CHECK_RESULT2(cuEventSynchronize(ccmaEvent), "Error synchronizing on event for CCMA");
...@@ -766,12 +702,12 @@ void CudaIntegrationUtilities::applyConstraints(bool constrainVelocities, double ...@@ -766,12 +702,12 @@ void CudaIntegrationUtilities::applyConstraints(bool constrainVelocities, double
void CudaIntegrationUtilities::computeVirtualSites() { void CudaIntegrationUtilities::computeVirtualSites() {
if (numVsites > 0) { if (numVsites > 0) {
CUdeviceptr posCorrection = (context.getUseMixedPrecision() ? context.getPosqCorrection().getDevicePointer() : 0); CUdeviceptr posCorrection = (context.getUseMixedPrecision() ? context.getPosqCorrection().getDevicePointer() : 0);
void* args[] = {&context.getPosq().getDevicePointer(), &posCorrection, &vsite2AvgAtoms->getDevicePointer(), &vsite2AvgWeights->getDevicePointer(), void* args[] = {&context.getPosq().getDevicePointer(), &posCorrection, &vsite2AvgAtoms.getDevicePointer(), &vsite2AvgWeights.getDevicePointer(),
&vsite3AvgAtoms->getDevicePointer(), &vsite3AvgWeights->getDevicePointer(), &vsite3AvgAtoms.getDevicePointer(), &vsite3AvgWeights.getDevicePointer(),
&vsiteOutOfPlaneAtoms->getDevicePointer(), &vsiteOutOfPlaneWeights->getDevicePointer(), &vsiteOutOfPlaneAtoms.getDevicePointer(), &vsiteOutOfPlaneWeights.getDevicePointer(),
&vsiteLocalCoordsIndex->getDevicePointer(), &vsiteLocalCoordsAtoms->getDevicePointer(), &vsiteLocalCoordsIndex.getDevicePointer(), &vsiteLocalCoordsAtoms.getDevicePointer(),
&vsiteLocalCoordsWeights->getDevicePointer(), &vsiteLocalCoordsPos->getDevicePointer(), &vsiteLocalCoordsWeights.getDevicePointer(), &vsiteLocalCoordsPos.getDevicePointer(),
&vsiteLocalCoordsStartIndex->getDevicePointer()}; &vsiteLocalCoordsStartIndex.getDevicePointer()};
context.executeKernel(vsitePositionKernel, args, numVsites); context.executeKernel(vsitePositionKernel, args, numVsites);
} }
} }
...@@ -780,18 +716,18 @@ void CudaIntegrationUtilities::distributeForcesFromVirtualSites() { ...@@ -780,18 +716,18 @@ void CudaIntegrationUtilities::distributeForcesFromVirtualSites() {
if (numVsites > 0) { if (numVsites > 0) {
CUdeviceptr posCorrection = (context.getUseMixedPrecision() ? context.getPosqCorrection().getDevicePointer() : 0); CUdeviceptr posCorrection = (context.getUseMixedPrecision() ? context.getPosqCorrection().getDevicePointer() : 0);
void* args[] = {&context.getPosq().getDevicePointer(), &posCorrection, &context.getForce().getDevicePointer(), void* args[] = {&context.getPosq().getDevicePointer(), &posCorrection, &context.getForce().getDevicePointer(),
&vsite2AvgAtoms->getDevicePointer(), &vsite2AvgWeights->getDevicePointer(), &vsite2AvgAtoms.getDevicePointer(), &vsite2AvgWeights.getDevicePointer(),
&vsite3AvgAtoms->getDevicePointer(), &vsite3AvgWeights->getDevicePointer(), &vsite3AvgAtoms.getDevicePointer(), &vsite3AvgWeights.getDevicePointer(),
&vsiteOutOfPlaneAtoms->getDevicePointer(), &vsiteOutOfPlaneWeights->getDevicePointer(), &vsiteOutOfPlaneAtoms.getDevicePointer(), &vsiteOutOfPlaneWeights.getDevicePointer(),
&vsiteLocalCoordsIndex->getDevicePointer(), &vsiteLocalCoordsAtoms->getDevicePointer(), &vsiteLocalCoordsIndex.getDevicePointer(), &vsiteLocalCoordsAtoms.getDevicePointer(),
&vsiteLocalCoordsWeights->getDevicePointer(), &vsiteLocalCoordsPos->getDevicePointer(), &vsiteLocalCoordsWeights.getDevicePointer(), &vsiteLocalCoordsPos.getDevicePointer(),
&vsiteLocalCoordsStartIndex->getDevicePointer()}; &vsiteLocalCoordsStartIndex.getDevicePointer()};
context.executeKernel(vsiteForceKernel, args, numVsites); context.executeKernel(vsiteForceKernel, args, numVsites);
} }
} }
void CudaIntegrationUtilities::initRandomNumberGenerator(unsigned int randomNumberSeed) { void CudaIntegrationUtilities::initRandomNumberGenerator(unsigned int randomNumberSeed) {
if (random != NULL) { if (random.isInitialized()) {
if (randomNumberSeed != lastSeed) if (randomNumberSeed != lastSeed)
throw OpenMMException("CudaIntegrationUtilities::initRandomNumberGenerator(): Requested two different values for the random number seed"); throw OpenMMException("CudaIntegrationUtilities::initRandomNumberGenerator(): Requested two different values for the random number seed");
return; return;
...@@ -800,63 +736,61 @@ void CudaIntegrationUtilities::initRandomNumberGenerator(unsigned int randomNumb ...@@ -800,63 +736,61 @@ void CudaIntegrationUtilities::initRandomNumberGenerator(unsigned int randomNumb
// Create the random number arrays. // Create the random number arrays.
lastSeed = randomNumberSeed; lastSeed = randomNumberSeed;
random = CudaArray::create<float4>(context, 4*context.getPaddedNumAtoms(), "random"); random.initialize<float4>(context, 4*context.getPaddedNumAtoms(), "random");
randomSeed = CudaArray::create<int4>(context, context.getNumThreadBlocks()*CudaContext::ThreadBlockSize, "randomSeed"); randomSeed.initialize<int4>(context, context.getNumThreadBlocks()*CudaContext::ThreadBlockSize, "randomSeed");
randomPos = random->getSize(); randomPos = random.getSize();
// Use a quick and dirty RNG to pick seeds for the real random number generator. // Use a quick and dirty RNG to pick seeds for the real random number generator.
vector<int4> seed(randomSeed->getSize()); vector<int4> seed(randomSeed.getSize());
unsigned int r = randomNumberSeed; unsigned int r = randomNumberSeed;
if (r == 0) r = (unsigned int) osrngseed(); if (r == 0) r = (unsigned int) osrngseed();
for (int i = 0; i < randomSeed->getSize(); i++) { for (int i = 0; i < randomSeed.getSize(); i++) {
seed[i].x = r = (1664525*r + 1013904223) & 0xFFFFFFFF; seed[i].x = r = (1664525*r + 1013904223) & 0xFFFFFFFF;
seed[i].y = r = (1664525*r + 1013904223) & 0xFFFFFFFF; seed[i].y = r = (1664525*r + 1013904223) & 0xFFFFFFFF;
seed[i].z = r = (1664525*r + 1013904223) & 0xFFFFFFFF; seed[i].z = r = (1664525*r + 1013904223) & 0xFFFFFFFF;
seed[i].w = r = (1664525*r + 1013904223) & 0xFFFFFFFF; seed[i].w = r = (1664525*r + 1013904223) & 0xFFFFFFFF;
} }
randomSeed->upload(seed); randomSeed.upload(seed);
} }
int CudaIntegrationUtilities::prepareRandomNumbers(int numValues) { int CudaIntegrationUtilities::prepareRandomNumbers(int numValues) {
if (randomPos+numValues <= random->getSize()) { if (randomPos+numValues <= random.getSize()) {
int oldPos = randomPos; int oldPos = randomPos;
randomPos += numValues; randomPos += numValues;
return oldPos; return oldPos;
} }
if (numValues > random->getSize()) { if (numValues > random.getSize())
delete random; random.resize(numValues);
random = CudaArray::create<float4>(context, numValues, "random"); int size = random.getSize();
} void* args[] = {&size, &random.getDevicePointer(), &randomSeed.getDevicePointer()};
int size = random->getSize(); context.executeKernel(randomKernel, args, random.getSize());
void* args[] = {&size, &random->getDevicePointer(), &randomSeed->getDevicePointer()};
context.executeKernel(randomKernel, args, random->getSize());
randomPos = numValues; randomPos = numValues;
return 0; return 0;
} }
void CudaIntegrationUtilities::createCheckpoint(ostream& stream) { void CudaIntegrationUtilities::createCheckpoint(ostream& stream) {
if(random == NULL) if (!random.isInitialized())
return; return;
stream.write((char*) &randomPos, sizeof(int)); stream.write((char*) &randomPos, sizeof(int));
vector<float4> randomVec; vector<float4> randomVec;
random->download(randomVec); random.download(randomVec);
stream.write((char*) &randomVec[0], sizeof(float4)*random->getSize()); stream.write((char*) &randomVec[0], sizeof(float4)*random.getSize());
vector<int4> randomSeedVec; vector<int4> randomSeedVec;
randomSeed->download(randomSeedVec); randomSeed.download(randomSeedVec);
stream.write((char*) &randomSeedVec[0], sizeof(int4)*randomSeed->getSize()); stream.write((char*) &randomSeedVec[0], sizeof(int4)*randomSeed.getSize());
} }
void CudaIntegrationUtilities::loadCheckpoint(istream& stream) { void CudaIntegrationUtilities::loadCheckpoint(istream& stream) {
if(random == NULL) if (!random.isInitialized())
return; return;
stream.read((char*) &randomPos, sizeof(int)); stream.read((char*) &randomPos, sizeof(int));
vector<float4> randomVec(random->getSize()); vector<float4> randomVec(random.getSize());
stream.read((char*) &randomVec[0], sizeof(float4)*random->getSize()); stream.read((char*) &randomVec[0], sizeof(float4)*random.getSize());
random->upload(randomVec); random.upload(randomVec);
vector<int4> randomSeedVec(randomSeed->getSize()); vector<int4> randomSeedVec(randomSeed.getSize());
stream.read((char*) &randomSeedVec[0], sizeof(int4)*randomSeed->getSize()); stream.read((char*) &randomSeedVec[0], sizeof(int4)*randomSeed.getSize());
randomSeed->upload(randomSeedVec); randomSeed.upload(randomSeedVec);
} }
double CudaIntegrationUtilities::computeKineticEnergy(double timeShift) { double CudaIntegrationUtilities::computeKineticEnergy(double timeShift) {
...@@ -867,7 +801,7 @@ double CudaIntegrationUtilities::computeKineticEnergy(double timeShift) { ...@@ -867,7 +801,7 @@ double CudaIntegrationUtilities::computeKineticEnergy(double timeShift) {
// Copy the velocities into the posDelta array while we temporarily modify them. // Copy the velocities into the posDelta array while we temporarily modify them.
context.getVelm().copyTo(*posDelta); context.getVelm().copyTo(posDelta);
// Apply the time shift. // Apply the time shift.
...@@ -901,6 +835,6 @@ double CudaIntegrationUtilities::computeKineticEnergy(double timeShift) { ...@@ -901,6 +835,6 @@ double CudaIntegrationUtilities::computeKineticEnergy(double timeShift) {
// Restore the velocities. // Restore the velocities.
if (timeShift != 0) if (timeShift != 0)
posDelta->copyTo(context.getVelm()); posDelta.copyTo(context.getVelm());
return 0.5*energy; return 0.5*energy;
} }
...@@ -63,10 +63,7 @@ private: ...@@ -63,10 +63,7 @@ private:
}; };
CudaNonbondedUtilities::CudaNonbondedUtilities(CudaContext& context) : context(context), useCutoff(false), usePeriodic(false), anyExclusions(false), usePadding(true), CudaNonbondedUtilities::CudaNonbondedUtilities(CudaContext& context) : context(context), useCutoff(false), usePeriodic(false), anyExclusions(false), usePadding(true),
exclusionIndices(NULL), exclusionRowIndices(NULL), exclusionTiles(NULL), exclusions(NULL), interactingTiles(NULL), interactingAtoms(NULL), blockSorter(NULL), pinnedCountBuffer(NULL), forceRebuildNeighborList(true), lastCutoff(0.0), groupFlags(0), canUsePairList(true) {
interactionCount(NULL), singlePairs(NULL), blockCenter(NULL), blockBoundingBox(NULL), sortedBlocks(NULL), sortedBlockCenter(NULL), sortedBlockBoundingBox(NULL),
oldPositions(NULL), rebuildNeighborList(NULL), blockSorter(NULL), pinnedCountBuffer(NULL), forceRebuildNeighborList(true), lastCutoff(0.0), groupFlags(0),
canUsePairList(true) {
// Decide how many thread blocks to use. // Decide how many thread blocks to use.
string errorMessage = "Error initializing nonbonded utilities"; string errorMessage = "Error initializing nonbonded utilities";
...@@ -79,36 +76,6 @@ CudaNonbondedUtilities::CudaNonbondedUtilities(CudaContext& context) : context(c ...@@ -79,36 +76,6 @@ CudaNonbondedUtilities::CudaNonbondedUtilities(CudaContext& context) : context(c
} }
CudaNonbondedUtilities::~CudaNonbondedUtilities() { CudaNonbondedUtilities::~CudaNonbondedUtilities() {
if (exclusionIndices != NULL)
delete exclusionIndices;
if (exclusionRowIndices != NULL)
delete exclusionRowIndices;
if (exclusionTiles != NULL)
delete exclusionTiles;
if (exclusions != NULL)
delete exclusions;
if (interactingTiles != NULL)
delete interactingTiles;
if (interactingAtoms != NULL)
delete interactingAtoms;
if (interactionCount != NULL)
delete interactionCount;
if (singlePairs != NULL)
delete singlePairs;
if (blockCenter != NULL)
delete blockCenter;
if (blockBoundingBox != NULL)
delete blockBoundingBox;
if (sortedBlocks != NULL)
delete sortedBlocks;
if (sortedBlockCenter != NULL)
delete sortedBlockCenter;
if (sortedBlockBoundingBox != NULL)
delete sortedBlockBoundingBox;
if (oldPositions != NULL)
delete oldPositions;
if (rebuildNeighborList != NULL)
delete rebuildNeighborList;
if (blockSorter != NULL) if (blockSorter != NULL)
delete blockSorter; delete blockSorter;
if (pinnedCountBuffer != NULL) if (pinnedCountBuffer != NULL)
...@@ -220,8 +187,8 @@ void CudaNonbondedUtilities::initialize(const System& system) { ...@@ -220,8 +187,8 @@ void CudaNonbondedUtilities::initialize(const System& system) {
for (set<pair<int, int> >::const_iterator iter = tilesWithExclusions.begin(); iter != tilesWithExclusions.end(); ++iter) for (set<pair<int, int> >::const_iterator iter = tilesWithExclusions.begin(); iter != tilesWithExclusions.end(); ++iter)
exclusionTilesVec.push_back(make_ushort2((unsigned short) iter->first, (unsigned short) iter->second)); exclusionTilesVec.push_back(make_ushort2((unsigned short) iter->first, (unsigned short) iter->second));
sort(exclusionTilesVec.begin(), exclusionTilesVec.end(), compareUshort2); sort(exclusionTilesVec.begin(), exclusionTilesVec.end(), compareUshort2);
exclusionTiles = CudaArray::create<ushort2>(context, exclusionTilesVec.size(), "exclusionTiles"); exclusionTiles.initialize<ushort2>(context, exclusionTilesVec.size(), "exclusionTiles");
exclusionTiles->upload(exclusionTilesVec); exclusionTiles.upload(exclusionTilesVec);
map<pair<int, int>, int> exclusionTileMap; map<pair<int, int>, int> exclusionTileMap;
for (int i = 0; i < (int) exclusionTilesVec.size(); i++) { for (int i = 0; i < (int) exclusionTilesVec.size(); i++) {
ushort2 tile = exclusionTilesVec[i]; ushort2 tile = exclusionTilesVec[i];
...@@ -242,16 +209,16 @@ void CudaNonbondedUtilities::initialize(const System& system) { ...@@ -242,16 +209,16 @@ void CudaNonbondedUtilities::initialize(const System& system) {
maxExclusions = 0; maxExclusions = 0;
for (int i = 0; i < (int) exclusionBlocksForBlock.size(); i++) for (int i = 0; i < (int) exclusionBlocksForBlock.size(); i++)
maxExclusions = (maxExclusions > exclusionBlocksForBlock[i].size() ? maxExclusions : exclusionBlocksForBlock[i].size()); maxExclusions = (maxExclusions > exclusionBlocksForBlock[i].size() ? maxExclusions : exclusionBlocksForBlock[i].size());
exclusionIndices = CudaArray::create<unsigned int>(context, exclusionIndicesVec.size(), "exclusionIndices"); exclusionIndices.initialize<unsigned int>(context, exclusionIndicesVec.size(), "exclusionIndices");
exclusionRowIndices = CudaArray::create<unsigned int>(context, exclusionRowIndicesVec.size(), "exclusionRowIndices"); exclusionRowIndices.initialize<unsigned int>(context, exclusionRowIndicesVec.size(), "exclusionRowIndices");
exclusionIndices->upload(exclusionIndicesVec); exclusionIndices.upload(exclusionIndicesVec);
exclusionRowIndices->upload(exclusionRowIndicesVec); exclusionRowIndices.upload(exclusionRowIndicesVec);
// Record the exclusion data. // Record the exclusion data.
exclusions = CudaArray::create<tileflags>(context, tilesWithExclusions.size()*CudaContext::TileSize, "exclusions"); exclusions.initialize<tileflags>(context, tilesWithExclusions.size()*CudaContext::TileSize, "exclusions");
tileflags allFlags = (tileflags) -1; tileflags allFlags = (tileflags) -1;
vector<tileflags> exclusionVec(exclusions->getSize(), allFlags); vector<tileflags> exclusionVec(exclusions.getSize(), allFlags);
for (int atom1 = 0; atom1 < (int) atomExclusions.size(); ++atom1) { for (int atom1 = 0; atom1 < (int) atomExclusions.size(); ++atom1) {
int x = atom1/CudaContext::TileSize; int x = atom1/CudaContext::TileSize;
int offset1 = atom1-x*CudaContext::TileSize; int offset1 = atom1-x*CudaContext::TileSize;
...@@ -270,7 +237,7 @@ void CudaNonbondedUtilities::initialize(const System& system) { ...@@ -270,7 +237,7 @@ void CudaNonbondedUtilities::initialize(const System& system) {
} }
} }
atomExclusions.clear(); // We won't use this again, so free the memory it used atomExclusions.clear(); // We won't use this again, so free the memory it used
exclusions->upload(exclusionVec); exclusions.upload(exclusionVec);
// Create data structures for the neighbor list. // Create data structures for the neighbor list.
...@@ -284,21 +251,21 @@ void CudaNonbondedUtilities::initialize(const System& system) { ...@@ -284,21 +251,21 @@ void CudaNonbondedUtilities::initialize(const System& system) {
if (maxTiles < 1) if (maxTiles < 1)
maxTiles = 1; maxTiles = 1;
maxSinglePairs = 5*numAtoms; maxSinglePairs = 5*numAtoms;
interactingTiles = CudaArray::create<int>(context, maxTiles, "interactingTiles"); interactingTiles.initialize<int>(context, maxTiles, "interactingTiles");
interactingAtoms = CudaArray::create<int>(context, CudaContext::TileSize*maxTiles, "interactingAtoms"); interactingAtoms.initialize<int>(context, CudaContext::TileSize*maxTiles, "interactingAtoms");
interactionCount = CudaArray::create<unsigned int>(context, 2, "interactionCount"); interactionCount.initialize<unsigned int>(context, 2, "interactionCount");
singlePairs = CudaArray::create<int2>(context, maxSinglePairs, "singlePairs"); singlePairs.initialize<int2>(context, maxSinglePairs, "singlePairs");
int elementSize = (context.getUseDoublePrecision() ? sizeof(double) : sizeof(float)); int elementSize = (context.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
blockCenter = new CudaArray(context, numAtomBlocks, 4*elementSize, "blockCenter"); blockCenter.initialize(context, numAtomBlocks, 4*elementSize, "blockCenter");
blockBoundingBox = new CudaArray(context, numAtomBlocks, 4*elementSize, "blockBoundingBox"); blockBoundingBox.initialize(context, numAtomBlocks, 4*elementSize, "blockBoundingBox");
sortedBlocks = new CudaArray(context, numAtomBlocks, 2*elementSize, "sortedBlocks"); sortedBlocks.initialize(context, numAtomBlocks, 2*elementSize, "sortedBlocks");
sortedBlockCenter = new CudaArray(context, numAtomBlocks+1, 4*elementSize, "sortedBlockCenter"); sortedBlockCenter.initialize(context, numAtomBlocks+1, 4*elementSize, "sortedBlockCenter");
sortedBlockBoundingBox = new CudaArray(context, numAtomBlocks+1, 4*elementSize, "sortedBlockBoundingBox"); sortedBlockBoundingBox.initialize(context, numAtomBlocks+1, 4*elementSize, "sortedBlockBoundingBox");
oldPositions = new CudaArray(context, numAtoms, 4*elementSize, "oldPositions"); oldPositions.initialize(context, numAtoms, 4*elementSize, "oldPositions");
rebuildNeighborList = CudaArray::create<int>(context, 1, "rebuildNeighborList"); rebuildNeighborList.initialize<int>(context, 1, "rebuildNeighborList");
blockSorter = new CudaSort(context, new BlockSortTrait(context.getUseDoublePrecision()), numAtomBlocks); blockSorter = new CudaSort(context, new BlockSortTrait(context.getUseDoublePrecision()), numAtomBlocks);
vector<unsigned int> count(2, 0); vector<unsigned int> count(2, 0);
interactionCount->upload(count); interactionCount.upload(count);
} }
// Record arguments for kernels. // Record arguments for kernels.
...@@ -306,24 +273,24 @@ void CudaNonbondedUtilities::initialize(const System& system) { ...@@ -306,24 +273,24 @@ void CudaNonbondedUtilities::initialize(const System& system) {
forceArgs.push_back(&context.getForce().getDevicePointer()); forceArgs.push_back(&context.getForce().getDevicePointer());
forceArgs.push_back(&context.getEnergyBuffer().getDevicePointer()); forceArgs.push_back(&context.getEnergyBuffer().getDevicePointer());
forceArgs.push_back(&context.getPosq().getDevicePointer()); forceArgs.push_back(&context.getPosq().getDevicePointer());
forceArgs.push_back(&exclusions->getDevicePointer()); forceArgs.push_back(&exclusions.getDevicePointer());
forceArgs.push_back(&exclusionTiles->getDevicePointer()); forceArgs.push_back(&exclusionTiles.getDevicePointer());
forceArgs.push_back(&startTileIndex); forceArgs.push_back(&startTileIndex);
forceArgs.push_back(&numTiles); forceArgs.push_back(&numTiles);
if (useCutoff) { if (useCutoff) {
forceArgs.push_back(&interactingTiles->getDevicePointer()); forceArgs.push_back(&interactingTiles.getDevicePointer());
forceArgs.push_back(&interactionCount->getDevicePointer()); forceArgs.push_back(&interactionCount.getDevicePointer());
forceArgs.push_back(context.getPeriodicBoxSizePointer()); forceArgs.push_back(context.getPeriodicBoxSizePointer());
forceArgs.push_back(context.getInvPeriodicBoxSizePointer()); forceArgs.push_back(context.getInvPeriodicBoxSizePointer());
forceArgs.push_back(context.getPeriodicBoxVecXPointer()); forceArgs.push_back(context.getPeriodicBoxVecXPointer());
forceArgs.push_back(context.getPeriodicBoxVecYPointer()); forceArgs.push_back(context.getPeriodicBoxVecYPointer());
forceArgs.push_back(context.getPeriodicBoxVecZPointer()); forceArgs.push_back(context.getPeriodicBoxVecZPointer());
forceArgs.push_back(&maxTiles); forceArgs.push_back(&maxTiles);
forceArgs.push_back(&blockCenter->getDevicePointer()); forceArgs.push_back(&blockCenter.getDevicePointer());
forceArgs.push_back(&blockBoundingBox->getDevicePointer()); forceArgs.push_back(&blockBoundingBox.getDevicePointer());
forceArgs.push_back(&interactingAtoms->getDevicePointer()); forceArgs.push_back(&interactingAtoms.getDevicePointer());
forceArgs.push_back(&maxSinglePairs); forceArgs.push_back(&maxSinglePairs);
forceArgs.push_back(&singlePairs->getDevicePointer()); forceArgs.push_back(&singlePairs.getDevicePointer());
} }
for (int i = 0; i < (int) parameters.size(); i++) for (int i = 0; i < (int) parameters.size(); i++)
forceArgs.push_back(&parameters[i].getMemory()); forceArgs.push_back(&parameters[i].getMemory());
...@@ -339,41 +306,41 @@ void CudaNonbondedUtilities::initialize(const System& system) { ...@@ -339,41 +306,41 @@ void CudaNonbondedUtilities::initialize(const System& system) {
findBlockBoundsArgs.push_back(context.getPeriodicBoxVecYPointer()); findBlockBoundsArgs.push_back(context.getPeriodicBoxVecYPointer());
findBlockBoundsArgs.push_back(context.getPeriodicBoxVecZPointer()); findBlockBoundsArgs.push_back(context.getPeriodicBoxVecZPointer());
findBlockBoundsArgs.push_back(&context.getPosq().getDevicePointer()); findBlockBoundsArgs.push_back(&context.getPosq().getDevicePointer());
findBlockBoundsArgs.push_back(&blockCenter->getDevicePointer()); findBlockBoundsArgs.push_back(&blockCenter.getDevicePointer());
findBlockBoundsArgs.push_back(&blockBoundingBox->getDevicePointer()); findBlockBoundsArgs.push_back(&blockBoundingBox.getDevicePointer());
findBlockBoundsArgs.push_back(&rebuildNeighborList->getDevicePointer()); findBlockBoundsArgs.push_back(&rebuildNeighborList.getDevicePointer());
findBlockBoundsArgs.push_back(&sortedBlocks->getDevicePointer()); findBlockBoundsArgs.push_back(&sortedBlocks.getDevicePointer());
sortBoxDataArgs.push_back(&sortedBlocks->getDevicePointer()); sortBoxDataArgs.push_back(&sortedBlocks.getDevicePointer());
sortBoxDataArgs.push_back(&blockCenter->getDevicePointer()); sortBoxDataArgs.push_back(&blockCenter.getDevicePointer());
sortBoxDataArgs.push_back(&blockBoundingBox->getDevicePointer()); sortBoxDataArgs.push_back(&blockBoundingBox.getDevicePointer());
sortBoxDataArgs.push_back(&sortedBlockCenter->getDevicePointer()); sortBoxDataArgs.push_back(&sortedBlockCenter.getDevicePointer());
sortBoxDataArgs.push_back(&sortedBlockBoundingBox->getDevicePointer()); sortBoxDataArgs.push_back(&sortedBlockBoundingBox.getDevicePointer());
sortBoxDataArgs.push_back(&context.getPosq().getDevicePointer()); sortBoxDataArgs.push_back(&context.getPosq().getDevicePointer());
sortBoxDataArgs.push_back(&oldPositions->getDevicePointer()); sortBoxDataArgs.push_back(&oldPositions.getDevicePointer());
sortBoxDataArgs.push_back(&interactionCount->getDevicePointer()); sortBoxDataArgs.push_back(&interactionCount.getDevicePointer());
sortBoxDataArgs.push_back(&rebuildNeighborList->getDevicePointer()); sortBoxDataArgs.push_back(&rebuildNeighborList.getDevicePointer());
sortBoxDataArgs.push_back(&forceRebuildNeighborList); sortBoxDataArgs.push_back(&forceRebuildNeighborList);
findInteractingBlocksArgs.push_back(context.getPeriodicBoxSizePointer()); findInteractingBlocksArgs.push_back(context.getPeriodicBoxSizePointer());
findInteractingBlocksArgs.push_back(context.getInvPeriodicBoxSizePointer()); findInteractingBlocksArgs.push_back(context.getInvPeriodicBoxSizePointer());
findInteractingBlocksArgs.push_back(context.getPeriodicBoxVecXPointer()); findInteractingBlocksArgs.push_back(context.getPeriodicBoxVecXPointer());
findInteractingBlocksArgs.push_back(context.getPeriodicBoxVecYPointer()); findInteractingBlocksArgs.push_back(context.getPeriodicBoxVecYPointer());
findInteractingBlocksArgs.push_back(context.getPeriodicBoxVecZPointer()); findInteractingBlocksArgs.push_back(context.getPeriodicBoxVecZPointer());
findInteractingBlocksArgs.push_back(&interactionCount->getDevicePointer()); findInteractingBlocksArgs.push_back(&interactionCount.getDevicePointer());
findInteractingBlocksArgs.push_back(&interactingTiles->getDevicePointer()); findInteractingBlocksArgs.push_back(&interactingTiles.getDevicePointer());
findInteractingBlocksArgs.push_back(&interactingAtoms->getDevicePointer()); findInteractingBlocksArgs.push_back(&interactingAtoms.getDevicePointer());
findInteractingBlocksArgs.push_back(&singlePairs->getDevicePointer()); findInteractingBlocksArgs.push_back(&singlePairs.getDevicePointer());
findInteractingBlocksArgs.push_back(&context.getPosq().getDevicePointer()); findInteractingBlocksArgs.push_back(&context.getPosq().getDevicePointer());
findInteractingBlocksArgs.push_back(&maxTiles); findInteractingBlocksArgs.push_back(&maxTiles);
findInteractingBlocksArgs.push_back(&maxSinglePairs); findInteractingBlocksArgs.push_back(&maxSinglePairs);
findInteractingBlocksArgs.push_back(&startBlockIndex); findInteractingBlocksArgs.push_back(&startBlockIndex);
findInteractingBlocksArgs.push_back(&numBlocks); findInteractingBlocksArgs.push_back(&numBlocks);
findInteractingBlocksArgs.push_back(&sortedBlocks->getDevicePointer()); findInteractingBlocksArgs.push_back(&sortedBlocks.getDevicePointer());
findInteractingBlocksArgs.push_back(&sortedBlockCenter->getDevicePointer()); findInteractingBlocksArgs.push_back(&sortedBlockCenter.getDevicePointer());
findInteractingBlocksArgs.push_back(&sortedBlockBoundingBox->getDevicePointer()); findInteractingBlocksArgs.push_back(&sortedBlockBoundingBox.getDevicePointer());
findInteractingBlocksArgs.push_back(&exclusionIndices->getDevicePointer()); findInteractingBlocksArgs.push_back(&exclusionIndices.getDevicePointer());
findInteractingBlocksArgs.push_back(&exclusionRowIndices->getDevicePointer()); findInteractingBlocksArgs.push_back(&exclusionRowIndices.getDevicePointer());
findInteractingBlocksArgs.push_back(&oldPositions->getDevicePointer()); findInteractingBlocksArgs.push_back(&oldPositions.getDevicePointer());
findInteractingBlocksArgs.push_back(&rebuildNeighborList->getDevicePointer()); findInteractingBlocksArgs.push_back(&rebuildNeighborList.getDevicePointer());
} }
} }
...@@ -406,12 +373,12 @@ void CudaNonbondedUtilities::prepareInteractions(int forceGroups) { ...@@ -406,12 +373,12 @@ void CudaNonbondedUtilities::prepareInteractions(int forceGroups) {
if (lastCutoff != kernels.cutoffDistance) if (lastCutoff != kernels.cutoffDistance)
forceRebuildNeighborList = true; forceRebuildNeighborList = true;
context.executeKernel(kernels.findBlockBoundsKernel, &findBlockBoundsArgs[0], context.getNumAtoms()); context.executeKernel(kernels.findBlockBoundsKernel, &findBlockBoundsArgs[0], context.getNumAtoms());
blockSorter->sort(*sortedBlocks); blockSorter->sort(sortedBlocks);
context.executeKernel(kernels.sortBoxDataKernel, &sortBoxDataArgs[0], context.getNumAtoms()); context.executeKernel(kernels.sortBoxDataKernel, &sortBoxDataArgs[0], context.getNumAtoms());
context.executeKernel(kernels.findInteractingBlocksKernel, &findInteractingBlocksArgs[0], context.getNumAtoms(), 256); context.executeKernel(kernels.findInteractingBlocksKernel, &findInteractingBlocksArgs[0], context.getNumAtoms(), 256);
forceRebuildNeighborList = false; forceRebuildNeighborList = false;
lastCutoff = kernels.cutoffDistance; lastCutoff = kernels.cutoffDistance;
interactionCount->download(pinnedCountBuffer, false); interactionCount.download(pinnedCountBuffer, false);
cuEventRecord(downloadCountEvent, context.getCurrentStream()); cuEventRecord(downloadCountEvent, context.getCurrentStream());
} }
...@@ -445,27 +412,21 @@ bool CudaNonbondedUtilities::updateNeighborListSize() { ...@@ -445,27 +412,21 @@ bool CudaNonbondedUtilities::updateNeighborListSize() {
int totalTiles = context.getNumAtomBlocks()*(context.getNumAtomBlocks()+1)/2; int totalTiles = context.getNumAtomBlocks()*(context.getNumAtomBlocks()+1)/2;
if (maxTiles > totalTiles) if (maxTiles > totalTiles)
maxTiles = totalTiles; maxTiles = totalTiles;
delete interactingTiles; interactingTiles.resize(maxTiles);
delete interactingAtoms; interactingAtoms.resize(CudaContext::TileSize*maxTiles);
interactingTiles = NULL; // Avoid an error in the destructor if the following allocation fails
interactingAtoms = NULL;
interactingTiles = CudaArray::create<int>(context, maxTiles, "interactingTiles");
interactingAtoms = CudaArray::create<int>(context, CudaContext::TileSize*maxTiles, "interactingAtoms");
if (forceArgs.size() > 0) if (forceArgs.size() > 0)
forceArgs[7] = &interactingTiles->getDevicePointer(); forceArgs[7] = &interactingTiles.getDevicePointer();
findInteractingBlocksArgs[6] = &interactingTiles->getDevicePointer(); findInteractingBlocksArgs[6] = &interactingTiles.getDevicePointer();
if (forceArgs.size() > 0) if (forceArgs.size() > 0)
forceArgs[17] = &interactingAtoms->getDevicePointer(); forceArgs[17] = &interactingAtoms.getDevicePointer();
findInteractingBlocksArgs[7] = &interactingAtoms->getDevicePointer(); findInteractingBlocksArgs[7] = &interactingAtoms.getDevicePointer();
} }
if (pinnedCountBuffer[1] > maxSinglePairs) { if (pinnedCountBuffer[1] > maxSinglePairs) {
maxSinglePairs = (int) (1.2*pinnedCountBuffer[1]); maxSinglePairs = (int) (1.2*pinnedCountBuffer[1]);
delete singlePairs; singlePairs.resize(maxSinglePairs);
singlePairs = NULL; // Avoid an error in the destructor if the following allocation fails
singlePairs = CudaArray::create<int2>(context, maxSinglePairs, "singlePairs");
if (forceArgs.size() > 0) if (forceArgs.size() > 0)
forceArgs[19] = &singlePairs->getDevicePointer(); forceArgs[19] = &singlePairs.getDevicePointer();
findInteractingBlocksArgs[8] = &singlePairs->getDevicePointer(); findInteractingBlocksArgs[8] = &singlePairs.getDevicePointer();
} }
forceRebuildNeighborList = true; forceRebuildNeighborList = true;
context.setForcesValid(false); context.setForcesValid(false);
...@@ -510,7 +471,7 @@ void CudaNonbondedUtilities::createKernelsForGroups(int groups) { ...@@ -510,7 +471,7 @@ void CudaNonbondedUtilities::createKernelsForGroups(int groups) {
defines["PADDING"] = context.doubleToString(padding); defines["PADDING"] = context.doubleToString(padding);
defines["PADDED_CUTOFF"] = context.doubleToString(paddedCutoff); defines["PADDED_CUTOFF"] = context.doubleToString(paddedCutoff);
defines["PADDED_CUTOFF_SQUARED"] = context.doubleToString(paddedCutoff*paddedCutoff); defines["PADDED_CUTOFF_SQUARED"] = context.doubleToString(paddedCutoff*paddedCutoff);
defines["NUM_TILES_WITH_EXCLUSIONS"] = context.intToString(exclusionTiles->getSize()); defines["NUM_TILES_WITH_EXCLUSIONS"] = context.intToString(exclusionTiles.getSize());
if (usePeriodic) if (usePeriodic)
defines["USE_PERIODIC"] = "1"; defines["USE_PERIODIC"] = "1";
if (context.getBoxIsTriclinic()) if (context.getBoxIsTriclinic())
...@@ -735,7 +696,7 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source, ...@@ -735,7 +696,7 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
defines["PADDED_NUM_ATOMS"] = context.intToString(context.getPaddedNumAtoms()); defines["PADDED_NUM_ATOMS"] = context.intToString(context.getPaddedNumAtoms());
defines["NUM_BLOCKS"] = context.intToString(context.getNumAtomBlocks()); defines["NUM_BLOCKS"] = context.intToString(context.getNumAtomBlocks());
defines["TILE_SIZE"] = context.intToString(CudaContext::TileSize); defines["TILE_SIZE"] = context.intToString(CudaContext::TileSize);
int numExclusionTiles = exclusionTiles->getSize(); int numExclusionTiles = exclusionTiles.getSize();
defines["NUM_TILES_WITH_EXCLUSIONS"] = context.intToString(numExclusionTiles); defines["NUM_TILES_WITH_EXCLUSIONS"] = context.intToString(numExclusionTiles);
int numContexts = context.getPlatformData().contexts.size(); int numContexts = context.getPlatformData().contexts.size();
int startExclusionIndex = context.getContextIndex()*numExclusionTiles/numContexts; int startExclusionIndex = context.getContextIndex()*numExclusionTiles/numContexts;
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2010-2012 Stanford University and the Authors. * * Portions copyright (c) 2010-2018 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -31,8 +31,7 @@ ...@@ -31,8 +31,7 @@
using namespace OpenMM; using namespace OpenMM;
using namespace std; using namespace std;
CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length) : context(context), trait(trait), CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length) : context(context), trait(trait), dataLength(length) {
dataRange(NULL), bucketOfElement(NULL), offsetInBucket(NULL), bucketOffset(NULL), buckets(NULL), dataLength(length) {
// Create kernels. // Create kernels.
map<string, string> replacements; map<string, string> replacements;
...@@ -76,26 +75,16 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length) ...@@ -76,26 +75,16 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length)
// Create workspace arrays. // Create workspace arrays.
if (!isShortList) { if (!isShortList) {
dataRange = new CudaArray(context, 2, trait->getKeySize(), "sortDataRange"); dataRange.initialize(context, 2, trait->getKeySize(), "sortDataRange");
bucketOffset = CudaArray::create<uint1>(context, numBuckets, "bucketOffset"); bucketOffset.initialize<uint1>(context, numBuckets, "bucketOffset");
bucketOfElement = CudaArray::create<uint1>(context, length, "bucketOfElement"); bucketOfElement.initialize<uint1>(context, length, "bucketOfElement");
offsetInBucket = CudaArray::create<uint1>(context, length, "offsetInBucket"); offsetInBucket.initialize<uint1>(context, length, "offsetInBucket");
buckets = new CudaArray(context, length, trait->getDataSize(), "buckets"); buckets.initialize(context, length, trait->getDataSize(), "buckets");
} }
} }
CudaSort::~CudaSort() { CudaSort::~CudaSort() {
delete trait; delete trait;
if (dataRange != NULL)
delete dataRange;
if (bucketOfElement != NULL)
delete bucketOfElement;
if (offsetInBucket != NULL)
delete offsetInBucket;
if (bucketOffset != NULL)
delete bucketOffset;
if (buckets != NULL)
delete buckets;
} }
void CudaSort::sort(CudaArray& data) { void CudaSort::sort(CudaArray& data) {
...@@ -112,30 +101,30 @@ void CudaSort::sort(CudaArray& data) { ...@@ -112,30 +101,30 @@ void CudaSort::sort(CudaArray& data) {
else { else {
// Compute the range of data values. // Compute the range of data values.
unsigned int numBuckets = bucketOffset->getSize(); unsigned int numBuckets = bucketOffset.getSize();
void* rangeArgs[] = {&data.getDevicePointer(), &dataLength, &dataRange->getDevicePointer(), &numBuckets, &bucketOffset->getDevicePointer()}; void* rangeArgs[] = {&data.getDevicePointer(), &dataLength, &dataRange.getDevicePointer(), &numBuckets, &bucketOffset.getDevicePointer()};
context.executeKernel(computeRangeKernel, rangeArgs, rangeKernelSize, rangeKernelSize, 2*rangeKernelSize*trait->getKeySize()); context.executeKernel(computeRangeKernel, rangeArgs, rangeKernelSize, rangeKernelSize, 2*rangeKernelSize*trait->getKeySize());
// Assign array elements to buckets. // Assign array elements to buckets.
void* elementsArgs[] = {&data.getDevicePointer(), &dataLength, &numBuckets, &dataRange->getDevicePointer(), void* elementsArgs[] = {&data.getDevicePointer(), &dataLength, &numBuckets, &dataRange.getDevicePointer(),
&bucketOffset->getDevicePointer(), &bucketOfElement->getDevicePointer(), &offsetInBucket->getDevicePointer()}; &bucketOffset.getDevicePointer(), &bucketOfElement.getDevicePointer(), &offsetInBucket.getDevicePointer()};
context.executeKernel(assignElementsKernel, elementsArgs, data.getSize(), 128); context.executeKernel(assignElementsKernel, elementsArgs, data.getSize(), 128);
// Compute the position of each bucket. // Compute the position of each bucket.
void* computeArgs[] = {&numBuckets, &bucketOffset->getDevicePointer()}; void* computeArgs[] = {&numBuckets, &bucketOffset.getDevicePointer()};
context.executeKernel(computeBucketPositionsKernel, computeArgs, positionsKernelSize, positionsKernelSize, positionsKernelSize*sizeof(int)); context.executeKernel(computeBucketPositionsKernel, computeArgs, positionsKernelSize, positionsKernelSize, positionsKernelSize*sizeof(int));
// Copy the data into the buckets. // Copy the data into the buckets.
void* copyArgs[] = {&data.getDevicePointer(), &buckets->getDevicePointer(), &dataLength, &bucketOffset->getDevicePointer(), void* copyArgs[] = {&data.getDevicePointer(), &buckets.getDevicePointer(), &dataLength, &bucketOffset.getDevicePointer(),
&bucketOfElement->getDevicePointer(), &offsetInBucket->getDevicePointer()}; &bucketOfElement.getDevicePointer(), &offsetInBucket.getDevicePointer()};
context.executeKernel(copyToBucketsKernel, copyArgs, data.getSize()); context.executeKernel(copyToBucketsKernel, copyArgs, data.getSize());
// Sort each bucket. // Sort each bucket.
void* sortArgs[] = {&data.getDevicePointer(), &buckets->getDevicePointer(), &numBuckets, &bucketOffset->getDevicePointer()}; void* sortArgs[] = {&data.getDevicePointer(), &buckets.getDevicePointer(), &numBuckets, &bucketOffset.getDevicePointer()};
context.executeKernel(sortBucketsKernel, sortArgs, ((data.getSize()+sortKernelSize-1)/sortKernelSize)*sortKernelSize, sortKernelSize, sortKernelSize*trait->getDataSize()); context.executeKernel(sortBucketsKernel, sortArgs, ((data.getSize()+sortKernelSize-1)/sortKernelSize)*sortKernelSize, sortKernelSize, sortKernelSize*trait->getDataSize());
} }
} }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment