Commit 86aacbd8 authored by Peter Eastman's avatar Peter Eastman
Browse files

Continuing to implement new CUDA platform: NonbondedForce

parent eb64fa2f
...@@ -18,7 +18,7 @@ IF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug) ...@@ -18,7 +18,7 @@ IF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
ELSE (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug) ELSE (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
SET(MAIN_OPENMM_LIB ${OPENMM_LIBRARY_NAME}) SET(MAIN_OPENMM_LIB ${OPENMM_LIBRARY_NAME})
ENDIF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug) ENDIF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${MAIN_OPENMM_LIB} ${CUDA_LIBRARIES} ${PTHREADS_LIB}) TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${MAIN_OPENMM_LIB} ${CUDA_LIBRARIES} ${CUDA_CUFFT_LIBRARIES} ${PTHREADS_LIB})
SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES COMPILE_FLAGS "-DOPENMM_BUILDING_SHARED_LIBRARY") SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES COMPILE_FLAGS "-DOPENMM_BUILDING_SHARED_LIBRARY")
INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${SHARED_TARGET}) INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${SHARED_TARGET})
...@@ -66,7 +66,8 @@ void CudaBondedUtilities::addPrefixCode(const string& source) { ...@@ -66,7 +66,8 @@ void CudaBondedUtilities::addPrefixCode(const string& source) {
void CudaBondedUtilities::initialize(const System& system) { void CudaBondedUtilities::initialize(const System& system) {
int numForces = forceAtoms.size(); int numForces = forceAtoms.size();
if (numForces == 0) hasInteractions = (numForces > 0);
if (!hasInteractions)
return; return;
// Build the lists of atom indices. // Build the lists of atom indices.
...@@ -164,6 +165,8 @@ void CudaBondedUtilities::computeInteractions(int groups) { ...@@ -164,6 +165,8 @@ void CudaBondedUtilities::computeInteractions(int groups) {
for (int i = 0; i < (int) arguments.size(); i++) for (int i = 0; i < (int) arguments.size(); i++)
kernelArgs.push_back(&arguments[i]); kernelArgs.push_back(&arguments[i]);
} }
if (!hasInteractions)
return;
kernelArgs[3] = &groups; kernelArgs[3] = &groups;
context.executeKernel(kernel, &kernelArgs[0], maxBonds); context.executeKernel(kernel, &kernelArgs[0], maxBonds);
} }
...@@ -131,7 +131,7 @@ private: ...@@ -131,7 +131,7 @@ private:
std::vector<std::string> prefixCode; std::vector<std::string> prefixCode;
std::vector<void*> kernelArgs; std::vector<void*> kernelArgs;
int numForceBuffers, maxBonds; int numForceBuffers, maxBonds;
bool hasInitializedKernels; bool hasInitializedKernels, hasInteractions;
}; };
} // namespace OpenMM } // namespace OpenMM
......
...@@ -34,7 +34,7 @@ ...@@ -34,7 +34,7 @@
#include "CudaForceInfo.h" #include "CudaForceInfo.h"
#include "CudaIntegrationUtilities.h" #include "CudaIntegrationUtilities.h"
#include "CudaKernelSources.h" #include "CudaKernelSources.h"
//#include "CudaNonbondedUtilities.h" #include "CudaNonbondedUtilities.h"
#include "hilbert.h" #include "hilbert.h"
#include "openmm/OpenMMException.h" #include "openmm/OpenMMException.h"
#include "openmm/Platform.h" #include "openmm/Platform.h"
...@@ -68,7 +68,7 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking ...@@ -68,7 +68,7 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
const string& tempDir, CudaPlatform::PlatformData& platformData) : system(system), compiler(compiler), const string& tempDir, CudaPlatform::PlatformData& platformData) : system(system), compiler(compiler),
time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), contextIsValid(false), atomsWereReordered(false), pinnedBuffer(NULL), posq(NULL), time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), contextIsValid(false), atomsWereReordered(false), pinnedBuffer(NULL), posq(NULL),
velm(NULL), force(NULL), energyBuffer(NULL), atomIndex(NULL), integration(NULL), expression(NULL), velm(NULL), force(NULL), energyBuffer(NULL), atomIndex(NULL), integration(NULL), expression(NULL),
bonded(NULL), /*nonbonded(NULL),*/ thread(NULL) { bonded(NULL), nonbonded(NULL), thread(NULL) {
if (!hasInitializedCuda) { if (!hasInitializedCuda) {
CHECK_RESULT2(cuInit(0), "Error initializing CUDA"); CHECK_RESULT2(cuInit(0), "Error initializing CUDA");
hasInitializedCuda = true; hasInitializedCuda = true;
...@@ -122,7 +122,6 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking ...@@ -122,7 +122,6 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
throw OpenMMException("No compatible CUDA device is available"); throw OpenMMException("No compatible CUDA device is available");
CHECK_RESULT(cuDeviceGet(&device, deviceIndex)); CHECK_RESULT(cuDeviceGet(&device, deviceIndex));
this->deviceIndex = deviceIndex; this->deviceIndex = deviceIndex;
compilationDefines["WORK_GROUP_SIZE"] = intToString(ThreadBlockSize);
defaultOptimizationOptions = "--use_fast_math"; defaultOptimizationOptions = "--use_fast_math";
unsigned int flags = CU_CTX_MAP_HOST; unsigned int flags = CU_CTX_MAP_HOST;
if (useBlockingSync) if (useBlockingSync)
...@@ -139,13 +138,18 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking ...@@ -139,13 +138,18 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
int numThreadBlocksPerComputeUnit = 6; int numThreadBlocksPerComputeUnit = 6;
numThreadBlocks = numThreadBlocksPerComputeUnit*multiprocessors; numThreadBlocks = numThreadBlocksPerComputeUnit*multiprocessors;
bonded = new CudaBondedUtilities(*this); bonded = new CudaBondedUtilities(*this);
// nonbonded = new CudaNonbondedUtilities(*this); nonbonded = new CudaNonbondedUtilities(*this);
int numEnergyBuffers = max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers());
if (useDoublePrecision) { if (useDoublePrecision) {
posq = CudaArray::create<double4>(paddedNumAtoms, "posq"); posq = CudaArray::create<double4>(paddedNumAtoms, "posq");
velm = CudaArray::create<double4>(paddedNumAtoms, "velm"); velm = CudaArray::create<double4>(paddedNumAtoms, "velm");
compilationDefines["USE_DOUBLE_PRECISION"] = "1";
compilationDefines["make_real2"] = "make_double2"; compilationDefines["make_real2"] = "make_double2";
compilationDefines["make_real3"] = "make_double3"; compilationDefines["make_real3"] = "make_double3";
compilationDefines["make_real4"] = "make_double4"; compilationDefines["make_real4"] = "make_double4";
energyBuffer = CudaArray::create<double>(numEnergyBuffers, "energyBuffer");
int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers);
CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0));
} }
else { else {
posq = CudaArray::create<float4>(paddedNumAtoms, "posq"); posq = CudaArray::create<float4>(paddedNumAtoms, "posq");
...@@ -153,6 +157,9 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking ...@@ -153,6 +157,9 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
compilationDefines["make_real2"] = "make_float2"; compilationDefines["make_real2"] = "make_float2";
compilationDefines["make_real3"] = "make_float3"; compilationDefines["make_real3"] = "make_float3";
compilationDefines["make_real4"] = "make_float4"; compilationDefines["make_real4"] = "make_float4";
energyBuffer = CudaArray::create<float>(numEnergyBuffers, "energyBuffer");
int pinnedBufferSize = max(paddedNumAtoms*6, numEnergyBuffers);
CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(float), 0));
} }
posCellOffsets.resize(paddedNumAtoms, make_int4(0, 0, 0, 0)); posCellOffsets.resize(paddedNumAtoms, make_int4(0, 0, 0, 0));
...@@ -191,6 +198,7 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking ...@@ -191,6 +198,7 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
} }
CudaContext::~CudaContext() { CudaContext::~CudaContext() {
cuCtxSetCurrent(context);
for (int i = 0; i < (int) forces.size(); i++) for (int i = 0; i < (int) forces.size(); i++)
delete forces[i]; delete forces[i];
for (int i = 0; i < (int) reorderListeners.size(); i++) for (int i = 0; i < (int) reorderListeners.size(); i++)
...@@ -211,8 +219,8 @@ CudaContext::~CudaContext() { ...@@ -211,8 +219,8 @@ CudaContext::~CudaContext() {
delete expression; delete expression;
if (bonded != NULL) if (bonded != NULL)
delete bonded; delete bonded;
// if (nonbonded != NULL) if (nonbonded != NULL)
// delete nonbonded; delete nonbonded;
if (thread != NULL) if (thread != NULL)
delete thread; delete thread;
string errorMessage = "Error deleting Context"; string errorMessage = "Error deleting Context";
...@@ -221,17 +229,8 @@ CudaContext::~CudaContext() { ...@@ -221,17 +229,8 @@ CudaContext::~CudaContext() {
} }
void CudaContext::initialize() { void CudaContext::initialize() {
cuCtxSetCurrent(context);
string errorMessage = "Error initializing Context"; string errorMessage = "Error initializing Context";
if (useDoublePrecision) {
energyBuffer = CudaArray::create<double>(numThreadBlocks*ThreadBlockSize, "energyBuffer");
int pinnedBufferSize = max(paddedNumAtoms*4, numThreadBlocks*ThreadBlockSize);
CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0));
}
else {
energyBuffer = CudaArray::create<float>(numThreadBlocks*ThreadBlockSize, "energyBuffer");
int pinnedBufferSize = max(paddedNumAtoms*6, numThreadBlocks*ThreadBlockSize);
CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(float), 0));
}
for (int i = 0; i < numAtoms; i++) { for (int i = 0; i < numAtoms; i++) {
double mass = system.getParticleMass(i); double mass = system.getParticleMass(i);
if (useDoublePrecision) if (useDoublePrecision)
...@@ -251,7 +250,7 @@ void CudaContext::initialize() { ...@@ -251,7 +250,7 @@ void CudaContext::initialize() {
atomIndexDevice->upload(atomIndex); atomIndexDevice->upload(atomIndex);
findMoleculeGroups(); findMoleculeGroups();
moleculesInvalid = false; moleculesInvalid = false;
// nonbonded->initialize(system); nonbonded->initialize(system);
} }
void CudaContext::addForce(CudaForceInfo* force) { void CudaContext::addForce(CudaForceInfo* force) {
...@@ -719,226 +718,226 @@ void CudaContext::invalidateMolecules() { ...@@ -719,226 +718,226 @@ void CudaContext::invalidateMolecules() {
moleculesInvalid = true; moleculesInvalid = true;
} }
//void OpenCLContext::validateMolecules() { void CudaContext::validateMolecules() {
// moleculesInvalid = false; moleculesInvalid = false;
// if (numAtoms == 0 || nonbonded == NULL || !nonbonded->getUseCutoff()) if (numAtoms == 0 || nonbonded == NULL || !nonbonded->getUseCutoff())
// return; return;
// bool valid = true; bool valid = true;
// for (int group = 0; valid && group < (int) moleculeGroups.size(); group++) { for (int group = 0; valid && group < (int) moleculeGroups.size(); group++) {
// MoleculeGroup& mol = moleculeGroups[group]; MoleculeGroup& mol = moleculeGroups[group];
// vector<int>& instances = mol.instances; vector<int>& instances = mol.instances;
// vector<int>& offsets = mol.offsets; vector<int>& offsets = mol.offsets;
// vector<int>& atoms = mol.atoms; vector<int>& atoms = mol.atoms;
// int numMolecules = instances.size(); int numMolecules = instances.size();
// Molecule& m1 = molecules[instances[0]]; Molecule& m1 = molecules[instances[0]];
// int offset1 = offsets[0]; int offset1 = offsets[0];
// for (int j = 1; valid && j < numMolecules; j++) { for (int j = 1; valid && j < numMolecules; j++) {
// // See if the atoms are identical. // See if the atoms are identical.
//
// Molecule& m2 = molecules[instances[j]]; Molecule& m2 = molecules[instances[j]];
// int offset2 = offsets[j]; int offset2 = offsets[j];
// for (int i = 0; i < (int) atoms.size() && valid; i++) { for (int i = 0; i < (int) atoms.size() && valid; i++) {
// for (int k = 0; k < (int) forces.size(); k++) for (int k = 0; k < (int) forces.size(); k++)
// if (!forces[k]->areParticlesIdentical(atoms[i]+offset1, atoms[i]+offset2)) if (!forces[k]->areParticlesIdentical(atoms[i]+offset1, atoms[i]+offset2))
// valid = false; valid = false;
// } }
//
// // See if the force groups are identical. // See if the force groups are identical.
//
// for (int i = 0; i < (int) forces.size() && valid; i++) { for (int i = 0; i < (int) forces.size() && valid; i++) {
// for (int k = 0; k < (int) m1.groups[i].size() && valid; k++) for (int k = 0; k < (int) m1.groups[i].size() && valid; k++)
// if (!forces[i]->areGroupsIdentical(m1.groups[i][k], m2.groups[i][k])) if (!forces[i]->areGroupsIdentical(m1.groups[i][k], m2.groups[i][k]))
// valid = false; valid = false;
// } }
// } }
// } }
// if (valid) if (valid)
// return; return;
//
// // The list of which molecules are identical is no longer valid. We need to restore the // The list of which molecules are identical is no longer valid. We need to restore the
// // atoms to their original order, rebuild the list of identical molecules, and sort them // atoms to their original order, rebuild the list of identical molecules, and sort them
// // again. // again.
//
// vector<mm_float4> newPosq(numAtoms); vector<float4> oldPosq(paddedNumAtoms);
// vector<mm_float4> newVelm(numAtoms); vector<float4> newPosq(paddedNumAtoms);
// vector<mm_int4> newCellOffsets(numAtoms); vector<float4> oldVelm(paddedNumAtoms);
// posq->download(); vector<float4> newVelm(paddedNumAtoms);
// velm->download(); vector<int4> newCellOffsets(numAtoms);
// for (int i = 0; i < numAtoms; i++) { posq->download(oldPosq);
// int index = atomIndex->get(i); velm->download(oldVelm);
// newPosq[index] = posq->get(i); for (int i = 0; i < numAtoms; i++) {
// newVelm[index] = velm->get(i); int index = atomIndex[i];
// newCellOffsets[index] = posCellOffsets[i]; newPosq[index] = oldPosq[i];
// } newVelm[index] = oldVelm[i];
// for (int i = 0; i < numAtoms; i++) { newCellOffsets[index] = posCellOffsets[i];
// posq->set(i, newPosq[i]); }
// velm->set(i, newVelm[i]); for (int i = 0; i < numAtoms; i++) {
// atomIndex->set(i, i); atomIndex[i] = i;
// posCellOffsets[i] = newCellOffsets[i]; posCellOffsets[i] = newCellOffsets[i];
// } }
// posq->upload(); posq->upload(newPosq);
// velm->upload(); velm->upload(newVelm);
// atomIndex->upload(); atomIndexDevice->upload(atomIndex);
// findMoleculeGroups(); findMoleculeGroups();
// for (int i = 0; i < (int) reorderListeners.size(); i++) for (int i = 0; i < (int) reorderListeners.size(); i++)
// reorderListeners[i]->execute(); reorderListeners[i]->execute();
//} }
//
//void OpenCLContext::reorderAtoms(bool enforcePeriodic) { void CudaContext::reorderAtoms(bool enforcePeriodic) {
// if (numAtoms == 0 || nonbonded == NULL || !nonbonded->getUseCutoff()) if (numAtoms == 0 || nonbonded == NULL || !nonbonded->getUseCutoff())
// return; return;
// if (moleculesInvalid) if (moleculesInvalid)
// validateMolecules(); validateMolecules();
// atomsWereReordered = true; atomsWereReordered = true;
//
// // Find the range of positions and the number of bins along each axis. // Find the range of positions and the number of bins along each axis.
//
// posq->download(); vector<float4> oldPosq(paddedNumAtoms);
// velm->download(); vector<float4> oldVelm(paddedNumAtoms);
// float minx = posq->get(0).x, maxx = posq->get(0).x; posq->download(oldPosq);
// float miny = posq->get(0).y, maxy = posq->get(0).y; velm->download(oldVelm);
// float minz = posq->get(0).z, maxz = posq->get(0).z; float minx = oldPosq[0].x, maxx = oldPosq[0].x;
// if (nonbonded->getUsePeriodic()) { float miny = oldPosq[0].y, maxy = oldPosq[0].y;
// minx = miny = minz = 0.0; float minz = oldPosq[0].z, maxz = oldPosq[0].z;
// maxx = periodicBoxSize.x; if (nonbonded->getUsePeriodic()) {
// maxy = periodicBoxSize.y; minx = miny = minz = 0.0;
// maxz = periodicBoxSize.z; maxx = periodicBoxSize.x;
// } maxy = periodicBoxSize.y;
// else { maxz = periodicBoxSize.z;
// for (int i = 1; i < numAtoms; i++) { }
// const mm_float4& pos = posq->get(i); else {
// minx = min(minx, pos.x); for (int i = 1; i < numAtoms; i++) {
// maxx = max(maxx, pos.x); const float4& pos = oldPosq[i];
// miny = min(miny, pos.y); minx = min(minx, pos.x);
// maxy = max(maxy, pos.y); maxx = max(maxx, pos.x);
// minz = min(minz, pos.z); miny = min(miny, pos.y);
// maxz = max(maxz, pos.z); maxy = max(maxy, pos.y);
// } minz = min(minz, pos.z);
// } maxz = max(maxz, pos.z);
// }
// // Loop over each group of identical molecules and reorder them. }
//
// vector<int> originalIndex(numAtoms); // Loop over each group of identical molecules and reorder them.
// vector<mm_float4> newPosq(numAtoms);
// vector<mm_float4> newVelm(numAtoms); vector<int> originalIndex(numAtoms);
// vector<mm_int4> newCellOffsets(numAtoms); vector<float4> newPosq(paddedNumAtoms);
// for (int group = 0; group < (int) moleculeGroups.size(); group++) { vector<float4> newVelm(paddedNumAtoms);
// // Find the center of each molecule. vector<int4> newCellOffsets(numAtoms);
// for (int group = 0; group < (int) moleculeGroups.size(); group++) {
// MoleculeGroup& mol = moleculeGroups[group]; // Find the center of each molecule.
// int numMolecules = mol.offsets.size();
// vector<int>& atoms = mol.atoms; MoleculeGroup& mol = moleculeGroups[group];
// vector<mm_float4> molPos(numMolecules); int numMolecules = mol.offsets.size();
// float invNumAtoms = 1.0f/atoms.size(); vector<int>& atoms = mol.atoms;
// for (int i = 0; i < numMolecules; i++) { vector<float4> molPos(numMolecules);
// molPos[i].x = 0.0f; float invNumAtoms = 1.0f/atoms.size();
// molPos[i].y = 0.0f; for (int i = 0; i < numMolecules; i++) {
// molPos[i].z = 0.0f; molPos[i].x = 0.0f;
// for (int j = 0; j < (int)atoms.size(); j++) { molPos[i].y = 0.0f;
// int atom = atoms[j]+mol.offsets[i]; molPos[i].z = 0.0f;
// const mm_float4& pos = posq->get(atom); for (int j = 0; j < (int)atoms.size(); j++) {
// molPos[i].x += pos.x; int atom = atoms[j]+mol.offsets[i];
// molPos[i].y += pos.y; const float4& pos = oldPosq[atom];
// molPos[i].z += pos.z; molPos[i].x += pos.x;
// } molPos[i].y += pos.y;
// molPos[i].x *= invNumAtoms; molPos[i].z += pos.z;
// molPos[i].y *= invNumAtoms; }
// molPos[i].z *= invNumAtoms; molPos[i].x *= invNumAtoms;
// } molPos[i].y *= invNumAtoms;
// if (nonbonded->getUsePeriodic()) { molPos[i].z *= invNumAtoms;
// // Move each molecule position into the same box. }
// if (nonbonded->getUsePeriodic()) {
// for (int i = 0; i < numMolecules; i++) { // Move each molecule position into the same box.
// int xcell = (int) floor(molPos[i].x*invPeriodicBoxSize.x);
// int ycell = (int) floor(molPos[i].y*invPeriodicBoxSize.y); for (int i = 0; i < numMolecules; i++) {
// int zcell = (int) floor(molPos[i].z*invPeriodicBoxSize.z); int xcell = (int) floor(molPos[i].x*invPeriodicBoxSize.x);
// float dx = xcell*periodicBoxSize.x; int ycell = (int) floor(molPos[i].y*invPeriodicBoxSize.y);
// float dy = ycell*periodicBoxSize.y; int zcell = (int) floor(molPos[i].z*invPeriodicBoxSize.z);
// float dz = zcell*periodicBoxSize.z; float dx = xcell*periodicBoxSize.x;
// if (dx != 0.0f || dy != 0.0f || dz != 0.0f) { float dy = ycell*periodicBoxSize.y;
// molPos[i].x -= dx; float dz = zcell*periodicBoxSize.z;
// molPos[i].y -= dy; if (dx != 0.0f || dy != 0.0f || dz != 0.0f) {
// molPos[i].z -= dz; molPos[i].x -= dx;
// if (enforcePeriodic) { molPos[i].y -= dy;
// for (int j = 0; j < (int) atoms.size(); j++) { molPos[i].z -= dz;
// int atom = atoms[j]+mol.offsets[i]; if (enforcePeriodic) {
// mm_float4 p = posq->get(atom); for (int j = 0; j < (int) atoms.size(); j++) {
// p.x -= dx; int atom = atoms[j]+mol.offsets[i];
// p.y -= dy; float4 p = oldPosq[atom];
// p.z -= dz; p.x -= dx;
// posq->set(atom, p); p.y -= dy;
// posCellOffsets[atom].x -= xcell; p.z -= dz;
// posCellOffsets[atom].y -= ycell; oldPosq[atom] = p;
// posCellOffsets[atom].z -= zcell; posCellOffsets[atom].x -= xcell;
// } posCellOffsets[atom].y -= ycell;
// } posCellOffsets[atom].z -= zcell;
// } }
// } }
// } }
// }
// // Select a bin for each molecule, then sort them by bin. }
//
// bool useHilbert = (numMolecules > 5000 || atoms.size() > 8); // For small systems, a simple zigzag curve works better than a Hilbert curve. // Select a bin for each molecule, then sort them by bin.
// float binWidth;
// if (useHilbert) bool useHilbert = (numMolecules > 5000 || atoms.size() > 8); // For small systems, a simple zigzag curve works better than a Hilbert curve.
// binWidth = (float)(max(max(maxx-minx, maxy-miny), maxz-minz)/255.0); float binWidth;
// else if (useHilbert)
// binWidth = (float)(0.2*nonbonded->getCutoffDistance()); binWidth = (float)(max(max(maxx-minx, maxy-miny), maxz-minz)/255.0);
// float invBinWidth = 1.0f/binWidth; else
// int xbins = 1 + (int) ((maxx-minx)*invBinWidth); binWidth = (float)(0.2*nonbonded->getCutoffDistance());
// int ybins = 1 + (int) ((maxy-miny)*invBinWidth); float invBinWidth = 1.0f/binWidth;
// vector<pair<int, int> > molBins(numMolecules); int xbins = 1 + (int) ((maxx-minx)*invBinWidth);
// bitmask_t coords[3]; int ybins = 1 + (int) ((maxy-miny)*invBinWidth);
// for (int i = 0; i < numMolecules; i++) { vector<pair<int, int> > molBins(numMolecules);
// int x = (int) ((molPos[i].x-minx)*invBinWidth); bitmask_t coords[3];
// int y = (int) ((molPos[i].y-miny)*invBinWidth); for (int i = 0; i < numMolecules; i++) {
// int z = (int) ((molPos[i].z-minz)*invBinWidth); int x = (int) ((molPos[i].x-minx)*invBinWidth);
// int bin; int y = (int) ((molPos[i].y-miny)*invBinWidth);
// if (useHilbert) { int z = (int) ((molPos[i].z-minz)*invBinWidth);
// coords[0] = x; int bin;
// coords[1] = y; if (useHilbert) {
// coords[2] = z; coords[0] = x;
// bin = (int) hilbert_c2i(3, 8, coords); coords[1] = y;
// } coords[2] = z;
// else { bin = (int) hilbert_c2i(3, 8, coords);
// int yodd = y&1; }
// int zodd = z&1; else {
// bin = z*xbins*ybins; int yodd = y&1;
// bin += (zodd ? ybins-y : y)*xbins; int zodd = z&1;
// bin += (yodd ? xbins-x : x); bin = z*xbins*ybins;
// } bin += (zodd ? ybins-y : y)*xbins;
// molBins[i] = pair<int, int>(bin, i); bin += (yodd ? xbins-x : x);
// } }
// sort(molBins.begin(), molBins.end()); molBins[i] = pair<int, int>(bin, i);
// }
// // Reorder the atoms. sort(molBins.begin(), molBins.end());
//
// for (int i = 0; i < numMolecules; i++) { // Reorder the atoms.
// for (int j = 0; j < (int)atoms.size(); j++) {
// int oldIndex = mol.offsets[molBins[i].second]+atoms[j]; for (int i = 0; i < numMolecules; i++) {
// int newIndex = mol.offsets[i]+atoms[j]; for (int j = 0; j < (int)atoms.size(); j++) {
// originalIndex[newIndex] = atomIndex->get(oldIndex); int oldIndex = mol.offsets[molBins[i].second]+atoms[j];
// newPosq[newIndex] = posq->get(oldIndex); int newIndex = mol.offsets[i]+atoms[j];
// newVelm[newIndex] = velm->get(oldIndex); originalIndex[newIndex] = atomIndex[oldIndex];
// newCellOffsets[newIndex] = posCellOffsets[oldIndex]; newPosq[newIndex] = oldPosq[oldIndex];
// } newVelm[newIndex] = oldVelm[oldIndex];
// } newCellOffsets[newIndex] = posCellOffsets[oldIndex];
// } }
// }
// // Update the streams. }
//
// for (int i = 0; i < numAtoms; i++) { // Update the streams.
// posq->set(i, newPosq[i]);
// velm->set(i, newVelm[i]); for (int i = 0; i < numAtoms; i++) {
// atomIndex->set(i, originalIndex[i]); atomIndex[i] = originalIndex[i];
// posCellOffsets[i] = newCellOffsets[i]; posCellOffsets[i] = newCellOffsets[i];
// } }
// posq->upload(); posq->upload(newPosq);
// velm->upload(); velm->upload(newVelm);
// atomIndex->upload(); atomIndexDevice->upload(atomIndex);
// for (int i = 0; i < (int) reorderListeners.size(); i++) for (int i = 0; i < (int) reorderListeners.size(); i++)
// reorderListeners[i]->execute(); reorderListeners[i]->execute();
//} }
struct CudaContext::WorkThread::ThreadData { struct CudaContext::WorkThread::ThreadData {
ThreadData(std::queue<CudaContext::WorkTask*>& tasks, bool& waiting, bool& finished, ThreadData(std::queue<CudaContext::WorkTask*>& tasks, bool& waiting, bool& finished,
......
...@@ -324,6 +324,8 @@ public: ...@@ -324,6 +324,8 @@ public:
void setPeriodicBoxSize(double xsize, double ysize, double zsize) { void setPeriodicBoxSize(double xsize, double ysize, double zsize) {
periodicBoxSize = make_double4(xsize, ysize, zsize, 0.0); periodicBoxSize = make_double4(xsize, ysize, zsize, 0.0);
invPeriodicBoxSize = make_double4(1.0/xsize, 1.0/ysize, 1.0/zsize, 0.0); invPeriodicBoxSize = make_double4(1.0/xsize, 1.0/ysize, 1.0/zsize, 0.0);
periodicBoxSizeFloat = make_float4((float) xsize, (float) ysize, (float) zsize, 0.0f);
invPeriodicBoxSizeFloat = make_float4(1.0f/(float) xsize, 1.0f/(float) ysize, 1.0f/(float) zsize, 0.0f);
} }
/** /**
* Get the inverse of the size of the periodic box. * Get the inverse of the size of the periodic box.
...@@ -331,6 +333,20 @@ public: ...@@ -331,6 +333,20 @@ public:
double4 getInvPeriodicBoxSize() const { double4 getInvPeriodicBoxSize() const {
return invPeriodicBoxSize; return invPeriodicBoxSize;
} }
/**
* Get a pointer to the size of the periodic box, represented as either a float4 or double4 depending on
* this context's precision. This value is suitable for passing to kernels as an argument.
*/
void* getPeriodicBoxSizePointer() {
return (useDoublePrecision ? reinterpret_cast<void*>(&periodicBoxSize) : reinterpret_cast<void*>(&periodicBoxSizeFloat));
}
/**
* Get a pointer to the inverse of the size of the periodic box, represented as either a float4 or double4 depending on
* this context's precision. This value is suitable for passing to kernels as an argument.
*/
void* getInvPeriodicBoxSizePointer() {
return (useDoublePrecision ? reinterpret_cast<void*>(&invPeriodicBoxSize) : reinterpret_cast<void*>(&invPeriodicBoxSizeFloat));
}
/** /**
* Get the CudaIntegrationUtilities for this context. * Get the CudaIntegrationUtilities for this context.
*/ */
...@@ -349,12 +365,12 @@ public: ...@@ -349,12 +365,12 @@ public:
CudaBondedUtilities& getBondedUtilities() { CudaBondedUtilities& getBondedUtilities() {
return *bonded; return *bonded;
} }
// /** /**
// * Get the CudaNonbondedUtilities for this context. * Get the CudaNonbondedUtilities for this context.
// */ */
// CudaNonbondedUtilities& getNonbondedUtilities() { CudaNonbondedUtilities& getNonbondedUtilities() {
// return *nonbonded; return *nonbonded;
// } }
/** /**
* Get the thread used by this context for executing parallel computations. * Get the thread used by this context for executing parallel computations.
*/ */
...@@ -429,8 +445,8 @@ private: ...@@ -429,8 +445,8 @@ private:
int numThreadBlocks; int numThreadBlocks;
bool useBlockingSync, useDoublePrecision, accumulateInDouble, contextIsValid, atomsWereReordered, moleculesInvalid; bool useBlockingSync, useDoublePrecision, accumulateInDouble, contextIsValid, atomsWereReordered, moleculesInvalid;
std::string compiler, tempDir, gpuArchitecture; std::string compiler, tempDir, gpuArchitecture;
double4 periodicBoxSize; float4 periodicBoxSizeFloat, invPeriodicBoxSizeFloat;
double4 invPeriodicBoxSize; double4 periodicBoxSize, invPeriodicBoxSize;
std::string defaultOptimizationOptions; std::string defaultOptimizationOptions;
std::map<std::string, std::string> compilationDefines; std::map<std::string, std::string> compilationDefines;
CUcontext context; CUcontext context;
...@@ -458,7 +474,7 @@ private: ...@@ -458,7 +474,7 @@ private:
CudaIntegrationUtilities* integration; CudaIntegrationUtilities* integration;
CudaExpressionUtilities* expression; CudaExpressionUtilities* expression;
CudaBondedUtilities* bonded; CudaBondedUtilities* bonded;
// CudaNonbondedUtilities* nonbonded; CudaNonbondedUtilities* nonbonded;
WorkThread* thread; WorkThread* thread;
}; };
......
...@@ -92,8 +92,8 @@ KernelImpl* CudaKernelFactory::createKernelImpl(std::string name, const Platform ...@@ -92,8 +92,8 @@ KernelImpl* CudaKernelFactory::createKernelImpl(std::string name, const Platform
return new CudaCalcCMAPTorsionForceKernel(name, platform, cu, context.getSystem()); return new CudaCalcCMAPTorsionForceKernel(name, platform, cu, context.getSystem());
if (name == CalcCustomTorsionForceKernel::Name()) if (name == CalcCustomTorsionForceKernel::Name())
return new CudaCalcCustomTorsionForceKernel(name, platform, cu, context.getSystem()); return new CudaCalcCustomTorsionForceKernel(name, platform, cu, context.getSystem());
// if (name == CalcNonbondedForceKernel::Name()) if (name == CalcNonbondedForceKernel::Name())
// return new CudaCalcNonbondedForceKernel(name, platform, cu, context.getSystem()); return new CudaCalcNonbondedForceKernel(name, platform, cu, context.getSystem());
// if (name == CalcCustomNonbondedForceKernel::Name()) // if (name == CalcCustomNonbondedForceKernel::Name())
// return new CudaCalcCustomNonbondedForceKernel(name, platform, cu, context.getSystem()); // return new CudaCalcCustomNonbondedForceKernel(name, platform, cu, context.getSystem());
// if (name == CalcGBSAOBCForceKernel::Name()) // if (name == CalcGBSAOBCForceKernel::Name())
......
...@@ -83,23 +83,23 @@ void CudaCalcForcesAndEnergyKernel::initialize(const System& system) { ...@@ -83,23 +83,23 @@ void CudaCalcForcesAndEnergyKernel::initialize(const System& system) {
void CudaCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForces, bool includeEnergy, int groups) { void CudaCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForces, bool includeEnergy, int groups) {
cuCtxSetCurrent(cu.getContext()); cuCtxSetCurrent(cu.getContext());
// CudaNonbondedUtilities& nb = cu.getNonbondedUtilities(); CudaNonbondedUtilities& nb = cu.getNonbondedUtilities();
// bool includeNonbonded = ((groups&(1<<nb.getForceGroup())) != 0); bool includeNonbonded = ((groups&(1<<nb.getForceGroup())) != 0);
// cu.setAtomsWereReordered(false); cu.setAtomsWereReordered(false);
// if (nb.getUseCutoff() && includeNonbonded && (cu.getMoleculesAreInvalid() || cu.getComputeForceCount()%100 == 0)) { if (nb.getUseCutoff() && includeNonbonded && (cu.getMoleculesAreInvalid() || cu.getComputeForceCount()%100 == 0)) {
// cu.reorderAtoms(!cu.getMoleculesAreInvalid()); cu.reorderAtoms(!cu.getMoleculesAreInvalid());
// nb.updateNeighborListSize(); nb.updateNeighborListSize();
// } }
cu.setComputeForceCount(cu.getComputeForceCount()+1); cu.setComputeForceCount(cu.getComputeForceCount()+1);
cu.clearAutoclearBuffers(); cu.clearAutoclearBuffers();
// if (includeNonbonded) if (includeNonbonded)
// nb.prepareInteractions(); nb.prepareInteractions();
} }
double CudaCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bool includeForces, bool includeEnergy, int groups) { double CudaCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bool includeForces, bool includeEnergy, int groups) {
cu.getBondedUtilities().computeInteractions(groups); cu.getBondedUtilities().computeInteractions(groups);
// if ((groups&(1<<cu.getNonbondedUtilities().getForceGroup())) != 0) if ((groups&(1<<cu.getNonbondedUtilities().getForceGroup())) != 0)
// cu.getNonbondedUtilities().computeInteractions(); cu.getNonbondedUtilities().computeInteractions();
cu.getIntegrationUtilities().distributeForcesFromVirtualSites(); cu.getIntegrationUtilities().distributeForcesFromVirtualSites();
double sum = 0.0; double sum = 0.0;
if (includeEnergy) { if (includeEnergy) {
...@@ -334,8 +334,8 @@ void CudaApplyConstraintsKernel::apply(ContextImpl& context, double tol) { ...@@ -334,8 +334,8 @@ void CudaApplyConstraintsKernel::apply(ContextImpl& context, double tol) {
// hasInitializedKernel = true; // hasInitializedKernel = true;
// map<string, string> defines; // map<string, string> defines;
// defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms()); // defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
// cu::Program program = cu.createProgram(CudaKernelSources::constraints, defines); // CUmodule module = cu.createModule(CudaKernelSources::constraints, defines);
// applyDeltasKernel = cu::Kernel(program, "applyPositionDeltas"); // applyDeltasKernel = cu.getKernel(module, "applyPositionDeltas");
// applyDeltasKernel.setArg<cu::Buffer>(0, cu.getPosq().getDevicePointer()); // applyDeltasKernel.setArg<cu::Buffer>(0, cu.getPosq().getDevicePointer());
// applyDeltasKernel.setArg<cu::Buffer>(1, cu.getIntegrationUtilities().getPosDelta().getDevicePointer()); // applyDeltasKernel.setArg<cu::Buffer>(1, cu.getIntegrationUtilities().getPosDelta().getDevicePointer());
// } // }
...@@ -380,11 +380,13 @@ private: ...@@ -380,11 +380,13 @@ private:
}; };
CudaCalcHarmonicBondForceKernel::~CudaCalcHarmonicBondForceKernel() { CudaCalcHarmonicBondForceKernel::~CudaCalcHarmonicBondForceKernel() {
cuCtxSetCurrent(cu.getContext());
if (params != NULL) if (params != NULL)
delete params; delete params;
} }
void CudaCalcHarmonicBondForceKernel::initialize(const System& system, const HarmonicBondForce& force) { void CudaCalcHarmonicBondForceKernel::initialize(const System& system, const HarmonicBondForce& force) {
cuCtxSetCurrent(cu.getContext());
int numContexts = cu.getPlatformData().contexts.size(); int numContexts = cu.getPlatformData().contexts.size();
int startIndex = cu.getContextIndex()*force.getNumBonds()/numContexts; int startIndex = cu.getContextIndex()*force.getNumBonds()/numContexts;
int endIndex = (cu.getContextIndex()+1)*force.getNumBonds()/numContexts; int endIndex = (cu.getContextIndex()+1)*force.getNumBonds()/numContexts;
...@@ -465,6 +467,7 @@ private: ...@@ -465,6 +467,7 @@ private:
}; };
CudaCalcCustomBondForceKernel::~CudaCalcCustomBondForceKernel() { CudaCalcCustomBondForceKernel::~CudaCalcCustomBondForceKernel() {
cuCtxSetCurrent(cu.getContext());
if (params != NULL) if (params != NULL)
delete params; delete params;
if (globals != NULL) if (globals != NULL)
...@@ -472,6 +475,7 @@ CudaCalcCustomBondForceKernel::~CudaCalcCustomBondForceKernel() { ...@@ -472,6 +475,7 @@ CudaCalcCustomBondForceKernel::~CudaCalcCustomBondForceKernel() {
} }
void CudaCalcCustomBondForceKernel::initialize(const System& system, const CustomBondForce& force) { void CudaCalcCustomBondForceKernel::initialize(const System& system, const CustomBondForce& force) {
cuCtxSetCurrent(cu.getContext());
int numContexts = cu.getPlatformData().contexts.size(); int numContexts = cu.getPlatformData().contexts.size();
int startIndex = cu.getContextIndex()*force.getNumBonds()/numContexts; int startIndex = cu.getContextIndex()*force.getNumBonds()/numContexts;
int endIndex = (cu.getContextIndex()+1)*force.getNumBonds()/numContexts; int endIndex = (cu.getContextIndex()+1)*force.getNumBonds()/numContexts;
...@@ -525,7 +529,7 @@ void CudaCalcCustomBondForceKernel::initialize(const System& system, const Custo ...@@ -525,7 +529,7 @@ void CudaCalcCustomBondForceKernel::initialize(const System& system, const Custo
} }
stringstream compute; stringstream compute;
for (int i = 0; i < (int) params->getBuffers().size(); i++) { for (int i = 0; i < (int) params->getBuffers().size(); i++) {
const CudaNonbondedUtilities::ParameterInfo& buffer = params->getBuffers()[i]; CudaNonbondedUtilities::ParameterInfo& buffer = params->getBuffers()[i];
string argName = cu.getBondedUtilities().addArgument(buffer.getMemory(), buffer.getType()); string argName = cu.getBondedUtilities().addArgument(buffer.getMemory(), buffer.getType());
compute<<buffer.getType()<<" bondParams"<<(i+1)<<" = "<<argName<<"[index];\n"; compute<<buffer.getType()<<" bondParams"<<(i+1)<<" = "<<argName<<"[index];\n";
} }
...@@ -605,11 +609,13 @@ private: ...@@ -605,11 +609,13 @@ private:
}; };
CudaCalcHarmonicAngleForceKernel::~CudaCalcHarmonicAngleForceKernel() { CudaCalcHarmonicAngleForceKernel::~CudaCalcHarmonicAngleForceKernel() {
cuCtxSetCurrent(cu.getContext());
if (params != NULL) if (params != NULL)
delete params; delete params;
} }
void CudaCalcHarmonicAngleForceKernel::initialize(const System& system, const HarmonicAngleForce& force) { void CudaCalcHarmonicAngleForceKernel::initialize(const System& system, const HarmonicAngleForce& force) {
cuCtxSetCurrent(cu.getContext());
int numContexts = cu.getPlatformData().contexts.size(); int numContexts = cu.getPlatformData().contexts.size();
int startIndex = cu.getContextIndex()*force.getNumAngles()/numContexts; int startIndex = cu.getContextIndex()*force.getNumAngles()/numContexts;
int endIndex = (cu.getContextIndex()+1)*force.getNumAngles()/numContexts; int endIndex = (cu.getContextIndex()+1)*force.getNumAngles()/numContexts;
...@@ -692,6 +698,7 @@ private: ...@@ -692,6 +698,7 @@ private:
}; };
CudaCalcCustomAngleForceKernel::~CudaCalcCustomAngleForceKernel() { CudaCalcCustomAngleForceKernel::~CudaCalcCustomAngleForceKernel() {
cuCtxSetCurrent(cu.getContext());
if (params != NULL) if (params != NULL)
delete params; delete params;
if (globals != NULL) if (globals != NULL)
...@@ -699,6 +706,7 @@ CudaCalcCustomAngleForceKernel::~CudaCalcCustomAngleForceKernel() { ...@@ -699,6 +706,7 @@ CudaCalcCustomAngleForceKernel::~CudaCalcCustomAngleForceKernel() {
} }
void CudaCalcCustomAngleForceKernel::initialize(const System& system, const CustomAngleForce& force) { void CudaCalcCustomAngleForceKernel::initialize(const System& system, const CustomAngleForce& force) {
cuCtxSetCurrent(cu.getContext());
int numContexts = cu.getPlatformData().contexts.size(); int numContexts = cu.getPlatformData().contexts.size();
int startIndex = cu.getContextIndex()*force.getNumAngles()/numContexts; int startIndex = cu.getContextIndex()*force.getNumAngles()/numContexts;
int endIndex = (cu.getContextIndex()+1)*force.getNumAngles()/numContexts; int endIndex = (cu.getContextIndex()+1)*force.getNumAngles()/numContexts;
...@@ -752,7 +760,7 @@ void CudaCalcCustomAngleForceKernel::initialize(const System& system, const Cust ...@@ -752,7 +760,7 @@ void CudaCalcCustomAngleForceKernel::initialize(const System& system, const Cust
} }
stringstream compute; stringstream compute;
for (int i = 0; i < (int) params->getBuffers().size(); i++) { for (int i = 0; i < (int) params->getBuffers().size(); i++) {
const CudaNonbondedUtilities::ParameterInfo& buffer = params->getBuffers()[i]; CudaNonbondedUtilities::ParameterInfo& buffer = params->getBuffers()[i];
string argName = cu.getBondedUtilities().addArgument(buffer.getMemory(), buffer.getType()); string argName = cu.getBondedUtilities().addArgument(buffer.getMemory(), buffer.getType());
compute<<buffer.getType()<<" angleParams"<<(i+1)<<" = "<<argName<<"[index];\n"; compute<<buffer.getType()<<" angleParams"<<(i+1)<<" = "<<argName<<"[index];\n";
} }
...@@ -838,6 +846,7 @@ CudaCalcPeriodicTorsionForceKernel::~CudaCalcPeriodicTorsionForceKernel() { ...@@ -838,6 +846,7 @@ CudaCalcPeriodicTorsionForceKernel::~CudaCalcPeriodicTorsionForceKernel() {
} }
void CudaCalcPeriodicTorsionForceKernel::initialize(const System& system, const PeriodicTorsionForce& force) { void CudaCalcPeriodicTorsionForceKernel::initialize(const System& system, const PeriodicTorsionForce& force) {
cuCtxSetCurrent(cu.getContext());
int numContexts = cu.getPlatformData().contexts.size(); int numContexts = cu.getPlatformData().contexts.size();
int startIndex = cu.getContextIndex()*force.getNumTorsions()/numContexts; int startIndex = cu.getContextIndex()*force.getNumTorsions()/numContexts;
int endIndex = (cu.getContextIndex()+1)*force.getNumTorsions()/numContexts; int endIndex = (cu.getContextIndex()+1)*force.getNumTorsions()/numContexts;
...@@ -925,6 +934,7 @@ CudaCalcRBTorsionForceKernel::~CudaCalcRBTorsionForceKernel() { ...@@ -925,6 +934,7 @@ CudaCalcRBTorsionForceKernel::~CudaCalcRBTorsionForceKernel() {
} }
void CudaCalcRBTorsionForceKernel::initialize(const System& system, const RBTorsionForce& force) { void CudaCalcRBTorsionForceKernel::initialize(const System& system, const RBTorsionForce& force) {
cuCtxSetCurrent(cu.getContext());
int numContexts = cu.getPlatformData().contexts.size(); int numContexts = cu.getPlatformData().contexts.size();
int startIndex = cu.getContextIndex()*force.getNumTorsions()/numContexts; int startIndex = cu.getContextIndex()*force.getNumTorsions()/numContexts;
int endIndex = (cu.getContextIndex()+1)*force.getNumTorsions()/numContexts; int endIndex = (cu.getContextIndex()+1)*force.getNumTorsions()/numContexts;
...@@ -1024,6 +1034,7 @@ CudaCalcCMAPTorsionForceKernel::~CudaCalcCMAPTorsionForceKernel() { ...@@ -1024,6 +1034,7 @@ CudaCalcCMAPTorsionForceKernel::~CudaCalcCMAPTorsionForceKernel() {
} }
void CudaCalcCMAPTorsionForceKernel::initialize(const System& system, const CMAPTorsionForce& force) { void CudaCalcCMAPTorsionForceKernel::initialize(const System& system, const CMAPTorsionForce& force) {
cuCtxSetCurrent(cu.getContext());
int numContexts = cu.getPlatformData().contexts.size(); int numContexts = cu.getPlatformData().contexts.size();
int startIndex = cu.getContextIndex()*force.getNumTorsions()/numContexts; int startIndex = cu.getContextIndex()*force.getNumTorsions()/numContexts;
int endIndex = (cu.getContextIndex()+1)*force.getNumTorsions()/numContexts; int endIndex = (cu.getContextIndex()+1)*force.getNumTorsions()/numContexts;
...@@ -1110,6 +1121,7 @@ CudaCalcCustomTorsionForceKernel::~CudaCalcCustomTorsionForceKernel() { ...@@ -1110,6 +1121,7 @@ CudaCalcCustomTorsionForceKernel::~CudaCalcCustomTorsionForceKernel() {
} }
void CudaCalcCustomTorsionForceKernel::initialize(const System& system, const CustomTorsionForce& force) { void CudaCalcCustomTorsionForceKernel::initialize(const System& system, const CustomTorsionForce& force) {
cuCtxSetCurrent(cu.getContext());
int numContexts = cu.getPlatformData().contexts.size(); int numContexts = cu.getPlatformData().contexts.size();
int startIndex = cu.getContextIndex()*force.getNumTorsions()/numContexts; int startIndex = cu.getContextIndex()*force.getNumTorsions()/numContexts;
int endIndex = (cu.getContextIndex()+1)*force.getNumTorsions()/numContexts; int endIndex = (cu.getContextIndex()+1)*force.getNumTorsions()/numContexts;
...@@ -1163,7 +1175,7 @@ void CudaCalcCustomTorsionForceKernel::initialize(const System& system, const Cu ...@@ -1163,7 +1175,7 @@ void CudaCalcCustomTorsionForceKernel::initialize(const System& system, const Cu
} }
stringstream compute; stringstream compute;
for (int i = 0; i < (int) params->getBuffers().size(); i++) { for (int i = 0; i < (int) params->getBuffers().size(); i++) {
const CudaNonbondedUtilities::ParameterInfo& buffer = params->getBuffers()[i]; CudaNonbondedUtilities::ParameterInfo& buffer = params->getBuffers()[i];
string argName = cu.getBondedUtilities().addArgument(buffer.getMemory(), buffer.getType()); string argName = cu.getBondedUtilities().addArgument(buffer.getMemory(), buffer.getType());
compute<<buffer.getType()<<" torsionParams"<<(i+1)<<" = "<<argName<<"[index];\n"; compute<<buffer.getType()<<" torsionParams"<<(i+1)<<" = "<<argName<<"[index];\n";
} }
...@@ -1215,475 +1227,449 @@ void CudaCalcCustomTorsionForceKernel::copyParametersToContext(ContextImpl& cont ...@@ -1215,475 +1227,449 @@ void CudaCalcCustomTorsionForceKernel::copyParametersToContext(ContextImpl& cont
cu.invalidateMolecules(); cu.invalidateMolecules();
} }
//class CudaNonbondedForceInfo : public CudaForceInfo { class CudaNonbondedForceInfo : public CudaForceInfo {
//public: public:
// CudaNonbondedForceInfo(int requiredBuffers, const NonbondedForce& force) : CudaForceInfo(requiredBuffers), force(force) { CudaNonbondedForceInfo(const NonbondedForce& force) : force(force) {
// } }
// bool areParticlesIdentical(int particle1, int particle2) { bool areParticlesIdentical(int particle1, int particle2) {
// double charge1, charge2, sigma1, sigma2, epsilon1, epsilon2; double charge1, charge2, sigma1, sigma2, epsilon1, epsilon2;
// force.getParticleParameters(particle1, charge1, sigma1, epsilon1); force.getParticleParameters(particle1, charge1, sigma1, epsilon1);
// force.getParticleParameters(particle2, charge2, sigma2, epsilon2); force.getParticleParameters(particle2, charge2, sigma2, epsilon2);
// return (charge1 == charge2 && sigma1 == sigma2 && epsilon1 == epsilon2); return (charge1 == charge2 && sigma1 == sigma2 && epsilon1 == epsilon2);
// } }
// int getNumParticleGroups() { int getNumParticleGroups() {
// return force.getNumExceptions(); return force.getNumExceptions();
// } }
// void getParticlesInGroup(int index, vector<int>& particles) { void getParticlesInGroup(int index, vector<int>& particles) {
// int particle1, particle2; int particle1, particle2;
// double chargeProd, sigma, epsilon; double chargeProd, sigma, epsilon;
// force.getExceptionParameters(index, particle1, particle2, chargeProd, sigma, epsilon); force.getExceptionParameters(index, particle1, particle2, chargeProd, sigma, epsilon);
// particles.resize(2); particles.resize(2);
// particles[0] = particle1; particles[0] = particle1;
// particles[1] = particle2; particles[1] = particle2;
// } }
// bool areGroupsIdentical(int group1, int group2) { bool areGroupsIdentical(int group1, int group2) {
// int particle1, particle2; int particle1, particle2;
// double chargeProd1, chargeProd2, sigma1, sigma2, epsilon1, epsilon2; double chargeProd1, chargeProd2, sigma1, sigma2, epsilon1, epsilon2;
// force.getExceptionParameters(group1, particle1, particle2, chargeProd1, sigma1, epsilon1); force.getExceptionParameters(group1, particle1, particle2, chargeProd1, sigma1, epsilon1);
// force.getExceptionParameters(group2, particle1, particle2, chargeProd2, sigma2, epsilon2); force.getExceptionParameters(group2, particle1, particle2, chargeProd2, sigma2, epsilon2);
// return (chargeProd1 == chargeProd2 && sigma1 == sigma2 && epsilon1 == epsilon2); return (chargeProd1 == chargeProd2 && sigma1 == sigma2 && epsilon1 == epsilon2);
// } }
//private: private:
// const NonbondedForce& force; const NonbondedForce& force;
//}; };
//
//CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() { CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
// if (sigmaEpsilon != NULL) cuCtxSetCurrent(cu.getContext());
// delete sigmaEpsilon; if (sigmaEpsilon != NULL)
// if (exceptionParams != NULL) delete sigmaEpsilon;
// delete exceptionParams; if (exceptionParams != NULL)
// if (cosSinSums != NULL) delete exceptionParams;
// delete cosSinSums; if (cosSinSums != NULL)
// if (pmeGrid != NULL) delete cosSinSums;
// delete pmeGrid; if (pmeGrid != NULL)
// if (pmeGrid2 != NULL) delete pmeGrid;
// delete pmeGrid2; if (pmeBsplineModuliX != NULL)
// if (pmeBsplineModuliX != NULL) delete pmeBsplineModuliX;
// delete pmeBsplineModuliX; if (pmeBsplineModuliY != NULL)
// if (pmeBsplineModuliY != NULL) delete pmeBsplineModuliY;
// delete pmeBsplineModuliY; if (pmeBsplineModuliZ != NULL)
// if (pmeBsplineModuliZ != NULL) delete pmeBsplineModuliZ;
// delete pmeBsplineModuliZ; if (pmeBsplineTheta != NULL)
// if (pmeBsplineTheta != NULL) delete pmeBsplineTheta;
// delete pmeBsplineTheta; if (pmeBsplineDTheta != NULL)
// if (pmeBsplineDTheta != NULL) delete pmeBsplineDTheta;
// delete pmeBsplineDTheta; if (pmeAtomRange != NULL)
// if (pmeAtomRange != NULL) delete pmeAtomRange;
// delete pmeAtomRange; if (pmeAtomGridIndex != NULL)
// if (pmeAtomGridIndex != NULL) delete pmeAtomGridIndex;
// delete pmeAtomGridIndex; if (sort != NULL)
// if (sort != NULL) delete sort;
// delete sort; if (hasInitializedFFT)
// if (fft != NULL) cufftDestroy(fft);
// delete fft; }
//}
// /**
//void CudaCalcNonbondedForceKernel::initialize(const System& system, const NonbondedForce& force) { * Select a size for an FFT that is a multiple of 2, 3, 5, and 7.
// */
// // Identify which exceptions are 1-4 interactions. static int findFFTDimension(int minimum) {
// if (minimum < 1)
// vector<pair<int, int> > exclusions; return 1;
// vector<int> exceptions; while (true) {
// for (int i = 0; i < force.getNumExceptions(); i++) { // Attempt to factor the current value.
// int particle1, particle2;
// double chargeProd, sigma, epsilon; int unfactored = minimum;
// force.getExceptionParameters(i, particle1, particle2, chargeProd, sigma, epsilon); for (int factor = 2; factor < 8; factor++) {
// exclusions.push_back(pair<int, int>(particle1, particle2)); while (unfactored > 1 && unfactored%factor == 0)
// if (chargeProd != 0.0 || epsilon != 0.0) unfactored /= factor;
// exceptions.push_back(i); }
// } if (unfactored == 1)
// return minimum;
// // Initialize nonbonded interactions. minimum++;
// }
// int numParticles = force.getNumParticles(); }
// sigmaEpsilon = new CudaArray<mm_float2>(cu, numParticles, "sigmaEpsilon");
// CudaArray<mm_float4>& posq = cu.getPosq(); void CudaCalcNonbondedForceKernel::initialize(const System& system, const NonbondedForce& force) {
// vector<mm_float2> sigmaEpsilonVector(numParticles); cuCtxSetCurrent(cu.getContext());
// vector<vector<int> > exclusionList(numParticles);
// double sumSquaredCharges = 0.0; // Identify which exceptions are 1-4 interactions.
// hasCoulomb = false;
// hasLJ = false; vector<pair<int, int> > exclusions;
// for (int i = 0; i < numParticles; i++) { vector<int> exceptions;
// double charge, sigma, epsilon; for (int i = 0; i < force.getNumExceptions(); i++) {
// force.getParticleParameters(i, charge, sigma, epsilon); int particle1, particle2;
// posq[i].w = (float) charge; double chargeProd, sigma, epsilon;
// sigmaEpsilonVector[i] = mm_float2((float) (0.5*sigma), (float) (2.0*sqrt(epsilon))); force.getExceptionParameters(i, particle1, particle2, chargeProd, sigma, epsilon);
// exclusionList[i].push_back(i); exclusions.push_back(pair<int, int>(particle1, particle2));
// sumSquaredCharges += charge*charge; if (chargeProd != 0.0 || epsilon != 0.0)
// if (charge != 0.0) exceptions.push_back(i);
// hasCoulomb = true; }
// if (epsilon != 0.0)
// hasLJ = true; // Initialize nonbonded interactions.
// }
// for (int i = 0; i < (int) exclusions.size(); i++) { int numParticles = force.getNumParticles();
// exclusionList[exclusions[i].first].push_back(exclusions[i].second); sigmaEpsilon = CudaArray::create<float2>(numParticles, "sigmaEpsilon");
// exclusionList[exclusions[i].second].push_back(exclusions[i].first); CudaArray& posq = cu.getPosq();
// } float4* posqf = (float4*) cu.getPinnedBuffer();
// posq.upload(); double4* posqd = (double4*) cu.getPinnedBuffer();
// sigmaEpsilon->upload(sigmaEpsilonVector); vector<float2> sigmaEpsilonVector(numParticles);
// bool useCutoff = (force.getNonbondedMethod() != NonbondedForce::NoCutoff); vector<vector<int> > exclusionList(numParticles);
// bool usePeriodic = (force.getNonbondedMethod() != NonbondedForce::NoCutoff && force.getNonbondedMethod() != NonbondedForce::CutoffNonPeriodic); double sumSquaredCharges = 0.0;
// map<string, string> defines; hasCoulomb = false;
// defines["HAS_COULOMB"] = (hasCoulomb ? "1" : "0"); hasLJ = false;
// defines["HAS_LENNARD_JONES"] = (hasLJ ? "1" : "0"); for (int i = 0; i < numParticles; i++) {
// if (useCutoff) { double charge, sigma, epsilon;
// // Compute the reaction field constants. force.getParticleParameters(i, charge, sigma, epsilon);
// if (cu.getUseDoublePrecision())
// double reactionFieldK = pow(force.getCutoffDistance(), -3.0)*(force.getReactionFieldDielectric()-1.0)/(2.0*force.getReactionFieldDielectric()+1.0); posqd[i] = make_double4(0, 0, 0, charge);
// double reactionFieldC = (1.0 / force.getCutoffDistance())*(3.0*force.getReactionFieldDielectric())/(2.0*force.getReactionFieldDielectric()+1.0); else
// defines["REACTION_FIELD_K"] = cu.doubleToString(reactionFieldK); posqf[i] = make_float4(0, 0, 0, (float) charge);
// defines["REACTION_FIELD_C"] = cu.doubleToString(reactionFieldC); sigmaEpsilonVector[i] = make_float2((float) (0.5*sigma), (float) (2.0*sqrt(epsilon)));
// } exclusionList[i].push_back(i);
// if (force.getUseDispersionCorrection() && cu.getContextIndex() == 0) sumSquaredCharges += charge*charge;
// dispersionCoefficient = NonbondedForceImpl::calcDispersionCorrection(system, force); if (charge != 0.0)
// else hasCoulomb = true;
// dispersionCoefficient = 0.0; if (epsilon != 0.0)
// alpha = 0; hasLJ = true;
// if (force.getNonbondedMethod() == NonbondedForce::Ewald) { }
// // Compute the Ewald parameters. for (int i = 0; i < (int) exclusions.size(); i++) {
// exclusionList[exclusions[i].first].push_back(exclusions[i].second);
// int kmaxx, kmaxy, kmaxz; exclusionList[exclusions[i].second].push_back(exclusions[i].first);
// NonbondedForceImpl::calcEwaldParameters(system, force, alpha, kmaxx, kmaxy, kmaxz); }
// defines["EWALD_ALPHA"] = cu.doubleToString(alpha); posq.upload(cu.getPinnedBuffer());
// defines["TWO_OVER_SQRT_PI"] = cu.doubleToString(2.0/sqrt(M_PI)); sigmaEpsilon->upload(sigmaEpsilonVector);
// defines["USE_EWALD"] = "1"; bool useCutoff = (force.getNonbondedMethod() != NonbondedForce::NoCutoff);
// ewaldSelfEnergy = (cu.getContextIndex() == 0 ? -ONE_4PI_EPS0*alpha*sumSquaredCharges/sqrt(M_PI) : 0.0); bool usePeriodic = (force.getNonbondedMethod() != NonbondedForce::NoCutoff && force.getNonbondedMethod() != NonbondedForce::CutoffNonPeriodic);
// map<string, string> defines;
// // Create the reciprocal space kernels. defines["HAS_COULOMB"] = (hasCoulomb ? "1" : "0");
// defines["HAS_LENNARD_JONES"] = (hasLJ ? "1" : "0");
// map<string, string> replacements; if (useCutoff) {
// replacements["NUM_ATOMS"] = cu.intToString(numParticles); // Compute the reaction field constants.
// replacements["KMAX_X"] = cu.intToString(kmaxx);
// replacements["KMAX_Y"] = cu.intToString(kmaxy); double reactionFieldK = pow(force.getCutoffDistance(), -3.0)*(force.getReactionFieldDielectric()-1.0)/(2.0*force.getReactionFieldDielectric()+1.0);
// replacements["KMAX_Z"] = cu.intToString(kmaxz); double reactionFieldC = (1.0 / force.getCutoffDistance())*(3.0*force.getReactionFieldDielectric())/(2.0*force.getReactionFieldDielectric()+1.0);
// replacements["EXP_COEFFICIENT"] = cu.doubleToString(-1.0/(4.0*alpha*alpha)); defines["REACTION_FIELD_K"] = cu.doubleToString(reactionFieldK);
// cu::Program program = cu.createProgram(CudaKernelSources::ewald, replacements); defines["REACTION_FIELD_C"] = cu.doubleToString(reactionFieldC);
// ewaldSumsKernel = cu::Kernel(program, "calculateEwaldCosSinSums"); }
// ewaldForcesKernel = cu::Kernel(program, "calculateEwaldForces"); if (force.getUseDispersionCorrection() && cu.getContextIndex() == 0)
// cosSinSums = new CudaArray<mm_float2>(cu, (2*kmaxx-1)*(2*kmaxy-1)*(2*kmaxz-1), "cosSinSums"); dispersionCoefficient = NonbondedForceImpl::calcDispersionCorrection(system, force);
// } else
// else if (force.getNonbondedMethod() == NonbondedForce::PME) { dispersionCoefficient = 0.0;
// // Compute the PME parameters. alpha = 0;
// if (force.getNonbondedMethod() == NonbondedForce::Ewald) {
// int gridSizeX, gridSizeY, gridSizeZ; // Compute the Ewald parameters.
// NonbondedForceImpl::calcPMEParameters(system, force, alpha, gridSizeX, gridSizeY, gridSizeZ);
// gridSizeX = CudaFFT3D::findLegalDimension(gridSizeX); int kmaxx, kmaxy, kmaxz;
// gridSizeY = CudaFFT3D::findLegalDimension(gridSizeY); NonbondedForceImpl::calcEwaldParameters(system, force, alpha, kmaxx, kmaxy, kmaxz);
// gridSizeZ = CudaFFT3D::findLegalDimension(gridSizeZ); defines["EWALD_ALPHA"] = cu.doubleToString(alpha);
// defines["EWALD_ALPHA"] = cu.doubleToString(alpha); defines["TWO_OVER_SQRT_PI"] = cu.doubleToString(2.0/sqrt(M_PI));
// defines["TWO_OVER_SQRT_PI"] = cu.doubleToString(2.0/sqrt(M_PI)); defines["USE_EWALD"] = "1";
// defines["USE_EWALD"] = "1"; ewaldSelfEnergy = (cu.getContextIndex() == 0 ? -ONE_4PI_EPS0*alpha*sumSquaredCharges/sqrt(M_PI) : 0.0);
// ewaldSelfEnergy = (cu.getContextIndex() == 0 ? -ONE_4PI_EPS0*alpha*sumSquaredCharges/sqrt(M_PI) : 0.0);
// pmeDefines["PME_ORDER"] = cu.intToString(PmeOrder); // Create the reciprocal space kernels.
// pmeDefines["NUM_ATOMS"] = cu.intToString(numParticles);
// pmeDefines["RECIP_EXP_FACTOR"] = cu.doubleToString(M_PI*M_PI/(alpha*alpha)); map<string, string> replacements;
// pmeDefines["GRID_SIZE_X"] = cu.intToString(gridSizeX); replacements["NUM_ATOMS"] = cu.intToString(numParticles);
// pmeDefines["GRID_SIZE_Y"] = cu.intToString(gridSizeY); replacements["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
// pmeDefines["GRID_SIZE_Z"] = cu.intToString(gridSizeZ); replacements["KMAX_X"] = cu.intToString(kmaxx);
// pmeDefines["EPSILON_FACTOR"] = cu.doubleToString(sqrt(ONE_4PI_EPS0)); replacements["KMAX_Y"] = cu.intToString(kmaxy);
// replacements["KMAX_Z"] = cu.intToString(kmaxz);
// // Create required data structures. replacements["EXP_COEFFICIENT"] = cu.doubleToString(-1.0/(4.0*alpha*alpha));
// replacements["ONE_4PI_EPS0"] = cu.doubleToString(ONE_4PI_EPS0);
// pmeGrid = new CudaArray<mm_float2>(cu, gridSizeX*gridSizeY*gridSizeZ, "pmeGrid"); CUmodule module = cu.createModule(CudaKernelSources::vectorOps+CudaKernelSources::ewald, replacements);
// cu.addAutoclearBuffer(pmeGrid->getDevicePointer(), pmeGrid->getSize()*2); ewaldSumsKernel = cu.getKernel(module, "calculateEwaldCosSinSums");
// pmeGrid2 = new CudaArray<mm_float2>(cu, gridSizeX*gridSizeY*gridSizeZ, "pmeGrid2"); ewaldForcesKernel = cu.getKernel(module, "calculateEwaldForces");
// pmeBsplineModuliX = new CudaArray<cl_float>(cu, gridSizeX, "pmeBsplineModuliX"); int elementSize = (cu.getUseDoublePrecision() ? sizeof(double2) : sizeof(float2));
// pmeBsplineModuliY = new CudaArray<cl_float>(cu, gridSizeY, "pmeBsplineModuliY"); cosSinSums = new CudaArray((2*kmaxx-1)*(2*kmaxy-1)*(2*kmaxz-1), elementSize, "cosSinSums");
// pmeBsplineModuliZ = new CudaArray<cl_float>(cu, gridSizeZ, "pmeBsplineModuliZ"); }
// pmeBsplineTheta = new CudaArray<mm_float4>(cu, PmeOrder*numParticles, "pmeBsplineTheta"); else if (force.getNonbondedMethod() == NonbondedForce::PME) {
// bool deviceIsCpu = (cu.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU); // Compute the PME parameters.
// if (deviceIsCpu)
// pmeBsplineDTheta = new CudaArray<mm_float4>(cu, PmeOrder*numParticles, "pmeBsplineDTheta"); int gridSizeX, gridSizeY, gridSizeZ;
// pmeAtomRange = new CudaArray<cl_int>(cu, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange"); NonbondedForceImpl::calcPMEParameters(system, force, alpha, gridSizeX, gridSizeY, gridSizeZ);
// pmeAtomGridIndex = new CudaArray<mm_int2>(cu, numParticles, "pmeAtomGridIndex"); gridSizeX = findFFTDimension(gridSizeX);
// sort = new CudaSort<SortTrait>(cu, cu.getNumAtoms()); gridSizeY = findFFTDimension(gridSizeY);
// fft = new CudaFFT3D(cu, gridSizeX, gridSizeY, gridSizeZ); gridSizeZ = findFFTDimension(gridSizeZ);
// defines["EWALD_ALPHA"] = cu.doubleToString(alpha);
// // Initialize the b-spline moduli. defines["TWO_OVER_SQRT_PI"] = cu.doubleToString(2.0/sqrt(M_PI));
// defines["USE_EWALD"] = "1";
// int maxSize = max(max(gridSizeX, gridSizeY), gridSizeZ); ewaldSelfEnergy = (cu.getContextIndex() == 0 ? -ONE_4PI_EPS0*alpha*sumSquaredCharges/sqrt(M_PI) : 0.0);
// vector<double> data(PmeOrder); pmeDefines["PME_ORDER"] = cu.intToString(PmeOrder);
// vector<double> ddata(PmeOrder); pmeDefines["NUM_ATOMS"] = cu.intToString(numParticles);
// vector<double> bsplines_data(maxSize); pmeDefines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
// data[PmeOrder-1] = 0.0; pmeDefines["RECIP_EXP_FACTOR"] = cu.doubleToString(M_PI*M_PI/(alpha*alpha));
// data[1] = 0.0; pmeDefines["GRID_SIZE_X"] = cu.intToString(gridSizeX);
// data[0] = 1.0; pmeDefines["GRID_SIZE_Y"] = cu.intToString(gridSizeY);
// for (int i = 3; i < PmeOrder; i++) { pmeDefines["GRID_SIZE_Z"] = cu.intToString(gridSizeZ);
// double div = 1.0/(i-1.0); pmeDefines["EPSILON_FACTOR"] = cu.doubleToString(sqrt(ONE_4PI_EPS0));
// data[i-1] = 0.0; CUmodule module = cu.createModule(CudaKernelSources::vectorOps+CudaKernelSources::pme, pmeDefines);
// for (int j = 1; j < (i-1); j++) pmeUpdateBsplinesKernel = cu.getKernel(module, "updateBsplines");
// data[i-j-1] = div*(j*data[i-j-2]+(i-j)*data[i-j-1]); pmeAtomRangeKernel = cu.getKernel(module, "findAtomRangeForGrid");
// data[0] = div*data[0]; pmeSpreadChargeKernel = cu.getKernel(module, "gridSpreadCharge");
// } pmeConvolutionKernel = cu.getKernel(module, "reciprocalConvolution");
// pmeInterpolateForceKernel = cu.getKernel(module, "gridInterpolateForce");
// // Differentiate. pmeFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadCharge");
//
// ddata[0] = -data[0]; // Create required data structures.
// for (int i = 1; i < PmeOrder; i++)
// ddata[i] = data[i-1]-data[i]; int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
// double div = 1.0/(PmeOrder-1); pmeGrid = new CudaArray(gridSizeX*gridSizeY*gridSizeZ, 2*elementSize, "pmeGrid");
// data[PmeOrder-1] = 0.0; cu.addAutoclearBuffer(pmeGrid->getDevicePointer(), pmeGrid->getSize()*sizeof(float2));
// for (int i = 1; i < (PmeOrder-1); i++) pmeBsplineModuliX = new CudaArray(gridSizeX, elementSize, "pmeBsplineModuliX");
// data[PmeOrder-i-1] = div*(i*data[PmeOrder-i-2]+(PmeOrder-i)*data[PmeOrder-i-1]); pmeBsplineModuliY = new CudaArray(gridSizeY, elementSize, "pmeBsplineModuliY");
// data[0] = div*data[0]; pmeBsplineModuliZ = new CudaArray(gridSizeZ, elementSize, "pmeBsplineModuliZ");
// for (int i = 0; i < maxSize; i++) pmeBsplineTheta = new CudaArray(PmeOrder*numParticles, 4*elementSize, "pmeBsplineTheta");
// bsplines_data[i] = 0.0; pmeAtomRange = CudaArray::create<int>(gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange");
// for (int i = 1; i <= PmeOrder; i++) pmeAtomGridIndex = CudaArray::create<int2>(numParticles, "pmeAtomGridIndex");
// bsplines_data[i] = data[i-1]; sort = new CudaSort(cu, new SortTrait(), cu.getNumAtoms());
// cufftResult result = cufftPlan3d(&fft, gridSizeX, gridSizeY, gridSizeZ, CUFFT_C2C);
// // Evaluate the actual bspline moduli for X/Y/Z. if (result != CUFFT_SUCCESS)
// throw OpenMMException("Error initializing FFT: "+cu.intToString(result));
// for(int dim = 0; dim < 3; dim++) { hasInitializedFFT = true;
// int ndata = (dim == 0 ? gridSizeX : dim == 1 ? gridSizeY : gridSizeZ);
// vector<cl_float> moduli(ndata); // Initialize the b-spline moduli.
// for (int i = 0; i < ndata; i++) {
// double sc = 0.0; int maxSize = max(max(gridSizeX, gridSizeY), gridSizeZ);
// double ss = 0.0; vector<double> data(PmeOrder);
// for (int j = 0; j < ndata; j++) { vector<double> ddata(PmeOrder);
// double arg = (2.0*M_PI*i*j)/ndata; vector<double> bsplines_data(maxSize);
// sc += bsplines_data[j]*cos(arg); data[PmeOrder-1] = 0.0;
// ss += bsplines_data[j]*sin(arg); data[1] = 0.0;
// } data[0] = 1.0;
// moduli[i] = (float) (sc*sc+ss*ss); for (int i = 3; i < PmeOrder; i++) {
// } double div = 1.0/(i-1.0);
// for (int i = 0; i < ndata; i++) data[i-1] = 0.0;
// { for (int j = 1; j < (i-1); j++)
// if (moduli[i] < 1.0e-7) data[i-j-1] = div*(j*data[i-j-2]+(i-j)*data[i-j-1]);
// moduli[i] = (moduli[i-1]+moduli[i+1])*0.5f; data[0] = div*data[0];
// } }
// if (dim == 0)
// pmeBsplineModuliX->upload(moduli); // Differentiate.
// else if (dim == 1)
// pmeBsplineModuliY->upload(moduli); ddata[0] = -data[0];
// else for (int i = 1; i < PmeOrder; i++)
// pmeBsplineModuliZ->upload(moduli); ddata[i] = data[i-1]-data[i];
// } double div = 1.0/(PmeOrder-1);
// } data[PmeOrder-1] = 0.0;
// else for (int i = 1; i < (PmeOrder-1); i++)
// ewaldSelfEnergy = 0.0; data[PmeOrder-i-1] = div*(i*data[PmeOrder-i-2]+(PmeOrder-i)*data[PmeOrder-i-1]);
// data[0] = div*data[0];
// // Add the interaction to the default nonbonded kernel. for (int i = 0; i < maxSize; i++)
// bsplines_data[i] = 0.0;
// string source = cu.replaceStrings(CudaKernelSources::coulombLennardJones, defines); for (int i = 1; i <= PmeOrder; i++)
// cu.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, true, force.getCutoffDistance(), exclusionList, source, force.getForceGroup()); bsplines_data[i] = data[i-1];
// if (hasLJ)
// cu.getNonbondedUtilities().addParameter(CudaNonbondedUtilities::ParameterInfo("sigmaEpsilon", "float", 2, sizeof(cl_float2), sigmaEpsilon->getDevicePointer())); // Evaluate the actual bspline moduli for X/Y/Z.
//
// // Initialize the exceptions. for(int dim = 0; dim < 3; dim++) {
// int ndata = (dim == 0 ? gridSizeX : dim == 1 ? gridSizeY : gridSizeZ);
// int numContexts = cu.getPlatformData().contexts.size(); vector<double> moduli(ndata);
// int startIndex = cu.getContextIndex()*exceptions.size()/numContexts; for (int i = 0; i < ndata; i++) {
// int endIndex = (cu.getContextIndex()+1)*exceptions.size()/numContexts; double sc = 0.0;
// int numExceptions = endIndex-startIndex; double ss = 0.0;
// if (numExceptions > 0) { for (int j = 0; j < ndata; j++) {
// exceptionAtoms.resize(numExceptions); double arg = (2.0*M_PI*i*j)/ndata;
// vector<vector<int> > atoms(numExceptions, vector<int>(2)); sc += bsplines_data[j]*cos(arg);
// exceptionParams = new CudaArray<mm_float4>(cu, numExceptions, "exceptionParams"); ss += bsplines_data[j]*sin(arg);
// vector<mm_float4> exceptionParamsVector(numExceptions); }
// for (int i = 0; i < numExceptions; i++) { moduli[i] = sc*sc+ss*ss;
// double chargeProd, sigma, epsilon; }
// force.getExceptionParameters(exceptions[startIndex+i], atoms[i][0], atoms[i][1], chargeProd, sigma, epsilon); for (int i = 0; i < ndata; i++)
// exceptionParamsVector[i] = mm_float4((float) (ONE_4PI_EPS0*chargeProd), (float) sigma, (float) (4.0*epsilon), 0.0f); if (moduli[i] < 1.0e-7)
// exceptionAtoms[i] = make_pair(atoms[i][0], atoms[i][1]); moduli[i] = (moduli[i-1]+moduli[i+1])*0.5;
// } if (cu.getUseDoublePrecision()) {
// exceptionParams->upload(exceptionParamsVector); if (dim == 0)
// map<string, string> replacements; pmeBsplineModuliX->upload(moduli);
// replacements["PARAMS"] = cu.getBondedUtilities().addArgument(exceptionParams->getDevicePointer(), "float4"); else if (dim == 1)
// cu.getBondedUtilities().addInteraction(atoms, cu.replaceStrings(CudaKernelSources::nonbondedExceptions, replacements), force.getForceGroup()); pmeBsplineModuliY->upload(moduli);
// } else
// cu.addForce(new CudaNonbondedForceInfo(cu.getNonbondedUtilities().getNumForceBuffers(), force)); pmeBsplineModuliZ->upload(moduli);
//} }
// else {
//double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy, bool includeDirect, bool includeReciprocal) { vector<float> modulif(ndata);
// bool deviceIsCpu = (cu.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU); for (int i = 0; i < ndata; i++)
// if (!hasInitializedKernel) { modulif[i] = (float) moduli[i];
// hasInitializedKernel = true; if (dim == 0)
// if (cosSinSums != NULL) { pmeBsplineModuliX->upload(modulif);
// ewaldSumsKernel.setArg<cu::Buffer>(0, cu.getEnergyBuffer().getDevicePointer()); else if (dim == 1)
// ewaldSumsKernel.setArg<cu::Buffer>(1, cu.getPosq().getDevicePointer()); pmeBsplineModuliY->upload(modulif);
// ewaldSumsKernel.setArg<cu::Buffer>(2, cosSinSums->getDevicePointer()); else
// ewaldForcesKernel.setArg<cu::Buffer>(0, cu.getForceBuffers().getDevicePointer()); pmeBsplineModuliZ->upload(modulif);
// ewaldForcesKernel.setArg<cu::Buffer>(1, cu.getPosq().getDevicePointer()); }
// ewaldForcesKernel.setArg<cu::Buffer>(2, cosSinSums->getDevicePointer()); }
// } }
// if (pmeGrid != NULL) { else
// string file = (deviceIsCpu ? CudaKernelSources::pme_cpu : CudaKernelSources::pme); ewaldSelfEnergy = 0.0;
// cu::Program program = cu.createProgram(file, pmeDefines);
// pmeUpdateBsplinesKernel = cu::Kernel(program, "updateBsplines"); // Add the interaction to the default nonbonded kernel.
// pmeAtomRangeKernel = cu::Kernel(program, "findAtomRangeForGrid");
// if (!deviceIsCpu) string source = cu.replaceStrings(CudaKernelSources::coulombLennardJones, defines);
// pmeZIndexKernel = cu::Kernel(program, "recordZIndex"); cu.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, true, force.getCutoffDistance(), exclusionList, source, force.getForceGroup());
// pmeSpreadChargeKernel = cu::Kernel(program, "gridSpreadCharge"); if (hasLJ)
// pmeConvolutionKernel = cu::Kernel(program, "reciprocalConvolution"); cu.getNonbondedUtilities().addParameter(CudaNonbondedUtilities::ParameterInfo("sigmaEpsilon", "float", 2, sizeof(float2), sigmaEpsilon->getDevicePointer()));
// pmeInterpolateForceKernel = cu::Kernel(program, "gridInterpolateForce");
// pmeUpdateBsplinesKernel.setArg<cu::Buffer>(0, cu.getPosq().getDevicePointer()); // Initialize the exceptions.
// pmeUpdateBsplinesKernel.setArg<cu::Buffer>(1, pmeBsplineTheta->getDevicePointer());
// pmeUpdateBsplinesKernel.setArg(2, CudaContext::ThreadBlockSize*PmeOrder*sizeof(mm_float4), NULL); int numContexts = cu.getPlatformData().contexts.size();
// pmeUpdateBsplinesKernel.setArg<cu::Buffer>(3, pmeAtomGridIndex->getDevicePointer()); int startIndex = cu.getContextIndex()*exceptions.size()/numContexts;
// if (deviceIsCpu) int endIndex = (cu.getContextIndex()+1)*exceptions.size()/numContexts;
// pmeUpdateBsplinesKernel.setArg<cu::Buffer>(6, pmeBsplineDTheta->getDevicePointer()); int numExceptions = endIndex-startIndex;
// pmeAtomRangeKernel.setArg<cu::Buffer>(0, pmeAtomGridIndex->getDevicePointer()); if (numExceptions > 0) {
// pmeAtomRangeKernel.setArg<cu::Buffer>(1, pmeAtomRange->getDevicePointer()); exceptionAtoms.resize(numExceptions);
// pmeAtomRangeKernel.setArg<cu::Buffer>(2, cu.getPosq().getDevicePointer()); vector<vector<int> > atoms(numExceptions, vector<int>(2));
// if (!deviceIsCpu) { exceptionParams = CudaArray::create<float4>(numExceptions, "exceptionParams");
// pmeZIndexKernel.setArg<cu::Buffer>(0, pmeAtomGridIndex->getDevicePointer()); vector<float4> exceptionParamsVector(numExceptions);
// pmeZIndexKernel.setArg<cu::Buffer>(1, cu.getPosq().getDevicePointer()); for (int i = 0; i < numExceptions; i++) {
// } double chargeProd, sigma, epsilon;
// pmeSpreadChargeKernel.setArg<cu::Buffer>(0, cu.getPosq().getDevicePointer()); force.getExceptionParameters(exceptions[startIndex+i], atoms[i][0], atoms[i][1], chargeProd, sigma, epsilon);
// pmeSpreadChargeKernel.setArg<cu::Buffer>(1, pmeAtomGridIndex->getDevicePointer()); exceptionParamsVector[i] = make_float4((float) (ONE_4PI_EPS0*chargeProd), (float) sigma, (float) (4.0*epsilon), 0.0f);
// pmeSpreadChargeKernel.setArg<cu::Buffer>(2, pmeAtomRange->getDevicePointer()); exceptionAtoms[i] = make_pair(atoms[i][0], atoms[i][1]);
// pmeSpreadChargeKernel.setArg<cu::Buffer>(3, pmeGrid->getDevicePointer()); }
// pmeSpreadChargeKernel.setArg<cu::Buffer>(4, pmeBsplineTheta->getDevicePointer()); exceptionParams->upload(exceptionParamsVector);
// pmeConvolutionKernel.setArg<cu::Buffer>(0, pmeGrid2->getDevicePointer()); map<string, string> replacements;
// pmeConvolutionKernel.setArg<cu::Buffer>(1, cu.getEnergyBuffer().getDevicePointer()); replacements["PARAMS"] = cu.getBondedUtilities().addArgument(exceptionParams->getDevicePointer(), "float4");
// pmeConvolutionKernel.setArg<cu::Buffer>(2, pmeBsplineModuliX->getDevicePointer()); cu.getBondedUtilities().addInteraction(atoms, cu.replaceStrings(CudaKernelSources::nonbondedExceptions, replacements), force.getForceGroup());
// pmeConvolutionKernel.setArg<cu::Buffer>(3, pmeBsplineModuliY->getDevicePointer()); }
// pmeConvolutionKernel.setArg<cu::Buffer>(4, pmeBsplineModuliZ->getDevicePointer()); cu.addForce(new CudaNonbondedForceInfo(force));
// interpolateForceThreads = (cu.getDevice().getInfo<CL_DEVICE_LOCAL_MEM_SIZE>() > 2*128*PmeOrder*sizeof(mm_float4) ? 128 : 64); }
// pmeInterpolateForceKernel.setArg<cu::Buffer>(0, cu.getPosq().getDevicePointer());
// pmeInterpolateForceKernel.setArg<cu::Buffer>(1, cu.getForceBuffers().getDevicePointer()); double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy, bool includeDirect, bool includeReciprocal) {
// pmeInterpolateForceKernel.setArg<cu::Buffer>(2, pmeGrid->getDevicePointer()); if (cosSinSums != NULL && cu.getContextIndex() == 0 && includeReciprocal) {
// if (deviceIsCpu) { void* sumsArgs[] = {&cu.getEnergyBuffer().getDevicePointer(), &cu.getPosq().getDevicePointer(), &cosSinSums->getDevicePointer(), cu.getPeriodicBoxSizePointer()};
// pmeInterpolateForceKernel.setArg<cu::Buffer>(5, pmeBsplineTheta->getDevicePointer()); cu.executeKernel(ewaldSumsKernel, sumsArgs, cosSinSums->getSize());
// pmeInterpolateForceKernel.setArg<cu::Buffer>(6, pmeBsplineDTheta->getDevicePointer()); void* forcesArgs[] = {&cu.getForce().getDevicePointer(), &cu.getPosq().getDevicePointer(), &cosSinSums->getDevicePointer(), cu.getPeriodicBoxSizePointer()};
// } cu.executeKernel(ewaldForcesKernel, forcesArgs, cu.getNumAtoms());
// else }
// pmeInterpolateForceKernel.setArg(5, 2*interpolateForceThreads*PmeOrder*sizeof(mm_float4), NULL); if (pmeGrid != NULL && cu.getContextIndex() == 0 && includeReciprocal) {
// if (cu.getSupports64BitGlobalAtomics()) { void* bsplinesArgs[] = {&cu.getPosq().getDevicePointer(), &pmeBsplineTheta->getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(),
// pmeFinishSpreadChargeKernel = cu::Kernel(program, "finishSpreadCharge"); cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()};
// pmeFinishSpreadChargeKernel.setArg<cu::Buffer>(0, pmeGrid->getDevicePointer()); int bsplinesSharedSize = cu.ThreadBlockSize*PmeOrder*(cu.getUseDoublePrecision() ? sizeof(double4) : sizeof(float4));
// } cu.executeKernel(pmeUpdateBsplinesKernel, bsplinesArgs, cu.getNumAtoms(), cu.ThreadBlockSize, bsplinesSharedSize);
// } sort->sort(*pmeAtomGridIndex);
// } void* rangeArgs[] = {&pmeAtomGridIndex->getDevicePointer(), &pmeAtomRange->getDevicePointer(), &cu.getPosq().getDevicePointer(),
// if (cosSinSums != NULL && cu.getContextIndex() == 0 && includeReciprocal) { cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()};
// mm_float4 boxSize = cu.getPeriodicBoxSize(); cu.executeKernel(pmeAtomRangeKernel, rangeArgs, cu.getNumAtoms());
// mm_float4 recipBoxSize = mm_float4((float) (2*M_PI/boxSize.x), (float) (2*M_PI/boxSize.y), (float) (2*M_PI/boxSize.z), 0); void* spreadArgs[] = {&cu.getPosq().getDevicePointer(), &pmeGrid->getDevicePointer(), &pmeBsplineTheta->getDevicePointer(),
// float recipCoefficient = (float) (ONE_4PI_EPS0*4*M_PI/(boxSize.x*boxSize.y*boxSize.z)); cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()};
// ewaldSumsKernel.setArg<mm_float4>(3, recipBoxSize); cu.executeKernel(pmeSpreadChargeKernel, spreadArgs, cu.getNumAtoms(), PmeOrder*PmeOrder*PmeOrder);
// ewaldSumsKernel.setArg<cl_float>(4, recipCoefficient); void* finishSpreadArgs[] = {&pmeGrid->getDevicePointer()};
// cu.executeKernel(ewaldSumsKernel, cosSinSums->getSize()); cu.executeKernel(pmeFinishSpreadChargeKernel, finishSpreadArgs, pmeGrid->getSize());
// ewaldForcesKernel.setArg<mm_float4>(3, recipBoxSize); cufftExecC2C(fft, (float2*) pmeGrid->getDevicePointer(), (float2*) pmeGrid->getDevicePointer(), CUFFT_FORWARD);
// ewaldForcesKernel.setArg<cl_float>(4, recipCoefficient); void* convolutionArgs[] = {&pmeGrid->getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(), &pmeBsplineModuliX->getDevicePointer(),
// cu.executeKernel(ewaldForcesKernel, cu.getNumAtoms()); &pmeBsplineModuliY->getDevicePointer(), &pmeBsplineModuliZ->getDevicePointer(), cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()};
// } cu.executeKernel(pmeConvolutionKernel, convolutionArgs, cu.getNumAtoms());
// if (pmeGrid != NULL && cu.getContextIndex() == 0 && includeReciprocal) { cufftExecC2C(fft, (float2*) pmeGrid->getDevicePointer(), (float2*) pmeGrid->getDevicePointer(), CUFFT_INVERSE);
// mm_float4 boxSize = cu.getPeriodicBoxSize(); void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &pmeGrid->getDevicePointer(),
// mm_float4 invBoxSize = cu.getInvPeriodicBoxSize(); cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()};
// pmeUpdateBsplinesKernel.setArg<mm_float4>(4, boxSize); interpolateForceThreads = 64;
// pmeUpdateBsplinesKernel.setArg<mm_float4>(5, invBoxSize); int interpolateSharedSize = 2*interpolateForceThreads*PmeOrder*(cu.getUseDoublePrecision() ? sizeof(double3) : sizeof(float3));
// cu.executeKernel(pmeUpdateBsplinesKernel, cu.getNumAtoms()); cu.executeKernel(pmeInterpolateForceKernel, interpolateArgs, cu.getNumAtoms(), interpolateForceThreads, interpolateSharedSize);
// if (deviceIsCpu) { }
// pmeSpreadChargeKernel.setArg<mm_float4>(5, boxSize); double energy = (includeReciprocal ? ewaldSelfEnergy : 0.0);
// pmeSpreadChargeKernel.setArg<mm_float4>(6, invBoxSize); if (dispersionCoefficient != 0.0 && includeDirect) {
// cu.executeKernel(pmeSpreadChargeKernel, 2*cu.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(), 1); double4 boxSize = cu.getPeriodicBoxSize();
// } energy += dispersionCoefficient/(boxSize.x*boxSize.y*boxSize.z);
// else { }
// sort->sort(*pmeAtomGridIndex); return energy;
// pmeAtomRangeKernel.setArg<mm_float4>(3, boxSize); }
// pmeAtomRangeKernel.setArg<mm_float4>(4, invBoxSize);
// cu.executeKernel(pmeAtomRangeKernel, cu.getNumAtoms()); void CudaCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& context, const NonbondedForce& force) {
// if (cu.getSupports64BitGlobalAtomics()) { // Make sure the new parameters are acceptable.
// pmeSpreadChargeKernel.setArg<mm_float4>(5, boxSize);
// pmeSpreadChargeKernel.setArg<mm_float4>(6, invBoxSize); cuCtxSetCurrent(cu.getContext());
// cu.executeKernel(pmeSpreadChargeKernel, cu.getNumAtoms(), PmeOrder*PmeOrder*PmeOrder); if (force.getNumParticles() != cu.getNumAtoms())
// cu.executeKernel(pmeFinishSpreadChargeKernel, pmeGrid->getSize()); throw OpenMMException("updateParametersInContext: The number of particles has changed");
// } if (!hasCoulomb || !hasLJ) {
// else { for (int i = 0; i < force.getNumParticles(); i++) {
// pmeZIndexKernel.setArg<mm_float4>(2, boxSize); double charge, sigma, epsilon;
// pmeZIndexKernel.setArg<mm_float4>(3, invBoxSize); force.getParticleParameters(i, charge, sigma, epsilon);
// cu.executeKernel(pmeZIndexKernel, cu.getNumAtoms()); if (!hasCoulomb && charge != 0.0)
// cu.executeKernel(pmeSpreadChargeKernel, cu.getNumAtoms()); throw OpenMMException("updateParametersInContext: The nonbonded force kernel does not include Coulomb interactions, because all charges were originally 0");
// } if (!hasLJ && epsilon != 0.0)
// } throw OpenMMException("updateParametersInContext: The nonbonded force kernel does not include Lennard-Jones interactions, because all epsilons were originally 0");
// fft->execFFT(*pmeGrid, *pmeGrid2, true); }
// pmeConvolutionKernel.setArg<mm_float4>(5, invBoxSize); }
// pmeConvolutionKernel.setArg<cl_float>(6, (float) (1.0/(M_PI*boxSize.x*boxSize.y*boxSize.z))); vector<int> exceptions;
// cu.executeKernel(pmeConvolutionKernel, cu.getNumAtoms()); for (int i = 0; i < force.getNumExceptions(); i++) {
// fft->execFFT(*pmeGrid2, *pmeGrid, false); int particle1, particle2;
// pmeInterpolateForceKernel.setArg<mm_float4>(3, boxSize); double chargeProd, sigma, epsilon;
// pmeInterpolateForceKernel.setArg<mm_float4>(4, invBoxSize); force.getExceptionParameters(i, particle1, particle2, chargeProd, sigma, epsilon);
// cu.executeKernel(pmeInterpolateForceKernel, cu.getNumAtoms(), interpolateForceThreads); if (exceptionAtoms.size() > exceptions.size() && make_pair(particle1, particle2) == exceptionAtoms[exceptions.size()])
// } exceptions.push_back(i);
// double energy = (includeReciprocal ? ewaldSelfEnergy : 0.0); else if (chargeProd != 0.0 || epsilon != 0.0)
// if (dispersionCoefficient != 0.0 && includeDirect) { throw OpenMMException("updateParametersInContext: The set of non-excluded exceptions has changed");
// mm_float4 boxSize = cu.getPeriodicBoxSize(); }
// energy += dispersionCoefficient/(boxSize.x*boxSize.y*boxSize.z); int numContexts = cu.getPlatformData().contexts.size();
// } int startIndex = cu.getContextIndex()*exceptions.size()/numContexts;
// return energy; int endIndex = (cu.getContextIndex()+1)*exceptions.size()/numContexts;
//} int numExceptions = endIndex-startIndex;
//
//void CudaCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& context, const NonbondedForce& force) { // Record the per-particle parameters.
// // Make sure the new parameters are acceptable.
// CudaArray& posq = cu.getPosq();
// cuCtxSetCurrent(cu.getContext()); posq.download(cu.getPinnedBuffer());
// if (force.getNumParticles() != cu.getNumAtoms()) float4* posqf = (float4*) cu.getPinnedBuffer();
// throw OpenMMException("updateParametersInContext: The number of particles has changed"); double4* posqd = (double4*) cu.getPinnedBuffer();
// if (!hasCoulomb || !hasLJ) { vector<float2> sigmaEpsilonVector(force.getNumParticles());
// for (int i = 0; i < force.getNumParticles(); i++) { double sumSquaredCharges = 0.0;
// double charge, sigma, epsilon; const vector<int>& order = cu.getAtomIndex();
// force.getParticleParameters(i, charge, sigma, epsilon); for (int i = 0; i < force.getNumParticles(); i++) {
// if (!hasCoulomb && charge != 0.0) int index = order[i];
// throw OpenMMException("updateParametersInContext: The nonbonded force kernel does not include Coulomb interactions, because all charges were originally 0"); double charge, sigma, epsilon;
// if (!hasLJ && epsilon != 0.0) force.getParticleParameters(index, charge, sigma, epsilon);
// throw OpenMMException("updateParametersInContext: The nonbonded force kernel does not include Lennard-Jones interactions, because all epsilons were originally 0"); if (cu.getUseDoublePrecision())
// } posqd[i].w = charge;
// } else
// vector<int> exceptions; posqf[i].w = (float) charge;
// for (int i = 0; i < force.getNumExceptions(); i++) { sigmaEpsilonVector[index] = make_float2((float) (0.5*sigma), (float) (2.0*sqrt(epsilon)));
// int particle1, particle2; sumSquaredCharges += charge*charge;
// double chargeProd, sigma, epsilon; }
// force.getExceptionParameters(i, particle1, particle2, chargeProd, sigma, epsilon); posq.upload(cu.getPinnedBuffer());
// if (exceptionAtoms.size() > exceptions.size() && make_pair(particle1, particle2) == exceptionAtoms[exceptions.size()]) sigmaEpsilon->upload(sigmaEpsilonVector);
// exceptions.push_back(i);
// else if (chargeProd != 0.0 || epsilon != 0.0) // Record the exceptions.
// throw OpenMMException("updateParametersInContext: The set of non-excluded exceptions has changed");
// } if (numExceptions > 0) {
// int numContexts = cu.getPlatformData().contexts.size(); vector<vector<int> > atoms(numExceptions, vector<int>(2));
// int startIndex = cu.getContextIndex()*exceptions.size()/numContexts; vector<float4> exceptionParamsVector(numExceptions);
// int endIndex = (cu.getContextIndex()+1)*exceptions.size()/numContexts; for (int i = 0; i < numExceptions; i++) {
// int numExceptions = endIndex-startIndex; double chargeProd, sigma, epsilon;
// force.getExceptionParameters(exceptions[startIndex+i], atoms[i][0], atoms[i][1], chargeProd, sigma, epsilon);
// // Record the per-particle parameters. exceptionParamsVector[i] = make_float4((float) (ONE_4PI_EPS0*chargeProd), (float) sigma, (float) (4.0*epsilon), 0.0f);
// }
// CudaArray<mm_float4>& posq = cu.getPosq(); exceptionParams->upload(exceptionParamsVector);
// posq.download(); }
// vector<mm_float2> sigmaEpsilonVector(force.getNumParticles());
// double sumSquaredCharges = 0.0; // Compute other values.
// CudaArray<cl_int>& order = cu.getAtomIndex();
// for (int i = 0; i < force.getNumParticles(); i++) { NonbondedForce::NonbondedMethod method = force.getNonbondedMethod();
// int index = order[i]; if (method == NonbondedForce::Ewald || method == NonbondedForce::PME)
// double charge, sigma, epsilon; ewaldSelfEnergy = (cu.getContextIndex() == 0 ? -ONE_4PI_EPS0*alpha*sumSquaredCharges/sqrt(M_PI) : 0.0);
// force.getParticleParameters(index, charge, sigma, epsilon); if (force.getUseDispersionCorrection() && cu.getContextIndex() == 0 && (method == NonbondedForce::CutoffPeriodic || method == NonbondedForce::Ewald || method == NonbondedForce::PME))
// posq[i].w = (float) charge; dispersionCoefficient = NonbondedForceImpl::calcDispersionCorrection(context.getSystem(), force);
// sigmaEpsilonVector[index] = mm_float2((float) (0.5*sigma), (float) (2.0*sqrt(epsilon))); cu.invalidateMolecules();
// sumSquaredCharges += charge*charge; }
// }
// posq.upload();
// sigmaEpsilon->upload(sigmaEpsilonVector);
//
// // Record the exceptions.
//
// if (numExceptions > 0) {
// vector<vector<int> > atoms(numExceptions, vector<int>(2));
// vector<mm_float4> exceptionParamsVector(numExceptions);
// for (int i = 0; i < numExceptions; i++) {
// double chargeProd, sigma, epsilon;
// force.getExceptionParameters(exceptions[startIndex+i], atoms[i][0], atoms[i][1], chargeProd, sigma, epsilon);
// exceptionParamsVector[i] = mm_float4((float) (ONE_4PI_EPS0*chargeProd), (float) sigma, (float) (4.0*epsilon), 0.0f);
// }
// exceptionParams->upload(exceptionParamsVector);
// }
//
// // Compute other values.
//
// NonbondedForce::NonbondedMethod method = force.getNonbondedMethod();
// if (method == NonbondedForce::Ewald || method == NonbondedForce::PME)
// ewaldSelfEnergy = (cu.getContextIndex() == 0 ? -ONE_4PI_EPS0*alpha*sumSquaredCharges/sqrt(M_PI) : 0.0);
// if (force.getUseDispersionCorrection() && cu.getContextIndex() == 0 && (method == NonbondedForce::CutoffPeriodic || method == NonbondedForce::Ewald || method == NonbondedForce::PME))
// dispersionCoefficient = NonbondedForceImpl::calcDispersionCorrection(context.getSystem(), force);
// cu.invalidateMolecules();
//}
//
//class CudaCustomNonbondedForceInfo : public CudaForceInfo { //class CudaCustomNonbondedForceInfo : public CudaForceInfo {
//public: //public:
// CudaCustomNonbondedForceInfo(int requiredBuffers, const CustomNonbondedForce& force) : CudaForceInfo(requiredBuffers), force(force) { // CudaCustomNonbondedForceInfo(int requiredBuffers, const CustomNonbondedForce& force) : CudaForceInfo(requiredBuffers), force(force) {
...@@ -1716,6 +1702,7 @@ void CudaCalcCustomTorsionForceKernel::copyParametersToContext(ContextImpl& cont ...@@ -1716,6 +1702,7 @@ void CudaCalcCustomTorsionForceKernel::copyParametersToContext(ContextImpl& cont
//}; //};
// //
//CudaCalcCustomNonbondedForceKernel::~CudaCalcCustomNonbondedForceKernel() { //CudaCalcCustomNonbondedForceKernel::~CudaCalcCustomNonbondedForceKernel() {
// cuCtxSetCurrent(cu.getContext());
// if (params != NULL) // if (params != NULL)
// delete params; // delete params;
// if (globals != NULL) // if (globals != NULL)
...@@ -1727,6 +1714,7 @@ void CudaCalcCustomTorsionForceKernel::copyParametersToContext(ContextImpl& cont ...@@ -1727,6 +1714,7 @@ void CudaCalcCustomTorsionForceKernel::copyParametersToContext(ContextImpl& cont
//} //}
// //
//void CudaCalcCustomNonbondedForceKernel::initialize(const System& system, const CustomNonbondedForce& force) { //void CudaCalcCustomNonbondedForceKernel::initialize(const System& system, const CustomNonbondedForce& force) {
// cuCtxSetCurrent(cu.getContext());
// int forceIndex; // int forceIndex;
// for (forceIndex = 0; forceIndex < system.getNumForces() && &system.getForce(forceIndex) != &force; ++forceIndex) // for (forceIndex = 0; forceIndex < system.getNumForces() && &system.getForce(forceIndex) != &force; ++forceIndex)
// ; // ;
...@@ -1887,6 +1875,7 @@ void CudaCalcCustomTorsionForceKernel::copyParametersToContext(ContextImpl& cont ...@@ -1887,6 +1875,7 @@ void CudaCalcCustomTorsionForceKernel::copyParametersToContext(ContextImpl& cont
//}; //};
// //
//CudaCalcGBSAOBCForceKernel::~CudaCalcGBSAOBCForceKernel() { //CudaCalcGBSAOBCForceKernel::~CudaCalcGBSAOBCForceKernel() {
// cuCtxSetCurrent(cu.getContext());
// if (params != NULL) // if (params != NULL)
// delete params; // delete params;
// if (bornSum != NULL) // if (bornSum != NULL)
...@@ -1904,6 +1893,7 @@ void CudaCalcCustomTorsionForceKernel::copyParametersToContext(ContextImpl& cont ...@@ -1904,6 +1893,7 @@ void CudaCalcCustomTorsionForceKernel::copyParametersToContext(ContextImpl& cont
//} //}
// //
//void CudaCalcGBSAOBCForceKernel::initialize(const System& system, const GBSAOBCForce& force) { //void CudaCalcGBSAOBCForceKernel::initialize(const System& system, const GBSAOBCForce& force) {
// cuCtxSetCurrent(cu.getContext());
// if (cu.getPlatformData().contexts.size() > 1) // if (cu.getPlatformData().contexts.size() > 1)
// throw OpenMMException("GBSAOBCForce does not support using multiple CUDA devices"); // throw OpenMMException("GBSAOBCForce does not support using multiple CUDA devices");
// CudaNonbondedUtilities& nb = cu.getNonbondedUtilities(); // CudaNonbondedUtilities& nb = cu.getNonbondedUtilities();
...@@ -1977,10 +1967,10 @@ void CudaCalcCustomTorsionForceKernel::copyParametersToContext(ContextImpl& cont ...@@ -1977,10 +1967,10 @@ void CudaCalcCustomTorsionForceKernel::copyParametersToContext(ContextImpl& cont
// file = CudaKernelSources::gbsaObc_nvidia; // file = CudaKernelSources::gbsaObc_nvidia;
// else // else
// file = CudaKernelSources::gbsaObc_default; // file = CudaKernelSources::gbsaObc_default;
// cu::Program program = cu.createProgram(file, defines); // CUmodule module = cu.createModule(file, defines);
// bool useLong = (cu.getSupports64BitGlobalAtomics() && !deviceIsCpu); // bool useLong = (cu.getSupports64BitGlobalAtomics() && !deviceIsCpu);
// int index = 0; // int index = 0;
// computeBornSumKernel = cu::Kernel(program, "computeBornSum"); // computeBornSumKernel = cu.getKernel(module, "computeBornSum");
// computeBornSumKernel.setArg<cu::Buffer>(index++, (useLong ? longBornSum->getDevicePointer() : bornSum->getDevicePointer())); // computeBornSumKernel.setArg<cu::Buffer>(index++, (useLong ? longBornSum->getDevicePointer() : bornSum->getDevicePointer()));
// computeBornSumKernel.setArg<cu::Buffer>(index++, cu.getPosq().getDevicePointer()); // computeBornSumKernel.setArg<cu::Buffer>(index++, cu.getPosq().getDevicePointer());
// computeBornSumKernel.setArg<cu::Buffer>(index++, params->getDevicePointer()); // computeBornSumKernel.setArg<cu::Buffer>(index++, params->getDevicePointer());
...@@ -1998,7 +1988,7 @@ void CudaCalcCustomTorsionForceKernel::copyParametersToContext(ContextImpl& cont ...@@ -1998,7 +1988,7 @@ void CudaCalcCustomTorsionForceKernel::copyParametersToContext(ContextImpl& cont
// computeBornSumKernel.setArg<cu::Buffer>(index++, nb.getExclusionIndices().getDevicePointer()); // computeBornSumKernel.setArg<cu::Buffer>(index++, nb.getExclusionIndices().getDevicePointer());
// computeBornSumKernel.setArg<cu::Buffer>(index++, nb.getExclusionRowIndices().getDevicePointer()); // computeBornSumKernel.setArg<cu::Buffer>(index++, nb.getExclusionRowIndices().getDevicePointer());
// } // }
// force1Kernel = cu::Kernel(program, "computeGBSAForce1"); // force1Kernel = cu.getKernel(module, "computeGBSAForce1");
// index = 0; // index = 0;
// force1Kernel.setArg<cu::Buffer>(index++, (useLong ? cu.getLongForceBuffer().getDevicePointer() : cu.getForceBuffers().getDevicePointer())); // force1Kernel.setArg<cu::Buffer>(index++, (useLong ? cu.getLongForceBuffer().getDevicePointer() : cu.getForceBuffers().getDevicePointer()));
// force1Kernel.setArg<cu::Buffer>(index++, (useLong ? longBornForce->getDevicePointer() : bornForce->getDevicePointer())); // force1Kernel.setArg<cu::Buffer>(index++, (useLong ? longBornForce->getDevicePointer() : bornForce->getDevicePointer()));
...@@ -2019,8 +2009,8 @@ void CudaCalcCustomTorsionForceKernel::copyParametersToContext(ContextImpl& cont ...@@ -2019,8 +2009,8 @@ void CudaCalcCustomTorsionForceKernel::copyParametersToContext(ContextImpl& cont
// force1Kernel.setArg<cu::Buffer>(index++, nb.getExclusionIndices().getDevicePointer()); // force1Kernel.setArg<cu::Buffer>(index++, nb.getExclusionIndices().getDevicePointer());
// force1Kernel.setArg<cu::Buffer>(index++, nb.getExclusionRowIndices().getDevicePointer()); // force1Kernel.setArg<cu::Buffer>(index++, nb.getExclusionRowIndices().getDevicePointer());
// } // }
// program = cu.createProgram(CudaKernelSources::gbsaObcReductions, defines); // module = cu.createModule(CudaKernelSources::gbsaObcReductions, defines);
// reduceBornSumKernel = cu::Kernel(program, "reduceBornSum"); // reduceBornSumKernel = cu.getKernel(module, "reduceBornSum");
// reduceBornSumKernel.setArg<cl_int>(0, cu.getPaddedNumAtoms()); // reduceBornSumKernel.setArg<cl_int>(0, cu.getPaddedNumAtoms());
// reduceBornSumKernel.setArg<cl_int>(1, nb.getNumForceBuffers()); // reduceBornSumKernel.setArg<cl_int>(1, nb.getNumForceBuffers());
// reduceBornSumKernel.setArg<cl_float>(2, 1.0f); // reduceBornSumKernel.setArg<cl_float>(2, 1.0f);
...@@ -2030,7 +2020,7 @@ void CudaCalcCustomTorsionForceKernel::copyParametersToContext(ContextImpl& cont ...@@ -2030,7 +2020,7 @@ void CudaCalcCustomTorsionForceKernel::copyParametersToContext(ContextImpl& cont
// reduceBornSumKernel.setArg<cu::Buffer>(6, params->getDevicePointer()); // reduceBornSumKernel.setArg<cu::Buffer>(6, params->getDevicePointer());
// reduceBornSumKernel.setArg<cu::Buffer>(7, bornRadii->getDevicePointer()); // reduceBornSumKernel.setArg<cu::Buffer>(7, bornRadii->getDevicePointer());
// reduceBornSumKernel.setArg<cu::Buffer>(8, obcChain->getDevicePointer()); // reduceBornSumKernel.setArg<cu::Buffer>(8, obcChain->getDevicePointer());
// reduceBornForceKernel = cu::Kernel(program, "reduceBornForce"); // reduceBornForceKernel = cu.getKernel(module, "reduceBornForce");
// index = 0; // index = 0;
// reduceBornForceKernel.setArg<cl_int>(index++, cu.getPaddedNumAtoms()); // reduceBornForceKernel.setArg<cl_int>(index++, cu.getPaddedNumAtoms());
// reduceBornForceKernel.setArg<cl_int>(index++, nb.getNumForceBuffers()); // reduceBornForceKernel.setArg<cl_int>(index++, nb.getNumForceBuffers());
...@@ -2127,6 +2117,7 @@ void CudaCalcCustomTorsionForceKernel::copyParametersToContext(ContextImpl& cont ...@@ -2127,6 +2117,7 @@ void CudaCalcCustomTorsionForceKernel::copyParametersToContext(ContextImpl& cont
//}; //};
// //
//CudaCalcCustomGBForceKernel::~CudaCalcCustomGBForceKernel() { //CudaCalcCustomGBForceKernel::~CudaCalcCustomGBForceKernel() {
// cuCtxSetCurrent(cu.getContext());
// if (params != NULL) // if (params != NULL)
// delete params; // delete params;
// if (computedValues != NULL) // if (computedValues != NULL)
...@@ -2148,6 +2139,7 @@ void CudaCalcCustomTorsionForceKernel::copyParametersToContext(ContextImpl& cont ...@@ -2148,6 +2139,7 @@ void CudaCalcCustomTorsionForceKernel::copyParametersToContext(ContextImpl& cont
//} //}
// //
//void CudaCalcCustomGBForceKernel::initialize(const System& system, const CustomGBForce& force) { //void CudaCalcCustomGBForceKernel::initialize(const System& system, const CustomGBForce& force) {
// cuCtxSetCurrent(cu.getContext());
// if (cu.getPlatformData().contexts.size() > 1) // if (cu.getPlatformData().contexts.size() > 1)
// throw OpenMMException("CustomGBForce does not support using multiple CUDA devices"); // throw OpenMMException("CustomGBForce does not support using multiple CUDA devices");
// bool useExclusionsForValue = false; // bool useExclusionsForValue = false;
...@@ -2360,8 +2352,8 @@ void CudaCalcCustomTorsionForceKernel::copyParametersToContext(ContextImpl& cont ...@@ -2360,8 +2352,8 @@ void CudaCalcCustomTorsionForceKernel::copyParametersToContext(ContextImpl& cont
// file = CudaKernelSources::customGBValueN2_nvidia; // file = CudaKernelSources::customGBValueN2_nvidia;
// else // else
// file = CudaKernelSources::customGBValueN2_default; // file = CudaKernelSources::customGBValueN2_default;
// cu::Program program = cu.createProgram(cu.replaceStrings(file, replacements), defines); // CUmodule module = cu.createModule(cu.replaceStrings(file, replacements), defines);
// pairValueKernel = cu::Kernel(program, "computeN2Value"); // pairValueKernel = cu.getKernel(module, "computeN2Value");
// if (useExclusionsForValue) // if (useExclusionsForValue)
// cu.getNonbondedUtilities().requestExclusions(exclusionList); // cu.getNonbondedUtilities().requestExclusions(exclusionList);
// } // }
...@@ -2406,8 +2398,8 @@ void CudaCalcCustomTorsionForceKernel::copyParametersToContext(ContextImpl& cont ...@@ -2406,8 +2398,8 @@ void CudaCalcCustomTorsionForceKernel::copyParametersToContext(ContextImpl& cont
// replacements["COMPUTE_VALUES"] = reductionSource.str(); // replacements["COMPUTE_VALUES"] = reductionSource.str();
// map<string, string> defines; // map<string, string> defines;
// defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms()); // defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
// cu::Program program = cu.createProgram(cu.replaceStrings(CudaKernelSources::customGBValuePerParticle, replacements), defines); // CUmodule module = cu.createModule(cu.replaceStrings(CudaKernelSources::customGBValuePerParticle, replacements), defines);
// perParticleValueKernel = cu::Kernel(program, "computePerParticleValues"); // perParticleValueKernel = cu.getKernel(module, "computePerParticleValues");
// } // }
// { // {
// // Create the N2 energy kernel. // // Create the N2 energy kernel.
...@@ -2559,8 +2551,8 @@ void CudaCalcCustomTorsionForceKernel::copyParametersToContext(ContextImpl& cont ...@@ -2559,8 +2551,8 @@ void CudaCalcCustomTorsionForceKernel::copyParametersToContext(ContextImpl& cont
// file = CudaKernelSources::customGBEnergyN2_nvidia; // file = CudaKernelSources::customGBEnergyN2_nvidia;
// else // else
// file = CudaKernelSources::customGBEnergyN2_default; // file = CudaKernelSources::customGBEnergyN2_default;
// cu::Program program = cu.createProgram(cu.replaceStrings(file, replacements), defines); // CUmodule module = cu.createModule(cu.replaceStrings(file, replacements), defines);
// pairEnergyKernel = cu::Kernel(program, "computeN2Energy"); // pairEnergyKernel = cu.getKernel(module, "computeN2Energy");
// } // }
// { // {
// // Create the kernel to reduce the derivatives and calculate per-particle energy terms. // // Create the kernel to reduce the derivatives and calculate per-particle energy terms.
...@@ -2654,8 +2646,8 @@ void CudaCalcCustomTorsionForceKernel::copyParametersToContext(ContextImpl& cont ...@@ -2654,8 +2646,8 @@ void CudaCalcCustomTorsionForceKernel::copyParametersToContext(ContextImpl& cont
// map<string, string> defines; // map<string, string> defines;
// defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms()); // defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
// defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms()); // defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
// cu::Program program = cu.createProgram(cu.replaceStrings(CudaKernelSources::customGBEnergyPerParticle, replacements), defines); // CUmodule module = cu.createModule(cu.replaceStrings(CudaKernelSources::customGBEnergyPerParticle, replacements), defines);
// perParticleEnergyKernel = cu::Kernel(program, "computePerParticleEnergy"); // perParticleEnergyKernel = cu.getKernel(module, "computePerParticleEnergy");
// } // }
// if (needParameterGradient) { // if (needParameterGradient) {
// // Create the kernel to compute chain rule terms for computed values that depend explicitly on particle coordinates. // // Create the kernel to compute chain rule terms for computed values that depend explicitly on particle coordinates.
...@@ -2719,8 +2711,8 @@ void CudaCalcCustomTorsionForceKernel::copyParametersToContext(ContextImpl& cont ...@@ -2719,8 +2711,8 @@ void CudaCalcCustomTorsionForceKernel::copyParametersToContext(ContextImpl& cont
// replacements["COMPUTE_FORCES"] = compute.str(); // replacements["COMPUTE_FORCES"] = compute.str();
// map<string, string> defines; // map<string, string> defines;
// defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms()); // defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
// cu::Program program = cu.createProgram(cu.replaceStrings(CudaKernelSources::customGBGradientChainRule, replacements), defines); // CUmodule module = cu.createModule(cu.replaceStrings(CudaKernelSources::customGBGradientChainRule, replacements), defines);
// gradientChainRuleKernel = cu::Kernel(program, "computeGradientChainRuleTerms"); // gradientChainRuleKernel = cu.getKernel(module, "computeGradientChainRuleTerms");
// } // }
// { // {
// // Create the code to calculate chain rules terms as part of the default nonbonded kernel. // // Create the code to calculate chain rules terms as part of the default nonbonded kernel.
...@@ -3061,6 +3053,7 @@ private: ...@@ -3061,6 +3053,7 @@ private:
}; };
CudaCalcCustomExternalForceKernel::~CudaCalcCustomExternalForceKernel() { CudaCalcCustomExternalForceKernel::~CudaCalcCustomExternalForceKernel() {
cuCtxSetCurrent(cu.getContext());
if (params != NULL) if (params != NULL)
delete params; delete params;
if (globals != NULL) if (globals != NULL)
...@@ -3068,6 +3061,7 @@ CudaCalcCustomExternalForceKernel::~CudaCalcCustomExternalForceKernel() { ...@@ -3068,6 +3061,7 @@ CudaCalcCustomExternalForceKernel::~CudaCalcCustomExternalForceKernel() {
} }
void CudaCalcCustomExternalForceKernel::initialize(const System& system, const CustomExternalForce& force) { void CudaCalcCustomExternalForceKernel::initialize(const System& system, const CustomExternalForce& force) {
cuCtxSetCurrent(cu.getContext());
int numContexts = cu.getPlatformData().contexts.size(); int numContexts = cu.getPlatformData().contexts.size();
int startIndex = cu.getContextIndex()*force.getNumParticles()/numContexts; int startIndex = cu.getContextIndex()*force.getNumParticles()/numContexts;
int endIndex = (cu.getContextIndex()+1)*force.getNumParticles()/numContexts; int endIndex = (cu.getContextIndex()+1)*force.getNumParticles()/numContexts;
...@@ -3127,7 +3121,7 @@ void CudaCalcCustomExternalForceKernel::initialize(const System& system, const C ...@@ -3127,7 +3121,7 @@ void CudaCalcCustomExternalForceKernel::initialize(const System& system, const C
} }
stringstream compute; stringstream compute;
for (int i = 0; i < (int) params->getBuffers().size(); i++) { for (int i = 0; i < (int) params->getBuffers().size(); i++) {
const CudaNonbondedUtilities::ParameterInfo& buffer = params->getBuffers()[i]; CudaNonbondedUtilities::ParameterInfo& buffer = params->getBuffers()[i];
string argName = cu.getBondedUtilities().addArgument(buffer.getMemory(), buffer.getType()); string argName = cu.getBondedUtilities().addArgument(buffer.getMemory(), buffer.getType());
compute<<buffer.getType()<<" particleParams"<<(i+1)<<" = "<<argName<<"[index];\n"; compute<<buffer.getType()<<" particleParams"<<(i+1)<<" = "<<argName<<"[index];\n";
} }
...@@ -3256,6 +3250,7 @@ void CudaCalcCustomExternalForceKernel::copyParametersToContext(ContextImpl& con ...@@ -3256,6 +3250,7 @@ void CudaCalcCustomExternalForceKernel::copyParametersToContext(ContextImpl& con
//}; //};
// //
//CudaCalcCustomHbondForceKernel::~CudaCalcCustomHbondForceKernel() { //CudaCalcCustomHbondForceKernel::~CudaCalcCustomHbondForceKernel() {
// cuCtxSetCurrent(cu.getContext());
// if (donorParams != NULL) // if (donorParams != NULL)
// delete donorParams; // delete donorParams;
// if (acceptorParams != NULL) // if (acceptorParams != NULL)
...@@ -3296,6 +3291,7 @@ void CudaCalcCustomExternalForceKernel::copyParametersToContext(ContextImpl& con ...@@ -3296,6 +3291,7 @@ void CudaCalcCustomExternalForceKernel::copyParametersToContext(ContextImpl& con
//void CudaCalcCustomHbondForceKernel::initialize(const System& system, const CustomHbondForce& force) { //void CudaCalcCustomHbondForceKernel::initialize(const System& system, const CustomHbondForce& force) {
// // Record the lists of donors and acceptors, and the parameters for each one. // // Record the lists of donors and acceptors, and the parameters for each one.
// //
// cuCtxSetCurrent(cu.getContext());
// int numContexts = cu.getPlatformData().contexts.size(); // int numContexts = cu.getPlatformData().contexts.size();
// int startIndex = cu.getContextIndex()*force.getNumDonors()/numContexts; // int startIndex = cu.getContextIndex()*force.getNumDonors()/numContexts;
// int endIndex = (cu.getContextIndex()+1)*force.getNumDonors()/numContexts; // int endIndex = (cu.getContextIndex()+1)*force.getNumDonors()/numContexts;
...@@ -3608,9 +3604,9 @@ void CudaCalcCustomExternalForceKernel::copyParametersToContext(ContextImpl& con ...@@ -3608,9 +3604,9 @@ void CudaCalcCustomExternalForceKernel::copyParametersToContext(ContextImpl& con
// defines["USE_PERIODIC"] = "1"; // defines["USE_PERIODIC"] = "1";
// if (force.getNumExclusions() > 0) // if (force.getNumExclusions() > 0)
// defines["USE_EXCLUSIONS"] = "1"; // defines["USE_EXCLUSIONS"] = "1";
// cu::Program program = cu.createProgram(cu.replaceStrings(CudaKernelSources::customHbondForce, replacements), defines); // CUmodule module = cu.createModule(cu.replaceStrings(CudaKernelSources::customHbondForce, replacements), defines);
// donorKernel = cu::Kernel(program, "computeDonorForces"); // donorKernel = cu.getKernel(module, "computeDonorForces");
// acceptorKernel = cu::Kernel(program, "computeAcceptorForces"); // acceptorKernel = cu.getKernel(module, "computeAcceptorForces");
//} //}
// //
//double CudaCalcCustomHbondForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) { //double CudaCalcCustomHbondForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
...@@ -3755,6 +3751,7 @@ private: ...@@ -3755,6 +3751,7 @@ private:
}; };
CudaCalcCustomCompoundBondForceKernel::~CudaCalcCustomCompoundBondForceKernel() { CudaCalcCustomCompoundBondForceKernel::~CudaCalcCustomCompoundBondForceKernel() {
cuCtxSetCurrent(cu.getContext());
if (params != NULL) if (params != NULL)
delete params; delete params;
if (globals != NULL) if (globals != NULL)
...@@ -3766,6 +3763,7 @@ CudaCalcCustomCompoundBondForceKernel::~CudaCalcCustomCompoundBondForceKernel() ...@@ -3766,6 +3763,7 @@ CudaCalcCustomCompoundBondForceKernel::~CudaCalcCustomCompoundBondForceKernel()
} }
void CudaCalcCustomCompoundBondForceKernel::initialize(const System& system, const CustomCompoundBondForce& force) { void CudaCalcCustomCompoundBondForceKernel::initialize(const System& system, const CustomCompoundBondForce& force) {
cuCtxSetCurrent(cu.getContext());
int numContexts = cu.getPlatformData().contexts.size(); int numContexts = cu.getPlatformData().contexts.size();
int startIndex = cu.getContextIndex()*force.getNumBonds()/numContexts; int startIndex = cu.getContextIndex()*force.getNumBonds()/numContexts;
int endIndex = (cu.getContextIndex()+1)*force.getNumBonds()/numContexts; int endIndex = (cu.getContextIndex()+1)*force.getNumBonds()/numContexts;
...@@ -3922,7 +3920,7 @@ void CudaCalcCustomCompoundBondForceKernel::initialize(const System& system, con ...@@ -3922,7 +3920,7 @@ void CudaCalcCustomCompoundBondForceKernel::initialize(const System& system, con
// Now evaluate the expressions. // Now evaluate the expressions.
for (int i = 0; i < (int) params->getBuffers().size(); i++) { for (int i = 0; i < (int) params->getBuffers().size(); i++) {
const CudaNonbondedUtilities::ParameterInfo& buffer = params->getBuffers()[i]; CudaNonbondedUtilities::ParameterInfo& buffer = params->getBuffers()[i];
string argName = cu.getBondedUtilities().addArgument(buffer.getMemory(), buffer.getType()); string argName = cu.getBondedUtilities().addArgument(buffer.getMemory(), buffer.getType());
compute<<buffer.getType()<<" bondParams"<<(i+1)<<" = "<<argName<<"[index];\n"; compute<<buffer.getType()<<" bondParams"<<(i+1)<<" = "<<argName<<"[index];\n";
} }
...@@ -4051,6 +4049,7 @@ CudaIntegrateVerletStepKernel::~CudaIntegrateVerletStepKernel() { ...@@ -4051,6 +4049,7 @@ CudaIntegrateVerletStepKernel::~CudaIntegrateVerletStepKernel() {
} }
void CudaIntegrateVerletStepKernel::initialize(const System& system, const VerletIntegrator& integrator) { void CudaIntegrateVerletStepKernel::initialize(const System& system, const VerletIntegrator& integrator) {
cuCtxSetCurrent(cu.getContext());
cu.getPlatformData().initializeContexts(system); cu.getPlatformData().initializeContexts(system);
map<string, string> defines; map<string, string> defines;
defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms()); defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
...@@ -4103,19 +4102,21 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn ...@@ -4103,19 +4102,21 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn
} }
//CudaIntegrateLangevinStepKernel::~CudaIntegrateLangevinStepKernel() { //CudaIntegrateLangevinStepKernel::~CudaIntegrateLangevinStepKernel() {
// cuCtxSetCurrent(cu.getContext());
// if (params != NULL) // if (params != NULL)
// delete params; // delete params;
//} //}
// //
//void CudaIntegrateLangevinStepKernel::initialize(const System& system, const LangevinIntegrator& integrator) { //void CudaIntegrateLangevinStepKernel::initialize(const System& system, const LangevinIntegrator& integrator) {
// cuCtxSetCurrent(cu.getContext());
// cu.getPlatformData().initializeContexts(system); // cu.getPlatformData().initializeContexts(system);
// cu.getIntegrationUtilities().initRandomNumberGenerator(integrator.getRandomNumberSeed()); // cu.getIntegrationUtilities().initRandomNumberGenerator(integrator.getRandomNumberSeed());
// map<string, string> defines; // map<string, string> defines;
// defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms()); // defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
// defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms()); // defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
// cu::Program program = cu.createProgram(CudaKernelSources::langevin, defines, ""); // CUmodule module = cu.createModule(CudaKernelSources::langevin, defines, "");
// kernel1 = cu::Kernel(program, "integrateLangevinPart1"); // kernel1 = cu.getKernel(module, "integrateLangevinPart1");
// kernel2 = cu::Kernel(program, "integrateLangevinPart2"); // kernel2 = cu.getKernel(module, "integrateLangevinPart2");
// params = new CudaArray<cl_float>(cu, 3, "langevinParams"); // params = new CudaArray<cl_float>(cu, 3, "langevinParams");
// prevStepSize = -1.0; // prevStepSize = -1.0;
//} //}
...@@ -4183,13 +4184,14 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn ...@@ -4183,13 +4184,14 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn
//} //}
// //
//void CudaIntegrateBrownianStepKernel::initialize(const System& system, const BrownianIntegrator& integrator) { //void CudaIntegrateBrownianStepKernel::initialize(const System& system, const BrownianIntegrator& integrator) {
// cuCtxSetCurrent(cu.getContext());
// cu.getPlatformData().initializeContexts(system); // cu.getPlatformData().initializeContexts(system);
// cu.getIntegrationUtilities().initRandomNumberGenerator(integrator.getRandomNumberSeed()); // cu.getIntegrationUtilities().initRandomNumberGenerator(integrator.getRandomNumberSeed());
// map<string, string> defines; // map<string, string> defines;
// defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms()); // defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
// cu::Program program = cu.createProgram(CudaKernelSources::brownian, defines, ""); // CUmodule module = cu.createModule(CudaKernelSources::brownian, defines, "");
// kernel1 = cu::Kernel(program, "integrateBrownianPart1"); // kernel1 = cu.getKernel(module, "integrateBrownianPart1");
// kernel2 = cu::Kernel(program, "integrateBrownianPart2"); // kernel2 = cu.getKernel(module, "integrateBrownianPart2");
// prevStepSize = -1.0; // prevStepSize = -1.0;
//} //}
// //
...@@ -4243,11 +4245,12 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn ...@@ -4243,11 +4245,12 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn
//} //}
// //
//void CudaIntegrateVariableVerletStepKernel::initialize(const System& system, const VariableVerletIntegrator& integrator) { //void CudaIntegrateVariableVerletStepKernel::initialize(const System& system, const VariableVerletIntegrator& integrator) {
// cuCtxSetCurrent(cu.getContext());
// cu.getPlatformData().initializeContexts(system); // cu.getPlatformData().initializeContexts(system);
// cu::Program program = cu.createProgram(CudaKernelSources::verlet, ""); // CUmodule module = cu.createModule(CudaKernelSources::verlet, "");
// kernel1 = cu::Kernel(program, "integrateVerletPart1"); // kernel1 = cu.getKernel(module, "integrateVerletPart1");
// kernel2 = cu::Kernel(program, "integrateVerletPart2"); // kernel2 = cu.getKernel(module, "integrateVerletPart2");
// selectSizeKernel = cu::Kernel(program, "selectVerletStepSize"); // selectSizeKernel = cu.getKernel(module, "selectVerletStepSize");
// blockSize = min(min(256, system.getNumParticles()), (int) cu.getDevice().getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>()); // blockSize = min(min(256, system.getNumParticles()), (int) cu.getDevice().getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>());
//} //}
// //
...@@ -4307,20 +4310,22 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn ...@@ -4307,20 +4310,22 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn
//} //}
// //
//CudaIntegrateVariableLangevinStepKernel::~CudaIntegrateVariableLangevinStepKernel() { //CudaIntegrateVariableLangevinStepKernel::~CudaIntegrateVariableLangevinStepKernel() {
// cuCtxSetCurrent(cu.getContext());
// if (params != NULL) // if (params != NULL)
// delete params; // delete params;
//} //}
// //
//void CudaIntegrateVariableLangevinStepKernel::initialize(const System& system, const VariableLangevinIntegrator& integrator) { //void CudaIntegrateVariableLangevinStepKernel::initialize(const System& system, const VariableLangevinIntegrator& integrator) {
// cuCtxSetCurrent(cu.getContext());
// cu.getPlatformData().initializeContexts(system); // cu.getPlatformData().initializeContexts(system);
// cu.getIntegrationUtilities().initRandomNumberGenerator(integrator.getRandomNumberSeed()); // cu.getIntegrationUtilities().initRandomNumberGenerator(integrator.getRandomNumberSeed());
// map<string, string> defines; // map<string, string> defines;
// defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms()); // defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
// defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms()); // defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
// cu::Program program = cu.createProgram(CudaKernelSources::langevin, defines, ""); // CUmodule module = cu.createModule(CudaKernelSources::langevin, defines, "");
// kernel1 = cu::Kernel(program, "integrateLangevinPart1"); // kernel1 = cu.getKernel(module, "integrateLangevinPart1");
// kernel2 = cu::Kernel(program, "integrateLangevinPart2"); // kernel2 = cu.getKernel(module, "integrateLangevinPart2");
// selectSizeKernel = cu::Kernel(program, "selectLangevinStepSize"); // selectSizeKernel = cu.getKernel(module, "selectLangevinStepSize");
// params = new CudaArray<cl_float>(cu, 3, "langevinParams"); // params = new CudaArray<cl_float>(cu, 3, "langevinParams");
// blockSize = min(256, system.getNumParticles()); // blockSize = min(256, system.getNumParticles());
// blockSize = max(blockSize, params->getSize()); // blockSize = max(blockSize, params->getSize());
...@@ -4428,6 +4433,7 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn ...@@ -4428,6 +4433,7 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn
//}; //};
// //
//CudaIntegrateCustomStepKernel::~CudaIntegrateCustomStepKernel() { //CudaIntegrateCustomStepKernel::~CudaIntegrateCustomStepKernel() {
// cuCtxSetCurrent(cu.getContext());
// if (globalValues != NULL) // if (globalValues != NULL)
// delete globalValues; // delete globalValues;
// if (contextParameterValues != NULL) // if (contextParameterValues != NULL)
...@@ -4445,6 +4451,7 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn ...@@ -4445,6 +4451,7 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn
//} //}
// //
//void CudaIntegrateCustomStepKernel::initialize(const System& system, const CustomIntegrator& integrator) { //void CudaIntegrateCustomStepKernel::initialize(const System& system, const CustomIntegrator& integrator) {
// cuCtxSetCurrent(cu.getContext());
// cu.getPlatformData().initializeContexts(system); // cu.getPlatformData().initializeContexts(system);
// cu.getIntegrationUtilities().initRandomNumberGenerator(integrator.getRandomNumberSeed()); // cu.getIntegrationUtilities().initRandomNumberGenerator(integrator.getRandomNumberSeed());
// numGlobalVariables = integrator.getNumGlobalVariables(); // numGlobalVariables = integrator.getNumGlobalVariables();
...@@ -4565,8 +4572,8 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn ...@@ -4565,8 +4572,8 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn
// seed[i].w = r = (1664525*r + 1013904223) & 0xFFFFFFFF; // seed[i].w = r = (1664525*r + 1013904223) & 0xFFFFFFFF;
// } // }
// randomSeed->upload(seed); // randomSeed->upload(seed);
// cu::Program randomProgram = cu.createProgram(CudaKernelSources::customIntegrator, defines); // CUmodule randomProgram = cu.createModule(CudaKernelSources::customIntegrator, defines);
// randomKernel = cu::Kernel(randomProgram, "generateRandomNumbers"); // randomKernel = cu.getKernel(randomProgram, "generateRandomNumbers");
// randomKernel.setArg<cu::Buffer>(0, uniformRandoms->getDevicePointer()); // randomKernel.setArg<cu::Buffer>(0, uniformRandoms->getDevicePointer());
// randomKernel.setArg<cu::Buffer>(1, randomSeed->getDevicePointer()); // randomKernel.setArg<cu::Buffer>(1, randomSeed->getDevicePointer());
// //
...@@ -4721,8 +4728,8 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn ...@@ -4721,8 +4728,8 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn
// defines["LOAD_POS_AS_DELTA"] = "1"; // defines["LOAD_POS_AS_DELTA"] = "1";
// else if (defines.find("LOAD_POS_AS_DELTA") != defines.end()) // else if (defines.find("LOAD_POS_AS_DELTA") != defines.end())
// defines.erase("LOAD_POS_AS_DELTA"); // defines.erase("LOAD_POS_AS_DELTA");
// cu::Program program = cu.createProgram(cu.replaceStrings(CudaKernelSources::customIntegratorPerDof, replacements), defines); // CUmodule module = cu.createModule(cu.replaceStrings(CudaKernelSources::customIntegratorPerDof, replacements), defines);
// cu::Kernel kernel = cu::Kernel(program, "computePerDof"); // cu::Kernel kernel = cu.getKernel(module, "computePerDof");
// kernels[step].push_back(kernel); // kernels[step].push_back(kernel);
// requiredGaussian[step] = numGaussian; // requiredGaussian[step] = numGaussian;
// requiredUniform[step] = numUniform; // requiredUniform[step] = numUniform;
...@@ -4744,8 +4751,8 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn ...@@ -4744,8 +4751,8 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn
// if (stepType[step] == CustomIntegrator::ComputeSum) { // if (stepType[step] == CustomIntegrator::ComputeSum) {
// // Create a second kernel for this step that sums the values. // // Create a second kernel for this step that sums the values.
// //
// program = cu.createProgram(CudaKernelSources::customIntegrator, defines); // module = cu.createModule(CudaKernelSources::customIntegrator, defines);
// kernel = cu::Kernel(program, "computeSum"); // kernel = cu.getKernel(module, "computeSum");
// kernels[step].push_back(kernel); // kernels[step].push_back(kernel);
// index = 0; // index = 0;
// kernel.setArg<cu::Buffer>(index++, sumBuffer->getDevicePointer()); // kernel.setArg<cu::Buffer>(index++, sumBuffer->getDevicePointer());
...@@ -4776,8 +4783,8 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn ...@@ -4776,8 +4783,8 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn
// compute << "{\n" << createGlobalComputation(variable[i], expression[i], integrator, energyName[i]) << "}\n"; // compute << "{\n" << createGlobalComputation(variable[i], expression[i], integrator, energyName[i]) << "}\n";
// map<string, string> replacements; // map<string, string> replacements;
// replacements["COMPUTE_STEP"] = compute.str(); // replacements["COMPUTE_STEP"] = compute.str();
// cu::Program program = cu.createProgram(cu.replaceStrings(CudaKernelSources::customIntegratorGlobal, replacements), defines); // CUmodule module = cu.createModule(cu.replaceStrings(CudaKernelSources::customIntegratorGlobal, replacements), defines);
// cu::Kernel kernel = cu::Kernel(program, "computeGlobal"); // cu::Kernel kernel = cu.getKernel(module, "computeGlobal");
// kernels[step].push_back(kernel); // kernels[step].push_back(kernel);
// int index = 0; // int index = 0;
// kernel.setArg<cu::Buffer>(index++, integration.getStepSize().getDevicePointer()); // kernel.setArg<cu::Buffer>(index++, integration.getStepSize().getDevicePointer());
...@@ -4789,8 +4796,8 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn ...@@ -4789,8 +4796,8 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn
// else if (stepType[step] == CustomIntegrator::ConstrainPositions) { // else if (stepType[step] == CustomIntegrator::ConstrainPositions) {
// // Apply position constraints. // // Apply position constraints.
// //
// cu::Program program = cu.createProgram(CudaKernelSources::customIntegrator, defines); // CUmodule module = cu.createModule(CudaKernelSources::customIntegrator, defines);
// cu::Kernel kernel = cu::Kernel(program, "applyPositionDeltas"); // cu::Kernel kernel = cu.getKernel(module, "applyPositionDeltas");
// kernels[step].push_back(kernel); // kernels[step].push_back(kernel);
// int index = 0; // int index = 0;
// kernel.setArg<cu::Buffer>(index++, cu.getPosq().getDevicePointer()); // kernel.setArg<cu::Buffer>(index++, cu.getPosq().getDevicePointer());
...@@ -4800,8 +4807,8 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn ...@@ -4800,8 +4807,8 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn
// //
// // Create the kernel for summing energy. // // Create the kernel for summing energy.
// //
// cu::Program program = cu.createProgram(CudaKernelSources::customIntegrator, defines); // CUmodule module = cu.createModule(CudaKernelSources::customIntegrator, defines);
// sumEnergyKernel = cu::Kernel(program, "computeSum"); // sumEnergyKernel = cu.getKernel(module, "computeSum");
// int index = 0; // int index = 0;
// sumEnergyKernel.setArg<cu::Buffer>(index++, cu.getEnergyBuffer().getDevicePointer()); // sumEnergyKernel.setArg<cu::Buffer>(index++, cu.getEnergyBuffer().getDevicePointer());
// sumEnergyKernel.setArg<cu::Buffer>(index++, energy->getDevicePointer()); // sumEnergyKernel.setArg<cu::Buffer>(index++, energy->getDevicePointer());
...@@ -4949,16 +4956,18 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn ...@@ -4949,16 +4956,18 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn
//} //}
// //
//CudaApplyAndersenThermostatKernel::~CudaApplyAndersenThermostatKernel() { //CudaApplyAndersenThermostatKernel::~CudaApplyAndersenThermostatKernel() {
// cuCtxSetCurrent(cu.getContext());
// if (atomGroups != NULL) // if (atomGroups != NULL)
// delete atomGroups; // delete atomGroups;
//} //}
// //
//void CudaApplyAndersenThermostatKernel::initialize(const System& system, const AndersenThermostat& thermostat) { //void CudaApplyAndersenThermostatKernel::initialize(const System& system, const AndersenThermostat& thermostat) {
// cuCtxSetCurrent(cu.getContext());
// randomSeed = thermostat.getRandomNumberSeed(); // randomSeed = thermostat.getRandomNumberSeed();
// map<string, string> defines; // map<string, string> defines;
// defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms()); // defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
// cu::Program program = cu.createProgram(CudaKernelSources::andersenThermostat, defines); // CUmodule module = cu.createModule(CudaKernelSources::andersenThermostat, defines);
// kernel = cu::Kernel(program, "applyAndersenThermostat"); // kernel = cu.getKernel(module, "applyAndersenThermostat");
// cu.getIntegrationUtilities().initRandomNumberGenerator(randomSeed); // cu.getIntegrationUtilities().initRandomNumberGenerator(randomSeed);
// //
// // Create the arrays with the group definitions. // // Create the arrays with the group definitions.
...@@ -4988,6 +4997,7 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn ...@@ -4988,6 +4997,7 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn
//} //}
// //
//CudaApplyMonteCarloBarostatKernel::~CudaApplyMonteCarloBarostatKernel() { //CudaApplyMonteCarloBarostatKernel::~CudaApplyMonteCarloBarostatKernel() {
// cuCtxSetCurrent(cu.getContext());
// if (savedPositions != NULL) // if (savedPositions != NULL)
// delete savedPositions; // delete savedPositions;
// if (moleculeAtoms != NULL) // if (moleculeAtoms != NULL)
...@@ -4997,9 +5007,10 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn ...@@ -4997,9 +5007,10 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn
//} //}
// //
//void CudaApplyMonteCarloBarostatKernel::initialize(const System& system, const MonteCarloBarostat& thermostat) { //void CudaApplyMonteCarloBarostatKernel::initialize(const System& system, const MonteCarloBarostat& thermostat) {
// cuCtxSetCurrent(cu.getContext());
// savedPositions = new CudaArray<mm_float4>(cu, cu.getPaddedNumAtoms(), "savedPositions"); // savedPositions = new CudaArray<mm_float4>(cu, cu.getPaddedNumAtoms(), "savedPositions");
// cu::Program program = cu.createProgram(CudaKernelSources::monteCarloBarostat); // CUmodule module = cu.createModule(CudaKernelSources::monteCarloBarostat);
// kernel = cu::Kernel(program, "scalePositions"); // kernel = cu.getKernel(module, "scalePositions");
//} //}
// //
//void CudaApplyMonteCarloBarostatKernel::scaleCoordinates(ContextImpl& context, double scale) { //void CudaApplyMonteCarloBarostatKernel::scaleCoordinates(ContextImpl& context, double scale) {
...@@ -5045,6 +5056,7 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn ...@@ -5045,6 +5056,7 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn
//} //}
void CudaCalcKineticEnergyKernel::initialize(const System& system) { void CudaCalcKineticEnergyKernel::initialize(const System& system) {
cuCtxSetCurrent(cu.getContext());
int numParticles = system.getNumParticles(); int numParticles = system.getNumParticles();
masses.resize(numParticles); masses.resize(numParticles);
for (int i = 0; i < numParticles; ++i) for (int i = 0; i < numParticles; ++i)
...@@ -5077,11 +5089,13 @@ double CudaCalcKineticEnergyKernel::execute(ContextImpl& context) { ...@@ -5077,11 +5089,13 @@ double CudaCalcKineticEnergyKernel::execute(ContextImpl& context) {
} }
//CudaRemoveCMMotionKernel::~CudaRemoveCMMotionKernel() { //CudaRemoveCMMotionKernel::~CudaRemoveCMMotionKernel() {
// cuCtxSetCurrent(cu.getContext());
// if (cmMomentum != NULL) // if (cmMomentum != NULL)
// delete cmMomentum; // delete cmMomentum;
//} //}
// //
//void CudaRemoveCMMotionKernel::initialize(const System& system, const CMMotionRemover& force) { //void CudaRemoveCMMotionKernel::initialize(const System& system, const CMMotionRemover& force) {
// cuCtxSetCurrent(cu.getContext());
// frequency = force.getFrequency(); // frequency = force.getFrequency();
// int numAtoms = cu.getNumAtoms(); // int numAtoms = cu.getNumAtoms();
// cmMomentum = new CudaArray<mm_float4>(cu, (numAtoms+CudaContext::ThreadBlockSize-1)/CudaContext::ThreadBlockSize, "cmMomentum"); // cmMomentum = new CudaArray<mm_float4>(cu, (numAtoms+CudaContext::ThreadBlockSize-1)/CudaContext::ThreadBlockSize, "cmMomentum");
...@@ -5090,13 +5104,13 @@ double CudaCalcKineticEnergyKernel::execute(ContextImpl& context) { ...@@ -5090,13 +5104,13 @@ double CudaCalcKineticEnergyKernel::execute(ContextImpl& context) {
// totalMass += system.getParticleMass(i); // totalMass += system.getParticleMass(i);
// map<string, string> defines; // map<string, string> defines;
// defines["INVERSE_TOTAL_MASS"] = cu.doubleToString(1.0/totalMass); // defines["INVERSE_TOTAL_MASS"] = cu.doubleToString(1.0/totalMass);
// cu::Program program = cu.createProgram(CudaKernelSources::removeCM, defines); // CUmodule module = cu.createModule(CudaKernelSources::removeCM, defines);
// kernel1 = cu::Kernel(program, "calcCenterOfMassMomentum"); // kernel1 = cu.getKernel(module, "calcCenterOfMassMomentum");
// kernel1.setArg<cl_int>(0, numAtoms); // kernel1.setArg<cl_int>(0, numAtoms);
// kernel1.setArg<cu::Buffer>(1, cu.getVelm().getDevicePointer()); // kernel1.setArg<cu::Buffer>(1, cu.getVelm().getDevicePointer());
// kernel1.setArg<cu::Buffer>(2, cmMomentum->getDevicePointer()); // kernel1.setArg<cu::Buffer>(2, cmMomentum->getDevicePointer());
// kernel1.setArg(3, CudaContext::ThreadBlockSize*sizeof(mm_float4), NULL); // kernel1.setArg(3, CudaContext::ThreadBlockSize*sizeof(mm_float4), NULL);
// kernel2 = cu::Kernel(program, "removeCenterOfMassMomentum"); // kernel2 = cu.getKernel(module, "removeCenterOfMassMomentum");
// kernel2.setArg<cl_int>(0, numAtoms); // kernel2.setArg<cl_int>(0, numAtoms);
// kernel2.setArg<cu::Buffer>(1, cu.getVelm().getDevicePointer()); // kernel2.setArg<cu::Buffer>(1, cu.getVelm().getDevicePointer());
// kernel2.setArg<cu::Buffer>(2, cmMomentum->getDevicePointer()); // kernel2.setArg<cu::Buffer>(2, cmMomentum->getDevicePointer());
......
...@@ -34,6 +34,7 @@ ...@@ -34,6 +34,7 @@
#include "CudaSort.h" #include "CudaSort.h"
#include "openmm/kernels.h" #include "openmm/kernels.h"
#include "openmm/System.h" #include "openmm/System.h"
#include <cufft.h>
namespace OpenMM { namespace OpenMM {
...@@ -542,87 +543,86 @@ private: ...@@ -542,87 +543,86 @@ private:
std::vector<float> globalParamValues; std::vector<float> globalParamValues;
}; };
///** /**
// * This kernel is invoked by NonbondedForce to calculate the forces acting on the system. * This kernel is invoked by NonbondedForce to calculate the forces acting on the system.
// */ */
//class CudaCalcNonbondedForceKernel : public CalcNonbondedForceKernel { class CudaCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
//public: public:
// CudaCalcNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cu, System& system) : CalcNonbondedForceKernel(name, platform), CudaCalcNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cu, System& system) : CalcNonbondedForceKernel(name, platform),
// hasInitializedKernel(false), cu(cu), sigmaEpsilon(NULL), exceptionParams(NULL), cosSinSums(NULL), pmeGrid(NULL), cu(cu), hasInitializedFFT(false), sigmaEpsilon(NULL), exceptionParams(NULL), cosSinSums(NULL), pmeGrid(NULL),
// pmeGrid2(NULL), pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeBsplineTheta(NULL), pmeBsplineDTheta(NULL), pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeBsplineTheta(NULL), pmeBsplineDTheta(NULL),
// pmeAtomRange(NULL), pmeAtomGridIndex(NULL), sort(NULL), fft(NULL) { pmeAtomRange(NULL), pmeAtomGridIndex(NULL), sort(NULL) {
// } }
// ~CudaCalcNonbondedForceKernel(); ~CudaCalcNonbondedForceKernel();
// /** /**
// * Initialize the kernel. * Initialize the kernel.
// * *
// * @param system the System this kernel will be applied to * @param system the System this kernel will be applied to
// * @param force the NonbondedForce this kernel will be used for * @param force the NonbondedForce this kernel will be used for
// */ */
// void initialize(const System& system, const NonbondedForce& force); void initialize(const System& system, const NonbondedForce& force);
// /** /**
// * Execute the kernel to calculate the forces and/or energy. * Execute the kernel to calculate the forces and/or energy.
// * *
// * @param context the context in which to execute this kernel * @param context the context in which to execute this kernel
// * @param includeForces true if forces should be calculated * @param includeForces true if forces should be calculated
// * @param includeEnergy true if the energy should be calculated * @param includeEnergy true if the energy should be calculated
// * @param includeDirect true if direct space interactions should be included * @param includeDirect true if direct space interactions should be included
// * @param includeReciprocal true if reciprocal space interactions should be included * @param includeReciprocal true if reciprocal space interactions should be included
// * @return the potential energy due to the force * @return the potential energy due to the force
// */ */
// double execute(ContextImpl& context, bool includeForces, bool includeEnergy, bool includeDirect, bool includeReciprocal); double execute(ContextImpl& context, bool includeForces, bool includeEnergy, bool includeDirect, bool includeReciprocal);
// /** /**
// * Copy changed parameters over to a context. * Copy changed parameters over to a context.
// * *
// * @param context the context to copy parameters to * @param context the context to copy parameters to
// * @param force the NonbondedForce to copy the parameters from * @param force the NonbondedForce to copy the parameters from
// */ */
// void copyParametersToContext(ContextImpl& context, const NonbondedForce& force); void copyParametersToContext(ContextImpl& context, const NonbondedForce& force);
//private: private:
// struct SortTrait { class SortTrait : public CudaSort::SortTrait {
// typedef mm_int2 DataType; int getDataSize() const {return 8;}
// typedef cl_int KeyType; int getKeySize() const {return 4;}
// static const char* clDataType() {return "int2";} const char* getDataType() const {return "int2";}
// static const char* clKeyType() {return "int";} const char* getKeyType() const {return "int";}
// static const char* clMinKey() {return "INT_MIN";} const char* getMinKey() const {return "INT_MIN";}
// static const char* clMaxKey() {return "INT_MAX";} const char* getMaxKey() const {return "INT_MAX";}
// static const char* clMaxValue() {return "(int2) (INT_MAX, INT_MAX)";} const char* getMaxValue() const {return "make_int2(INT_MAX, INT_MAX)";}
// static const char* clSortKey() {return "value.y";} const char* getSortKey() const {return "value.y";}
// }; };
// CudaContext& cu; CudaContext& cu;
// bool hasInitializedKernel; bool hasInitializedFFT;
// CudaArray<mm_float2>* sigmaEpsilon; CudaArray* sigmaEpsilon;
// CudaArray<mm_float4>* exceptionParams; CudaArray* exceptionParams;
// CudaArray<mm_float2>* cosSinSums; CudaArray* cosSinSums;
// CudaArray<mm_float2>* pmeGrid; CudaArray* pmeGrid;
// CudaArray<mm_float2>* pmeGrid2; CudaArray* pmeBsplineModuliX;
// CudaArray<cl_float>* pmeBsplineModuliX; CudaArray* pmeBsplineModuliY;
// CudaArray<cl_float>* pmeBsplineModuliY; CudaArray* pmeBsplineModuliZ;
// CudaArray<cl_float>* pmeBsplineModuliZ; CudaArray* pmeBsplineTheta;
// CudaArray<mm_float4>* pmeBsplineTheta; CudaArray* pmeBsplineDTheta;
// CudaArray<mm_float4>* pmeBsplineDTheta; CudaArray* pmeAtomRange;
// CudaArray<cl_int>* pmeAtomRange; CudaArray* pmeAtomGridIndex;
// CudaArray<mm_int2>* pmeAtomGridIndex; CudaSort* sort;
// CudaSort<SortTrait>* sort; cufftHandle fft;
// CudaFFT3D* fft; CUfunction ewaldSumsKernel;
// CUfunction ewaldSumsKernel; CUfunction ewaldForcesKernel;
// CUfunction ewaldForcesKernel; CUfunction pmeGridIndexKernel;
// CUfunction pmeGridIndexKernel; CUfunction pmeAtomRangeKernel;
// CUfunction pmeAtomRangeKernel; CUfunction pmeZIndexKernel;
// CUfunction pmeZIndexKernel; CUfunction pmeUpdateBsplinesKernel;
// CUfunction pmeUpdateBsplinesKernel; CUfunction pmeSpreadChargeKernel;
// CUfunction pmeSpreadChargeKernel; CUfunction pmeFinishSpreadChargeKernel;
// CUfunction pmeFinishSpreadChargeKernel; CUfunction pmeConvolutionKernel;
// CUfunction pmeConvolutionKernel; CUfunction pmeInterpolateForceKernel;
// CUfunction pmeInterpolateForceKernel; std::map<std::string, std::string> pmeDefines;
// std::map<std::string, std::string> pmeDefines; std::vector<std::pair<int, int> > exceptionAtoms;
// std::vector<std::pair<int, int> > exceptionAtoms; double ewaldSelfEnergy, dispersionCoefficient, alpha;
// double ewaldSelfEnergy, dispersionCoefficient, alpha; int interpolateForceThreads;
// int interpolateForceThreads; bool hasCoulomb, hasLJ;
// bool hasCoulomb, hasLJ; static const int PmeOrder = 5;
// static const int PmeOrder = 5; };
//};
//
///** ///**
// * This kernel is invoked by CustomNonbondedForce to calculate the forces acting on the system. // * This kernel is invoked by CustomNonbondedForce to calculate the forces acting on the system.
// */ // */
......
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "openmm/OpenMMException.h"
#include "CudaNonbondedUtilities.h"
#include "CudaArray.h"
#include "CudaKernelSources.h"
#include "CudaExpressionUtilities.h"
#include <map>
#include <set>
#include <utility>
using namespace OpenMM;
using namespace std;
#define CHECK_RESULT(result) \
if (result != CUDA_SUCCESS) { \
std::stringstream m; \
m<<errorMessage<<": "<<context.getErrorString(result)<<" ("<<result<<")"<<" at "<<__FILE__<<":"<<__LINE__; \
throw OpenMMException(m.str());\
}
CudaNonbondedUtilities::CudaNonbondedUtilities(CudaContext& context) : context(context), cutoff(-1.0), useCutoff(false), anyExclusions(false),
exclusionIndices(NULL), exclusionRowIndices(NULL), exclusions(NULL), interactingTiles(NULL), interactionFlags(NULL),
interactionCount(NULL), blockCenter(NULL), blockBoundingBox(NULL), pinnedInteractionCount(NULL), nonbondedForceGroup(0) {
// Decide how many thread blocks to use.
string errorMessage = "Error initializing nonbonded utilities";
int multiprocessors;
CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, context.getDevice()));
numForceThreadBlocks = 2*multiprocessors;
forceThreadBlockSize = 256;
}
CudaNonbondedUtilities::~CudaNonbondedUtilities() {
if (exclusionIndices != NULL)
delete exclusionIndices;
if (exclusionRowIndices != NULL)
delete exclusionRowIndices;
if (exclusions != NULL)
delete exclusions;
if (interactingTiles != NULL)
delete interactingTiles;
if (interactionFlags != NULL)
delete interactionFlags;
if (interactionCount != NULL)
delete interactionCount;
if (blockCenter != NULL)
delete blockCenter;
if (blockBoundingBox != NULL)
delete blockBoundingBox;
if (pinnedInteractionCount != NULL)
cuMemFreeHost(pinnedInteractionCount);
}
void CudaNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector<vector<int> >& exclusionList, const string& kernel, int forceGroup) {
if (cutoff != -1.0) {
if (usesCutoff != useCutoff)
throw OpenMMException("All Forces must agree on whether to use a cutoff");
if (usesPeriodic != usePeriodic)
throw OpenMMException("All Forces must agree on whether to use periodic boundary conditions");
if (cutoffDistance != cutoff)
throw OpenMMException("All Forces must use the same cutoff distance");
if (forceGroup != nonbondedForceGroup)
throw OpenMMException("All nonbonded forces must be in the same force group");
}
if (usesExclusions)
requestExclusions(exclusionList);
useCutoff = usesCutoff;
usePeriodic = usesPeriodic;
cutoff = cutoffDistance;
kernelSource += kernel+"\n";
nonbondedForceGroup = forceGroup;
}
void CudaNonbondedUtilities::addParameter(const ParameterInfo& parameter) {
parameters.push_back(parameter);
}
void CudaNonbondedUtilities::addArgument(const ParameterInfo& parameter) {
arguments.push_back(parameter);
}
void CudaNonbondedUtilities::requestExclusions(const vector<vector<int> >& exclusionList) {
if (anyExclusions) {
bool sameExclusions = (exclusionList.size() == atomExclusions.size());
for (int i = 0; i < (int) exclusionList.size() && sameExclusions; i++) {
if (exclusionList[i].size() != atomExclusions[i].size())
sameExclusions = false;
for (int j = 0; j < (int) exclusionList[i].size(); j++)
if (exclusionList[i][j] != atomExclusions[i][j])
sameExclusions = false;
}
if (!sameExclusions)
throw OpenMMException("All Forces must have identical exceptions");
}
else {
atomExclusions = exclusionList;
anyExclusions = true;
}
}
void CudaNonbondedUtilities::initialize(const System& system) {
if (cutoff == -1.0)
return; // There are no nonbonded interactions in the System.
string errorMessage = "Error initializing nonbonded utilities";
if (atomExclusions.size() == 0) {
// No exclusions were specifically requested, so just mark every atom as not interacting with itself.
atomExclusions.resize(context.getNumAtoms());
for (int i = 0; i < (int) atomExclusions.size(); i++)
atomExclusions[i].push_back(i);
}
// Create the list of tiles.
numAtoms = context.getNumAtoms();
int numAtomBlocks = context.getNumAtomBlocks();
int totalTiles = numAtomBlocks*(numAtomBlocks+1)/2;
int numContexts = context.getPlatformData().contexts.size();
startTileIndex = context.getContextIndex()*totalTiles/numContexts;
int endTileIndex = (context.getContextIndex()+1)*totalTiles/numContexts;
numTiles = endTileIndex-startTileIndex;
// Build a list of indices for the tiles with exclusions.
set<pair<int, int> > tilesWithExclusions;
for (int atom1 = 0; atom1 < (int) atomExclusions.size(); ++atom1) {
int x = atom1/CudaContext::TileSize;
for (int j = 0; j < (int) atomExclusions[atom1].size(); ++j) {
int atom2 = atomExclusions[atom1][j];
int y = atom2/CudaContext::TileSize;
tilesWithExclusions.insert(make_pair(max(x, y), min(x, y)));
}
}
if (context.getPaddedNumAtoms() > context.getNumAtoms()) {
for (int i = 0; i < numAtomBlocks; ++i)
tilesWithExclusions.insert(make_pair(numAtomBlocks-1, i));
}
vector<unsigned int> exclusionRowIndicesVec(numAtomBlocks+1, 0);
vector<unsigned int> exclusionIndicesVec;
int currentRow = 0;
for (set<pair<int, int> >::const_iterator iter = tilesWithExclusions.begin(); iter != tilesWithExclusions.end(); ++iter) {
while (iter->first != currentRow)
exclusionRowIndicesVec[++currentRow] = exclusionIndicesVec.size();
exclusionIndicesVec.push_back(iter->second);
}
exclusionRowIndicesVec[++currentRow] = exclusionIndicesVec.size();
exclusionIndices = CudaArray::create<unsigned int>(exclusionIndicesVec.size(), "exclusionIndices");
exclusionRowIndices = CudaArray::create<unsigned int>(exclusionRowIndicesVec.size(), "exclusionRowIndices");
exclusionIndices->upload(exclusionIndicesVec);
exclusionRowIndices->upload(exclusionRowIndicesVec);
// Record the exclusion data.
exclusions = CudaArray::create<unsigned int>(tilesWithExclusions.size()*CudaContext::TileSize, "exclusions");
vector<unsigned int> exclusionVec(exclusions->getSize());
for (int i = 0; i < exclusions->getSize(); ++i)
exclusionVec[i] = 0xFFFFFFFF;
for (int atom1 = 0; atom1 < (int) atomExclusions.size(); ++atom1) {
int x = atom1/CudaContext::TileSize;
int offset1 = atom1-x*CudaContext::TileSize;
for (int j = 0; j < (int) atomExclusions[atom1].size(); ++j) {
int atom2 = atomExclusions[atom1][j];
int y = atom2/CudaContext::TileSize;
int offset2 = atom2-y*CudaContext::TileSize;
if (x > y) {
int index = findExclusionIndex(x, y, exclusionIndicesVec, exclusionRowIndicesVec);
exclusionVec[index+offset1] &= 0xFFFFFFFF-(1<<offset2);
}
else {
int index = findExclusionIndex(y, x, exclusionIndicesVec, exclusionRowIndicesVec);
exclusionVec[index+offset2] &= 0xFFFFFFFF-(1<<offset1);
}
}
}
// Mark all interactions that involve a padding atom as being excluded.
for (int atom1 = context.getNumAtoms(); atom1 < context.getPaddedNumAtoms(); ++atom1) {
int x = atom1/CudaContext::TileSize;
int offset1 = atom1-x*CudaContext::TileSize;
for (int atom2 = 0; atom2 < context.getPaddedNumAtoms(); ++atom2) {
int y = atom2/CudaContext::TileSize;
int offset2 = atom2-y*CudaContext::TileSize;
if (x >= y) {
int index = findExclusionIndex(x, y, exclusionIndicesVec, exclusionRowIndicesVec);
exclusionVec[index+offset1] &= 0xFFFFFFFF-(1<<offset2);
}
if (y >= x) {
int index = findExclusionIndex(y, x, exclusionIndicesVec, exclusionRowIndicesVec);
exclusionVec[index+offset2] &= 0xFFFFFFFF-(1<<offset1);
}
}
}
atomExclusions.clear(); // We won't use this again, so free the memory it used
exclusions->upload(exclusionVec);
// Create data structures for the neighbor list.
if (useCutoff) {
// Select a size for the arrays that hold the neighbor list. This estimate is intentionally very
// high, because if it ever is too small, we have to fall back to the N^2 algorithm.
double4 boxSize = context.getPeriodicBoxSize();
maxTiles = (int) (numTiles*(cutoff/boxSize.x+cutoff/boxSize.y+cutoff/boxSize.z));
if (maxTiles > numTiles)
maxTiles = numTiles;
if (maxTiles < 1)
maxTiles = 1;
interactingTiles = CudaArray::create<ushort2>(maxTiles, "interactingTiles");
interactionFlags = CudaArray::create<unsigned int>(maxTiles, "interactionFlags");
interactionCount = CudaArray::create<unsigned int>(1, "interactionCount");
blockCenter = CudaArray::create<float4>(numAtomBlocks, "blockCenter");
blockBoundingBox = CudaArray::create<float4>(numAtomBlocks, "blockBoundingBox");
CHECK_RESULT(cuMemHostAlloc((void**) &pinnedInteractionCount, sizeof(unsigned int), 0));
pinnedInteractionCount[0] = 0;
interactionCount->upload(pinnedInteractionCount);
}
// Create kernels.
forceKernel = createInteractionKernel(kernelSource, parameters, arguments, true, true);
if (useCutoff) {
map<string, string> defines;
defines["NUM_BLOCKS"] = context.intToString(context.getNumAtomBlocks());
defines["CUTOFF_SQUARED"] = context.doubleToString(cutoff*cutoff);
if (usePeriodic)
defines["USE_PERIODIC"] = "1";
CUmodule interactingBlocksProgram = context.createModule(CudaKernelSources::vectorOps+CudaKernelSources::findInteractingBlocks, defines);
findBlockBoundsKernel = context.getKernel(interactingBlocksProgram, "findBlockBounds");
findBlockBoundsArgs.push_back(&numAtoms);
findBlockBoundsArgs.push_back(context.getPeriodicBoxSizePointer());
findBlockBoundsArgs.push_back(context.getInvPeriodicBoxSizePointer());
findBlockBoundsArgs.push_back(&context.getPosq().getDevicePointer());
findBlockBoundsArgs.push_back(&blockCenter->getDevicePointer());
findBlockBoundsArgs.push_back(&blockBoundingBox->getDevicePointer());
findBlockBoundsArgs.push_back(&interactionCount->getDevicePointer());
findInteractingBlocksKernel = context.getKernel(interactingBlocksProgram, "findBlocksWithInteractions");
findInteractingBlocksArgs.push_back(context.getPeriodicBoxSizePointer());
findInteractingBlocksArgs.push_back(context.getInvPeriodicBoxSizePointer());
findInteractingBlocksArgs.push_back(&blockCenter->getDevicePointer());
findInteractingBlocksArgs.push_back(&blockBoundingBox->getDevicePointer());
findInteractingBlocksArgs.push_back(&interactionCount->getDevicePointer());
findInteractingBlocksArgs.push_back(&interactingTiles->getDevicePointer());
findInteractingBlocksArgs.push_back(&interactionFlags->getDevicePointer());
findInteractingBlocksArgs.push_back(&context.getPosq().getDevicePointer());
findInteractingBlocksArgs.push_back(&maxTiles);
findInteractingBlocksArgs.push_back(&startTileIndex);
findInteractingBlocksArgs.push_back(&numTiles);
findInteractionsWithinBlocksKernel = context.getKernel(interactingBlocksProgram, "findInteractionsWithinBlocks");
findInteractionsWithinBlocksArgs.push_back(context.getPeriodicBoxSizePointer());
findInteractionsWithinBlocksArgs.push_back(context.getInvPeriodicBoxSizePointer());
findInteractionsWithinBlocksArgs.push_back(&context.getPosq().getDevicePointer());
findInteractionsWithinBlocksArgs.push_back(&interactingTiles->getDevicePointer());
findInteractionsWithinBlocksArgs.push_back(&blockCenter->getDevicePointer());
findInteractionsWithinBlocksArgs.push_back(&blockBoundingBox->getDevicePointer());
findInteractionsWithinBlocksArgs.push_back(&interactionFlags->getDevicePointer());
findInteractionsWithinBlocksArgs.push_back(&interactionCount->getDevicePointer());
findInteractionsWithinBlocksArgs.push_back(&maxTiles);
}
}
int CudaNonbondedUtilities::findExclusionIndex(int x, int y, const vector<unsigned int>& exclusionIndices, const vector<unsigned int>& exclusionRowIndices) {
int start = exclusionRowIndices[x];
int end = exclusionRowIndices[x+1];
for (int i = start; i < end; i++)
if (exclusionIndices[i] == y)
return i*CudaContext::TileSize;
throw OpenMMException("Internal error: exclusion in unexpected tile");
}
void CudaNonbondedUtilities::prepareInteractions() {
if (!useCutoff)
return;
if (usePeriodic) {
double4 box = context.getPeriodicBoxSize();
double minAllowedSize = 1.999999*cutoff;
if (box.x < minAllowedSize || box.y < minAllowedSize || box.z < minAllowedSize)
throw OpenMMException("The periodic box size has decreased to less than twice the nonbonded cutoff.");
}
// Compute the neighbor list.
context.executeKernel(findBlockBoundsKernel, &findBlockBoundsArgs[0], context.getNumAtoms());
context.executeKernel(findInteractingBlocksKernel, &findInteractingBlocksArgs[0], context.getNumAtoms());
context.executeKernel(findInteractionsWithinBlocksKernel, &findInteractionsWithinBlocksArgs[0], context.getNumAtoms(), 128);
}
void CudaNonbondedUtilities::computeInteractions() {
if (cutoff != -1.0)
context.executeKernel(forceKernel, &forceArgs[0], numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
}
void CudaNonbondedUtilities::updateNeighborListSize() {
if (!useCutoff)
return;
interactionCount->download(pinnedInteractionCount);
if (pinnedInteractionCount[0] <= (unsigned int) maxTiles)
return;
// The most recent timestep had too many interactions to fit in the arrays. Make the arrays bigger to prevent
// this from happening in the future.
maxTiles = (int) (1.2*pinnedInteractionCount[0]);
int numTiles = context.getNumAtomBlocks()*(context.getNumAtomBlocks()+1)/2;
if (maxTiles > numTiles)
maxTiles = numTiles;
delete interactingTiles;
interactingTiles = CudaArray::create<ushort2>(maxTiles, "interactingTiles");
forceArgs[8] = &interactingTiles->getDevicePointer();
findInteractingBlocksArgs[5] = &interactingTiles->getDevicePointer();
delete interactionFlags;
interactionFlags = CudaArray::create<unsigned int>(maxTiles, "interactionFlags");
forceArgs[13] = &interactionFlags->getDevicePointer();
findInteractingBlocksArgs[6] = &interactionFlags->getDevicePointer();
findInteractionsWithinBlocksArgs[3] = &interactingTiles->getDevicePointer();
findInteractionsWithinBlocksArgs[6] = &interactionFlags->getDevicePointer();
}
void CudaNonbondedUtilities::setTileRange(int startTileIndex, int numTiles) {
this->startTileIndex = startTileIndex;
this->numTiles = numTiles;
}
CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source, vector<ParameterInfo>& params, vector<ParameterInfo>& arguments, bool useExclusions, bool isSymmetric) {
map<string, string> replacements;
replacements["COMPUTE_INTERACTION"] = source;
const string suffixes[] = {"x", "y", "z", "w"};
stringstream localData;
int localDataSize = 0;
for (int i = 0; i < (int) params.size(); i++) {
if (params[i].getNumComponents() == 1)
localData<<params[i].getType()<<" "<<params[i].getName()<<";\n";
else {
for (int j = 0; j < params[i].getNumComponents(); ++j)
localData<<params[i].getComponentType()<<" "<<params[i].getName()<<"_"<<suffixes[j]<<";\n";
}
localDataSize += params[i].getSize();
}
replacements["ATOM_PARAMETER_DATA"] = localData.str();
stringstream args;
for (int i = 0; i < (int) params.size(); i++) {
args << ", const ";
args << params[i].getType();
args << "* __restrict__ global_";
args << params[i].getName();
}
for (int i = 0; i < (int) arguments.size(); i++) {
args << ", const ";
args << arguments[i].getType();
args << "* __restrict__ ";
args << arguments[i].getName();
}
replacements["PARAMETER_ARGUMENTS"] = args.str();
stringstream loadLocal1;
for (int i = 0; i < (int) params.size(); i++) {
if (params[i].getNumComponents() == 1) {
loadLocal1<<"localData[localAtomIndex]."<<params[i].getName()<<" = "<<params[i].getName()<<"1;\n";
}
else {
for (int j = 0; j < params[i].getNumComponents(); ++j)
loadLocal1<<"localData[localAtomIndex]."<<params[i].getName()<<"_"<<suffixes[j]<<" = "<<params[i].getName()<<"1."<<suffixes[j]<<";\n";
}
}
replacements["LOAD_LOCAL_PARAMETERS_FROM_1"] = loadLocal1.str();
stringstream loadLocal2;
for (int i = 0; i < (int) params.size(); i++) {
if (params[i].getNumComponents() == 1) {
loadLocal2<<"localData[localAtomIndex]."<<params[i].getName()<<" = global_"<<params[i].getName()<<"[j];\n";
}
else {
loadLocal2<<params[i].getType()<<" temp_"<<params[i].getName()<<" = global_"<<params[i].getName()<<"[j];\n";
for (int j = 0; j < params[i].getNumComponents(); ++j)
loadLocal2<<"localData[localAtomIndex]."<<params[i].getName()<<"_"<<suffixes[j]<<" = temp_"<<params[i].getName()<<"."<<suffixes[j]<<";\n";
}
}
replacements["LOAD_LOCAL_PARAMETERS_FROM_GLOBAL"] = loadLocal2.str();
stringstream load1;
for (int i = 0; i < (int) params.size(); i++) {
load1 << params[i].getType();
load1 << " ";
load1 << params[i].getName();
load1 << "1 = global_";
load1 << params[i].getName();
load1 << "[atom1];\n";
}
replacements["LOAD_ATOM1_PARAMETERS"] = load1.str();
stringstream load2j;
for (int i = 0; i < (int) params.size(); i++) {
if (params[i].getNumComponents() == 1) {
load2j<<params[i].getType()<<" "<<params[i].getName()<<"2 = localData[atom2]."<<params[i].getName()<<";\n";
}
else {
load2j<<params[i].getType()<<" "<<params[i].getName()<<"2 = make_"<<params[i].getType()<<"(";
for (int j = 0; j < params[i].getNumComponents(); ++j) {
if (j > 0)
load2j<<", ";
load2j<<"localData[atom2]."<<params[i].getName()<<"_"<<suffixes[j];
}
load2j<<");\n";
}
}
replacements["LOAD_ATOM2_PARAMETERS"] = load2j.str();
map<string, string> defines;
if (useCutoff)
defines["USE_CUTOFF"] = "1";
if (usePeriodic)
defines["USE_PERIODIC"] = "1";
if (useExclusions)
defines["USE_EXCLUSIONS"] = "1";
if (isSymmetric)
defines["USE_SYMMETRIC"] = "1";
defines["THREAD_BLOCK_SIZE"] = context.intToString(forceThreadBlockSize);
defines["CUTOFF_SQUARED"] = context.doubleToString(cutoff*cutoff);
defines["NUM_ATOMS"] = context.intToString(context.getNumAtoms());
defines["PADDED_NUM_ATOMS"] = context.intToString(context.getPaddedNumAtoms());
defines["NUM_BLOCKS"] = context.intToString(context.getNumAtomBlocks());
if ((localDataSize/4)%2 == 0 && !context.getUseDoublePrecision())
defines["PARAMETER_SIZE_IS_EVEN"] = "1";
string file;
CUmodule program = context.createModule(context.replaceStrings(CudaKernelSources::vectorOps+CudaKernelSources::nonbonded, replacements), defines);
CUfunction kernel = context.getKernel(program, "computeNonbonded");
// Set arguments to the Kernel.
int index = 0;
forceArgs.push_back(&context.getForce().getDevicePointer());
forceArgs.push_back(&context.getEnergyBuffer().getDevicePointer());
forceArgs.push_back(&context.getPosq().getDevicePointer());
forceArgs.push_back(&exclusions->getDevicePointer());
forceArgs.push_back(&exclusionIndices->getDevicePointer());
forceArgs.push_back(&exclusionRowIndices->getDevicePointer());
forceArgs.push_back(&startTileIndex);
forceArgs.push_back(&numTiles);
if (useCutoff) {
forceArgs.push_back(&interactingTiles->getDevicePointer());
forceArgs.push_back(&interactionCount->getDevicePointer());
forceArgs.push_back(context.getPeriodicBoxSizePointer());
forceArgs.push_back(context.getInvPeriodicBoxSizePointer());
forceArgs.push_back(&maxTiles);
forceArgs.push_back(&interactionFlags->getDevicePointer());
}
for (int i = 0; i < (int) params.size(); i++)
forceArgs.push_back(&params[i].getMemory());
for (int i = 0; i < (int) arguments.size(); i++)
forceArgs.push_back(&arguments[i].getMemory());
return kernel;
}
...@@ -38,7 +38,7 @@ namespace OpenMM { ...@@ -38,7 +38,7 @@ namespace OpenMM {
/** /**
* This class provides a generic interface for calculating nonbonded interactions. It does this in two * This class provides a generic interface for calculating nonbonded interactions. It does this in two
* ways. First, it can be used to create Kernels that evaluate nonbonded interactions. Clients * ways. First, it can be used to create kernels that evaluate nonbonded interactions. Clients
* only need to provide the code for evaluating a single interaction and the list of parameters it depends on. * only need to provide the code for evaluating a single interaction and the list of parameters it depends on.
* A complete kernel is then synthesized using an appropriate algorithm to evaluate all interactions on all * A complete kernel is then synthesized using an appropriate algorithm to evaluate all interactions on all
* atoms. * atoms.
...@@ -64,209 +64,199 @@ namespace OpenMM { ...@@ -64,209 +64,199 @@ namespace OpenMM {
class OPENMM_EXPORT CudaNonbondedUtilities { class OPENMM_EXPORT CudaNonbondedUtilities {
public: public:
class ParameterInfo; class ParameterInfo;
// CudaNonbondedUtilities(CudaContext& context); CudaNonbondedUtilities(CudaContext& context);
// ~CudaNonbondedUtilities(); ~CudaNonbondedUtilities();
// /** /**
// * Add a nonbonded interaction to be evaluated by the default interaction kernel. * Add a nonbonded interaction to be evaluated by the default interaction kernel.
// * *
// * @param usesCutoff specifies whether a cutoff should be applied to this interaction * @param usesCutoff specifies whether a cutoff should be applied to this interaction
// * @param usesPeriodic specifies whether periodic boundary conditions should be applied to this interaction * @param usesPeriodic specifies whether periodic boundary conditions should be applied to this interaction
// * @param usesExclusions specifies whether this interaction uses exclusions. If this is true, it must have identical exclusions to every other interaction. * @param usesExclusions specifies whether this interaction uses exclusions. If this is true, it must have identical exclusions to every other interaction.
// * @param cutoffDistance the cutoff distance for this interaction (ignored if usesCutoff is false) * @param cutoffDistance the cutoff distance for this interaction (ignored if usesCutoff is false)
// * @param exclusionList for each atom, specifies the list of other atoms whose interactions should be excluded * @param exclusionList for each atom, specifies the list of other atoms whose interactions should be excluded
// * @param kernel the code to evaluate the interaction * @param kernel the code to evaluate the interaction
// * @param forceGroup the force group in which the interaction should be calculated * @param forceGroup the force group in which the interaction should be calculated
// */ */
// void addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const std::vector<std::vector<int> >& exclusionList, const std::string& kernel, int forceGroup); void addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const std::vector<std::vector<int> >& exclusionList, const std::string& kernel, int forceGroup);
// /** /**
// * Add a per-atom parameter that the default interaction kernel may depend on. * Add a per-atom parameter that the default interaction kernel may depend on.
// */ */
// void addParameter(const ParameterInfo& parameter); void addParameter(const ParameterInfo& parameter);
// /** /**
// * Add an array (other than a per-atom parameter) that should be passed as an argument to the default interaction kernel. * Add an array (other than a per-atom parameter) that should be passed as an argument to the default interaction kernel.
// */ */
// void addArgument(const ParameterInfo& parameter); void addArgument(const ParameterInfo& parameter);
// /** /**
// * Specify the list of exclusions that an interaction outside the default kernel will depend on. * Specify the list of exclusions that an interaction outside the default kernel will depend on.
// * *
// * @param exclusionList for each atom, specifies the list of other atoms whose interactions should be excluded * @param exclusionList for each atom, specifies the list of other atoms whose interactions should be excluded
// */ */
// void requestExclusions(const std::vector<std::vector<int> >& exclusionList); void requestExclusions(const std::vector<std::vector<int> >& exclusionList);
// /** /**
// * Initialize this object in preparation for a simulation. * Initialize this object in preparation for a simulation.
// */ */
// void initialize(const System& system); void initialize(const System& system);
// /** /**
// * Get the number of force buffers required for nonbonded forces. * Get the number of energy buffers required for nonbonded forces.
// */ */
// int getNumForceBuffers() { int getNumEnergyBuffers() {
// return numForceBuffers; return numForceThreadBlocks*forceThreadBlockSize;
// } }
// /** /**
// * Get the number of energy buffers required for nonbonded forces. * Get whether a cutoff is being used.
// */ */
// int getNumEnergyBuffers() { bool getUseCutoff() {
// return numForceThreadBlocks*forceThreadBlockSize; return useCutoff;
// } }
// /** /**
// * Get whether a cutoff is being used. * Get whether periodic boundary conditions are being used.
// */ */
// bool getUseCutoff() { bool getUsePeriodic() {
// return useCutoff; return usePeriodic;
// } }
// /** /**
// * Get whether periodic boundary conditions are being used. * Get the number of work groups used for computing nonbonded forces.
// */ */
// bool getUsePeriodic() { int getNumForceThreadBlocks() {
// return usePeriodic; return numForceThreadBlocks;
// } }
// /** /**
// * Get whether there is one force buffer per atom block. * Get the size of each work group used for computing nonbonded forces.
// */ */
// bool getForceBufferPerAtomBlock() { int getForceThreadBlockSize() {
// return forceBufferPerAtomBlock; return forceThreadBlockSize;
// } }
// /** /**
// * Get the number of work groups used for computing nonbonded forces. * Get the cutoff distance.
// */ */
// int getNumForceThreadBlocks() { double getCutoffDistance() {
// return numForceThreadBlocks; return cutoff;
// } }
// /** /**
// * Get the size of each work group used for computing nonbonded forces. * Get whether any interactions have been added.
// */ */
// int getForceThreadBlockSize() { bool getHasInteractions() {
// return forceThreadBlockSize; return cutoff != -1.0;
// } }
// /** /**
// * Get the cutoff distance. * Get the force group in which nonbonded interactions should be computed.
// */ */
// double getCutoffDistance() { int getForceGroup() {
// return cutoff; return nonbondedForceGroup;
// } }
// /** /**
// * Get whether any interactions have been added. * Prepare to compute interactions. This updates the neighbor list.
// */ */
// bool getHasInteractions() { void prepareInteractions();
// return cutoff != -1.0; /**
// } * Compute the nonbonded interactions.
// /** */
// * Get the force group in which nonbonded interactions should be computed. void computeInteractions();
// */ /**
// int getForceGroup() { * Check to see if the neighbor list arrays are large enough, and make them bigger if necessary.
// return nonbondedForceGroup; */
// } void updateNeighborListSize();
// /** /**
// * Prepare to compute interactions. This updates the neighbor list. * Get the array containing the center of each atom block.
// */ */
// void prepareInteractions(); CudaArray& getBlockCenters() {
// /** return *blockCenter;
// * Compute the nonbonded interactions. }
// */ /**
// void computeInteractions(); * Get the array containing the dimensions of each atom block.
// /** */
// * Check to see if the neighbor list arrays are large enough, and make them bigger if necessary. CudaArray& getBlockBoundingBoxes() {
// */ return *blockBoundingBox;
// void updateNeighborListSize(); }
// /** /**
// * Get the array containing the center of each atom block. * Get the array whose first element contains the number of tiles with interactions.
// */ */
// CudaArray<mm_float4>& getBlockCenters() { CudaArray& getInteractionCount() {
// return *blockCenter; return *interactionCount;
// } }
// /** /**
// * Get the array containing the dimensions of each atom block. * Get the array containing tiles with interactions.
// */ */
// CudaArray<mm_float4>& getBlockBoundingBoxes() { CudaArray& getInteractingTiles() {
// return *blockBoundingBox; return *interactingTiles;
// } }
// /** /**
// * Get the array whose first element contains the number of tiles with interactions. * Get the array containing flags for tiles with interactions.
// */ */
// CudaArray<cl_uint>& getInteractionCount() { CudaArray& getInteractionFlags() {
// return *interactionCount; return *interactionFlags;
// } }
// /** /**
// * Get the array containing tiles with interactions. * Get the array containing exclusion flags.
// */ */
// CudaArray<mm_ushort2>& getInteractingTiles() { CudaArray& getExclusions() {
// return *interactingTiles; return *exclusions;
// } }
// /** /**
// * Get the array containing flags for tiles with interactions. * Get the array containing the index into the exclusion array for each tile.
// */ */
// CudaArray<cl_uint>& getInteractionFlags() { CudaArray& getExclusionIndices() {
// return *interactionFlags; return *exclusionIndices;
// } }
// /** /**
// * Get the array containing exclusion flags. * Get the array listing where the exclusion data starts for each row.
// */ */
// CudaArray<cl_uint>& getExclusions() { CudaArray& getExclusionRowIndices() {
// return *exclusions; return *exclusionRowIndices;
// } }
// /** /**
// * Get the array containing the index into the exclusion array for each tile. * Get the index of the first tile this context is responsible for processing.
// */ */
// CudaArray<cl_uint>& getExclusionIndices() { int getStartTileIndex() const {
// return *exclusionIndices; return startTileIndex;
// } }
// /** /**
// * Get the array listing where the exclusion data starts for each row. * Get the total number of tiles this context is responsible for processing.
// */ */
// CudaArray<cl_uint>& getExclusionRowIndices() { int getNumTiles() const {
// return *exclusionRowIndices; return numTiles;
// } }
// /** /**
// * Get the index of the first tile this context is responsible for processing. * Set the range of tiles that should be processed by this context.
// */ */
// int getStartTileIndex() const { void setTileRange(int startTileIndex, int numTiles);
// return startTileIndex; /**
// } * Create a Kernel for evaluating a nonbonded interaction. Cutoffs and periodic boundary conditions
// /** * are assumed to be the same as those for the default interaction Kernel, since this kernel will use
// * Get the total number of tiles this context is responsible for processing. * the same neighbor list.
// */ *
// int getNumTiles() const { * @param source the source code for evaluating the force and energy
// return numTiles; * @param params the per-atom parameters this kernel may depend on
// } * @param arguments arrays (other than per-atom parameters) that should be passed as arguments to the kernel
// /** * @param useExclusions specifies whether exclusions are applied to this interaction
// * Set the range of tiles that should be processed by this context. * @param isSymmetric specifies whether the interaction is symmetric
// */ */
// void setTileRange(int startTileIndex, int numTiles); CUfunction createInteractionKernel(const std::string& source, std::vector<ParameterInfo>& params, std::vector<ParameterInfo>& arguments, bool useExclusions, bool isSymmetric);
// /**
// * Create a Kernel for evaluating a nonbonded interaction. Cutoffs and periodic boundary conditions
// * are assumed to be the same as those for the default interaction Kernel, since this kernel will use
// * the same neighbor list.
// *
// * @param source the source code for evaluating the force and energy
// * @param params the per-atom parameters this kernel may depend on
// * @param arguments arrays (other than per-atom parameters) that should be passed as arguments to the kernel
// * @param useExclusions specifies whether exclusions are applied to this interaction
// * @param isSymmetric specifies whether the interaction is symmetric
// */
// cl::Kernel createInteractionKernel(const std::string& source, const std::vector<ParameterInfo>& params, const std::vector<ParameterInfo>& arguments, bool useExclusions, bool isSymmetric) const;
private: private:
// static int findExclusionIndex(int x, int y, const std::vector<cl_uint>& exclusionIndices, const std::vector<cl_uint>& exclusionRowIndices); static int findExclusionIndex(int x, int y, const std::vector<unsigned int>& exclusionIndices, const std::vector<unsigned int>& exclusionRowIndices);
// CudaContext& context; CudaContext& context;
// cl::Kernel forceKernel; CUfunction forceKernel;
// cl::Kernel findBlockBoundsKernel; CUfunction findBlockBoundsKernel;
// cl::Kernel findInteractingBlocksKernel; CUfunction findInteractingBlocksKernel;
// cl::Kernel findInteractionsWithinBlocksKernel; CUfunction findInteractionsWithinBlocksKernel;
// CudaArray<cl_uint>* exclusions; CudaArray* exclusions;
// CudaArray<cl_uint>* exclusionIndices; CudaArray* exclusionIndices;
// CudaArray<cl_uint>* exclusionRowIndices; CudaArray* exclusionRowIndices;
// CudaArray<mm_ushort2>* interactingTiles; CudaArray* interactingTiles;
// CudaArray<cl_uint>* interactionFlags; CudaArray* interactionFlags;
// CudaArray<cl_uint>* interactionCount; CudaArray* interactionCount;
// CudaArray<mm_float4>* blockCenter; CudaArray* blockCenter;
// CudaArray<mm_float4>* blockBoundingBox; CudaArray* blockBoundingBox;
// std::vector<std::vector<int> > atomExclusions; unsigned int* pinnedInteractionCount;
// std::vector<ParameterInfo> parameters; std::vector<void*> forceArgs, findBlockBoundsArgs, findInteractingBlocksArgs, findInteractionsWithinBlocksArgs;
// std::vector<ParameterInfo> arguments; std::vector<std::vector<int> > atomExclusions;
// std::string kernelSource; std::vector<ParameterInfo> parameters;
// std::map<std::string, std::string> kernelDefines; std::vector<ParameterInfo> arguments;
// double cutoff; std::string kernelSource;
// bool useCutoff, usePeriodic, forceBufferPerAtomBlock, deviceIsCpu, anyExclusions; std::map<std::string, std::string> kernelDefines;
// int numForceBuffers, startTileIndex, numTiles, numForceThreadBlocks, forceThreadBlockSize, nonbondedForceGroup; double cutoff;
bool useCutoff, usePeriodic, anyExclusions;
int startTileIndex, numTiles, maxTiles, numForceThreadBlocks, forceThreadBlockSize, nonbondedForceGroup, numAtoms;
}; };
/** /**
...@@ -309,7 +299,7 @@ public: ...@@ -309,7 +299,7 @@ public:
int getSize() const { int getSize() const {
return size; return size;
} }
CUdeviceptr getMemory() const { CUdeviceptr& getMemory() {
return memory; return memory;
} }
private: private:
......
...@@ -77,7 +77,7 @@ CudaParameterSet::~CudaParameterSet() { ...@@ -77,7 +77,7 @@ CudaParameterSet::~CudaParameterSet() {
CHECK_RESULT(cuMemFree(buffers[i].getMemory())); CHECK_RESULT(cuMemFree(buffers[i].getMemory()));
} }
void CudaParameterSet::getParameterValues(vector<vector<float> >& values) const { void CudaParameterSet::getParameterValues(vector<vector<float> >& values) {
values.resize(numObjects); values.resize(numObjects);
for (int i = 0; i < numObjects; i++) for (int i = 0; i < numObjects; i++)
values[i].resize(numParameters); values[i].resize(numParameters);
......
...@@ -71,7 +71,7 @@ public: ...@@ -71,7 +71,7 @@ public:
* *
* @param values on exit, values[i][j] contains the value of parameter j for object i * @param values on exit, values[i][j] contains the value of parameter j for object i
*/ */
void getParameterValues(std::vector<std::vector<float> >& values) const; void getParameterValues(std::vector<std::vector<float> >& values);
/** /**
* Set the values of all parameters. * Set the values of all parameters.
* *
...@@ -82,7 +82,7 @@ public: ...@@ -82,7 +82,7 @@ public:
* Get a set of CudaNonbondedUtilities::ParameterInfo objects which describe the Buffers * Get a set of CudaNonbondedUtilities::ParameterInfo objects which describe the Buffers
* containing the data. * containing the data.
*/ */
const std::vector<CudaNonbondedUtilities::ParameterInfo>& getBuffers() const { std::vector<CudaNonbondedUtilities::ParameterInfo>& getBuffers() {
return buffers; return buffers;
} }
/** /**
......
...@@ -41,7 +41,7 @@ namespace OpenMM { ...@@ -41,7 +41,7 @@ namespace OpenMM {
* sort and the key for sorting it. Here is an example of a trait class for * sort and the key for sorting it. Here is an example of a trait class for
* sorting floats: * sorting floats:
* *
* class SortTrait : public CudaSort::SortTrait { * class FloatTrait : public CudaSort::SortTrait {
* int getDataSize() const {return 4;} * int getDataSize() const {return 4;}
* int getKeySize() const {return 4;} * int getKeySize() const {return 4;}
* const char* getDataType() const {return "float";} * const char* getDataType() const {return "float";}
......
#if USE_EWALD
bool needCorrection = isExcluded && atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS;
if (!isExcluded || needCorrection) {
real tempForce = 0.0f;
if (r2 < CUTOFF_SQUARED || needCorrection) {
const real alphaR = EWALD_ALPHA*r;
const real expAlphaRSqr = EXP(-alphaR*alphaR);
const real prefactor = 138.935456f*posq1.w*posq2.w*invR;
// This approximation for erfc is from Abramowitz and Stegun (1964) p. 299. They cite the following as
// the original source: C. Hastings, Jr., Approximations for Digital Computers (1955). It has a maximum
// error of 3e-7.
real t = 1.0f+(0.0705230784f+(0.0422820123f+(0.0092705272f+(0.0001520143f+(0.0002765672f+0.0000430638f*alphaR)*alphaR)*alphaR)*alphaR)*alphaR)*alphaR;
t *= t;
t *= t;
t *= t;
const real erfcAlphaR = RECIP(t*t);
if (needCorrection) {
// Subtract off the part of this interaction that was included in the reciprocal space contribution.
tempForce = -prefactor*((1.0f-erfcAlphaR)-alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
tempEnergy += -prefactor*(1.0f-erfcAlphaR);
}
else {
#if HAS_LENNARD_JONES
real sig = sigmaEpsilon1.x + sigmaEpsilon2.x;
real sig2 = invR*sig;
sig2 *= sig2;
real sig6 = sig2*sig2*sig2;
real epssig6 = sig6*(sigmaEpsilon1.y*sigmaEpsilon2.y);
tempForce = epssig6*(12.0f*sig6 - 6.0f) + prefactor*(erfcAlphaR+alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
tempEnergy += epssig6*(sig6 - 1.0f) + prefactor*erfcAlphaR;
#else
tempForce = prefactor*(erfcAlphaR+alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
tempEnergy += prefactor*erfcAlphaR;
#endif
}
}
dEdR += tempForce*invR*invR;
}
#else
{
#ifdef USE_CUTOFF
unsigned int includeInteraction = (!isExcluded && r2 < CUTOFF_SQUARED);
#else
unsigned int includeInteraction = (!isExcluded);
#endif
real tempForce = 0.0f;
#if HAS_LENNARD_JONES
real sig = sigmaEpsilon1.x + sigmaEpsilon2.x;
real sig2 = invR*sig;
sig2 *= sig2;
real sig6 = sig2*sig2*sig2;
real epssig6 = sig6*(sigmaEpsilon1.y*sigmaEpsilon2.y);
tempForce = epssig6*(12.0f*sig6 - 6.0f);
tempEnergy += includeInteraction ? epssig6*(sig6 - 1) : 0;
#endif
#if HAS_COULOMB
#ifdef USE_CUTOFF
const real prefactor = 138.935456f*posq1.w*posq2.w;
tempForce += prefactor*(invR - 2.0f*REACTION_FIELD_K*r2);
tempEnergy += includeInteraction ? prefactor*(invR + REACTION_FIELD_K*r2 - REACTION_FIELD_C) : 0;
#else
const real prefactor = 138.935456f*posq1.w*posq2.w*invR;
tempForce += prefactor;
tempEnergy += includeInteraction ? prefactor : 0;
#endif
#endif
dEdR += includeInteraction ? tempForce*invR*invR : 0;
}
#endif
\ No newline at end of file
/** /**
* Convert a real4 to a real3 by removing its last element. * Convert a real4 to a real3 by removing its last element.
*/ */
__device__ real3 ccb_trim(real4 v) { inline __device__ real3 ccb_trim(real4 v) {
return make_real3(v.x, v.y, v.z); return make_real3(v.x, v.y, v.z);
} }
/** /**
* Compute the difference between two vectors, setting the fourth component to the squared magnitude. * Compute the difference between two vectors, setting the fourth component to the squared magnitude.
*/ */
__device__ real4 ccb_delta(real4 vec1, real4 vec2) { inline __device__ real4 ccb_delta(real4 vec1, real4 vec2) {
real4 result = make_real4(vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0); real4 result = make_real4(vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0);
result.w = result.x*result.x + result.y*result.y + result.z*result.z; result.w = result.x*result.x + result.y*result.y + result.z*result.z;
return result; return result;
...@@ -38,7 +38,7 @@ __device__ real ccb_computeAngle(real4 vec1, real4 vec2) { ...@@ -38,7 +38,7 @@ __device__ real ccb_computeAngle(real4 vec1, real4 vec2) {
/** /**
* Compute the cross product of two vectors, setting the fourth component to the squared magnitude. * Compute the cross product of two vectors, setting the fourth component to the squared magnitude.
*/ */
__device__ real4 ccb_computeCross(real4 vec1, real4 vec2) { inline __device__ real4 ccb_computeCross(real4 vec1, real4 vec2) {
real3 cp = cross(vec1, vec2); real3 cp = cross(vec1, vec2);
return make_real4(cp.x, cp.y, cp.z, cp.x*cp.x+cp.y*cp.y+cp.z*cp.z); return make_real4(cp.x, cp.y, cp.z, cp.x*cp.x+cp.y*cp.y+cp.z*cp.z);
} }
__device__ real2 multofReal2(real2 a, real2 b) {
return make_real2(a.x*b.x - a.y*b.y, a.x*b.y + a.y*b.x);
}
/**
* Precompute the cosine and sine sums which appear in each force term.
*/
extern "C" __global__ void calculateEwaldCosSinSums(real* __restrict__ energyBuffer, const real4* __restrict__ posq, real2* __restrict__ cosSinSum, real4 periodicBoxSize) {
const unsigned int ksizex = 2*KMAX_X-1;
const unsigned int ksizey = 2*KMAX_Y-1;
const unsigned int ksizez = 2*KMAX_Z-1;
const unsigned int totalK = ksizex*ksizey*ksizez;
real3 reciprocalBoxSize = make_real3(2*M_PI/periodicBoxSize.x, 2*M_PI/periodicBoxSize.y, 2*M_PI/periodicBoxSize.z);
real reciprocalCoefficient = ONE_4PI_EPS0*4*M_PI/(periodicBoxSize.x*periodicBoxSize.y*periodicBoxSize.z);
unsigned int index = blockIdx.x*blockDim.x+threadIdx.x;
real energy = 0;
while (index < (KMAX_Y-1)*ksizez+KMAX_Z)
index += blockDim.x*gridDim.x;
while (index < totalK) {
// Find the wave vector (kx, ky, kz) this index corresponds to.
int rx = index/(ksizey*ksizez);
int remainder = index - rx*ksizey*ksizez;
int ry = remainder/ksizez;
int rz = remainder - ry*ksizez - KMAX_Z + 1;
ry += -KMAX_Y + 1;
real kx = rx*reciprocalBoxSize.x;
real ky = ry*reciprocalBoxSize.y;
real kz = rz*reciprocalBoxSize.z;
// Compute the sum for this wave vector.
real2 sum = make_real2(0);
for (int atom = 0; atom < NUM_ATOMS; atom++) {
real4 apos = posq[atom];
real phase = apos.x*kx;
real2 structureFactor = make_real2(cos(phase), sin(phase));
phase = apos.y*ky;
structureFactor = multofReal2(structureFactor, make_real2(cos(phase), sin(phase)));
phase = apos.z*kz;
structureFactor = multofReal2(structureFactor, make_real2(cos(phase), sin(phase)));
sum += apos.w*structureFactor;
}
cosSinSum[index] = sum;
// Compute the contribution to the energy.
real k2 = kx*kx + ky*ky + kz*kz;
real ak = EXP(k2*EXP_COEFFICIENT) / k2;
energy += reciprocalCoefficient*ak*(sum.x*sum.x + sum.y*sum.y);
index += blockDim.x*gridDim.x;
}
energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
}
/**
* Compute the reciprocal space part of the Ewald force, using the precomputed sums from the
* previous routine.
*/
extern "C" __global__ void calculateEwaldForces(unsigned long long* __restrict__ forceBuffers, const real4* __restrict__ posq, const real2* __restrict__ cosSinSum, real4 periodicBoxSize) {
unsigned int atom = blockIdx.x*blockDim.x+threadIdx.x;
real3 reciprocalBoxSize = make_real3(2*M_PI/periodicBoxSize.x, 2*M_PI/periodicBoxSize.y, 2*M_PI/periodicBoxSize.z);
real reciprocalCoefficient = ONE_4PI_EPS0*4*M_PI/(periodicBoxSize.x*periodicBoxSize.y*periodicBoxSize.z);
while (atom < NUM_ATOMS) {
real3 force = make_real3(0);
real4 apos = posq[atom];
// Loop over all wave vectors.
int lowry = 0;
int lowrz = 1;
for (int rx = 0; rx < KMAX_X; rx++) {
real kx = rx*reciprocalBoxSize.x;
for (int ry = lowry; ry < KMAX_Y; ry++) {
real ky = ry*reciprocalBoxSize.y;
real phase = apos.x*kx;
real2 tab_xy = make_real2(cos(phase), sin(phase));
phase = apos.y*ky;
tab_xy = multofReal2(tab_xy, make_real2(cos(phase), sin(phase)));
for (int rz = lowrz; rz < KMAX_Z; rz++) {
real kz = rz*reciprocalBoxSize.z;
// Compute the force contribution of this wave vector.
int index = rx*(KMAX_Y*2-1)*(KMAX_Z*2-1) + (ry+KMAX_Y-1)*(KMAX_Z*2-1) + (rz+KMAX_Z-1);
real k2 = kx*kx + ky*ky + kz*kz;
real ak = EXP(k2*EXP_COEFFICIENT)/k2;
phase = apos.z*kz;
real2 structureFactor = multofReal2(tab_xy, make_real2(cos(phase), sin(phase)));
real2 sum = cosSinSum[index];
real dEdR = 2*reciprocalCoefficient*ak*apos.w*(sum.x*structureFactor.y - sum.y*structureFactor.x);
force.x += dEdR*kx;
force.y += dEdR*ky;
force.z += dEdR*kz;
lowrz = 1 - KMAX_Z;
}
lowry = 1 - KMAX_Y;
}
}
// Record the force on the atom.
atomicAdd(&forceBuffers[atom], static_cast<unsigned long long>((long long) (force.x*0xFFFFFFFF)));
atomicAdd(&forceBuffers[atom+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0xFFFFFFFF)));
atomicAdd(&forceBuffers[atom+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0xFFFFFFFF)));
atom += blockDim.x*gridDim.x;
}
}
#define TILE_SIZE 32
#define GROUP_SIZE 64
#define BUFFER_GROUPS 4
#define BUFFER_SIZE BUFFER_GROUPS*GROUP_SIZE
/**
* Find a bounding box for the atoms in each block.
*/
extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, const real4* __restrict__ posq, real4* __restrict__ blockCenter, real4* __restrict__ blockBoundingBox, unsigned int* __restrict__ interactionCount) {
int index = blockIdx.x*blockDim.x+threadIdx.x;
int base = index*TILE_SIZE;
while (base < numAtoms) {
real4 pos = posq[base];
#ifdef USE_PERIODIC
pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
real4 firstPoint = pos;
#endif
real4 minPos = pos;
real4 maxPos = pos;
int last = min(base+TILE_SIZE, numAtoms);
for (int i = base+1; i < last; i++) {
pos = posq[i];
#ifdef USE_PERIODIC
pos.x -= floor((pos.x-firstPoint.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
pos.y -= floor((pos.y-firstPoint.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
pos.z -= floor((pos.z-firstPoint.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
minPos = make_real4(min(minPos.x,pos.x), min(minPos.y,pos.y), min(minPos.z,pos.z), 0);
maxPos = make_real4(max(maxPos.x,pos.x), max(maxPos.y,pos.y), max(maxPos.z,pos.z), 0);
}
blockBoundingBox[index] = 0.5f*(maxPos-minPos);
blockCenter[index] = 0.5f*(maxPos+minPos);
index += blockDim.x*gridDim.x;
base = index*TILE_SIZE;
}
if (blockIdx.x == 0 && threadIdx.x == 0)
interactionCount[0] = 0;
}
/**
* This is called by findBlocksWithInteractions(). It compacts the list of blocks and writes them
* to global memory.
*/
__device__ void storeInteractionData(ushort2* buffer, int* valid, short* sum, ushort2* temp, int* baseIndex,
unsigned int* interactionCount, ushort2* interactingTiles, real4 periodicBoxSize,
real4 invPeriodicBoxSize, const real4* posq, const real4* blockCenter, const real4* blockBoundingBox, unsigned int maxTiles) {
// The buffer is full, so we need to compact it and write out results. Start by doing a parallel prefix sum.
for (int i = threadIdx.x; i < BUFFER_SIZE; i += GROUP_SIZE)
temp[i].x = (valid[i] ? 1 : 0);
__syncthreads();
int whichBuffer = 0;
for (int offset = 1; offset < BUFFER_SIZE; offset *= 2) {
if (whichBuffer == 0)
for (int i = threadIdx.x; i < BUFFER_SIZE; i += GROUP_SIZE)
temp[i].y = (i < offset ? temp[i].x : temp[i].x+temp[i-offset].x);
else
for (int i = threadIdx.x; i < BUFFER_SIZE; i += GROUP_SIZE)
temp[i].x = (i < offset ? temp[i].y : temp[i].y+temp[i-offset].y);
whichBuffer = 1-whichBuffer;
__syncthreads();
}
if (whichBuffer == 0)
for (int i = threadIdx.x; i < BUFFER_SIZE; i += GROUP_SIZE)
sum[i] = temp[i].x;
else
for (int i = threadIdx.x; i < BUFFER_SIZE; i += GROUP_SIZE)
sum[i] = temp[i].y;
__syncthreads();
int numValid = sum[BUFFER_SIZE-1];
__syncthreads();
// Compact the buffer.
for (int i = threadIdx.x; i < BUFFER_SIZE; i += GROUP_SIZE)
if (valid[i]) {
temp[sum[i]-1] = buffer[i];
sum[i] = valid[i];
valid[i] = false;
buffer[i] = make_ushort2(1, 1);
}
__syncthreads();
// Store it to global memory.
if (threadIdx.x == 0)
*baseIndex = atomicAdd(interactionCount, numValid);
__syncthreads();
if (*baseIndex+numValid <= maxTiles)
for (int i = threadIdx.x; i < numValid; i += GROUP_SIZE)
interactingTiles[*baseIndex+i] = temp[i];
__syncthreads();
}
/**
* Compare the bounding boxes for each pair of blocks. If they are sufficiently far apart,
* mark them as non-interacting.
*/
extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, const real4* __restrict__ blockCenter,
const real4* __restrict__ blockBoundingBox, unsigned int* __restrict__ interactionCount, ushort2* __restrict__ interactingTiles,
unsigned int* __restrict__ interactionFlags, const real4* __restrict__ posq, unsigned int maxTiles, unsigned int startTileIndex,
unsigned int endTileIndex) {
__shared__ ushort2 buffer[BUFFER_SIZE];
__shared__ int valid[BUFFER_SIZE];
__shared__ short sum[BUFFER_SIZE];
__shared__ ushort2 temp[BUFFER_SIZE];
__shared__ int bufferFull;
__shared__ int globalIndex;
int valuesInBuffer = 0;
if (threadIdx.x == 0)
bufferFull = false;
for (int i = 0; i < BUFFER_GROUPS; ++i)
valid[i*GROUP_SIZE+threadIdx.x] = false;
__syncthreads();
for (int baseIndex = startTileIndex+blockIdx.x*blockDim.x; baseIndex < endTileIndex; baseIndex += blockDim.x*gridDim.x) {
// Identify the pair of blocks to compare.
int index = baseIndex+threadIdx.x;
if (index < endTileIndex) {
unsigned int y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*index));
unsigned int x = (index-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (index-y*NUM_BLOCKS+y*(y+1)/2);
}
// Find the distance between the bounding boxes of the two cells.
real4 delta = blockCenter[x]-blockCenter[y];
real4 boxSizea = blockBoundingBox[x];
real4 boxSizeb = blockBoundingBox[y];
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
delta.x = max(0.0f, fabs(delta.x)-boxSizea.x-boxSizeb.x);
delta.y = max(0.0f, fabs(delta.y)-boxSizea.y-boxSizeb.y);
delta.z = max(0.0f, fabs(delta.z)-boxSizea.z-boxSizeb.z);
if (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < CUTOFF_SQUARED) {
// Add this tile to the buffer.
int bufferIndex = valuesInBuffer*GROUP_SIZE+threadIdx.x;
valid[bufferIndex] = true;
buffer[bufferIndex] = make_ushort2(x, y);
valuesInBuffer++;
if (!bufferFull && valuesInBuffer == BUFFER_GROUPS)
bufferFull = true;
}
}
__syncthreads();
if (bufferFull) {
storeInteractionData(buffer, valid, sum, temp, &globalIndex, interactionCount, interactingTiles, periodicBoxSize, invPeriodicBoxSize, posq, blockCenter, blockBoundingBox, maxTiles);
valuesInBuffer = 0;
if (threadIdx.x == 0)
bufferFull = false;
__syncthreads();
}
}
storeInteractionData(buffer, valid, sum, temp, &globalIndex, interactionCount, interactingTiles, periodicBoxSize, invPeriodicBoxSize, posq, blockCenter, blockBoundingBox, maxTiles);
}
/**
* Compare each atom in one block to the bounding box of another block, and set
* flags for which ones are interacting.
*/
extern "C" __global__ void findInteractionsWithinBlocks(real4 periodicBoxSize, real4 invPeriodicBoxSize, const real4* __restrict__ posq, const ushort2* __restrict__ tiles, const real4* __restrict__ blockCenter,
const real4* __restrict__ blockBoundingBox, unsigned int* __restrict__ interactionFlags, const unsigned int* __restrict__ interactionCount, unsigned int maxTiles) {
unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
unsigned int numTiles = interactionCount[0];
unsigned int pos = warp*numTiles/totalWarps;
unsigned int end = (warp+1)*numTiles/totalWarps;
unsigned int index = threadIdx.x & (TILE_SIZE - 1);
#if (__CUDA_ARCH__ < 200)
__shared__ unsigned int flags[128];
#endif
if (numTiles > maxTiles)
return;
unsigned int lasty = 0xFFFFFFFF;
real4 apos;
while (pos < end) {
// Extract the coordinates of this tile
ushort2 tileIndices = tiles[pos];
unsigned int x = tileIndices.x;
unsigned int y = tileIndices.y;
if (x == y) {
if (index == 0)
interactionFlags[pos] = 0xFFFFFFFF;
}
else {
// Load the bounding box for x and the atom positions for y.
real4 center = blockCenter[x];
real4 boxSize = blockBoundingBox[x];
if (y != lasty)
apos = posq[y*TILE_SIZE+index];
// Find the distance of the atom from the bounding box.
real4 delta = apos-center;
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
delta.x = max((real) 0, fabs(delta.x)-boxSize.x);
delta.y = max((real) 0, fabs(delta.y)-boxSize.y);
delta.z = max((real) 0, fabs(delta.z)-boxSize.z);
#if (__CUDA_ARCH__ < 200)
flags[threadIdx.x] = (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z > CUTOFF_SQUARED ? 0 : 1 << index);
if (index % 4 == 0)
flags[threadIdx.x] += flags[threadIdx.x+1]+flags[threadIdx.x+2]+flags[threadIdx.x+3];
unsigned int allFlags = 0;
if (index == 0)
allFlags = flags[threadIdx.x]+flags[threadIdx.x+4]+flags[threadIdx.x+8]+flags[threadIdx.x+12]+flags[threadIdx.x+16]+flags[threadIdx.x+20]+flags[threadIdx.x+24]+flags[threadIdx.x+28];
#else
unsigned int allFlags = __ballot(delta.x*delta.x+delta.y*delta.y+delta.z*delta.z > CUTOFF_SQUARED);
#endif
// Sum the flags.
if (index == 0) {
// Count how many flags are set, and based on that decide whether to compute all interactions
// or only a fraction of them.
unsigned int bits = (allFlags&0x55555555) + ((allFlags>>1)&0x55555555);
bits = (bits&0x33333333) + ((bits>>2)&0x33333333);
bits = (bits&0x0F0F0F0F) + ((bits>>4)&0x0F0F0F0F);
bits = (bits&0x00FF00FF) + ((bits>>8)&0x00FF00FF);
bits = (bits&0x0000FFFF) + ((bits>>16)&0x0000FFFF);
interactionFlags[pos] = (bits > 12 ? 0xFFFFFFFF : allFlags);
}
lasty = y;
}
pos++;
}
}
#define TILE_SIZE 32
#define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)
typedef struct {
real x, y, z;
real q;
real fx, fy, fz;
ATOM_PARAMETER_DATA
#ifndef PARAMETER_SIZE_IS_EVEN
real padding;
#endif
} AtomData;
/**
* Compute nonbonded interactions.
*/
extern "C" __global__ void computeNonbonded(
unsigned long long* __restrict__ forceBuffers, real* __restrict__ energyBuffer, const real4* __restrict__ posq, const unsigned int* __restrict__ exclusions,
const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices,
unsigned int startTileIndex, unsigned int numTileIndices
#ifdef USE_CUTOFF
, const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const unsigned int* __restrict__ interactionFlags
#endif
PARAMETER_ARGUMENTS) {
unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
#ifdef USE_CUTOFF
const unsigned int numTiles = interactionCount[0];
unsigned int pos = (numTiles > maxTiles ? startTileIndex+warp*numTileIndices/totalWarps : warp*numTiles/totalWarps);
unsigned int end = (numTiles > maxTiles ? startTileIndex+(warp+1)*numTileIndices/totalWarps : (warp+1)*numTiles/totalWarps);
#else
const unsigned int numTiles = numTileIndices;
unsigned int pos = startTileIndex+warp*numTiles/totalWarps;
unsigned int end = startTileIndex+(warp+1)*numTiles/totalWarps;
#endif
real energy = 0.0f;
__shared__ AtomData localData[THREAD_BLOCK_SIZE];
__shared__ real tempBuffer[3*THREAD_BLOCK_SIZE];
__shared__ unsigned int exclusionRange[2*WARPS_PER_GROUP];
__shared__ int exclusionIndex[WARPS_PER_GROUP];
do {
// Extract the coordinates of this tile
const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
const unsigned int tbx = threadIdx.x - tgx;
const unsigned int localGroupIndex = threadIdx.x/TILE_SIZE;
unsigned int x, y;
real3 force = make_real3(0);
if (pos < end) {
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
x = tileIndices.x;
y = tileIndices.y;
}
else
#endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
unsigned int atom1 = x*TILE_SIZE + tgx;
real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
// Locate the exclusion data for this tile.
#ifdef USE_EXCLUSIONS
if (tgx < 2)
exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
if (tgx == 0)
exclusionIndex[localGroupIndex] = -1;
for (unsigned int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
if (exclusionIndices[i] == y)
exclusionIndex[localGroupIndex] = i*TILE_SIZE;
bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
#else
bool hasExclusions = false;
#endif
if (pos >= end)
; // This warp is done.
else if (x == y) {
// This tile is on the diagonal.
const unsigned int localAtomIndex = threadIdx.x;
localData[localAtomIndex].x = posq1.x;
localData[localAtomIndex].y = posq1.y;
localData[localAtomIndex].z = posq1.z;
localData[localAtomIndex].q = posq1.w;
LOAD_LOCAL_PARAMETERS_FROM_1
#ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
#endif
for (unsigned int j = 0; j < TILE_SIZE; j++) {
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
int atom2 = tbx+j;
real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
#ifdef USE_SYMMETRIC
real dEdR = 0.0f;
#else
real3 dEdR1 = make_real3(0);
real3 dEdR2 = make_real3(0);
#endif
real tempEnergy = 0.0f;
COMPUTE_INTERACTION
energy += 0.5f*tempEnergy;
#ifdef USE_SYMMETRIC
force -= delta*dEdR;
#else
force -= dEdR1;
#endif
#ifdef USE_EXCLUSIONS
excl >>= 1;
#endif
}
}
else {
// This is an off-diagonal tile.
const unsigned int localAtomIndex = threadIdx.x;
unsigned int j = y*TILE_SIZE + tgx;
real4 tempPosq = posq[j];
localData[localAtomIndex].x = tempPosq.x;
localData[localAtomIndex].y = tempPosq.y;
localData[localAtomIndex].z = tempPosq.z;
localData[localAtomIndex].q = tempPosq.w;
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
localData[localAtomIndex].fx = 0.0f;
localData[localAtomIndex].fy = 0.0f;
localData[localAtomIndex].fz = 0.0f;
#ifdef USE_CUTOFF
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
if (!hasExclusions && flags != 0xFFFFFFFF) {
if (flags == 0) {
// No interactions in this tile.
}
else {
// Compute only a subset of the interactions in this tile.
for (j = 0; j < TILE_SIZE; j++) {
if ((flags&(1<<j)) != 0) {
bool isExcluded = false;
int atom2 = tbx+j;
int bufferIndex = 3*threadIdx.x;
#ifdef USE_SYMMETRIC
real dEdR = 0;
#else
real3 dEdR1 = make_real3(0);
real3 dEdR2 = make_real3(0);
#endif
real tempEnergy = 0.0f;
real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
COMPUTE_INTERACTION
energy += tempEnergy;
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_SYMMETRIC
delta *= dEdR;
force -= delta;
tempBuffer[bufferIndex] = delta.x;
tempBuffer[bufferIndex+1] = delta.y;
tempBuffer[bufferIndex+2] = delta.z;
#else
force -= dEdR1;
tempBuffer[bufferIndex] = dEdR2.x;
tempBuffer[bufferIndex+1] = dEdR2.y;
tempBuffer[bufferIndex+2] = dEdR2.z;
#endif
// Sum the forces on atom2.
if (tgx % 4 == 0) {
tempBuffer[bufferIndex] += tempBuffer[bufferIndex+3]+tempBuffer[bufferIndex+6]+tempBuffer[bufferIndex+9];
tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+4]+tempBuffer[bufferIndex+7]+tempBuffer[bufferIndex+10];
tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+5]+tempBuffer[bufferIndex+8]+tempBuffer[bufferIndex+11];
}
if (tgx == 0) {
localData[tbx+j].fx += tempBuffer[bufferIndex]+tempBuffer[bufferIndex+12]+tempBuffer[bufferIndex+24]+tempBuffer[bufferIndex+36]+tempBuffer[bufferIndex+48]+tempBuffer[bufferIndex+60]+tempBuffer[bufferIndex+72]+tempBuffer[bufferIndex+84];
localData[tbx+j].fy += tempBuffer[bufferIndex+1]+tempBuffer[bufferIndex+13]+tempBuffer[bufferIndex+25]+tempBuffer[bufferIndex+37]+tempBuffer[bufferIndex+49]+tempBuffer[bufferIndex+61]+tempBuffer[bufferIndex+73]+tempBuffer[bufferIndex+85];
localData[tbx+j].fz += tempBuffer[bufferIndex+2]+tempBuffer[bufferIndex+14]+tempBuffer[bufferIndex+26]+tempBuffer[bufferIndex+38]+tempBuffer[bufferIndex+50]+tempBuffer[bufferIndex+62]+tempBuffer[bufferIndex+74]+tempBuffer[bufferIndex+86];
}
}
}
}
}
else
#endif
{
// Compute the full set of interactions in this tile.
#ifdef USE_EXCLUSIONS
unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[localGroupIndex]+tgx] : 0xFFFFFFFF);
excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
#endif
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
int atom2 = tbx+tj;
real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+tj;
#ifdef USE_SYMMETRIC
real dEdR = 0.0f;
#else
real3 dEdR1 = make_real3(0);
real3 dEdR2 = make_real3(0);
#endif
real tempEnergy = 0.0f;
COMPUTE_INTERACTION
energy += tempEnergy;
#ifdef USE_SYMMETRIC
delta *= dEdR;
force -= delta;
localData[tbx+tj].fx += delta.x;
localData[tbx+tj].fy += delta.y;
localData[tbx+tj].fz += delta.z;
#else
force -= dEdR1;
localData[tbx+tj].fx += dEdR2.x;
localData[tbx+tj].fy += dEdR2.y;
localData[tbx+tj].fz += dEdR2.z;
#endif
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl >>= 1;
#endif
tj = (tj + 1) & (TILE_SIZE - 1);
}
}
}
}
// Write results.
if (pos < end) {
const unsigned int offset = x*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (force.x*0xFFFFFFFF)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0xFFFFFFFF)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0xFFFFFFFF)));
__threadfence_block();
}
if (pos < end && x != y) {
const unsigned int offset = y*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fx*0xFFFFFFFF)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fy*0xFFFFFFFF)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fz*0xFFFFFFFF)));
__threadfence_block();
}
pos++;
} while (pos < end);
energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
}
real4 exceptionParams = PARAMS[index];
real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real invR = RSQRT(r2);
real sig2 = invR*exceptionParams.y;
sig2 *= sig2;
real sig6 = sig2*sig2*sig2;
real dEdR = exceptionParams.z*(12.0f*sig6-6.0f)*sig6;
real tempEnergy = exceptionParams.z*(sig6-1.0f)*sig6;
dEdR += exceptionParams.x*invR;
dEdR *= invR*invR;
tempEnergy += exceptionParams.x*invR;
energy += tempEnergy;
delta *= dEdR;
real3 force1 = -delta;
real3 force2 = delta;
extern "C" __global__ void updateBsplines(const real4* __restrict__ posq, real4* __restrict__ pmeBsplineTheta, int2* __restrict__ pmeAtomGridIndex,
real4 periodicBoxSize, real4 invPeriodicBoxSize) {
extern __shared__ real3 bsplinesCache[];
real3* data = &bsplinesCache[threadIdx.x*PME_ORDER];
const real3 scale = make_real3(RECIP(PME_ORDER-1));
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
real4 pos = posq[i];
pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
real3 t = make_real3((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X,
(pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y,
(pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z);
real3 dr = make_real3(t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z);
int3 gridIndex = make_int3(((int) t.x) % GRID_SIZE_X,
((int) t.y) % GRID_SIZE_Y,
((int) t.z) % GRID_SIZE_Z);
pmeAtomGridIndex[i] = make_int2(i, gridIndex.x*GRID_SIZE_Y*GRID_SIZE_Z+gridIndex.y*GRID_SIZE_Z+gridIndex.z);
data[PME_ORDER-1] = make_real3(0);
data[1] = dr;
data[0] = make_real3(1)-dr;
for (int j = 3; j < PME_ORDER; j++) {
real div = RECIP(j-1);
data[j-1] = div*dr*data[j-2];
for (int k = 1; k < (j-1); k++)
data[j-k-1] = div*((dr+make_real3(k)) *data[j-k-2] + (make_real3(j-k)-dr)*data[j-k-1]);
data[0] = div*(make_real3(1)-dr)*data[0];
}
data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
for (int j = 1; j < (PME_ORDER-1); j++)
data[PME_ORDER-j-1] = scale*((dr+make_real3(j))*data[PME_ORDER-j-2] + (make_real3(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]);
data[0] = scale*(make_real3(1)-dr)*data[0];
for (int j = 0; j < PME_ORDER; j++)
pmeBsplineTheta[i+j*NUM_ATOMS] = make_real4(data[j].x, data[j].y, data[j].z, pos.w); // Storing the charge here improves cache coherency in the charge spreading kernel
}
}
/**
* For each grid point, find the range of sorted atoms associated with that point.
*/
extern "C" __global__ void findAtomRangeForGrid(int2* __restrict__ pmeAtomGridIndex, int* __restrict__ pmeAtomRange, const real4* __restrict__ posq, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
int start = (NUM_ATOMS*(blockIdx.x*blockDim.x+threadIdx.x))/(blockDim.x*gridDim.x);
int end = (NUM_ATOMS*(blockIdx.x*blockDim.x+threadIdx.x+1))/(blockDim.x*gridDim.x);
int last = (start == 0 ? -1 : pmeAtomGridIndex[start-1].y);
for (int i = start; i < end; ++i) {
int2 atomData = pmeAtomGridIndex[i];
int gridIndex = atomData.y;
if (gridIndex != last) {
for (int j = last+1; j <= gridIndex; ++j)
pmeAtomRange[j] = i;
last = gridIndex;
}
}
// Fill in values beyond the last atom.
if (blockIdx.x == gridDim.x-1 && threadIdx.x == blockDim.x-1) {
int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
for (int j = last+1; j <= gridSize; ++j)
pmeAtomRange[j] = NUM_ATOMS;
}
}
#define BUFFER_SIZE (PME_ORDER*PME_ORDER*PME_ORDER)
extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, unsigned long long* __restrict__ pmeGrid, const real4* __restrict__ pmeBsplineTheta, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
int ix = threadIdx.x/(PME_ORDER*PME_ORDER);
int remainder = threadIdx.x-ix*PME_ORDER*PME_ORDER;
int iy = remainder/PME_ORDER;
int iz = remainder-iy*PME_ORDER;
__shared__ real4 theta[PME_ORDER];
__shared__ real charge[BUFFER_SIZE];
__shared__ int basex[BUFFER_SIZE];
__shared__ int basey[BUFFER_SIZE];
__shared__ int basez[BUFFER_SIZE];
if (ix < PME_ORDER) {
for (int baseIndex = blockIdx.x*BUFFER_SIZE; baseIndex < NUM_ATOMS; baseIndex += gridDim.x*BUFFER_SIZE) {
// Load the next block of atoms into the buffers.
if (threadIdx.x < BUFFER_SIZE) {
int atomIndex = baseIndex+threadIdx.x;
if (atomIndex < NUM_ATOMS) {
real4 pos = posq[atomIndex];
charge[threadIdx.x] = pos.w;
pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
basex[threadIdx.x] = (int) ((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X);
basey[threadIdx.x] = (int) ((pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y);
basez[threadIdx.x] = (int) ((pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z);
}
}
__syncthreads();
int lastIndex = min(BUFFER_SIZE, NUM_ATOMS-baseIndex);
for (int index = 0; index < lastIndex; index++) {
int atomIndex = index+baseIndex;
if (threadIdx.x < PME_ORDER)
theta[threadIdx.x] = pmeBsplineTheta[atomIndex+threadIdx.x*NUM_ATOMS];
__syncthreads();
real add = charge[index]*theta[ix].x*theta[iy].y*theta[iz].z;
int x = basex[index]+ix;
int y = basey[index]+iy;
int z = basez[index]+iz;
x -= (x >= GRID_SIZE_X ? GRID_SIZE_X : 0);
y -= (y >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
z -= (z >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
#ifdef USE_DOUBLE_PRECISION
atomicAdd(&pmeGrid[2*(x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z)], static_cast<unsigned long long>((long long) (add*0xFFFFFFFF)));
#else
atomicAdd(&pmeGrid[x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z], static_cast<unsigned long long>((long long) (add*0xFFFFFFFF)));
#endif
}
}
}
}
extern "C" __global__ void finishSpreadCharge(long long* __restrict__ pmeGrid) {
real2* floatGrid = (real2*) pmeGrid;
const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
real scale = EPSILON_FACTOR/(real) 0xFFFFFFFF;
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x) {
#ifdef USE_DOUBLE_PRECISION
long long value = pmeGrid[2*index];
#else
long long value = pmeGrid[index];
#endif
real2 floatValue = make_real2((real) (value*scale), 0);
floatGrid[index] = floatValue;
}
}
extern "C" __global__ void reciprocalConvolution(real2* __restrict__ pmeGrid, real* __restrict__ energyBuffer, const real* __restrict__ pmeBsplineModuliX,
const real* __restrict__ pmeBsplineModuliY, const real* __restrict__ pmeBsplineModuliZ, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
const real recipScaleFactor = RECIP(M_PI*periodicBoxSize.x*periodicBoxSize.y*periodicBoxSize.z);
real energy = 0;
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x) {
int kx = index/(GRID_SIZE_Y*GRID_SIZE_Z);
int remainder = index-kx*GRID_SIZE_Y*GRID_SIZE_Z;
int ky = remainder/GRID_SIZE_Z;
int kz = remainder-ky*GRID_SIZE_Z;
if (kx == 0 && ky == 0 && kz == 0)
continue;
int mx = (kx < (GRID_SIZE_X+1)/2) ? kx : (kx-GRID_SIZE_X);
int my = (ky < (GRID_SIZE_Y+1)/2) ? ky : (ky-GRID_SIZE_Y);
int mz = (kz < (GRID_SIZE_Z+1)/2) ? kz : (kz-GRID_SIZE_Z);
real mhx = mx*invPeriodicBoxSize.x;
real mhy = my*invPeriodicBoxSize.y;
real mhz = mz*invPeriodicBoxSize.z;
real bx = pmeBsplineModuliX[kx];
real by = pmeBsplineModuliY[ky];
real bz = pmeBsplineModuliZ[kz];
real2 grid = pmeGrid[index];
real m2 = mhx*mhx+mhy*mhy+mhz*mhz;
real denom = m2*bx*by*bz;
real eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom;
pmeGrid[index] = make_real2(grid.x*eterm, grid.y*eterm);
energy += eterm*(grid.x*grid.x + grid.y*grid.y);
}
energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += 0.5f*energy;
}
extern "C" __global__ void gridInterpolateForce(const real4* __restrict__ posq, unsigned long long* __restrict__ forceBuffers, const real2* __restrict__ pmeGrid,
real4 periodicBoxSize, real4 invPeriodicBoxSize) {
extern __shared__ real3 bsplinesCache[];
real3* data = &bsplinesCache[threadIdx.x*PME_ORDER];
real3* ddata = &bsplinesCache[threadIdx.x*PME_ORDER + blockDim.x*PME_ORDER];
const real scale = RECIP(PME_ORDER-1);
for (int atom = blockIdx.x*blockDim.x+threadIdx.x; atom < NUM_ATOMS; atom += blockDim.x*gridDim.x) {
real4 force = make_real4(0);
real4 pos = posq[atom];
pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
real3 t = make_real3((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X,
(pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y,
(pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z);
int3 gridIndex = make_int3(((int) t.x) % GRID_SIZE_X,
((int) t.y) % GRID_SIZE_Y,
((int) t.z) % GRID_SIZE_Z);
// Since we need the full set of thetas, it's faster to compute them here than load them
// from global memory.
real3 dr = make_real3(t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z);
data[PME_ORDER-1] = make_real3(0);
data[1] = dr;
data[0] = make_real3(1)-dr;
for (int j = 3; j < PME_ORDER; j++) {
real div = RECIP(j-1);
data[j-1] = div*dr*data[j-2];
for (int k = 1; k < (j-1); k++)
data[j-k-1] = div*((dr+make_real3(k))*data[j-k-2] + (make_real3(j-k)-dr)*data[j-k-1]);
data[0] = div*(make_real3(1)-dr)*data[0];
}
ddata[0] = -data[0];
for (int j = 1; j < PME_ORDER; j++)
ddata[j] = data[j-1]-data[j];
data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
for (int j = 1; j < (PME_ORDER-1); j++)
data[PME_ORDER-j-1] = scale*((dr+make_real3(j))*data[PME_ORDER-j-2] + (make_real3(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]);
data[0] = scale*(make_real3(1)-dr)*data[0];
// Compute the force on this atom.
for (int ix = 0; ix < PME_ORDER; ix++) {
int xindex = gridIndex.x+ix;
xindex -= (xindex >= GRID_SIZE_X ? GRID_SIZE_X : 0);
for (int iy = 0; iy < PME_ORDER; iy++) {
int yindex = gridIndex.y+iy;
yindex -= (yindex >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
for (int iz = 0; iz < PME_ORDER; iz++) {
int zindex = gridIndex.z+iz;
zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex;
real gridvalue = pmeGrid[index].x;
force.x += ddata[ix].x*data[iy].y*data[iz].z*gridvalue;
force.y += data[ix].x*ddata[iy].y*data[iz].z*gridvalue;
force.z += data[ix].x*data[iy].y*ddata[iz].z*gridvalue;
}
}
}
real q = pos.w*EPSILON_FACTOR;
atomicAdd(&forceBuffers[atom], static_cast<unsigned long long>((long long) (-q*force.x*GRID_SIZE_X*invPeriodicBoxSize.x*0xFFFFFFFF)));
atomicAdd(&forceBuffers[atom+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (-q*force.y*GRID_SIZE_Y*invPeriodicBoxSize.y*0xFFFFFFFF)));
atomicAdd(&forceBuffers[atom+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (-q*force.z*GRID_SIZE_Z*invPeriodicBoxSize.z*0xFFFFFFFF)));
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment