Commit 5a06df78 authored by tic20's avatar tic20
Browse files
parents 8dd60914 a9223eea
......@@ -30,7 +30,7 @@
#include "openmm/Platform.h"
#include "openmm/System.h"
#include "openmm/internal/ThreadPool.h"
#include "windowsExportCuda.h"
#include "openmm/common/windowsExportCommon.h"
namespace OpenMM {
......@@ -40,7 +40,7 @@ class CudaContext;
* This Platform subclass uses CUDA implementations of the OpenMM kernels.
*/
class OPENMM_EXPORT_CUDA CudaPlatform : public Platform {
class OPENMM_EXPORT_COMMON CudaPlatform : public Platform {
public:
class PlatformData;
CudaPlatform();
......@@ -127,7 +127,7 @@ public:
}
};
class OPENMM_EXPORT_CUDA CudaPlatform::PlatformData {
class OPENMM_EXPORT_COMMON CudaPlatform::PlatformData {
public:
PlatformData(ContextImpl* context, const System& system, const std::string& deviceIndexProperty, const std::string& blockingProperty, const std::string& precisionProperty,
const std::string& cpuPmeProperty, const std::string& compilerProperty, const std::string& tempProperty, const std::string& hostCompilerProperty,
......
#ifndef OPENMM_CUDAPROGRAM_H_
#define OPENMM_CUDAPROGRAM_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2019 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "openmm/common/ComputeProgram.h"
#include "CudaContext.h"
namespace OpenMM {
/**
* This is the CUDA implementation of the ComputeProgramImpl interface.
*/
class CudaProgram : public ComputeProgramImpl {
public:
/**
* Create a new CudaProgram.
*
* @param context the context this kernel belongs to
* @param module the compiled module
*/
CudaProgram(CudaContext& context, CUmodule module);
/**
* Create a ComputeKernel for one of the kernels in this program.
*
* @param name the name of the kernel to get
*/
ComputeKernel createKernel(const std::string& name);
private:
CudaContext& context;
CUmodule module;
};
} // namespace OpenMM
#endif /*OPENMM_CUDAPROGRAM_H_*/
......@@ -28,7 +28,7 @@
* -------------------------------------------------------------------------- */
#include "CudaArray.h"
#include "windowsExportCuda.h"
#include "openmm/common/windowsExportCommon.h"
#include "CudaContext.h"
namespace OpenMM {
......@@ -66,7 +66,7 @@ namespace OpenMM {
* elements).
*/
class OPENMM_EXPORT_CUDA CudaSort {
class OPENMM_EXPORT_COMMON CudaSort {
public:
class SortTrait;
/**
......
#ifndef OPENMM_WINDOWSEXPORTCUDA_H_
#define OPENMM_WINDOWSEXPORTCUDA_H_
/*
* Shared libraries are messy in Visual Studio. We have to distinguish three
* cases:
* (1) this header is being used to build the OpenMM shared library
* (dllexport)
* (2) this header is being used by a *client* of the OpenMM shared
* library (dllimport)
* (3) we are building the OpenMM static library, or the client is
* being compiled with the expectation of linking with the
* OpenMM static library (nothing special needed)
* In the CMake script for building this library, we define one of the symbols
* OPENMM_CUDA_BUILDING_{SHARED|STATIC}_LIBRARY
* Client code normally has no special symbol defined, in which case we'll
* assume it wants to use the shared library. However, if the client defines
* the symbol OPENMM_USE_STATIC_LIBRARIES we'll suppress the dllimport so
* that the client code can be linked with static libraries. Note that
* the client symbol is not library dependent, while the library symbols
* affect only the OpenMM library, meaning that other libraries can
* be clients of this one. However, we are assuming all-static or all-shared.
*/
#ifdef _MSC_VER
// We don't want to hear about how sprintf is "unsafe".
#pragma warning(disable:4996)
// Keep MS VC++ quiet about lack of dll export of private members.
#pragma warning(disable:4251)
#if defined(OPENMM_CUDA_BUILDING_SHARED_LIBRARY)
#define OPENMM_EXPORT_CUDA __declspec(dllexport)
#elif defined(OPENMM_CUDA_BUILDING_STATIC_LIBRARY) || defined(OPENMM_CUDA_USE_STATIC_LIBRARIES)
#define OPENMM_EXPORT_CUDA
#else
#define OPENMM_EXPORT_CUDA __declspec(dllimport) // i.e., a client of a shared library
#endif
#else
#define OPENMM_EXPORT_CUDA // Linux, Mac
#endif
#endif // OPENMM_WINDOWSEXPORTCUDA_H_
......@@ -4,17 +4,18 @@
INCLUDE(FindCUDA)
INCLUDE_DIRECTORIES(${CUDA_TOOLKIT_INCLUDE})
FILE(GLOB CUDA_KERNELS ${CUDA_SOURCE_DIR}/kernels/*.cu)
ADD_CUSTOM_COMMAND(OUTPUT ${CUDA_KERNELS_CPP} ${CUDA_KERNELS_H}
FILE(GLOB CUDA_KERNELS ${KERNEL_SOURCE_DIR}/kernels/*.cu)
ADD_CUSTOM_COMMAND(OUTPUT ${KERNELS_CPP} ${KERNELS_H}
COMMAND ${CMAKE_COMMAND}
ARGS -D CUDA_SOURCE_DIR=${CUDA_SOURCE_DIR} -D CUDA_KERNELS_CPP=${CUDA_KERNELS_CPP} -D CUDA_KERNELS_H=${CUDA_KERNELS_H} -D CUDA_SOURCE_CLASS=${CUDA_SOURCE_CLASS} -P ${CMAKE_CURRENT_SOURCE_DIR}/../EncodeCUDAFiles.cmake
ARGS -D KERNEL_SOURCE_DIR=${KERNEL_SOURCE_DIR} -D KERNELS_CPP=${KERNELS_CPP} -D KERNELS_H=${KERNELS_H} -D KERNEL_SOURCE_CLASS=${KERNEL_SOURCE_CLASS} -D KERNEL_FILE_EXTENSION=cu -P ${CMAKE_SOURCE_DIR}/cmake_modules/EncodeKernelFiles.cmake
DEPENDS ${CUDA_KERNELS}
)
SET_SOURCE_FILES_PROPERTIES(${CUDA_KERNELS_CPP} ${CUDA_KERNELS_H} PROPERTIES GENERATED TRUE)
SET_SOURCE_FILES_PROPERTIES(${KERNELS_CPP} ${KERNELS_H} ${COMMON_KERNELS_CPP} PROPERTIES GENERATED TRUE)
ADD_LIBRARY(${SHARED_TARGET} SHARED ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})
ADD_DEPENDENCIES(${SHARED_TARGET} CommonKernels)
TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${OPENMM_LIBRARY_NAME} ${CUDA_CUDA_LIBRARY} ${CUDA_cufft_LIBRARY} ${PTHREADS_LIB})
SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_CUDA_BUILDING_SHARED_LIBRARY")
SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_COMMON_BUILDING_SHARED_LIBRARY")
IF (APPLE)
SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS} -F/Library/Frameworks -framework CUDA")
ELSE (APPLE)
......
......@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2012-2018 Stanford University and the Authors. *
* Portions copyright (c) 2012-2019 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -51,10 +51,10 @@ CudaArray::~CudaArray() {
}
}
void CudaArray::initialize(CudaContext& context, int size, int elementSize, const std::string& name) {
void CudaArray::initialize(ComputeContext& context, int size, int elementSize, const std::string& name) {
if (this->pointer != 0)
throw OpenMMException("CudaArray has already been initialized");
this->context = &context;
this->context = &dynamic_cast<CudaContext&>(context);
this->size = size;
this->elementSize = elementSize;
this->name = name;
......@@ -82,6 +82,10 @@ void CudaArray::resize(int size) {
initialize(*context, size, elementSize, name);
}
ComputeContext& CudaArray::getContext() {
return *context;
}
void CudaArray::upload(const void* data, bool blocking) {
if (pointer == 0)
throw OpenMMException("CudaArray has not been initialized");
......@@ -112,12 +116,13 @@ void CudaArray::download(void* data, bool blocking) const {
}
}
void CudaArray::copyTo(CudaArray& dest) const {
void CudaArray::copyTo(ArrayInterface& dest) const {
if (pointer == 0)
throw OpenMMException("CudaArray has not been initialized");
if (dest.getSize() != size || dest.getElementSize() != elementSize)
throw OpenMMException("Error copying array "+name+" to "+dest.getName()+": The destination array does not match the size of the array");
CUresult result = cuMemcpyDtoDAsync(dest.getDevicePointer(), pointer, size*elementSize, context->getCurrentStream());
CudaArray& cuDest = context->unwrap(dest);
CUresult result = cuMemcpyDtoDAsync(cuDest.getDevicePointer(), pointer, size*elementSize, context->getCurrentStream());
if (result != CUDA_SUCCESS) {
std::stringstream str;
str<<"Error copying array "<<name<<" to "<<dest.getName()<<": "<<CudaContext::getErrorString(result)<<" ("<<result<<")";
......
......@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2011-2018 Stanford University and the Authors. *
* Portions copyright (c) 2011-2019 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -25,6 +25,7 @@
* -------------------------------------------------------------------------- */
#include "CudaBondedUtilities.h"
#include "CudaContext.h"
#include "CudaExpressionUtilities.h"
#include "CudaKernelSources.h"
#include "openmm/OpenMMException.h"
......@@ -52,6 +53,10 @@ string CudaBondedUtilities::addArgument(CUdeviceptr data, const string& type) {
return "customArg"+context.intToString(arguments.size());
}
string CudaBondedUtilities::addArgument(ArrayInterface& data, const string& type) {
return addArgument(context.unwrap(data).getDevicePointer(), type);
}
string CudaBondedUtilities::addEnergyParameterDerivative(const string& param) {
// See if the parameter has already been added.
......
......@@ -31,14 +31,14 @@
#include "CudaContext.h"
#include "CudaArray.h"
#include "CudaBondedUtilities.h"
#include "CudaForceInfo.h"
#include "CudaEvent.h"
#include "CudaIntegrationUtilities.h"
#include "CudaKernels.h"
#include "CudaKernelSources.h"
#include "CudaNonbondedUtilities.h"
#include "CudaProgram.h"
#include "openmm/common/ComputeArray.h"
#include "SHA1.h"
#include "hilbert.h"
#include "openmm/OpenMMException.h"
#include "openmm/Platform.h"
#include "openmm/System.h"
#include "openmm/VirtualSite.h"
......@@ -106,9 +106,9 @@ static int executeInWindows(const string &command) {
#endif
CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const string& precision, const string& compiler,
const string& tempDir, const std::string& hostCompiler, CudaPlatform::PlatformData& platformData, CudaContext* originalContext) : system(system), currentStream(0),
time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), stepsSinceReorder(99999), contextIsValid(false), atomsWereReordered(false), hasAssignedPosqCharges(false),
hasCompilerKernel(false), isNvccAvailable(false), pinnedBuffer(NULL), integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL), thread(NULL) {
const string& tempDir, const std::string& hostCompiler, CudaPlatform::PlatformData& platformData, CudaContext* originalContext) : ComputeContext(system), currentStream(0),
platformData(platformData), contextIsValid(false), hasAssignedPosqCharges(false),
hasCompilerKernel(false), isNvccAvailable(false), pinnedBuffer(NULL), integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL) {
// Determine what compiler to use.
this->compiler = "\""+compiler+"\"";
......@@ -218,7 +218,8 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
}
int major, minor;
CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, device));
CHECK_RESULT(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device));
CHECK_RESULT(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device));
int numThreadBlocksPerComputeUnit = (major == 6 ? 4 : 6);
if (cudaDriverVersion < 7000) {
// This is a workaround to support GTX 980 with CUDA 6.5. It reports
......@@ -257,7 +258,7 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
int multiprocessors;
CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device));
numThreadBlocks = numThreadBlocksPerComputeUnit*multiprocessors;
if (computeCapability >= 7.0) {
if (cudaDriverVersion >= 9000) {
compilationDefines["SYNC_WARPS"] = "__syncwarp();";
compilationDefines["SHFL(var, srcLane)"] = "__shfl_sync(0xffffffff, var, srcLane);";
compilationDefines["BALLOT(var)"] = "__ballot_sync(0xffffffff, var);";
......@@ -300,7 +301,8 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
compilationDefines["make_mixed3"] = "make_float3";
compilationDefines["make_mixed4"] = "make_float4";
}
posCellOffsets.resize(paddedNumAtoms, make_int4(0, 0, 0, 0));
force.initialize<long long>(*this, paddedNumAtoms*3, "force");
posCellOffsets.resize(paddedNumAtoms, mm_int4(0, 0, 0, 0));
atomIndexDevice.initialize<int>(*this, paddedNumAtoms, "atomIndex");
atomIndex.resize(paddedNumAtoms);
for (int i = 0; i < paddedNumAtoms; ++i)
......@@ -396,10 +398,6 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
"pos.z -= floor((pos.z-center.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;}";
}
// Create the work thread used for parallelization when running on multiple devices.
thread = new WorkThread();
// Create utilities objects.
bonded = new CudaBondedUtilities(*this);
......@@ -428,8 +426,6 @@ CudaContext::~CudaContext() {
delete bonded;
if (nonbonded != NULL)
delete nonbonded;
if (thread != NULL)
delete thread;
string errorMessage = "Error deleting Context";
if (contextIsValid && !isLinkedContext) {
cuProfilerStop();
......@@ -469,7 +465,6 @@ void CudaContext::initialize() {
}
velm.upload(pinnedBuffer);
bonded->initialize(system);
force.initialize<long long>(*this, paddedNumAtoms*3, "force");
addAutoclearBuffer(force.getDevicePointer(), force.getSize()*force.getElementSize());
addAutoclearBuffer(energyBuffer.getDevicePointer(), energyBuffer.getSize()*energyBuffer.getElementSize());
int numEnergyParamDerivs = energyParamDerivNames.size();
......@@ -484,12 +479,8 @@ void CudaContext::initialize() {
nonbonded->initialize(system);
}
void CudaContext::addForce(CudaForceInfo* force) {
forces.push_back(force);
}
vector<CudaForceInfo*>& CudaContext::getForceInfos() {
return forces;
void CudaContext::initializeContexts() {
getPlatformData().initializeContexts(system);
}
void CudaContext::setAsCurrent() {
......@@ -497,38 +488,6 @@ void CudaContext::setAsCurrent() {
cuCtxSetCurrent(context);
}
string CudaContext::replaceStrings(const string& input, const std::map<std::string, std::string>& replacements) const {
static set<char> symbolChars;
if (symbolChars.size() == 0) {
symbolChars.insert('_');
for (char c = 'a'; c <= 'z'; c++)
symbolChars.insert(c);
for (char c = 'A'; c <= 'Z'; c++)
symbolChars.insert(c);
for (char c = '0'; c <= '9'; c++)
symbolChars.insert(c);
}
string result = input;
for (auto& pair : replacements) {
int index = 0;
int size = pair.first.size();
do {
index = result.find(pair.first, index);
if (index != result.npos) {
if ((index == 0 || symbolChars.find(result[index-1]) == symbolChars.end()) && (index == result.size()-size || symbolChars.find(result[index+size]) == symbolChars.end())) {
// We have found a complete symbol, not part of a longer symbol.
result.replace(index, size, pair.second);
index += pair.second.size();
}
else
index++;
}
} while (index != result.npos);
}
return result;
}
CUmodule CudaContext::createModule(const string source, const char* optimizationFlags) {
return createModule(source, map<string, string>(), optimizationFlags);
}
......@@ -572,6 +531,7 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
src << "typedef float4 mixed4;\n";
}
src << "typedef unsigned int tileflags;\n";
src << CudaKernelSources::common << endl;
for (auto& pair : defines) {
src << "#define " << pair.first;
if (!pair.second.empty())
......@@ -716,19 +676,29 @@ void CudaContext::restoreDefaultStream() {
setCurrentStream(0);
}
string CudaContext::doubleToString(double value) const {
stringstream s;
s.precision(useDoublePrecision ? 16 : 8);
s << scientific << value;
if (!useDoublePrecision)
s << "f";
return s.str();
CudaArray* CudaContext::createArray() {
return new CudaArray();
}
string CudaContext::intToString(int value) const {
stringstream s;
s << value;
return s.str();
ComputeEvent CudaContext::createEvent() {
return shared_ptr<ComputeEventImpl>(new CudaEvent(*this));
}
ComputeProgram CudaContext::compileProgram(const std::string source, const std::map<std::string, std::string>& defines) {
CUmodule module = createModule(CudaKernelSources::vectorOps+source, defines);
return shared_ptr<ComputeProgramImpl>(new CudaProgram(*this, module));
}
CudaArray& CudaContext::unwrap(ArrayInterface& array) const {
CudaArray* cuarray;
ComputeArray* wrapper = dynamic_cast<ComputeArray*>(&array);
if (wrapper != NULL)
cuarray = dynamic_cast<CudaArray*>(&wrapper->getArray());
else
cuarray = dynamic_cast<CudaArray*>(&array);
if (cuarray == NULL)
throw OpenMMException("Array argument is not an CudaArray");
return *cuarray;
}
std::string CudaContext::getErrorString(CUresult result) {
......@@ -763,8 +733,8 @@ int CudaContext::computeThreadBlockSize(double memory, bool preferShared) const
return threads;
}
void CudaContext::clearBuffer(CudaArray& array) {
clearBuffer(array.getDevicePointer(), array.getSize()*array.getElementSize());
void CudaContext::clearBuffer(ArrayInterface& array) {
clearBuffer(unwrap(array).getDevicePointer(), array.getSize()*array.getElementSize());
}
void CudaContext::clearBuffer(CUdeviceptr memory, int size) {
......@@ -773,8 +743,8 @@ void CudaContext::clearBuffer(CUdeviceptr memory, int size) {
executeKernel(clearBufferKernel, args, words, 128);
}
void CudaContext::addAutoclearBuffer(CudaArray& array) {
addAutoclearBuffer(array.getDevicePointer(), array.getSize()*array.getElementSize());
void CudaContext::addAutoclearBuffer(ArrayInterface& array) {
addAutoclearBuffer(unwrap(array).getDevicePointer(), array.getSize()*array.getElementSize());
}
void CudaContext::addAutoclearBuffer(CUdeviceptr memory, int size) {
......@@ -855,523 +825,6 @@ bool CudaContext::requestPosqCharges() {
return allow;
}
/**
* This class ensures that atom reordering doesn't break virtual sites.
*/
class CudaContext::VirtualSiteInfo : public CudaForceInfo {
public:
VirtualSiteInfo(const System& system) {
for (int i = 0; i < system.getNumParticles(); i++) {
if (system.isVirtualSite(i)) {
const VirtualSite& vsite = system.getVirtualSite(i);
siteTypes.push_back(&typeid(vsite));
vector<int> particles;
particles.push_back(i);
for (int j = 0; j < vsite.getNumParticles(); j++)
particles.push_back(vsite.getParticle(j));
siteParticles.push_back(particles);
vector<double> weights;
if (dynamic_cast<const TwoParticleAverageSite*>(&vsite) != NULL) {
// A two particle average.
const TwoParticleAverageSite& site = dynamic_cast<const TwoParticleAverageSite&>(vsite);
weights.push_back(site.getWeight(0));
weights.push_back(site.getWeight(1));
}
else if (dynamic_cast<const ThreeParticleAverageSite*>(&vsite) != NULL) {
// A three particle average.
const ThreeParticleAverageSite& site = dynamic_cast<const ThreeParticleAverageSite&>(vsite);
weights.push_back(site.getWeight(0));
weights.push_back(site.getWeight(1));
weights.push_back(site.getWeight(2));
}
else if (dynamic_cast<const OutOfPlaneSite*>(&vsite) != NULL) {
// An out of plane site.
const OutOfPlaneSite& site = dynamic_cast<const OutOfPlaneSite&>(vsite);
weights.push_back(site.getWeight12());
weights.push_back(site.getWeight13());
weights.push_back(site.getWeightCross());
}
siteWeights.push_back(weights);
}
}
}
int getNumParticleGroups() {
return siteTypes.size();
}
void getParticlesInGroup(int index, std::vector<int>& particles) {
particles = siteParticles[index];
}
bool areGroupsIdentical(int group1, int group2) {
if (siteTypes[group1] != siteTypes[group2])
return false;
int numParticles = siteWeights[group1].size();
if (siteWeights[group2].size() != numParticles)
return false;
for (int i = 0; i < numParticles; i++)
if (siteWeights[group1][i] != siteWeights[group2][i])
return false;
return true;
}
private:
vector<const type_info*> siteTypes;
vector<vector<int> > siteParticles;
vector<vector<double> > siteWeights;
};
void CudaContext::findMoleculeGroups() {
// The first time this is called, we need to identify all the molecules in the system.
if (moleculeGroups.size() == 0) {
// Add a ForceInfo that makes sure reordering doesn't break virtual sites.
addForce(new VirtualSiteInfo(system));
// First make a list of every other atom to which each atom is connect by a constraint or force group.
vector<vector<int> > atomBonds(system.getNumParticles());
for (int i = 0; i < system.getNumConstraints(); i++) {
int particle1, particle2;
double distance;
system.getConstraintParameters(i, particle1, particle2, distance);
atomBonds[particle1].push_back(particle2);
atomBonds[particle2].push_back(particle1);
}
for (auto force : forces) {
for (int j = 0; j < force->getNumParticleGroups(); j++) {
vector<int> particles;
force->getParticlesInGroup(j, particles);
for (int k = 0; k < (int) particles.size(); k++)
for (int m = 0; m < (int) particles.size(); m++)
if (k != m)
atomBonds[particles[k]].push_back(particles[m]);
}
}
// Now identify atoms by which molecule they belong to.
vector<vector<int> > atomIndices = ContextImpl::findMolecules(numAtoms, atomBonds);
int numMolecules = atomIndices.size();
vector<int> atomMolecule(numAtoms);
for (int i = 0; i < (int) atomIndices.size(); i++)
for (int j = 0; j < (int) atomIndices[i].size(); j++)
atomMolecule[atomIndices[i][j]] = i;
// Construct a description of each molecule.
molecules.resize(numMolecules);
for (int i = 0; i < numMolecules; i++) {
molecules[i].atoms = atomIndices[i];
molecules[i].groups.resize(forces.size());
}
for (int i = 0; i < system.getNumConstraints(); i++) {
int particle1, particle2;
double distance;
system.getConstraintParameters(i, particle1, particle2, distance);
molecules[atomMolecule[particle1]].constraints.push_back(i);
}
for (int i = 0; i < (int) forces.size(); i++)
for (int j = 0; j < forces[i]->getNumParticleGroups(); j++) {
vector<int> particles;
forces[i]->getParticlesInGroup(j, particles);
if (particles.size() > 0)
molecules[atomMolecule[particles[0]]].groups[i].push_back(j);
}
}
// Sort them into groups of identical molecules.
vector<Molecule> uniqueMolecules;
vector<vector<int> > moleculeInstances;
vector<vector<int> > moleculeOffsets;
for (int molIndex = 0; molIndex < (int) molecules.size(); molIndex++) {
Molecule& mol = molecules[molIndex];
// See if it is identical to another molecule.
bool isNew = true;
for (int j = 0; j < (int) uniqueMolecules.size() && isNew; j++) {
Molecule& mol2 = uniqueMolecules[j];
bool identical = (mol.atoms.size() == mol2.atoms.size() && mol.constraints.size() == mol2.constraints.size());
// See if the atoms are identical.
int atomOffset = mol2.atoms[0]-mol.atoms[0];
for (int i = 0; i < (int) mol.atoms.size() && identical; i++) {
if (mol.atoms[i] != mol2.atoms[i]-atomOffset || system.getParticleMass(mol.atoms[i]) != system.getParticleMass(mol2.atoms[i]))
identical = false;
for (int k = 0; k < (int) forces.size(); k++)
if (!forces[k]->areParticlesIdentical(mol.atoms[i], mol2.atoms[i]))
identical = false;
}
// See if the constraints are identical.
for (int i = 0; i < (int) mol.constraints.size() && identical; i++) {
int c1particle1, c1particle2, c2particle1, c2particle2;
double distance1, distance2;
system.getConstraintParameters(mol.constraints[i], c1particle1, c1particle2, distance1);
system.getConstraintParameters(mol2.constraints[i], c2particle1, c2particle2, distance2);
if (c1particle1 != c2particle1-atomOffset || c1particle2 != c2particle2-atomOffset || distance1 != distance2)
identical = false;
}
// See if the force groups are identical.
for (int i = 0; i < (int) forces.size() && identical; i++) {
if (mol.groups[i].size() != mol2.groups[i].size())
identical = false;
for (int k = 0; k < (int) mol.groups[i].size() && identical; k++) {
if (!forces[i]->areGroupsIdentical(mol.groups[i][k], mol2.groups[i][k]))
identical = false;
vector<int> p1, p2;
forces[i]->getParticlesInGroup(mol.groups[i][k], p1);
forces[i]->getParticlesInGroup(mol2.groups[i][k], p2);
for (int m = 0; m < p1.size(); m++)
if (p1[m] != p2[m]-atomOffset)
identical = false;
}
}
if (identical) {
moleculeInstances[j].push_back(molIndex);
moleculeOffsets[j].push_back(mol.atoms[0]);
isNew = false;
}
}
if (isNew) {
uniqueMolecules.push_back(mol);
moleculeInstances.push_back(vector<int>());
moleculeInstances[moleculeInstances.size()-1].push_back(molIndex);
moleculeOffsets.push_back(vector<int>());
moleculeOffsets[moleculeOffsets.size()-1].push_back(mol.atoms[0]);
}
}
moleculeGroups.resize(moleculeInstances.size());
for (int i = 0; i < (int) moleculeInstances.size(); i++)
{
moleculeGroups[i].instances = moleculeInstances[i];
moleculeGroups[i].offsets = moleculeOffsets[i];
vector<int>& atoms = uniqueMolecules[i].atoms;
moleculeGroups[i].atoms.resize(atoms.size());
for (int j = 0; j < (int) atoms.size(); j++)
moleculeGroups[i].atoms[j] = atoms[j]-atoms[0];
}
}
void CudaContext::invalidateMolecules() {
for (int i = 0; i < forces.size(); i++)
if (invalidateMolecules(forces[i]))
return;
}
bool CudaContext::invalidateMolecules(CudaForceInfo* force) {
if (numAtoms == 0 || nonbonded == NULL || !nonbonded->getUseCutoff())
return false;
bool valid = true;
int forceIndex = -1;
for (int i = 0; i < forces.size(); i++)
if (forces[i] == force)
forceIndex = i;
getPlatformData().threads.execute([&] (ThreadPool& threads, int threadIndex) {
for (int group = 0; valid && group < (int) moleculeGroups.size(); group++) {
MoleculeGroup& mol = moleculeGroups[group];
vector<int>& instances = mol.instances;
vector<int>& offsets = mol.offsets;
vector<int>& atoms = mol.atoms;
int numMolecules = instances.size();
Molecule& m1 = molecules[instances[0]];
int offset1 = offsets[0];
int numThreads = threads.getNumThreads();
int start = max(1, threadIndex*numMolecules/numThreads);
int end = (threadIndex+1)*numMolecules/numThreads;
for (int j = start; j < end; j++) {
// See if the atoms are identical.
Molecule& m2 = molecules[instances[j]];
int offset2 = offsets[j];
for (int i = 0; i < (int) atoms.size() && valid; i++) {
if (!force->areParticlesIdentical(atoms[i]+offset1, atoms[i]+offset2))
valid = false;
}
// See if the force groups are identical.
if (valid && forceIndex > -1) {
for (int k = 0; k < (int) m1.groups[forceIndex].size() && valid; k++)
if (!force->areGroupsIdentical(m1.groups[forceIndex][k], m2.groups[forceIndex][k]))
valid = false;
}
}
}
});
getPlatformData().threads.waitForThreads();
if (valid)
return false;
// The list of which molecules are identical is no longer valid. We need to restore the
// atoms to their original order, rebuild the list of identical molecules, and sort them
// again.
vector<int4> newCellOffsets(numAtoms);
if (useDoublePrecision) {
vector<double4> oldPosq(paddedNumAtoms);
vector<double4> newPosq(paddedNumAtoms, make_double4(0, 0, 0, 0));
vector<double4> oldVelm(paddedNumAtoms);
vector<double4> newVelm(paddedNumAtoms, make_double4(0, 0, 0, 0));
posq.download(oldPosq);
velm.download(oldVelm);
for (int i = 0; i < numAtoms; i++) {
int index = atomIndex[i];
newPosq[index] = oldPosq[i];
newVelm[index] = oldVelm[i];
newCellOffsets[index] = posCellOffsets[i];
}
posq.upload(newPosq);
velm.upload(newVelm);
}
else if (useMixedPrecision) {
vector<float4> oldPosq(paddedNumAtoms);
vector<float4> newPosq(paddedNumAtoms, make_float4(0, 0, 0, 0));
vector<float4> oldPosqCorrection(paddedNumAtoms);
vector<float4> newPosqCorrection(paddedNumAtoms, make_float4(0, 0, 0, 0));
vector<double4> oldVelm(paddedNumAtoms);
vector<double4> newVelm(paddedNumAtoms, make_double4(0, 0, 0, 0));
posq.download(oldPosq);
velm.download(oldVelm);
for (int i = 0; i < numAtoms; i++) {
int index = atomIndex[i];
newPosq[index] = oldPosq[i];
newPosqCorrection[index] = oldPosqCorrection[i];
newVelm[index] = oldVelm[i];
newCellOffsets[index] = posCellOffsets[i];
}
posq.upload(newPosq);
posqCorrection.upload(newPosqCorrection);
velm.upload(newVelm);
}
else {
vector<float4> oldPosq(paddedNumAtoms);
vector<float4> newPosq(paddedNumAtoms, make_float4(0, 0, 0, 0));
vector<float4> oldVelm(paddedNumAtoms);
vector<float4> newVelm(paddedNumAtoms, make_float4(0, 0, 0, 0));
posq.download(oldPosq);
velm.download(oldVelm);
for (int i = 0; i < numAtoms; i++) {
int index = atomIndex[i];
newPosq[index] = oldPosq[i];
newVelm[index] = oldVelm[i];
newCellOffsets[index] = posCellOffsets[i];
}
posq.upload(newPosq);
velm.upload(newVelm);
}
for (int i = 0; i < numAtoms; i++) {
atomIndex[i] = i;
posCellOffsets[i] = newCellOffsets[i];
}
atomIndexDevice.upload(atomIndex);
findMoleculeGroups();
for (auto listener : reorderListeners)
listener->execute();
reorderAtoms();
return true;
}
void CudaContext::reorderAtoms() {
atomsWereReordered = false;
if (numAtoms == 0 || nonbonded == NULL || !nonbonded->getUseCutoff() || stepsSinceReorder < 250) {
stepsSinceReorder++;
return;
}
atomsWereReordered = true;
stepsSinceReorder = 0;
if (useDoublePrecision)
reorderAtomsImpl<double, double4, double, double4>();
else if (useMixedPrecision)
reorderAtomsImpl<float, float4, double, double4>();
else
reorderAtomsImpl<float, float4, float, float4>();
}
template <class Real, class Real4, class Mixed, class Mixed4>
void CudaContext::reorderAtomsImpl() {
// Find the range of positions and the number of bins along each axis.
Real4 padding = {0, 0, 0, 0};
vector<Real4> oldPosq(paddedNumAtoms, padding);
vector<Real4> oldPosqCorrection(paddedNumAtoms, padding);
Mixed4 paddingMixed = {0, 0, 0, 0};
vector<Mixed4> oldVelm(paddedNumAtoms, paddingMixed);
posq.download(oldPosq);
velm.download(oldVelm);
if (useMixedPrecision)
posqCorrection.download(oldPosqCorrection);
Real minx = oldPosq[0].x, maxx = oldPosq[0].x;
Real miny = oldPosq[0].y, maxy = oldPosq[0].y;
Real minz = oldPosq[0].z, maxz = oldPosq[0].z;
if (nonbonded->getUsePeriodic()) {
minx = miny = minz = 0.0;
maxx = periodicBoxSize.x;
maxy = periodicBoxSize.y;
maxz = periodicBoxSize.z;
}
else {
for (int i = 1; i < numAtoms; i++) {
const Real4& pos = oldPosq[i];
minx = min(minx, pos.x);
maxx = max(maxx, pos.x);
miny = min(miny, pos.y);
maxy = max(maxy, pos.y);
minz = min(minz, pos.z);
maxz = max(maxz, pos.z);
}
}
// Loop over each group of identical molecules and reorder them.
vector<int> originalIndex(numAtoms);
vector<Real4> newPosq(paddedNumAtoms);
vector<Real4> newPosqCorrection(paddedNumAtoms);
vector<Mixed4> newVelm(paddedNumAtoms);
vector<int4> newCellOffsets(numAtoms);
for (auto& mol : moleculeGroups) {
// Find the center of each molecule.
int numMolecules = mol.offsets.size();
vector<int>& atoms = mol.atoms;
vector<Real4> molPos(numMolecules);
Real invNumAtoms = (Real) (1.0/atoms.size());
for (int i = 0; i < numMolecules; i++) {
molPos[i].x = 0.0f;
molPos[i].y = 0.0f;
molPos[i].z = 0.0f;
for (int j = 0; j < (int)atoms.size(); j++) {
int atom = atoms[j]+mol.offsets[i];
const Real4& pos = oldPosq[atom];
molPos[i].x += pos.x;
molPos[i].y += pos.y;
molPos[i].z += pos.z;
}
molPos[i].x *= invNumAtoms;
molPos[i].y *= invNumAtoms;
molPos[i].z *= invNumAtoms;
if (molPos[i].x != molPos[i].x)
throw OpenMMException("Particle coordinate is nan");
}
if (nonbonded->getUsePeriodic()) {
// Move each molecule position into the same box.
for (int i = 0; i < numMolecules; i++) {
Real4 center = molPos[i];
int zcell = (int) floor(center.z*invPeriodicBoxSize.z);
center.x -= zcell*periodicBoxVecZ.x;
center.y -= zcell*periodicBoxVecZ.y;
center.z -= zcell*periodicBoxVecZ.z;
int ycell = (int) floor(center.y*invPeriodicBoxSize.y);
center.x -= ycell*periodicBoxVecY.x;
center.y -= ycell*periodicBoxVecY.y;
int xcell = (int) floor(center.x*invPeriodicBoxSize.x);
center.x -= xcell*periodicBoxVecX.x;
if (xcell != 0 || ycell != 0 || zcell != 0) {
Real dx = molPos[i].x-center.x;
Real dy = molPos[i].y-center.y;
Real dz = molPos[i].z-center.z;
molPos[i] = center;
for (int j = 0; j < (int) atoms.size(); j++) {
int atom = atoms[j]+mol.offsets[i];
Real4 p = oldPosq[atom];
p.x -= dx;
p.y -= dy;
p.z -= dz;
oldPosq[atom] = p;
posCellOffsets[atom].x -= xcell;
posCellOffsets[atom].y -= ycell;
posCellOffsets[atom].z -= zcell;
}
}
}
}
// Select a bin for each molecule, then sort them by bin.
bool useHilbert = (numMolecules > 5000 || atoms.size() > 8); // For small systems, a simple zigzag curve works better than a Hilbert curve.
Real binWidth;
if (useHilbert)
binWidth = (Real) (max(max(maxx-minx, maxy-miny), maxz-minz)/255.0);
else
binWidth = (Real) (0.2*nonbonded->getMaxCutoffDistance());
Real invBinWidth = (Real) (1.0/binWidth);
int xbins = 1 + (int) ((maxx-minx)*invBinWidth);
int ybins = 1 + (int) ((maxy-miny)*invBinWidth);
vector<pair<int, int> > molBins(numMolecules);
bitmask_t coords[3];
for (int i = 0; i < numMolecules; i++) {
int x = (int) ((molPos[i].x-minx)*invBinWidth);
int y = (int) ((molPos[i].y-miny)*invBinWidth);
int z = (int) ((molPos[i].z-minz)*invBinWidth);
int bin;
if (useHilbert) {
coords[0] = x;
coords[1] = y;
coords[2] = z;
bin = (int) hilbert_c2i(3, 8, coords);
}
else {
int yodd = y&1;
int zodd = z&1;
bin = z*xbins*ybins;
bin += (zodd ? ybins-y : y)*xbins;
bin += (yodd ? xbins-x : x);
}
molBins[i] = pair<int, int>(bin, i);
}
sort(molBins.begin(), molBins.end());
// Reorder the atoms.
for (int i = 0; i < numMolecules; i++) {
for (int atom : atoms) {
int oldIndex = mol.offsets[molBins[i].second]+atom;
int newIndex = mol.offsets[i]+atom;
originalIndex[newIndex] = atomIndex[oldIndex];
newPosq[newIndex] = oldPosq[oldIndex];
if (useMixedPrecision)
newPosqCorrection[newIndex] = oldPosqCorrection[oldIndex];
newVelm[newIndex] = oldVelm[oldIndex];
newCellOffsets[newIndex] = posCellOffsets[oldIndex];
}
}
}
// Update the streams.
for (int i = 0; i < numAtoms; i++) {
atomIndex[i] = originalIndex[i];
posCellOffsets[i] = newCellOffsets[i];
}
posq.upload(newPosq);
if (useMixedPrecision)
posqCorrection.upload(newPosqCorrection);
velm.upload(newVelm);
atomIndexDevice.upload(atomIndex);
for (auto listener : reorderListeners)
listener->execute();
}
void CudaContext::addReorderListener(ReorderListener* listener) {
reorderListeners.push_back(listener);
}
void CudaContext::addPreComputation(ForcePreComputation* computation) {
preComputations.push_back(computation);
}
void CudaContext::addPostComputation(ForcePostComputation* computation) {
postComputations.push_back(computation);
}
void CudaContext::addEnergyParameterDerivative(const string& param) {
// See if this parameter has already been registered.
......@@ -1381,90 +834,10 @@ void CudaContext::addEnergyParameterDerivative(const string& param) {
energyParamDerivNames.push_back(param);
}
struct CudaContext::WorkThread::ThreadData {
ThreadData(std::queue<CudaContext::WorkTask*>& tasks, bool& waiting, bool& finished,
pthread_mutex_t& queueLock, pthread_cond_t& waitForTaskCondition, pthread_cond_t& queueEmptyCondition) :
tasks(tasks), waiting(waiting), finished(finished), queueLock(queueLock),
waitForTaskCondition(waitForTaskCondition), queueEmptyCondition(queueEmptyCondition) {
}
std::queue<CudaContext::WorkTask*>& tasks;
bool& waiting;
bool& finished;
pthread_mutex_t& queueLock;
pthread_cond_t& waitForTaskCondition;
pthread_cond_t& queueEmptyCondition;
};
static void* threadBody(void* args) {
CudaContext::WorkThread::ThreadData& data = *reinterpret_cast<CudaContext::WorkThread::ThreadData*>(args);
while (!data.finished || data.tasks.size() > 0) {
pthread_mutex_lock(&data.queueLock);
while (data.tasks.empty() && !data.finished) {
data.waiting = true;
pthread_cond_signal(&data.queueEmptyCondition);
pthread_cond_wait(&data.waitForTaskCondition, &data.queueLock);
}
CudaContext::WorkTask* task = NULL;
if (!data.tasks.empty()) {
data.waiting = false;
task = data.tasks.front();
data.tasks.pop();
}
pthread_mutex_unlock(&data.queueLock);
if (task != NULL) {
task->execute();
delete task;
}
}
data.waiting = true;
pthread_cond_signal(&data.queueEmptyCondition);
delete &data;
return 0;
}
CudaContext::WorkThread::WorkThread() : waiting(true), finished(false) {
pthread_mutex_init(&queueLock, NULL);
pthread_cond_init(&waitForTaskCondition, NULL);
pthread_cond_init(&queueEmptyCondition, NULL);
ThreadData* data = new ThreadData(tasks, waiting, finished, queueLock, waitForTaskCondition, queueEmptyCondition);
pthread_create(&thread, NULL, threadBody, data);
}
CudaContext::WorkThread::~WorkThread() {
pthread_mutex_lock(&queueLock);
finished = true;
pthread_cond_broadcast(&waitForTaskCondition);
pthread_mutex_unlock(&queueLock);
pthread_join(thread, NULL);
pthread_mutex_destroy(&queueLock);
pthread_cond_destroy(&waitForTaskCondition);
pthread_cond_destroy(&queueEmptyCondition);
void CudaContext::flushQueue() {
cuStreamSynchronize(getCurrentStream());
}
void CudaContext::WorkThread::addTask(CudaContext::WorkTask* task) {
pthread_mutex_lock(&queueLock);
tasks.push(task);
waiting = false;
pthread_cond_signal(&waitForTaskCondition);
pthread_mutex_unlock(&queueLock);
}
bool CudaContext::WorkThread::isWaiting() {
return waiting;
}
bool CudaContext::WorkThread::isFinished() {
return finished;
}
void CudaContext::WorkThread::flush() {
pthread_mutex_lock(&queueLock);
while (!waiting)
pthread_cond_wait(&queueEmptyCondition, &queueLock);
pthread_mutex_unlock(&queueLock);
}
vector<int> CudaContext::getDevicePrecedence() {
int numDevices;
CUdevice thisDevice;
......@@ -1475,7 +848,8 @@ vector<int> CudaContext::getDevicePrecedence() {
for (int i = 0; i < numDevices; i++) {
CHECK_RESULT(cuDeviceGet(&thisDevice, i));
int major, minor, clock, multiprocessors, speed;
CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, thisDevice));
CHECK_RESULT(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, thisDevice));
CHECK_RESULT(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, thisDevice));
if (major == 1 && minor < 2)
continue;
......
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2019 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "CudaEvent.h"
#include "openmm/OpenMMException.h"
using namespace OpenMM;
CudaEvent::CudaEvent(CudaContext& context) : context(context), eventCreated(false) {
CUresult result = cuEventCreate(&event, CU_EVENT_DISABLE_TIMING);
if (result != CUDA_SUCCESS)
throw OpenMMException("Error creating CUDA event:"+CudaContext::getErrorString(result));
eventCreated = true;
}
CudaEvent::~CudaEvent() {
if (eventCreated)
cuEventDestroy(event);
}
void CudaEvent::enqueue() {
cuEventRecord(event, 0);
}
void CudaEvent::wait() {
cuEventSynchronize(event);
}
......@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2018 Stanford University and the Authors. *
* Portions copyright (c) 2009-2019 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -25,18 +25,7 @@
* -------------------------------------------------------------------------- */
#include "CudaIntegrationUtilities.h"
#include "CudaArray.h"
#include "CudaKernelSources.h"
#include "openmm/internal/OSRngSeed.h"
#include "openmm/HarmonicAngleForce.h"
#include "openmm/VirtualSite.h"
#include "quern.h"
#include "CudaExpressionUtilities.h"
#include "ReferenceCCMAAlgorithm.h"
#include <algorithm>
#include <cmath>
#include <cstdlib>
#include <map>
#include "CudaContext.h"
using namespace OpenMM;
using namespace std;
......@@ -45,539 +34,39 @@ using namespace std;
#define CHECK_RESULT2(result, prefix) \
if (result != CUDA_SUCCESS) { \
std::stringstream m; \
m<<prefix<<": "<<context.getErrorString(result)<<" ("<<result<<")"<<" at "<<__FILE__<<":"<<__LINE__; \
m<<prefix<<": "<<dynamic_cast<CudaContext&>(context).getErrorString(result)<<" ("<<result<<")"<<" at "<<__FILE__<<":"<<__LINE__; \
throw OpenMMException(m.str());\
}
struct CudaIntegrationUtilities::ShakeCluster {
int centralID;
int peripheralID[3];
int size;
bool valid;
double distance;
double centralInvMass, peripheralInvMass;
ShakeCluster() : valid(true) {
}
ShakeCluster(int centralID, double invMass) : centralID(centralID), centralInvMass(invMass), size(0), valid(true) {
}
void addAtom(int id, double dist, double invMass) {
if (size == 3 || (size > 0 && abs(dist-distance)/distance > 1e-8) || (size > 0 && abs(invMass-peripheralInvMass)/peripheralInvMass > 1e-8))
valid = false;
else {
peripheralID[size++] = id;
distance = dist;
peripheralInvMass = invMass;
}
}
void markInvalid(map<int, ShakeCluster>& allClusters, vector<bool>& invalidForShake)
{
valid = false;
invalidForShake[centralID] = true;
for (int i = 0; i < size; i++) {
invalidForShake[peripheralID[i]] = true;
map<int, ShakeCluster>::iterator otherCluster = allClusters.find(peripheralID[i]);
if (otherCluster != allClusters.end() && otherCluster->second.valid)
otherCluster->second.markInvalid(allClusters, invalidForShake);
}
}
};
struct CudaIntegrationUtilities::ConstraintOrderer : public binary_function<int, int, bool> {
const vector<int>& atom1;
const vector<int>& atom2;
const vector<int>& constraints;
ConstraintOrderer(const vector<int>& atom1, const vector<int>& atom2, const vector<int>& constraints) : atom1(atom1), atom2(atom2), constraints(constraints) {
}
bool operator()(int x, int y) {
int ix = constraints[x];
int iy = constraints[y];
if (atom1[ix] != atom1[iy])
return atom1[ix] < atom1[iy];
return atom2[ix] < atom2[iy];
}
};
CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const System& system) : context(context),
randomPos(0), ccmaConvergedMemory(NULL) {
// Create workspace arrays.
lastStepSize = make_double2(0.0, 0.0);
if (context.getUseDoublePrecision() || context.getUseMixedPrecision()) {
posDelta.initialize<double4>(context, context.getPaddedNumAtoms(), "posDelta");
vector<double4> deltas(posDelta.getSize(), make_double4(0.0, 0.0, 0.0, 0.0));
posDelta.upload(deltas);
stepSize.initialize<double2>(context, 1, "stepSize");
stepSize.upload(&lastStepSize);
}
else {
posDelta.initialize<float4>(context, context.getPaddedNumAtoms(), "posDelta");
vector<float4> deltas(posDelta.getSize(), make_float4(0.0f, 0.0f, 0.0f, 0.0f));
posDelta.upload(deltas);
stepSize.initialize<float2>(context, 1, "stepSize");
float2 lastStepSizeFloat = make_float2(0.0f, 0.0f);
stepSize.upload(&lastStepSizeFloat);
}
// Record the set of constraints and how many constraints each atom is involved in.
vector<int> atom1;
vector<int> atom2;
vector<double> distance;
vector<int> constraintCount(context.getNumAtoms(), 0);
for (int i = 0; i < system.getNumConstraints(); i++) {
int p1, p2;
double d;
system.getConstraintParameters(i, p1, p2, d);
if (system.getParticleMass(p1) != 0 || system.getParticleMass(p2) != 0) {
atom1.push_back(p1);
atom2.push_back(p2);
distance.push_back(d);
constraintCount[p1]++;
constraintCount[p2]++;
}
}
// Identify clusters of three atoms that can be treated with SETTLE. First, for every
// atom that might be part of such a cluster, make a list of the two other atoms it is
// connected to.
int numAtoms = system.getNumParticles();
vector<map<int, float> > settleConstraints(numAtoms);
for (int i = 0; i < (int)atom1.size(); i++) {
if (constraintCount[atom1[i]] == 2 && constraintCount[atom2[i]] == 2) {
settleConstraints[atom1[i]][atom2[i]] = (float) distance[i];
settleConstraints[atom2[i]][atom1[i]] = (float) distance[i];
}
}
// Now remove the ones that don't actually form closed loops of three atoms.
vector<int> settleClusters;
for (int i = 0; i < (int)settleConstraints.size(); i++) {
if (settleConstraints[i].size() == 2) {
int partner1 = settleConstraints[i].begin()->first;
int partner2 = (++settleConstraints[i].begin())->first;
if (settleConstraints[partner1].size() != 2 || settleConstraints[partner2].size() != 2 ||
settleConstraints[partner1].find(partner2) == settleConstraints[partner1].end())
settleConstraints[i].clear();
else if (i < partner1 && i < partner2)
settleClusters.push_back(i);
}
else
settleConstraints[i].clear();
}
// Record the SETTLE clusters.
vector<bool> isShakeAtom(numAtoms, false);
if (settleClusters.size() > 0) {
vector<int4> atoms;
vector<float2> params;
for (int i = 0; i < (int) settleClusters.size(); i++) {
int atom1 = settleClusters[i];
int atom2 = settleConstraints[atom1].begin()->first;
int atom3 = (++settleConstraints[atom1].begin())->first;
float dist12 = settleConstraints[atom1].find(atom2)->second;
float dist13 = settleConstraints[atom1].find(atom3)->second;
float dist23 = settleConstraints[atom2].find(atom3)->second;
if (dist12 == dist13) {
// atom1 is the central atom
atoms.push_back(make_int4(atom1, atom2, atom3, 0));
params.push_back(make_float2(dist12, dist23));
}
else if (dist12 == dist23) {
// atom2 is the central atom
atoms.push_back(make_int4(atom2, atom1, atom3, 0));
params.push_back(make_float2(dist12, dist13));
}
else if (dist13 == dist23) {
// atom3 is the central atom
atoms.push_back(make_int4(atom3, atom1, atom2, 0));
params.push_back(make_float2(dist13, dist12));
}
else
continue; // We can't handle this with SETTLE
isShakeAtom[atom1] = true;
isShakeAtom[atom2] = true;
isShakeAtom[atom3] = true;
}
if (atoms.size() > 0) {
settleAtoms.initialize<int4>(context, atoms.size(), "settleAtoms");
settleParams.initialize<float2>(context, params.size(), "settleParams");
settleAtoms.upload(atoms);
settleParams.upload(params);
}
}
// Find clusters consisting of a central atom with up to three peripheral atoms.
map<int, ShakeCluster> clusters;
vector<bool> invalidForShake(numAtoms, false);
for (int i = 0; i < (int) atom1.size(); i++) {
if (isShakeAtom[atom1[i]])
continue; // This is being taken care of with SETTLE.
// Determine which is the central atom.
bool firstIsCentral;
if (constraintCount[atom1[i]] > 1)
firstIsCentral = true;
else if (constraintCount[atom2[i]] > 1)
firstIsCentral = false;
else if (atom1[i] < atom2[i])
firstIsCentral = true;
else
firstIsCentral = false;
int centralID, peripheralID;
if (firstIsCentral) {
centralID = atom1[i];
peripheralID = atom2[i];
}
else {
centralID = atom2[i];
peripheralID = atom1[i];
}
// Add it to the cluster.
if (clusters.find(centralID) == clusters.end()) {
clusters[centralID] = ShakeCluster(centralID, 1.0/system.getParticleMass(centralID));
}
ShakeCluster& cluster = clusters[centralID];
cluster.addAtom(peripheralID, distance[i], 1.0/system.getParticleMass(peripheralID));
if (constraintCount[peripheralID] != 1 || invalidForShake[atom1[i]] || invalidForShake[atom2[i]]) {
cluster.markInvalid(clusters, invalidForShake);
map<int, ShakeCluster>::iterator otherCluster = clusters.find(peripheralID);
if (otherCluster != clusters.end() && otherCluster->second.valid)
otherCluster->second.markInvalid(clusters, invalidForShake);
}
}
int validShakeClusters = 0;
for (map<int, ShakeCluster>::iterator iter = clusters.begin(); iter != clusters.end(); ++iter) {
ShakeCluster& cluster = iter->second;
if (cluster.valid) {
cluster.valid = !invalidForShake[cluster.centralID] && cluster.size == constraintCount[cluster.centralID];
for (int i = 0; i < cluster.size; i++)
if (invalidForShake[cluster.peripheralID[i]])
cluster.valid = false;
if (cluster.valid)
++validShakeClusters;
}
}
// Record the SHAKE clusters.
if (validShakeClusters > 0) {
vector<int4> atoms;
vector<float4> params;
int index = 0;
for (map<int, ShakeCluster>::const_iterator iter = clusters.begin(); iter != clusters.end(); ++iter) {
const ShakeCluster& cluster = iter->second;
if (!cluster.valid)
continue;
atoms.push_back(make_int4(cluster.centralID, cluster.peripheralID[0], (cluster.size > 1 ? cluster.peripheralID[1] : -1), (cluster.size > 2 ? cluster.peripheralID[2] : -1)));
params.push_back(make_float4((float) cluster.centralInvMass, (float) (0.5/(cluster.centralInvMass+cluster.peripheralInvMass)), (float) (cluster.distance*cluster.distance), (float) cluster.peripheralInvMass));
isShakeAtom[cluster.centralID] = true;
isShakeAtom[cluster.peripheralID[0]] = true;
if (cluster.size > 1)
isShakeAtom[cluster.peripheralID[1]] = true;
if (cluster.size > 2)
isShakeAtom[cluster.peripheralID[2]] = true;
++index;
}
shakeAtoms.initialize<int4>(context, atoms.size(), "shakeAtoms");
shakeParams.initialize<float4>(context, params.size(), "shakeParams");
shakeAtoms.upload(atoms);
shakeParams.upload(params);
}
// Find connected constraints for CCMA.
vector<int> ccmaConstraints;
for (unsigned i = 0; i < atom1.size(); i++)
if (!isShakeAtom[atom1[i]])
ccmaConstraints.push_back(i);
// Record the connections between constraints.
int numCCMA = (int) ccmaConstraints.size();
if (numCCMA > 0) {
// Record information needed by ReferenceCCMAAlgorithm.
vector<pair<int, int> > refIndices(numCCMA);
vector<double> refDistance(numCCMA);
for (int i = 0; i < numCCMA; i++) {
int index = ccmaConstraints[i];
refIndices[i] = make_pair(atom1[index], atom2[index]);
refDistance[i] = distance[index];
}
vector<double> refMasses(numAtoms);
for (int i = 0; i < numAtoms; ++i)
refMasses[i] = system.getParticleMass(i);
// Look up angles for CCMA.
vector<ReferenceCCMAAlgorithm::AngleInfo> angles;
for (int i = 0; i < system.getNumForces(); i++) {
const HarmonicAngleForce* force = dynamic_cast<const HarmonicAngleForce*>(&system.getForce(i));
if (force != NULL) {
for (int j = 0; j < force->getNumAngles(); j++) {
int atom1, atom2, atom3;
double angle, k;
force->getAngleParameters(j, atom1, atom2, atom3, angle, k);
angles.push_back(ReferenceCCMAAlgorithm::AngleInfo(atom1, atom2, atom3, angle));
}
}
}
// Create a ReferenceCCMAAlgorithm. It will build and invert the constraint matrix for us.
ReferenceCCMAAlgorithm ccma(numAtoms, numCCMA, refIndices, refDistance, refMasses, angles, 0.1);
vector<vector<pair<int, double> > > matrix = ccma.getMatrix();
int maxRowElements = 0;
for (unsigned i = 0; i < matrix.size(); i++)
maxRowElements = max(maxRowElements, (int) matrix[i].size());
maxRowElements++;
// Build the list of constraints for each atom.
vector<vector<int> > atomConstraints(context.getNumAtoms());
for (int i = 0; i < numCCMA; i++) {
atomConstraints[atom1[ccmaConstraints[i]]].push_back(i);
atomConstraints[atom2[ccmaConstraints[i]]].push_back(i);
}
int maxAtomConstraints = 0;
for (unsigned i = 0; i < atomConstraints.size(); i++)
maxAtomConstraints = max(maxAtomConstraints, (int) atomConstraints[i].size());
// Sort the constraints.
vector<int> constraintOrder(numCCMA);
for (int i = 0; i < numCCMA; ++i)
constraintOrder[i] = i;
sort(constraintOrder.begin(), constraintOrder.end(), ConstraintOrderer(atom1, atom2, ccmaConstraints));
vector<int> inverseOrder(numCCMA);
for (int i = 0; i < numCCMA; ++i)
inverseOrder[constraintOrder[i]] = i;
for (int i = 0; i < (int)matrix.size(); ++i)
for (int j = 0; j < (int)matrix[i].size(); ++j)
matrix[i][j].first = inverseOrder[matrix[i][j].first];
// Record the CCMA data structures.
ccmaAtoms.initialize<int2>(context, numCCMA, "CcmaAtoms");
ccmaAtomConstraints.initialize<int>(context, numAtoms*maxAtomConstraints, "CcmaAtomConstraints");
ccmaNumAtomConstraints.initialize<int>(context, numAtoms, "CcmaAtomConstraintsIndex");
ccmaConstraintMatrixColumn.initialize<int>(context, numCCMA*maxRowElements, "ConstraintMatrixColumn");
ccmaConverged.initialize<int>(context, 2, "ccmaConverged");
CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const System& system) : IntegrationUtilities(context, system),
ccmaConvergedMemory(NULL) {
CHECK_RESULT2(cuEventCreate(&ccmaEvent, CU_EVENT_DISABLE_TIMING), "Error creating event for CCMA");
CHECK_RESULT2(cuMemHostAlloc((void**) &ccmaConvergedMemory, sizeof(int), CU_MEMHOSTALLOC_DEVICEMAP), "Error allocating pinned memory");
CHECK_RESULT2(cuMemHostGetDevicePointer(&ccmaConvergedDeviceMemory, ccmaConvergedMemory, 0), "Error getting device address for pinned memory");
vector<int2> atomsVec(ccmaAtoms.getSize());
vector<int> atomConstraintsVec(ccmaAtomConstraints.getSize());
vector<int> numAtomConstraintsVec(ccmaNumAtomConstraints.getSize());
vector<int> constraintMatrixColumnVec(ccmaConstraintMatrixColumn.getSize());
int elementSize = (context.getUseDoublePrecision() || context.getUseMixedPrecision() ? sizeof(double) : sizeof(float));
ccmaDistance.initialize(context, numCCMA, 4*elementSize, "CcmaDistance");
ccmaDelta1.initialize(context, numCCMA, elementSize, "CcmaDelta1");
ccmaDelta2.initialize(context, numCCMA, elementSize, "CcmaDelta2");
ccmaReducedMass.initialize(context, numCCMA, elementSize, "CcmaReducedMass");
ccmaConstraintMatrixValue.initialize(context, numCCMA*maxRowElements, elementSize, "ConstraintMatrixValue");
vector<double4> distanceVec(ccmaDistance.getSize());
vector<double> reducedMassVec(ccmaReducedMass.getSize());
vector<double> constraintMatrixValueVec(ccmaConstraintMatrixValue.getSize());
for (int i = 0; i < numCCMA; i++) {
int index = constraintOrder[i];
int c = ccmaConstraints[index];
atomsVec[i].x = atom1[c];
atomsVec[i].y = atom2[c];
distanceVec[i].w = distance[c];
reducedMassVec[i] = (0.5/(1.0/system.getParticleMass(atom1[c])+1.0/system.getParticleMass(atom2[c])));
for (unsigned int j = 0; j < matrix[index].size(); j++) {
constraintMatrixColumnVec[i+j*numCCMA] = matrix[index][j].first;
constraintMatrixValueVec[i+j*numCCMA] = matrix[index][j].second;
}
constraintMatrixColumnVec[i+matrix[index].size()*numCCMA] = numCCMA;
}
ccmaDistance.upload(distanceVec, true);
ccmaReducedMass.upload(reducedMassVec, true);
ccmaConstraintMatrixValue.upload(constraintMatrixValueVec, true);
for (unsigned int i = 0; i < atomConstraints.size(); i++) {
numAtomConstraintsVec[i] = atomConstraints[i].size();
for (unsigned int j = 0; j < atomConstraints[i].size(); j++) {
bool forward = (atom1[ccmaConstraints[atomConstraints[i][j]]] == i);
atomConstraintsVec[i+j*numAtoms] = (forward ? inverseOrder[atomConstraints[i][j]]+1 : -inverseOrder[atomConstraints[i][j]]-1);
}
}
ccmaAtoms.upload(atomsVec);
ccmaAtomConstraints.upload(atomConstraintsVec);
ccmaNumAtomConstraints.upload(numAtomConstraintsVec);
ccmaConstraintMatrixColumn.upload(constraintMatrixColumnVec);
}
// Build the list of virtual sites.
vector<int4> vsite2AvgAtomVec;
vector<double2> vsite2AvgWeightVec;
vector<int4> vsite3AvgAtomVec;
vector<double4> vsite3AvgWeightVec;
vector<int4> vsiteOutOfPlaneAtomVec;
vector<double4> vsiteOutOfPlaneWeightVec;
vector<int> vsiteLocalCoordsIndexVec;
vector<int> vsiteLocalCoordsAtomVec;
vector<int> vsiteLocalCoordsStartVec;
vector<double> vsiteLocalCoordsWeightVec;
vector<double4> vsiteLocalCoordsPosVec;
for (int i = 0; i < numAtoms; i++) {
if (system.isVirtualSite(i)) {
if (dynamic_cast<const TwoParticleAverageSite*>(&system.getVirtualSite(i)) != NULL) {
// A two particle average.
const TwoParticleAverageSite& site = dynamic_cast<const TwoParticleAverageSite&>(system.getVirtualSite(i));
vsite2AvgAtomVec.push_back(make_int4(i, site.getParticle(0), site.getParticle(1), 0));
vsite2AvgWeightVec.push_back(make_double2(site.getWeight(0), site.getWeight(1)));
}
else if (dynamic_cast<const ThreeParticleAverageSite*>(&system.getVirtualSite(i)) != NULL) {
// A three particle average.
const ThreeParticleAverageSite& site = dynamic_cast<const ThreeParticleAverageSite&>(system.getVirtualSite(i));
vsite3AvgAtomVec.push_back(make_int4(i, site.getParticle(0), site.getParticle(1), site.getParticle(2)));
vsite3AvgWeightVec.push_back(make_double4(site.getWeight(0), site.getWeight(1), site.getWeight(2), 0.0));
}
else if (dynamic_cast<const OutOfPlaneSite*>(&system.getVirtualSite(i)) != NULL) {
// An out of plane site.
const OutOfPlaneSite& site = dynamic_cast<const OutOfPlaneSite&>(system.getVirtualSite(i));
vsiteOutOfPlaneAtomVec.push_back(make_int4(i, site.getParticle(0), site.getParticle(1), site.getParticle(2)));
vsiteOutOfPlaneWeightVec.push_back(make_double4(site.getWeight12(), site.getWeight13(), site.getWeightCross(), 0.0));
}
else if (dynamic_cast<const LocalCoordinatesSite*>(&system.getVirtualSite(i)) != NULL) {
// A local coordinates site.
const LocalCoordinatesSite& site = dynamic_cast<const LocalCoordinatesSite&>(system.getVirtualSite(i));
int numParticles = site.getNumParticles();
vector<double> origin, x, y;
site.getOriginWeights(origin);
site.getXWeights(x);
site.getYWeights(y);
vsiteLocalCoordsIndexVec.push_back(i);
vsiteLocalCoordsStartVec.push_back(vsiteLocalCoordsAtomVec.size());
for (int j = 0; j < numParticles; j++) {
vsiteLocalCoordsAtomVec.push_back(site.getParticle(j));
vsiteLocalCoordsWeightVec.push_back(origin[j]);
vsiteLocalCoordsWeightVec.push_back(x[j]);
vsiteLocalCoordsWeightVec.push_back(y[j]);
}
Vec3 pos = site.getLocalPosition();
vsiteLocalCoordsPosVec.push_back(make_double4(pos[0], pos[1], pos[2], 0.0));
}
}
}
vsiteLocalCoordsStartVec.push_back(vsiteLocalCoordsAtomVec.size());
int num2Avg = vsite2AvgAtomVec.size();
int num3Avg = vsite3AvgAtomVec.size();
int numOutOfPlane = vsiteOutOfPlaneAtomVec.size();
int numLocalCoords = vsiteLocalCoordsPosVec.size();
vsite2AvgAtoms.initialize<int4>(context, max(1, num2Avg), "vsite2AvgAtoms");
vsite3AvgAtoms.initialize<int4>(context, max(1, num3Avg), "vsite3AvgAtoms");
vsiteOutOfPlaneAtoms.initialize<int4>(context, max(1, numOutOfPlane), "vsiteOutOfPlaneAtoms");
vsiteLocalCoordsIndex.initialize<int>(context, max(1, (int) vsiteLocalCoordsIndexVec.size()), "vsiteLocalCoordsIndex");
vsiteLocalCoordsAtoms.initialize<int>(context, max(1, (int) vsiteLocalCoordsAtomVec.size()), "vsiteLocalCoordsAtoms");
vsiteLocalCoordsStartIndex.initialize<int>(context, max(1, (int) vsiteLocalCoordsStartVec.size()), "vsiteLocalCoordsStartIndex");
if (num2Avg > 0)
vsite2AvgAtoms.upload(vsite2AvgAtomVec);
if (num3Avg > 0)
vsite3AvgAtoms.upload(vsite3AvgAtomVec);
if (numOutOfPlane > 0)
vsiteOutOfPlaneAtoms.upload(vsiteOutOfPlaneAtomVec);
if (numLocalCoords > 0) {
vsiteLocalCoordsIndex.upload(vsiteLocalCoordsIndexVec);
vsiteLocalCoordsAtoms.upload(vsiteLocalCoordsAtomVec);
vsiteLocalCoordsStartIndex.upload(vsiteLocalCoordsStartVec);
}
int elementSize = (context.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
vsite2AvgWeights.initialize(context, max(1, num2Avg), 2*elementSize, "vsite2AvgWeights");
vsite3AvgWeights.initialize(context, max(1, num3Avg), 4*elementSize, "vsite3AvgWeights");
vsiteOutOfPlaneWeights.initialize(context, max(1, numOutOfPlane), 4*elementSize, "vsiteOutOfPlaneWeights");
vsiteLocalCoordsWeights.initialize(context, max(1, (int) vsiteLocalCoordsWeightVec.size()), elementSize, "vsiteLocalCoordsWeights");
vsiteLocalCoordsPos.initialize(context, max(1, (int) vsiteLocalCoordsPosVec.size()), 4*elementSize, "vsiteLocalCoordsPos");
if (num2Avg > 0)
vsite2AvgWeights.upload(vsite2AvgWeightVec, true);
if (num3Avg > 0)
vsite3AvgWeights.upload(vsite3AvgWeightVec, true);
if (numOutOfPlane > 0)
vsiteOutOfPlaneWeights.upload(vsiteOutOfPlaneWeightVec, true);
if (numLocalCoords > 0) {
vsiteLocalCoordsWeights.upload(vsiteLocalCoordsWeightVec, true);
vsiteLocalCoordsPos.upload(vsiteLocalCoordsPosVec, true);
}
// Create the kernels used by this class.
map<string, string> defines;
defines["NUM_CCMA_CONSTRAINTS"] = context.intToString(numCCMA);
defines["NUM_ATOMS"] = context.intToString(numAtoms);
defines["NUM_2_AVERAGE"] = context.intToString(num2Avg);
defines["NUM_3_AVERAGE"] = context.intToString(num3Avg);
defines["NUM_OUT_OF_PLANE"] = context.intToString(numOutOfPlane);
defines["NUM_LOCAL_COORDS"] = context.intToString(numLocalCoords);
defines["PADDED_NUM_ATOMS"] = context.intToString(context.getPaddedNumAtoms());
CUmodule module = context.createModule(CudaKernelSources::vectorOps+CudaKernelSources::integrationUtilities, defines);
settlePosKernel = context.getKernel(module, "applySettleToPositions");
settleVelKernel = context.getKernel(module, "applySettleToVelocities");
shakePosKernel = context.getKernel(module, "applyShakeToPositions");
shakeVelKernel = context.getKernel(module, "applyShakeToVelocities");
ccmaDirectionsKernel = context.getKernel(module, "computeCCMAConstraintDirections");
ccmaPosForceKernel = context.getKernel(module, "computeCCMAPositionConstraintForce");
ccmaVelForceKernel = context.getKernel(module, "computeCCMAVelocityConstraintForce");
ccmaMultiplyKernel = context.getKernel(module, "multiplyByCCMAConstraintMatrix");
ccmaUpdateKernel = context.getKernel(module, "updateCCMAAtomPositions");
CHECK_RESULT2(cuEventCreate(&ccmaEvent, CU_EVENT_DISABLE_TIMING), "Error creating event for CCMA");
vsitePositionKernel = context.getKernel(module, "computeVirtualSites");
vsiteForceKernel = context.getKernel(module, "distributeVirtualSiteForces");
numVsites = num2Avg+num3Avg+numOutOfPlane+numLocalCoords;
randomKernel = context.getKernel(module, "generateRandomNumbers");
timeShiftKernel = context.getKernel(module, "timeShiftVelocities");
}
CudaIntegrationUtilities::~CudaIntegrationUtilities() {
context.setAsCurrent();
if (ccmaConvergedMemory != NULL)
if (ccmaConvergedMemory != NULL) {
cuMemFreeHost(ccmaConvergedMemory);
}
void CudaIntegrationUtilities::setNextStepSize(double size) {
if (size != lastStepSize.x || size != lastStepSize.y) {
lastStepSize = make_double2(size, size);
if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
stepSize.upload(&lastStepSize);
else {
float2 lastStepSizeFloat = make_float2((float) size, (float) size);
stepSize.upload(&lastStepSizeFloat);
}
cuEventDestroy(ccmaEvent);
}
}
double CudaIntegrationUtilities::getLastStepSize() {
if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
stepSize.download(&lastStepSize);
else {
float2 lastStepSizeFloat;
stepSize.download(&lastStepSizeFloat);
lastStepSize = make_double2(lastStepSizeFloat.x, lastStepSizeFloat.y);
}
return lastStepSize.y;
CudaArray& CudaIntegrationUtilities::getPosDelta() {
return dynamic_cast<CudaContext&>(context).unwrap(posDelta);
}
void CudaIntegrationUtilities::applyConstraints(double tol) {
applyConstraints(false, tol);
CudaArray& CudaIntegrationUtilities::getRandom() {
return dynamic_cast<CudaContext&>(context).unwrap(random);
}
void CudaIntegrationUtilities::applyVelocityConstraints(double tol) {
applyConstraints(true, tol);
CudaArray& CudaIntegrationUtilities::getStepSize() {
return dynamic_cast<CudaContext&>(context).unwrap(stepSize);
}
void CudaIntegrationUtilities::applyConstraints(bool constrainVelocities, double tol) {
CUfunction settleKernel, shakeKernel, ccmaForceKernel;
void CudaIntegrationUtilities::applyConstraintsImpl(bool constrainVelocities, double tol) {
ComputeKernel settleKernel, shakeKernel, ccmaForceKernel;
if (constrainVelocities) {
settleKernel = settleVelKernel;
shakeKernel = shakeVelKernel;
......@@ -588,45 +77,39 @@ void CudaIntegrationUtilities::applyConstraints(bool constrainVelocities, double
shakeKernel = shakePosKernel;
ccmaForceKernel = ccmaPosForceKernel;
}
float floatTol = (float) tol;
void* tolPointer = (context.getUseDoublePrecision() || context.getUseMixedPrecision() ? (void*) &tol : (void*) &floatTol);
CUdeviceptr posCorrection = (context.getUseMixedPrecision() ? context.getPosqCorrection().getDevicePointer() : 0);
if (settleAtoms.isInitialized()) {
int numClusters = settleAtoms.getSize();
void* args[] = {&numClusters, tolPointer, &context.getPosq().getDevicePointer(), &posCorrection,
&posDelta.getDevicePointer(), &context.getVelm().getDevicePointer(),
&settleAtoms.getDevicePointer(), &settleParams.getDevicePointer()};
context.executeKernel(settleKernel, args, settleAtoms.getSize());
if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
settleKernel->setArg(1, tol);
else
settleKernel->setArg(1, (float) tol);
settleKernel->execute(settleAtoms.getSize());
}
if (shakeAtoms.isInitialized()) {
int numClusters = shakeAtoms.getSize();
void* args[] = {&numClusters, tolPointer, &context.getPosq().getDevicePointer(), &posCorrection,
constrainVelocities ? &context.getVelm().getDevicePointer() : &posDelta.getDevicePointer(),
&shakeAtoms.getDevicePointer(), &shakeParams.getDevicePointer()};
context.executeKernel(shakeKernel, args, shakeAtoms.getSize());
if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
shakeKernel->setArg(1, tol);
else
shakeKernel->setArg(1, (float) tol);
shakeKernel->execute(shakeAtoms.getSize());
}
if (ccmaAtoms.isInitialized()) {
void* directionsArgs[] = {&ccmaAtoms.getDevicePointer(), &ccmaDistance.getDevicePointer(), &context.getPosq().getDevicePointer(), &posCorrection, &ccmaConverged.getDevicePointer()};
context.executeKernel(ccmaDirectionsKernel, directionsArgs, ccmaAtoms.getSize());
int i;
void* forceArgs[] = {&ccmaAtoms.getDevicePointer(), &ccmaDistance.getDevicePointer(),
constrainVelocities ? &context.getVelm().getDevicePointer() : &posDelta.getDevicePointer(),
&ccmaReducedMass.getDevicePointer(), &ccmaDelta1.getDevicePointer(), &ccmaConverged.getDevicePointer(),
&ccmaConvergedDeviceMemory, tolPointer, &i};
void* multiplyArgs[] = {&ccmaDelta1.getDevicePointer(), &ccmaDelta2.getDevicePointer(),
&ccmaConstraintMatrixColumn.getDevicePointer(), &ccmaConstraintMatrixValue.getDevicePointer(), &ccmaConverged.getDevicePointer(), &i};
void* updateArgs[] = {&ccmaNumAtomConstraints.getDevicePointer(), &ccmaAtomConstraints.getDevicePointer(), &ccmaDistance.getDevicePointer(),
constrainVelocities ? &context.getVelm().getDevicePointer() : &posDelta.getDevicePointer(),
&context.getVelm().getDevicePointer(), &ccmaDelta1.getDevicePointer(), &ccmaDelta2.getDevicePointer(),
&ccmaConverged.getDevicePointer(), &i};
ccmaForceKernel->setArg(6, ccmaConvergedDeviceMemory);
if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
ccmaForceKernel->setArg(7, tol);
else
ccmaForceKernel->setArg(7, (float) tol);
ccmaDirectionsKernel->execute(ccmaAtoms.getSize());
const int checkInterval = 4;
ccmaConvergedMemory[0] = 0;
for (i = 0; i < 150; i++) {
context.executeKernel(ccmaForceKernel, forceArgs, ccmaAtoms.getSize());
ccmaUpdateKernel->setArg(3, constrainVelocities ? context.getVelm() : posDelta);
for (int i = 0; i < 150; i++) {
ccmaForceKernel->setArg(8, i);
ccmaForceKernel->execute(ccmaAtoms.getSize());
if ((i+1)%checkInterval == 0)
CHECK_RESULT2(cuEventRecord(ccmaEvent, 0), "Error recording event for CCMA");
context.executeKernel(ccmaMultiplyKernel, multiplyArgs, ccmaAtoms.getSize());
context.executeKernel(ccmaUpdateKernel, updateArgs, context.getNumAtoms());
ccmaMultiplyKernel->setArg(5, i);
ccmaMultiplyKernel->execute(ccmaAtoms.getSize());
ccmaUpdateKernel->setArg(8, i);
ccmaUpdateKernel->execute(context.getNumAtoms());
if ((i+1)%checkInterval == 0) {
CHECK_RESULT2(cuEventSynchronize(ccmaEvent), "Error synchronizing on event for CCMA");
if (ccmaConvergedMemory[0])
......@@ -636,142 +119,9 @@ void CudaIntegrationUtilities::applyConstraints(bool constrainVelocities, double
}
}
void CudaIntegrationUtilities::computeVirtualSites() {
if (numVsites > 0) {
CUdeviceptr posCorrection = (context.getUseMixedPrecision() ? context.getPosqCorrection().getDevicePointer() : 0);
void* args[] = {&context.getPosq().getDevicePointer(), &posCorrection, &vsite2AvgAtoms.getDevicePointer(), &vsite2AvgWeights.getDevicePointer(),
&vsite3AvgAtoms.getDevicePointer(), &vsite3AvgWeights.getDevicePointer(),
&vsiteOutOfPlaneAtoms.getDevicePointer(), &vsiteOutOfPlaneWeights.getDevicePointer(),
&vsiteLocalCoordsIndex.getDevicePointer(), &vsiteLocalCoordsAtoms.getDevicePointer(),
&vsiteLocalCoordsWeights.getDevicePointer(), &vsiteLocalCoordsPos.getDevicePointer(),
&vsiteLocalCoordsStartIndex.getDevicePointer()};
context.executeKernel(vsitePositionKernel, args, numVsites);
}
}
void CudaIntegrationUtilities::distributeForcesFromVirtualSites() {
if (numVsites > 0) {
CUdeviceptr posCorrection = (context.getUseMixedPrecision() ? context.getPosqCorrection().getDevicePointer() : 0);
void* args[] = {&context.getPosq().getDevicePointer(), &posCorrection, &context.getForce().getDevicePointer(),
&vsite2AvgAtoms.getDevicePointer(), &vsite2AvgWeights.getDevicePointer(),
&vsite3AvgAtoms.getDevicePointer(), &vsite3AvgWeights.getDevicePointer(),
&vsiteOutOfPlaneAtoms.getDevicePointer(), &vsiteOutOfPlaneWeights.getDevicePointer(),
&vsiteLocalCoordsIndex.getDevicePointer(), &vsiteLocalCoordsAtoms.getDevicePointer(),
&vsiteLocalCoordsWeights.getDevicePointer(), &vsiteLocalCoordsPos.getDevicePointer(),
&vsiteLocalCoordsStartIndex.getDevicePointer()};
context.executeKernel(vsiteForceKernel, args, numVsites);
}
}
void CudaIntegrationUtilities::initRandomNumberGenerator(unsigned int randomNumberSeed) {
if (random.isInitialized()) {
if (randomNumberSeed != lastSeed)
throw OpenMMException("CudaIntegrationUtilities::initRandomNumberGenerator(): Requested two different values for the random number seed");
return;
}
// Create the random number arrays.
lastSeed = randomNumberSeed;
random.initialize<float4>(context, 4*context.getPaddedNumAtoms(), "random");
randomSeed.initialize<int4>(context, context.getNumThreadBlocks()*CudaContext::ThreadBlockSize, "randomSeed");
randomPos = random.getSize();
// Use a quick and dirty RNG to pick seeds for the real random number generator.
vector<int4> seed(randomSeed.getSize());
unsigned int r = randomNumberSeed;
if (r == 0) r = (unsigned int) osrngseed();
for (int i = 0; i < randomSeed.getSize(); i++) {
seed[i].x = r = (1664525*r + 1013904223) & 0xFFFFFFFF;
seed[i].y = r = (1664525*r + 1013904223) & 0xFFFFFFFF;
seed[i].z = r = (1664525*r + 1013904223) & 0xFFFFFFFF;
seed[i].w = r = (1664525*r + 1013904223) & 0xFFFFFFFF;
}
randomSeed.upload(seed);
}
int CudaIntegrationUtilities::prepareRandomNumbers(int numValues) {
if (randomPos+numValues <= random.getSize()) {
int oldPos = randomPos;
randomPos += numValues;
return oldPos;
}
if (numValues > random.getSize())
random.resize(numValues);
int size = random.getSize();
void* args[] = {&size, &random.getDevicePointer(), &randomSeed.getDevicePointer()};
context.executeKernel(randomKernel, args, random.getSize());
randomPos = numValues;
return 0;
}
void CudaIntegrationUtilities::createCheckpoint(ostream& stream) {
if (!random.isInitialized())
return;
stream.write((char*) &randomPos, sizeof(int));
vector<float4> randomVec;
random.download(randomVec);
stream.write((char*) &randomVec[0], sizeof(float4)*random.getSize());
vector<int4> randomSeedVec;
randomSeed.download(randomSeedVec);
stream.write((char*) &randomSeedVec[0], sizeof(int4)*randomSeed.getSize());
}
void CudaIntegrationUtilities::loadCheckpoint(istream& stream) {
if (!random.isInitialized())
return;
stream.read((char*) &randomPos, sizeof(int));
vector<float4> randomVec(random.getSize());
stream.read((char*) &randomVec[0], sizeof(float4)*random.getSize());
random.upload(randomVec);
vector<int4> randomSeedVec(randomSeed.getSize());
stream.read((char*) &randomSeedVec[0], sizeof(int4)*randomSeed.getSize());
randomSeed.upload(randomSeedVec);
}
double CudaIntegrationUtilities::computeKineticEnergy(double timeShift) {
int numParticles = context.getNumAtoms();
if (timeShift != 0) {
float timeShiftFloat = (float) timeShift;
void* timeShiftPtr = (context.getUseDoublePrecision() ? (void*) &timeShift : (void*) &timeShiftFloat);
// Copy the velocities into the posDelta array while we temporarily modify them.
context.getVelm().copyTo(posDelta);
// Apply the time shift.
void* args[] = {&context.getVelm().getDevicePointer(), &context.getForce().getDevicePointer(), timeShiftPtr};
context.executeKernel(timeShiftKernel, args, numParticles);
applyConstraints(true, 1e-4);
}
// Compute the kinetic energy.
double energy = 0.0;
if (context.getUseDoublePrecision() || context.getUseMixedPrecision()) {
vector<double4> velm;
context.getVelm().download(velm);
for (int i = 0; i < numParticles; i++) {
double4 v = velm[i];
if (v.w != 0)
energy += (v.x*v.x+v.y*v.y+v.z*v.z)/v.w;
}
}
else {
vector<float4> velm;
context.getVelm().download(velm);
for (int i = 0; i < numParticles; i++) {
float4 v = velm[i];
if (v.w != 0)
energy += (v.x*v.x+v.y*v.y+v.z*v.z)/v.w;
}
vsiteForceKernel->setArg(2, context.getLongForceBuffer());
vsiteForceKernel->execute(numVsites);
}
// Restore the velocities.
if (timeShift != 0)
posDelta.copyTo(context.getVelm());
return 0.5*energy;
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2019 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "CudaKernel.h"
#include "openmm/common/ComputeArray.h"
#include <cstring>
#include <vector>
using namespace OpenMM;
using namespace std;
CudaKernel::CudaKernel(CudaContext& context, CUfunction kernel, const string& name) : context(context), kernel(kernel), name(name) {
}
string CudaKernel::getName() const {
return name;
}
void CudaKernel::execute(int threads, int blockSize) {
int numArgs = arrayArgs.size();
argPointers.resize(numArgs);
for (int i = 0; i < numArgs; i++) {
if (arrayArgs[i] != NULL)
argPointers[i] = &arrayArgs[i]->getDevicePointer();
else
argPointers[i] = &primitiveArgs[i];
}
context.executeKernel(kernel, argPointers.data(), threads, blockSize);
}
void CudaKernel::addArrayArg(ArrayInterface& value) {
int index = arrayArgs.size();
addEmptyArg();
setArrayArg(index, value);
}
void CudaKernel::addPrimitiveArg(const void* value, int size) {
int index = arrayArgs.size();
addEmptyArg();
setPrimitiveArg(index, value, size);
}
void CudaKernel::addEmptyArg() {
primitiveArgs.push_back(make_double4(0, 0, 0, 0));
arrayArgs.push_back(NULL);
}
void CudaKernel::setArrayArg(int index, ArrayInterface& value) {
arrayArgs[index] = &context.unwrap(value);
}
void CudaKernel::setPrimitiveArg(int index, const void* value, int size) {
if (size > sizeof(double4))
throw OpenMMException("Unsupported value type for kernel argument");
memcpy(&primitiveArgs[index], value, size);
arrayArgs[index] = NULL;
}
......@@ -28,6 +28,7 @@
#include "CudaKernels.h"
#include "CudaParallelKernels.h"
#include "CudaPlatform.h"
#include "openmm/common/CommonKernels.h"
#include "openmm/internal/ContextImpl.h"
#include "openmm/OpenMMException.h"
......@@ -77,64 +78,68 @@ KernelImpl* CudaKernelFactory::createKernelImpl(std::string name, const Platform
if (name == VirtualSitesKernel::Name())
return new CudaVirtualSitesKernel(name, platform, cu);
if (name == CalcHarmonicBondForceKernel::Name())
return new CudaCalcHarmonicBondForceKernel(name, platform, cu, context.getSystem());
return new CommonCalcHarmonicBondForceKernel(name, platform, cu, context.getSystem());
if (name == CalcCustomBondForceKernel::Name())
return new CudaCalcCustomBondForceKernel(name, platform, cu, context.getSystem());
return new CommonCalcCustomBondForceKernel(name, platform, cu, context.getSystem());
if (name == CalcHarmonicAngleForceKernel::Name())
return new CudaCalcHarmonicAngleForceKernel(name, platform, cu, context.getSystem());
return new CommonCalcHarmonicAngleForceKernel(name, platform, cu, context.getSystem());
if (name == CalcCustomAngleForceKernel::Name())
return new CudaCalcCustomAngleForceKernel(name, platform, cu, context.getSystem());
return new CommonCalcCustomAngleForceKernel(name, platform, cu, context.getSystem());
if (name == CalcPeriodicTorsionForceKernel::Name())
return new CudaCalcPeriodicTorsionForceKernel(name, platform, cu, context.getSystem());
return new CommonCalcPeriodicTorsionForceKernel(name, platform, cu, context.getSystem());
if (name == CalcRBTorsionForceKernel::Name())
return new CudaCalcRBTorsionForceKernel(name, platform, cu, context.getSystem());
return new CommonCalcRBTorsionForceKernel(name, platform, cu, context.getSystem());
if (name == CalcCMAPTorsionForceKernel::Name())
return new CudaCalcCMAPTorsionForceKernel(name, platform, cu, context.getSystem());
return new CommonCalcCMAPTorsionForceKernel(name, platform, cu, context.getSystem());
if (name == CalcCustomTorsionForceKernel::Name())
return new CudaCalcCustomTorsionForceKernel(name, platform, cu, context.getSystem());
return new CommonCalcCustomTorsionForceKernel(name, platform, cu, context.getSystem());
if (name == CalcNonbondedForceKernel::Name())
return new CudaCalcNonbondedForceKernel(name, platform, cu, context.getSystem());
if (name == CalcCustomNonbondedForceKernel::Name())
return new CudaCalcCustomNonbondedForceKernel(name, platform, cu, context.getSystem());
return new CommonCalcCustomNonbondedForceKernel(name, platform, cu, context.getSystem());
if (name == CalcGBSAOBCForceKernel::Name())
return new CudaCalcGBSAOBCForceKernel(name, platform, cu);
return new CommonCalcGBSAOBCForceKernel(name, platform, cu);
if (name == CalcCustomGBForceKernel::Name())
return new CudaCalcCustomGBForceKernel(name, platform, cu, context.getSystem());
return new CommonCalcCustomGBForceKernel(name, platform, cu, context.getSystem());
if (name == CalcCustomExternalForceKernel::Name())
return new CudaCalcCustomExternalForceKernel(name, platform, cu, context.getSystem());
return new CommonCalcCustomExternalForceKernel(name, platform, cu, context.getSystem());
if (name == CalcCustomHbondForceKernel::Name())
return new CudaCalcCustomHbondForceKernel(name, platform, cu, context.getSystem());
return new CommonCalcCustomHbondForceKernel(name, platform, cu, context.getSystem());
if (name == CalcCustomCentroidBondForceKernel::Name())
return new CudaCalcCustomCentroidBondForceKernel(name, platform, cu, context.getSystem());
return new CommonCalcCustomCentroidBondForceKernel(name, platform, cu, context.getSystem());
if (name == CalcCustomCompoundBondForceKernel::Name())
return new CudaCalcCustomCompoundBondForceKernel(name, platform, cu, context.getSystem());
return new CommonCalcCustomCompoundBondForceKernel(name, platform, cu, context.getSystem());
if (name == CalcCustomCVForceKernel::Name())
return new CudaCalcCustomCVForceKernel(name, platform, cu);
if (name == CalcRMSDForceKernel::Name())
return new CudaCalcRMSDForceKernel(name, platform, cu);
return new CommonCalcRMSDForceKernel(name, platform, cu);
if (name == CalcCustomManyParticleForceKernel::Name())
return new CudaCalcCustomManyParticleForceKernel(name, platform, cu, context.getSystem());
return new CommonCalcCustomManyParticleForceKernel(name, platform, cu, context.getSystem());
if (name == CalcGayBerneForceKernel::Name())
return new CudaCalcGayBerneForceKernel(name, platform, cu);
return new CommonCalcGayBerneForceKernel(name, platform, cu);
if (name == IntegrateVerletStepKernel::Name())
return new CudaIntegrateVerletStepKernel(name, platform, cu);
return new CommonIntegrateVerletStepKernel(name, platform, cu);
if (name == IntegrateLangevinStepKernel::Name())
return new CudaIntegrateLangevinStepKernel(name, platform, cu);
if (name == IntegrateBAOABStepKernel::Name())
return new CudaIntegrateBAOABStepKernel(name, platform, cu);
return new CommonIntegrateLangevinStepKernel(name, platform, cu);
if (name == IntegrateLangevinMiddleStepKernel::Name())
return new CommonIntegrateLangevinMiddleStepKernel(name, platform, cu);
if (name == IntegrateBrownianStepKernel::Name())
return new CudaIntegrateBrownianStepKernel(name, platform, cu);
return new CommonIntegrateBrownianStepKernel(name, platform, cu);
if (name == IntegrateVariableVerletStepKernel::Name())
return new CudaIntegrateVariableVerletStepKernel(name, platform, cu);
return new CommonIntegrateVariableVerletStepKernel(name, platform, cu);
if (name == IntegrateVariableLangevinStepKernel::Name())
return new CudaIntegrateVariableLangevinStepKernel(name, platform, cu);
return new CommonIntegrateVariableLangevinStepKernel(name, platform, cu);
if (name == IntegrateCustomStepKernel::Name())
return new CudaIntegrateCustomStepKernel(name, platform, cu);
return new CommonIntegrateCustomStepKernel(name, platform, cu);
if (name == ApplyAndersenThermostatKernel::Name())
return new CudaApplyAndersenThermostatKernel(name, platform, cu);
return new CommonApplyAndersenThermostatKernel(name, platform, cu);
if (name == NoseHooverChainKernel::Name())
return new CudaNoseHooverChainKernel(name, platform, cu);
if (name == IntegrateVelocityVerletStepKernel::Name())
return new CudaIntegrateVelocityVerletStepKernel(name, platform, cu);
if (name == ApplyMonteCarloBarostatKernel::Name())
return new CudaApplyMonteCarloBarostatKernel(name, platform, cu);
if (name == RemoveCMMotionKernel::Name())
return new CudaRemoveCMMotionKernel(name, platform, cu);
return new CommonRemoveCMMotionKernel(name, platform, cu);
throw OpenMMException((std::string("Tried to create kernel with illegal kernel name '")+name+"'").c_str());
}
......@@ -27,7 +27,7 @@
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "windowsExportCuda.h"
#include "openmm/common/windowsExportCommon.h"
#include <string>
namespace OpenMM {
......@@ -38,9 +38,9 @@ namespace OpenMM {
* kernels subfolder.
*/
class OPENMM_EXPORT_CUDA CudaKernelSources {
class OPENMM_EXPORT_COMMON CudaKernelSources {
public:
@CUDA_FILE_DECLARATIONS@
@KERNEL_FILE_DECLARATIONS@
};
} // namespace OpenMM
......
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -27,6 +27,7 @@
#include "openmm/OpenMMException.h"
#include "CudaNonbondedUtilities.h"
#include "CudaArray.h"
#include "CudaContext.h"
#include "CudaKernelSources.h"
#include "CudaExpressionUtilities.h"
#include "CudaSort.h"
......@@ -84,6 +85,10 @@ CudaNonbondedUtilities::~CudaNonbondedUtilities() {
cuEventDestroy(downloadCountEvent);
}
void CudaNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector<vector<int> >& exclusionList, const string& kernel, int forceGroup) {
addInteraction(usesCutoff, usesPeriodic, usesExclusions, cutoffDistance, exclusionList, kernel, forceGroup, false);
}
void CudaNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector<vector<int> >& exclusionList, const string& kernel, int forceGroup, bool supportsPairList) {
if (groupCutoff.size() > 0) {
if (usesCutoff != useCutoff)
......@@ -110,10 +115,20 @@ void CudaNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic,
}
}
void CudaNonbondedUtilities::addParameter(ComputeParameterInfo parameter) {
parameters.push_back(ParameterInfo(parameter.getName(), parameter.getComponentType(), parameter.getNumComponents(),
parameter.getSize(), context.unwrap(parameter.getArray()).getDevicePointer()));
}
void CudaNonbondedUtilities::addParameter(const ParameterInfo& parameter) {
parameters.push_back(parameter);
}
void CudaNonbondedUtilities::addArgument(ComputeParameterInfo parameter) {
arguments.push_back(ParameterInfo(parameter.getName(), parameter.getComponentType(), parameter.getNumComponents(),
parameter.getSize(), context.unwrap(parameter.getArray()).getDevicePointer()));
}
void CudaNonbondedUtilities::addArgument(const ParameterInfo& parameter) {
arguments.push_back(parameter);
}
......
......@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2011-2018 Stanford University and the Authors. *
* Portions copyright (c) 2011-2019 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -200,7 +200,7 @@ void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& contex
for (int i = 0; i < (int) data.contexts.size(); i++) {
data.contextEnergy[i] = 0.0;
CudaContext& cu = *data.contexts[i];
CudaContext::WorkThread& thread = cu.getWorkThread();
ComputeContext::WorkThread& thread = cu.getWorkThread();
thread.addTask(new BeginComputationTask(context, cu, getKernel(i), includeForce, includeEnergy, groups, pinnedPositionBuffer, event, interactionCounts[i]));
}
}
......@@ -208,7 +208,7 @@ void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& contex
double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups, bool& valid) {
for (int i = 0; i < (int) data.contexts.size(); i++) {
CudaContext& cu = *data.contexts[i];
CudaContext::WorkThread& thread = cu.getWorkThread();
ComputeContext::WorkThread& thread = cu.getWorkThread();
thread.addTask(new FinishComputationTask(context, cu, getKernel(i), includeForce, includeEnergy, groups, data.contextEnergy[i], completionTimes[i], pinnedForceBuffer, contextForces, valid, interactionCounts[i]));
}
data.syncContexts();
......@@ -255,7 +255,7 @@ double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& con
class CudaParallelCalcHarmonicBondForceKernel::Task : public CudaContext::WorkTask {
public:
Task(ContextImpl& context, CudaCalcHarmonicBondForceKernel& kernel, bool includeForce,
Task(ContextImpl& context, CommonCalcHarmonicBondForceKernel& kernel, bool includeForce,
bool includeEnergy, double& energy) : context(context), kernel(kernel),
includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
}
......@@ -264,7 +264,7 @@ public:
}
private:
ContextImpl& context;
CudaCalcHarmonicBondForceKernel& kernel;
CommonCalcHarmonicBondForceKernel& kernel;
bool includeForce, includeEnergy;
double& energy;
};
......@@ -272,7 +272,7 @@ private:
CudaParallelCalcHarmonicBondForceKernel::CudaParallelCalcHarmonicBondForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, const System& system) :
CalcHarmonicBondForceKernel(name, platform), data(data) {
for (int i = 0; i < (int) data.contexts.size(); i++)
kernels.push_back(Kernel(new CudaCalcHarmonicBondForceKernel(name, platform, *data.contexts[i], system)));
kernels.push_back(Kernel(new CommonCalcHarmonicBondForceKernel(name, platform, *data.contexts[i], system)));
}
void CudaParallelCalcHarmonicBondForceKernel::initialize(const System& system, const HarmonicBondForce& force) {
......@@ -283,7 +283,7 @@ void CudaParallelCalcHarmonicBondForceKernel::initialize(const System& system, c
double CudaParallelCalcHarmonicBondForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
for (int i = 0; i < (int) data.contexts.size(); i++) {
CudaContext& cu = *data.contexts[i];
CudaContext::WorkThread& thread = cu.getWorkThread();
ComputeContext::WorkThread& thread = cu.getWorkThread();
thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
}
return 0.0;
......@@ -296,7 +296,7 @@ void CudaParallelCalcHarmonicBondForceKernel::copyParametersToContext(ContextImp
class CudaParallelCalcCustomBondForceKernel::Task : public CudaContext::WorkTask {
public:
Task(ContextImpl& context, CudaCalcCustomBondForceKernel& kernel, bool includeForce,
Task(ContextImpl& context, CommonCalcCustomBondForceKernel& kernel, bool includeForce,
bool includeEnergy, double& energy) : context(context), kernel(kernel),
includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
}
......@@ -305,7 +305,7 @@ public:
}
private:
ContextImpl& context;
CudaCalcCustomBondForceKernel& kernel;
CommonCalcCustomBondForceKernel& kernel;
bool includeForce, includeEnergy;
double& energy;
};
......@@ -313,7 +313,7 @@ private:
CudaParallelCalcCustomBondForceKernel::CudaParallelCalcCustomBondForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, const System& system) :
CalcCustomBondForceKernel(name, platform), data(data) {
for (int i = 0; i < (int) data.contexts.size(); i++)
kernels.push_back(Kernel(new CudaCalcCustomBondForceKernel(name, platform, *data.contexts[i], system)));
kernels.push_back(Kernel(new CommonCalcCustomBondForceKernel(name, platform, *data.contexts[i], system)));
}
void CudaParallelCalcCustomBondForceKernel::initialize(const System& system, const CustomBondForce& force) {
......@@ -324,7 +324,7 @@ void CudaParallelCalcCustomBondForceKernel::initialize(const System& system, con
double CudaParallelCalcCustomBondForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
for (int i = 0; i < (int) data.contexts.size(); i++) {
CudaContext& cu = *data.contexts[i];
CudaContext::WorkThread& thread = cu.getWorkThread();
ComputeContext::WorkThread& thread = cu.getWorkThread();
thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
}
return 0.0;
......@@ -337,7 +337,7 @@ void CudaParallelCalcCustomBondForceKernel::copyParametersToContext(ContextImpl&
class CudaParallelCalcHarmonicAngleForceKernel::Task : public CudaContext::WorkTask {
public:
Task(ContextImpl& context, CudaCalcHarmonicAngleForceKernel& kernel, bool includeForce,
Task(ContextImpl& context, CommonCalcHarmonicAngleForceKernel& kernel, bool includeForce,
bool includeEnergy, double& energy) : context(context), kernel(kernel),
includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
}
......@@ -346,7 +346,7 @@ public:
}
private:
ContextImpl& context;
CudaCalcHarmonicAngleForceKernel& kernel;
CommonCalcHarmonicAngleForceKernel& kernel;
bool includeForce, includeEnergy;
double& energy;
};
......@@ -354,7 +354,7 @@ private:
CudaParallelCalcHarmonicAngleForceKernel::CudaParallelCalcHarmonicAngleForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, const System& system) :
CalcHarmonicAngleForceKernel(name, platform), data(data) {
for (int i = 0; i < (int) data.contexts.size(); i++)
kernels.push_back(Kernel(new CudaCalcHarmonicAngleForceKernel(name, platform, *data.contexts[i], system)));
kernels.push_back(Kernel(new CommonCalcHarmonicAngleForceKernel(name, platform, *data.contexts[i], system)));
}
void CudaParallelCalcHarmonicAngleForceKernel::initialize(const System& system, const HarmonicAngleForce& force) {
......@@ -365,7 +365,7 @@ void CudaParallelCalcHarmonicAngleForceKernel::initialize(const System& system,
double CudaParallelCalcHarmonicAngleForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
for (int i = 0; i < (int) data.contexts.size(); i++) {
CudaContext& cu = *data.contexts[i];
CudaContext::WorkThread& thread = cu.getWorkThread();
ComputeContext::WorkThread& thread = cu.getWorkThread();
thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
}
return 0.0;
......@@ -378,7 +378,7 @@ void CudaParallelCalcHarmonicAngleForceKernel::copyParametersToContext(ContextIm
class CudaParallelCalcCustomAngleForceKernel::Task : public CudaContext::WorkTask {
public:
Task(ContextImpl& context, CudaCalcCustomAngleForceKernel& kernel, bool includeForce,
Task(ContextImpl& context, CommonCalcCustomAngleForceKernel& kernel, bool includeForce,
bool includeEnergy, double& energy) : context(context), kernel(kernel),
includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
}
......@@ -387,7 +387,7 @@ public:
}
private:
ContextImpl& context;
CudaCalcCustomAngleForceKernel& kernel;
CommonCalcCustomAngleForceKernel& kernel;
bool includeForce, includeEnergy;
double& energy;
};
......@@ -395,7 +395,7 @@ private:
CudaParallelCalcCustomAngleForceKernel::CudaParallelCalcCustomAngleForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, const System& system) :
CalcCustomAngleForceKernel(name, platform), data(data) {
for (int i = 0; i < (int) data.contexts.size(); i++)
kernels.push_back(Kernel(new CudaCalcCustomAngleForceKernel(name, platform, *data.contexts[i], system)));
kernels.push_back(Kernel(new CommonCalcCustomAngleForceKernel(name, platform, *data.contexts[i], system)));
}
void CudaParallelCalcCustomAngleForceKernel::initialize(const System& system, const CustomAngleForce& force) {
......@@ -406,7 +406,7 @@ void CudaParallelCalcCustomAngleForceKernel::initialize(const System& system, co
double CudaParallelCalcCustomAngleForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
for (int i = 0; i < (int) data.contexts.size(); i++) {
CudaContext& cu = *data.contexts[i];
CudaContext::WorkThread& thread = cu.getWorkThread();
ComputeContext::WorkThread& thread = cu.getWorkThread();
thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
}
return 0.0;
......@@ -419,7 +419,7 @@ void CudaParallelCalcCustomAngleForceKernel::copyParametersToContext(ContextImpl
class CudaParallelCalcPeriodicTorsionForceKernel::Task : public CudaContext::WorkTask {
public:
Task(ContextImpl& context, CudaCalcPeriodicTorsionForceKernel& kernel, bool includeForce,
Task(ContextImpl& context, CommonCalcPeriodicTorsionForceKernel& kernel, bool includeForce,
bool includeEnergy, double& energy) : context(context), kernel(kernel),
includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
}
......@@ -428,7 +428,7 @@ public:
}
private:
ContextImpl& context;
CudaCalcPeriodicTorsionForceKernel& kernel;
CommonCalcPeriodicTorsionForceKernel& kernel;
bool includeForce, includeEnergy;
double& energy;
};
......@@ -436,7 +436,7 @@ private:
CudaParallelCalcPeriodicTorsionForceKernel::CudaParallelCalcPeriodicTorsionForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, const System& system) :
CalcPeriodicTorsionForceKernel(name, platform), data(data) {
for (int i = 0; i < (int) data.contexts.size(); i++)
kernels.push_back(Kernel(new CudaCalcPeriodicTorsionForceKernel(name, platform, *data.contexts[i], system)));
kernels.push_back(Kernel(new CommonCalcPeriodicTorsionForceKernel(name, platform, *data.contexts[i], system)));
}
void CudaParallelCalcPeriodicTorsionForceKernel::initialize(const System& system, const PeriodicTorsionForce& force) {
......@@ -447,7 +447,7 @@ void CudaParallelCalcPeriodicTorsionForceKernel::initialize(const System& system
double CudaParallelCalcPeriodicTorsionForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
for (int i = 0; i < (int) data.contexts.size(); i++) {
CudaContext& cu = *data.contexts[i];
CudaContext::WorkThread& thread = cu.getWorkThread();
ComputeContext::WorkThread& thread = cu.getWorkThread();
thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
}
return 0.0;
......@@ -460,7 +460,7 @@ void CudaParallelCalcPeriodicTorsionForceKernel::copyParametersToContext(Context
class CudaParallelCalcRBTorsionForceKernel::Task : public CudaContext::WorkTask {
public:
Task(ContextImpl& context, CudaCalcRBTorsionForceKernel& kernel, bool includeForce,
Task(ContextImpl& context, CommonCalcRBTorsionForceKernel& kernel, bool includeForce,
bool includeEnergy, double& energy) : context(context), kernel(kernel),
includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
}
......@@ -469,7 +469,7 @@ public:
}
private:
ContextImpl& context;
CudaCalcRBTorsionForceKernel& kernel;
CommonCalcRBTorsionForceKernel& kernel;
bool includeForce, includeEnergy;
double& energy;
};
......@@ -477,7 +477,7 @@ private:
CudaParallelCalcRBTorsionForceKernel::CudaParallelCalcRBTorsionForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, const System& system) :
CalcRBTorsionForceKernel(name, platform), data(data) {
for (int i = 0; i < (int) data.contexts.size(); i++)
kernels.push_back(Kernel(new CudaCalcRBTorsionForceKernel(name, platform, *data.contexts[i], system)));
kernels.push_back(Kernel(new CommonCalcRBTorsionForceKernel(name, platform, *data.contexts[i], system)));
}
void CudaParallelCalcRBTorsionForceKernel::initialize(const System& system, const RBTorsionForce& force) {
......@@ -488,7 +488,7 @@ void CudaParallelCalcRBTorsionForceKernel::initialize(const System& system, cons
double CudaParallelCalcRBTorsionForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
for (int i = 0; i < (int) data.contexts.size(); i++) {
CudaContext& cu = *data.contexts[i];
CudaContext::WorkThread& thread = cu.getWorkThread();
ComputeContext::WorkThread& thread = cu.getWorkThread();
thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
}
return 0.0;
......@@ -501,7 +501,7 @@ void CudaParallelCalcRBTorsionForceKernel::copyParametersToContext(ContextImpl&
class CudaParallelCalcCMAPTorsionForceKernel::Task : public CudaContext::WorkTask {
public:
Task(ContextImpl& context, CudaCalcCMAPTorsionForceKernel& kernel, bool includeForce,
Task(ContextImpl& context, CommonCalcCMAPTorsionForceKernel& kernel, bool includeForce,
bool includeEnergy, double& energy) : context(context), kernel(kernel),
includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
}
......@@ -510,7 +510,7 @@ public:
}
private:
ContextImpl& context;
CudaCalcCMAPTorsionForceKernel& kernel;
CommonCalcCMAPTorsionForceKernel& kernel;
bool includeForce, includeEnergy;
double& energy;
};
......@@ -518,7 +518,7 @@ private:
CudaParallelCalcCMAPTorsionForceKernel::CudaParallelCalcCMAPTorsionForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, const System& system) :
CalcCMAPTorsionForceKernel(name, platform), data(data) {
for (int i = 0; i < (int) data.contexts.size(); i++)
kernels.push_back(Kernel(new CudaCalcCMAPTorsionForceKernel(name, platform, *data.contexts[i], system)));
kernels.push_back(Kernel(new CommonCalcCMAPTorsionForceKernel(name, platform, *data.contexts[i], system)));
}
void CudaParallelCalcCMAPTorsionForceKernel::initialize(const System& system, const CMAPTorsionForce& force) {
......@@ -529,7 +529,7 @@ void CudaParallelCalcCMAPTorsionForceKernel::initialize(const System& system, co
double CudaParallelCalcCMAPTorsionForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
for (int i = 0; i < (int) data.contexts.size(); i++) {
CudaContext& cu = *data.contexts[i];
CudaContext::WorkThread& thread = cu.getWorkThread();
ComputeContext::WorkThread& thread = cu.getWorkThread();
thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
}
return 0.0;
......@@ -542,7 +542,7 @@ void CudaParallelCalcCMAPTorsionForceKernel::copyParametersToContext(ContextImpl
class CudaParallelCalcCustomTorsionForceKernel::Task : public CudaContext::WorkTask {
public:
Task(ContextImpl& context, CudaCalcCustomTorsionForceKernel& kernel, bool includeForce,
Task(ContextImpl& context, CommonCalcCustomTorsionForceKernel& kernel, bool includeForce,
bool includeEnergy, double& energy) : context(context), kernel(kernel),
includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
}
......@@ -551,7 +551,7 @@ public:
}
private:
ContextImpl& context;
CudaCalcCustomTorsionForceKernel& kernel;
CommonCalcCustomTorsionForceKernel& kernel;
bool includeForce, includeEnergy;
double& energy;
};
......@@ -559,7 +559,7 @@ private:
CudaParallelCalcCustomTorsionForceKernel::CudaParallelCalcCustomTorsionForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, const System& system) :
CalcCustomTorsionForceKernel(name, platform), data(data) {
for (int i = 0; i < (int) data.contexts.size(); i++)
kernels.push_back(Kernel(new CudaCalcCustomTorsionForceKernel(name, platform, *data.contexts[i], system)));
kernels.push_back(Kernel(new CommonCalcCustomTorsionForceKernel(name, platform, *data.contexts[i], system)));
}
void CudaParallelCalcCustomTorsionForceKernel::initialize(const System& system, const CustomTorsionForce& force) {
......@@ -570,7 +570,7 @@ void CudaParallelCalcCustomTorsionForceKernel::initialize(const System& system,
double CudaParallelCalcCustomTorsionForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
for (int i = 0; i < (int) data.contexts.size(); i++) {
CudaContext& cu = *data.contexts[i];
CudaContext::WorkThread& thread = cu.getWorkThread();
ComputeContext::WorkThread& thread = cu.getWorkThread();
thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
}
return 0.0;
......@@ -611,7 +611,7 @@ void CudaParallelCalcNonbondedForceKernel::initialize(const System& system, cons
double CudaParallelCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy, bool includeDirect, bool includeReciprocal) {
for (int i = 0; i < (int) data.contexts.size(); i++) {
CudaContext& cu = *data.contexts[i];
CudaContext::WorkThread& thread = cu.getWorkThread();
ComputeContext::WorkThread& thread = cu.getWorkThread();
thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, includeDirect, includeReciprocal, data.contextEnergy[i]));
}
return 0.0;
......@@ -632,7 +632,7 @@ void CudaParallelCalcNonbondedForceKernel::getLJPMEParameters(double& alpha, int
class CudaParallelCalcCustomNonbondedForceKernel::Task : public CudaContext::WorkTask {
public:
Task(ContextImpl& context, CudaCalcCustomNonbondedForceKernel& kernel, bool includeForce,
Task(ContextImpl& context, CommonCalcCustomNonbondedForceKernel& kernel, bool includeForce,
bool includeEnergy, double& energy) : context(context), kernel(kernel),
includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
}
......@@ -641,7 +641,7 @@ public:
}
private:
ContextImpl& context;
CudaCalcCustomNonbondedForceKernel& kernel;
CommonCalcCustomNonbondedForceKernel& kernel;
bool includeForce, includeEnergy;
double& energy;
};
......@@ -649,7 +649,7 @@ private:
CudaParallelCalcCustomNonbondedForceKernel::CudaParallelCalcCustomNonbondedForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, const System& system) :
CalcCustomNonbondedForceKernel(name, platform), data(data) {
for (int i = 0; i < (int) data.contexts.size(); i++)
kernels.push_back(Kernel(new CudaCalcCustomNonbondedForceKernel(name, platform, *data.contexts[i], system)));
kernels.push_back(Kernel(new CommonCalcCustomNonbondedForceKernel(name, platform, *data.contexts[i], system)));
}
void CudaParallelCalcCustomNonbondedForceKernel::initialize(const System& system, const CustomNonbondedForce& force) {
......@@ -660,7 +660,7 @@ void CudaParallelCalcCustomNonbondedForceKernel::initialize(const System& system
double CudaParallelCalcCustomNonbondedForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
for (int i = 0; i < (int) data.contexts.size(); i++) {
CudaContext& cu = *data.contexts[i];
CudaContext::WorkThread& thread = cu.getWorkThread();
ComputeContext::WorkThread& thread = cu.getWorkThread();
thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
}
return 0.0;
......@@ -673,7 +673,7 @@ void CudaParallelCalcCustomNonbondedForceKernel::copyParametersToContext(Context
class CudaParallelCalcCustomExternalForceKernel::Task : public CudaContext::WorkTask {
public:
Task(ContextImpl& context, CudaCalcCustomExternalForceKernel& kernel, bool includeForce,
Task(ContextImpl& context, CommonCalcCustomExternalForceKernel& kernel, bool includeForce,
bool includeEnergy, double& energy) : context(context), kernel(kernel),
includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
}
......@@ -682,7 +682,7 @@ public:
}
private:
ContextImpl& context;
CudaCalcCustomExternalForceKernel& kernel;
CommonCalcCustomExternalForceKernel& kernel;
bool includeForce, includeEnergy;
double& energy;
};
......@@ -690,7 +690,7 @@ private:
CudaParallelCalcCustomExternalForceKernel::CudaParallelCalcCustomExternalForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, const System& system) :
CalcCustomExternalForceKernel(name, platform), data(data) {
for (int i = 0; i < (int) data.contexts.size(); i++)
kernels.push_back(Kernel(new CudaCalcCustomExternalForceKernel(name, platform, *data.contexts[i], system)));
kernels.push_back(Kernel(new CommonCalcCustomExternalForceKernel(name, platform, *data.contexts[i], system)));
}
void CudaParallelCalcCustomExternalForceKernel::initialize(const System& system, const CustomExternalForce& force) {
......@@ -701,7 +701,7 @@ void CudaParallelCalcCustomExternalForceKernel::initialize(const System& system,
double CudaParallelCalcCustomExternalForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
for (int i = 0; i < (int) data.contexts.size(); i++) {
CudaContext& cu = *data.contexts[i];
CudaContext::WorkThread& thread = cu.getWorkThread();
ComputeContext::WorkThread& thread = cu.getWorkThread();
thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
}
return 0.0;
......@@ -714,7 +714,7 @@ void CudaParallelCalcCustomExternalForceKernel::copyParametersToContext(ContextI
class CudaParallelCalcCustomHbondForceKernel::Task : public CudaContext::WorkTask {
public:
Task(ContextImpl& context, CudaCalcCustomHbondForceKernel& kernel, bool includeForce,
Task(ContextImpl& context, CommonCalcCustomHbondForceKernel& kernel, bool includeForce,
bool includeEnergy, double& energy) : context(context), kernel(kernel),
includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
}
......@@ -723,7 +723,7 @@ public:
}
private:
ContextImpl& context;
CudaCalcCustomHbondForceKernel& kernel;
CommonCalcCustomHbondForceKernel& kernel;
bool includeForce, includeEnergy;
double& energy;
};
......@@ -731,7 +731,7 @@ private:
CudaParallelCalcCustomHbondForceKernel::CudaParallelCalcCustomHbondForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, const System& system) :
CalcCustomHbondForceKernel(name, platform), data(data) {
for (int i = 0; i < (int) data.contexts.size(); i++)
kernels.push_back(Kernel(new CudaCalcCustomHbondForceKernel(name, platform, *data.contexts[i], system)));
kernels.push_back(Kernel(new CommonCalcCustomHbondForceKernel(name, platform, *data.contexts[i], system)));
}
void CudaParallelCalcCustomHbondForceKernel::initialize(const System& system, const CustomHbondForce& force) {
......@@ -742,7 +742,7 @@ void CudaParallelCalcCustomHbondForceKernel::initialize(const System& system, co
double CudaParallelCalcCustomHbondForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
for (int i = 0; i < (int) data.contexts.size(); i++) {
CudaContext& cu = *data.contexts[i];
CudaContext::WorkThread& thread = cu.getWorkThread();
ComputeContext::WorkThread& thread = cu.getWorkThread();
thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
}
return 0.0;
......@@ -755,7 +755,7 @@ void CudaParallelCalcCustomHbondForceKernel::copyParametersToContext(ContextImpl
class CudaParallelCalcCustomCompoundBondForceKernel::Task : public CudaContext::WorkTask {
public:
Task(ContextImpl& context, CudaCalcCustomCompoundBondForceKernel& kernel, bool includeForce,
Task(ContextImpl& context, CommonCalcCustomCompoundBondForceKernel& kernel, bool includeForce,
bool includeEnergy, double& energy) : context(context), kernel(kernel),
includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
}
......@@ -764,7 +764,7 @@ public:
}
private:
ContextImpl& context;
CudaCalcCustomCompoundBondForceKernel& kernel;
CommonCalcCustomCompoundBondForceKernel& kernel;
bool includeForce, includeEnergy;
double& energy;
};
......@@ -772,7 +772,7 @@ private:
CudaParallelCalcCustomCompoundBondForceKernel::CudaParallelCalcCustomCompoundBondForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, const System& system) :
CalcCustomCompoundBondForceKernel(name, platform), data(data) {
for (int i = 0; i < (int) data.contexts.size(); i++)
kernels.push_back(Kernel(new CudaCalcCustomCompoundBondForceKernel(name, platform, *data.contexts[i], system)));
kernels.push_back(Kernel(new CommonCalcCustomCompoundBondForceKernel(name, platform, *data.contexts[i], system)));
}
void CudaParallelCalcCustomCompoundBondForceKernel::initialize(const System& system, const CustomCompoundBondForce& force) {
......@@ -783,7 +783,7 @@ void CudaParallelCalcCustomCompoundBondForceKernel::initialize(const System& sys
double CudaParallelCalcCustomCompoundBondForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
for (int i = 0; i < (int) data.contexts.size(); i++) {
CudaContext& cu = *data.contexts[i];
CudaContext::WorkThread& thread = cu.getWorkThread();
ComputeContext::WorkThread& thread = cu.getWorkThread();
thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
}
return 0.0;
......
......@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2012 Stanford University and the Authors. *
* Portions copyright (c) 2009-2019 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -25,174 +25,12 @@
* -------------------------------------------------------------------------- */
#include "CudaParameterSet.h"
#include "openmm/OpenMMException.h"
#include <cmath>
#include <sstream>
using namespace OpenMM;
using namespace std;
#define CHECK_RESULT(result) \
if (result != CUDA_SUCCESS) { \
std::stringstream m; \
m<<errorMessage<<": "<<context.getErrorString(result)<<" ("<<result<<")"; \
throw OpenMMException(m.str());\
}
CudaParameterSet::CudaParameterSet(CudaContext& context, int numParameters, int numObjects, const string& name, bool bufferPerParameter, bool useDoublePrecision) :
context(context), numParameters(numParameters), numObjects(numObjects), name(name) {
int params = numParameters;
int bufferCount = 0;
elementSize = (useDoublePrecision ? sizeof(double) : sizeof(float));
string elementType = (useDoublePrecision ? "double" : "float");
CUdeviceptr pointer;
string errorMessage = "Error creating parameter set "+name;
if (!bufferPerParameter) {
while (params > 2) {
CHECK_RESULT(cuMemAlloc(&pointer, numObjects*elementSize*4));
std::stringstream name;
name << "param" << (++bufferCount);
buffers.push_back(CudaNonbondedUtilities::ParameterInfo(name.str(), elementType, 4, elementSize*4, pointer));
params -= 4;
}
if (params > 1) {
CHECK_RESULT(cuMemAlloc(&pointer, numObjects*elementSize*2));
std::stringstream name;
name << "param" << (++bufferCount);
buffers.push_back(CudaNonbondedUtilities::ParameterInfo(name.str(), elementType, 2, elementSize*2, pointer));
params -= 2;
}
}
while (params > 0) {
CHECK_RESULT(cuMemAlloc(&pointer, numObjects*elementSize));
std::stringstream name;
name << "param" << (++bufferCount);
buffers.push_back(CudaNonbondedUtilities::ParameterInfo(name.str(), elementType, 1, elementSize, pointer));
params--;
}
}
CudaParameterSet::~CudaParameterSet() {
if (context.getContextIsValid()) {
string errorMessage = "Error freeing device memory";
for (int i = 0; i < (int) buffers.size(); i++)
CHECK_RESULT(cuMemFree(buffers[i].getMemory()));
}
}
template <class T>
void CudaParameterSet::getParameterValues(vector<vector<T> >& values) {
if (sizeof(T) != elementSize)
throw OpenMMException("Called getParameterValues() with vector of wrong type");
values.resize(numObjects);
for (int i = 0; i < numObjects; i++)
values[i].resize(numParameters);
int base = 0;
string errorMessage = "Error downloading parameter set "+name;
for (int i = 0; i < (int) buffers.size(); i++) {
if (buffers[i].getSize() == 4*elementSize) {
vector<T> data(4*numObjects);
CHECK_RESULT(cuMemcpyDtoH(&data[0], buffers[i].getMemory(), numObjects*buffers[i].getSize()));
for (int j = 0; j < numObjects; j++) {
values[j][base] = data[4*j];
if (base+1 < numParameters)
values[j][base+1] = data[4*j+1];
if (base+2 < numParameters)
values[j][base+2] = data[4*j+2];
if (base+3 < numParameters)
values[j][base+3] = data[4*j+3];
}
base += 4;
}
else if (buffers[i].getSize() == 2*elementSize) {
vector<T> data(2*numObjects);
CHECK_RESULT(cuMemcpyDtoH(&data[0], buffers[i].getMemory(), numObjects*buffers[i].getSize()));
for (int j = 0; j < numObjects; j++) {
values[j][base] = data[2*j];
if (base+1 < numParameters)
values[j][base+1] = data[2*j+1];
}
base += 2;
}
else if (buffers[i].getSize() == elementSize) {
vector<T> data(numObjects);
CHECK_RESULT(cuMemcpyDtoH(&data[0], buffers[i].getMemory(), numObjects*buffers[i].getSize()));
for (int j = 0; j < numObjects; j++)
values[j][base] = data[j];
base++;
}
else
throw OpenMMException("Internal error: Unknown buffer type in CudaParameterSet");
}
ComputeParameterSet(context, numParameters, numObjects, name, bufferPerParameter, useDoublePrecision) {
for (auto& info : getParameterInfos())
buffers.push_back(CudaNonbondedUtilities::ParameterInfo(info.getName(), info.getComponentType(), info.getNumComponents(), info.getSize(), context.unwrap(info.getArray()).getDevicePointer()));
}
template <class T>
void CudaParameterSet::setParameterValues(const vector<vector<T> >& values) {
if (sizeof(T) != elementSize)
throw OpenMMException("Called setParameterValues() with vector of wrong type");
int base = 0;
string errorMessage = "Error uploading parameter set "+name;
for (int i = 0; i < (int) buffers.size(); i++) {
if (buffers[i].getSize() == 4*elementSize) {
vector<T> data(4*numObjects);
for (int j = 0; j < numObjects; j++) {
data[4*j] = values[j][base];
if (base+1 < numParameters)
data[4*j+1] = values[j][base+1];
if (base+2 < numParameters)
data[4*j+2] = values[j][base+2];
if (base+3 < numParameters)
data[4*j+3] = values[j][base+3];
}
CHECK_RESULT(cuMemcpyHtoD(buffers[i].getMemory(), &data[0], numObjects*buffers[i].getSize()));
base += 4;
}
else if (buffers[i].getSize() == 2*elementSize) {
vector<T> data(2*numObjects);
for (int j = 0; j < numObjects; j++) {
data[2*j] = values[j][base];
if (base+1 < numParameters)
data[2*j+1] = values[j][base+1];
}
CHECK_RESULT(cuMemcpyHtoD(buffers[i].getMemory(), &data[0], numObjects*buffers[i].getSize()));
base += 2;
}
else if (buffers[i].getSize() == elementSize) {
vector<T> data(numObjects);
for (int j = 0; j < numObjects; j++)
data[j] = values[j][base];
CHECK_RESULT(cuMemcpyHtoD(buffers[i].getMemory(), &data[0], numObjects*buffers[i].getSize()));
base++;
}
else
throw OpenMMException("Internal error: Unknown buffer type in CudaParameterSet");
}
}
string CudaParameterSet::getParameterSuffix(int index, const std::string& extraSuffix) const {
const string suffixes[] = {".x", ".y", ".z", ".w"};
int buffer = -1;
for (int i = 0; buffer == -1 && i < (int) buffers.size(); i++) {
if (index*elementSize < buffers[i].getSize())
buffer = i;
else
index -= buffers[i].getSize()/elementSize;
}
if (buffer == -1)
throw OpenMMException("Internal error: Illegal argument to CudaParameterSet::getParameterSuffix() ("+name+")");
stringstream suffix;
suffix << (buffer+1) << extraSuffix;
if (buffers[buffer].getSize() != elementSize)
suffix << suffixes[index];
return suffix.str();
}
/**
* Define template instantiations for float and double versions of getParameterValues() and setParameterValues().
*/
namespace OpenMM {
template OPENMM_EXPORT_CUDA void CudaParameterSet::getParameterValues<float>(vector<vector<float> >& values);
template OPENMM_EXPORT_CUDA void CudaParameterSet::setParameterValues<float>(const vector<vector<float> >& values);
template OPENMM_EXPORT_CUDA void CudaParameterSet::getParameterValues<double>(vector<vector<double> >& values);
template OPENMM_EXPORT_CUDA void CudaParameterSet::setParameterValues<double>(const vector<vector<double> >& values);
}
\ No newline at end of file
......@@ -51,12 +51,12 @@ using namespace std;
}
#ifdef OPENMM_CUDA_BUILDING_STATIC_LIBRARY
#ifdef OPENMM_COMMON_BUILDING_STATIC_LIBRARY
extern "C" void registerCudaPlatform() {
Platform::registerPlatform(new CudaPlatform());
}
#else
extern "C" OPENMM_EXPORT_CUDA void registerPlatforms() {
extern "C" OPENMM_EXPORT_COMMON void registerPlatforms() {
Platform::registerPlatform(new CudaPlatform());
}
#endif
......@@ -96,13 +96,15 @@ CudaPlatform::CudaPlatform() {
registerKernelFactory(CalcCustomManyParticleForceKernel::Name(), factory);
registerKernelFactory(CalcGayBerneForceKernel::Name(), factory);
registerKernelFactory(IntegrateVerletStepKernel::Name(), factory);
registerKernelFactory(IntegrateVelocityVerletStepKernel::Name(), factory);
registerKernelFactory(IntegrateLangevinStepKernel::Name(), factory);
registerKernelFactory(IntegrateBAOABStepKernel::Name(), factory);
registerKernelFactory(IntegrateLangevinMiddleStepKernel::Name(), factory);
registerKernelFactory(IntegrateBrownianStepKernel::Name(), factory);
registerKernelFactory(IntegrateVariableVerletStepKernel::Name(), factory);
registerKernelFactory(IntegrateVariableLangevinStepKernel::Name(), factory);
registerKernelFactory(IntegrateCustomStepKernel::Name(), factory);
registerKernelFactory(ApplyAndersenThermostatKernel::Name(), factory);
registerKernelFactory(NoseHooverChainKernel::Name(), factory);
registerKernelFactory(ApplyMonteCarloBarostatKernel::Name(), factory);
registerKernelFactory(RemoveCMMotionKernel::Name(), factory);
platformProperties.push_back(CudaDeviceIndex());
......
......@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2012 Stanford University and the Authors. *
* Portions copyright (c) 2019 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -24,23 +24,16 @@
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "CudaForceInfo.h"
#include "CudaProgram.h"
#include "CudaKernel.h"
using namespace OpenMM;
using namespace std;
bool CudaForceInfo::areParticlesIdentical(int particle1, int particle2) {
return true;
CudaProgram::CudaProgram(CudaContext& context, CUmodule module) : context(context), module(module) {
}
int CudaForceInfo::getNumParticleGroups() {
return 0;
}
void CudaForceInfo::getParticlesInGroup(int index, vector<int>& particles) {
return;
}
bool CudaForceInfo::areGroupsIdentical(int group1, int group2) {
return true;
}
ComputeKernel CudaProgram::createKernel(const string& name) {
CUfunction kernel = context.getKernel(module, name.c_str());
return shared_ptr<ComputeKernelImpl>(new CudaKernel(context, kernel, name));
}
\ No newline at end of file
/**
* This file contains CUDA definitions for the macros and functions needed for the
* common compute framework.
*/
#define KERNEL extern "C" __global__
#define DEVICE __device__
#define LOCAL __shared__
#define LOCAL_ARG
#define GLOBAL
#define RESTRICT __restrict__
#define LOCAL_ID threadIdx.x
#define LOCAL_SIZE blockDim.x
#define GLOBAL_ID (blockIdx.x*blockDim.x+threadIdx.x)
#define GLOBAL_SIZE (blockDim.x*gridDim.x)
#define GROUP_ID blockIdx.x
#define NUM_GROUPS gridDim.x
#define SYNC_THREADS __syncthreads();
#define MEM_FENCE __threadfence_block();
#define ATOMIC_ADD(dest, value) atomicAdd(dest, value)
typedef long long mm_long;
typedef unsigned long long mm_ulong;
#define SUPPORTS_64_BIT_ATOMICS 1
#define SUPPORTS_DOUBLE_PRECISION 1
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment