Commit 2e451b9d authored by Peter Eastman's avatar Peter Eastman
Browse files

Deleted the old CUDA platform

parent 352e2fc7
#---------------------------------------------------
# OpenMM CUDA Platform
#
# Creates OpenMM library, base name=OpenMMCuda.
# Default libraries are shared & optimized. Variants
# are created for static (_static) and debug (_d).
#
# Windows:
# OpenMMCuda[_d].dll
# OpenMMCuda[_d].lib
# OpenMMCuda_static[_d].lib
# Unix:
# libOpenMMCuda[_d].so
# libOpenMMCuda_static[_d].a
#----------------------------------------------------
set(OPENMM_BUILD_CUDA_TESTS TRUE CACHE BOOL "Whether to build CUDA test cases")
if(OPENMM_BUILD_CUDA_TESTS)
SUBDIRS (tests)
endif(OPENMM_BUILD_CUDA_TESTS)
# The source is organized into subdirectories, but we handle them all from
# this CMakeLists file rather than letting CMake visit them as SUBDIRS.
SET(OPENMM_SOURCE_SUBDIRS .)
# Collect up information about the version of the OpenMM library we're building
# and make it available to the code so it can be built into the binaries.
SET(OPENMMCUDA_LIBRARY_NAME OpenMMCuda)
SET(SHARED_TARGET ${OPENMMCUDA_LIBRARY_NAME})
SET(STATIC_TARGET ${OPENMMCUDA_LIBRARY_NAME}_static)
# Ensure that debug libraries have "_d" appended to their names.
# CMake gets this right on Windows automatically with this definition.
IF (${CMAKE_GENERATOR} MATCHES "Visual Studio")
SET(CMAKE_DEBUG_POSTFIX "_d" CACHE INTERNAL "" FORCE)
ENDIF (${CMAKE_GENERATOR} MATCHES "Visual Studio")
# But on Unix or Cygwin we have to add the suffix manually
IF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
SET(SHARED_TARGET ${SHARED_TARGET}_d)
SET(STATIC_TARGET ${STATIC_TARGET}_d)
ENDIF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
# These are all the places to search for header files which are
# to be part of the API.
SET(API_INCLUDE_DIRS) # start empty
FOREACH(subdir ${OPENMM_SOURCE_SUBDIRS})
# append
SET(API_INCLUDE_DIRS ${API_INCLUDE_DIRS}
${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include
${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include/internal)
ENDFOREACH(subdir)
# We'll need both *relative* path names, starting with their API_INCLUDE_DIRS,
# and absolute pathnames.
SET(API_REL_INCLUDE_FILES) # start these out empty
SET(API_ABS_INCLUDE_FILES)
FOREACH(dir ${API_INCLUDE_DIRS})
FILE(GLOB fullpaths ${dir}/*.h) # returns full pathnames
SET(API_ABS_INCLUDE_FILES ${API_ABS_INCLUDE_FILES} ${fullpaths})
FOREACH(pathname ${fullpaths})
GET_FILENAME_COMPONENT(filename ${pathname} NAME)
SET(API_REL_INCLUDE_FILES ${API_REL_INCLUDE_FILES} ${dir}/${filename})
ENDFOREACH(pathname)
ENDFOREACH(dir)
# collect up source files
SET(SOURCE_FILES) # empty
SET(SOURCE_INCLUDE_FILES)
FOREACH(subdir ${OPENMM_SOURCE_SUBDIRS})
FILE(GLOB_RECURSE src_files ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/src/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/src/*.c)
FILE(GLOB incl_files ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/src/*.h)
SET(SOURCE_FILES ${SOURCE_FILES} ${src_files}) #append
SET(SOURCE_INCLUDE_FILES ${SOURCE_INCLUDE_FILES} ${incl_files})
INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include)
ENDFOREACH(subdir)
INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/src)
# SET(FINDCUDA_DIR ${CMAKE_CURRENT_SOURCE_DIR}/cuda-cmake)
SUBDIRS (sharedTarget)
#ifndef OPENMM_CUDAKERNELFACTORY_H_
#define OPENMM_CUDAKERNELFACTORY_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "openmm/KernelFactory.h"
#include "windowsExportCuda.h"
namespace OpenMM {
/**
* This KernelFactory creates all kernels for CudaPlatform.
*/
class CudaKernelFactory : public KernelFactory {
public:
OPENMMCUDA_EXPORT KernelImpl* createKernelImpl(std::string name, const Platform& platform, ContextImpl& context) const;
};
} // namespace OpenMM
#endif /*OPENMM_CUDAKERNELFACTORY_H_*/
#ifndef OPENMM_CUDAPLATFORM_H_
#define OPENMM_CUDAPLATFORM_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "openmm/Platform.h"
#include "windowsExportCuda.h"
struct _gpuContext;
namespace OpenMM {
/**
* This Platform subclass uses CUDA implementations of the OpenMM kernels to run on NVidia GPUs.
*/
class OPENMMCUDA_EXPORT CudaPlatform : public Platform {
public:
class PlatformData;
CudaPlatform();
const std::string& getName() const {
static const std::string name = "Cuda";
return name;
}
double getSpeed() const {
return 50;
}
bool supportsDoublePrecision() const;
const std::string& getPropertyValue(const Context& context, const std::string& property) const;
void setPropertyValue(Context& context, const std::string& property, const std::string& value) const;
void contextCreated(ContextImpl& context, const std::map<std::string, std::string>& properties) const;
void contextDestroyed(ContextImpl& context) const;
/**
* This is the name of the parameter for selecting which CUDA device to use.
*/
static const std::string& CudaDevice() {
static const std::string key = "CudaDevice";
return key;
}
/**
* This is the name of the parameter for selecting whether CUDA should sync or spin loop while waiting for results.
*/
static const std::string& CudaUseBlockingSync() {
static const std::string key = "CudaUseBlockingSync";
return key;
}
};
class CudaPlatform::PlatformData {
public:
OPENMMCUDA_EXPORT PlatformData(_gpuContext* gpu);
_gpuContext* gpu;
bool removeCM;
bool hasBonds, hasAngles, hasPeriodicTorsions, hasRB, hasNonbonded, hasCustomNonbonded;
int nonbondedMethod, customNonbondedMethod;
int cmMotionFrequency;
int stepCount, computeForceCount;
double time, ewaldSelfEnergy, dispersionCoefficient;
std::map<std::string, std::string> propertyValues;
};
} // namespace OpenMM
#endif /*OPENMM_CUDAPLATFORM_H_*/
#ifndef OPENMM_WINDOWSEXPORTCUDA_H_
#define OPENMM_WINDOWSEXPORTCUDA_H_
/*
* Shared libraries are messy in Visual Studio. We have to distinguish three
* cases:
* (1) this header is being used to build the OpenMM shared library
* (dllexport)
* (2) this header is being used by a *client* of the OpenMM shared
* library (dllimport)
* (3) we are building the OpenMM static library, or the client is
* being compiled with the expectation of linking with the
* OpenMM static library (nothing special needed)
* In the CMake script for building this library, we define one of the symbols
* OpenMMCUDA_BUILDING_{SHARED|STATIC}_LIBRARY
* Client code normally has no special symbol defined, in which case we'll
* assume it wants to use the shared library. However, if the client defines
* the symbol OPENMM_USE_STATIC_LIBRARIES we'll suppress the dllimport so
* that the client code can be linked with static libraries. Note that
* the client symbol is not library dependent, while the library symbols
* affect only the OpenMM library, meaning that other libraries can
* be clients of this one. However, we are assuming all-static or all-shared.
*/
#ifdef _MSC_VER
// We don't want to hear about how sprintf is "unsafe".
#pragma warning(disable:4996)
// Keep MS VC++ quiet about lack of dll export of private members.
#pragma warning(disable:4251)
#if defined(OPENMMCUDA_BUILDING_SHARED_LIBRARY)
#define OPENMMCUDA_EXPORT __declspec(dllexport)
#elif defined(OPENMMCUDA_BUILDING_STATIC_LIBRARY) || defined(OPENMMCUDA_USE_STATIC_LIBRARIES)
#define OPENMMCUDA_EXPORT
#else
#define OPENMMCUDA_EXPORT __declspec(dllimport) // i.e., a client of a shared library
#endif
#else
#define OPENMMCUDA_EXPORT // Linux, Mac
#endif
#endif // OPENMM_WINDOWSEXPORTCUDA_H_
#
# Include CUDA related files.
#
# INCLUDE(${FINDCUDA_DIR}/FindCuda.cmake)
INCLUDE_DIRECTORIES(${CUDA_INCLUDE})
LINK_DIRECTORIES(${CUDA_TARGET_LINK})
FOREACH(subdir ${OPENMM_SOURCE_SUBDIRS})
FILE(GLOB src_files ${CMAKE_SOURCE_DIR}/platforms/cuda/${subdir}/src/*.cu ${CMAKE_SOURCE_DIR}/platforms/cuda/${subdir}/src/*/*.cu)
SET(SOURCE_FILES ${SOURCE_FILES} ${src_files})
CUDA_INCLUDE_DIRECTORIES(BEFORE ${CMAKE_SOURCE_DIR}/platforms/cuda/${subdir}/include)
CUDA_INCLUDE_DIRECTORIES(BEFORE ${CMAKE_SOURCE_DIR}/platforms/cuda/${subdir}/src)
ENDFOREACH(subdir)
CUDA_INCLUDE_DIRECTORIES(BEFORE ${CMAKE_SOURCE_DIR}/jama/include)
CUDA_INCLUDE_DIRECTORIES(BEFORE ${CMAKE_SOURCE_DIR}/openmmapi/include)
IF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
SET(MAIN_OPENMM_LIB ${OPENMM_LIBRARY_NAME}_d)
ELSE (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
SET(MAIN_OPENMM_LIB ${OPENMM_LIBRARY_NAME})
ENDIF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
IF(APPLE AND CMAKE_OSX_ARCHITECTURES AND CMAKE_OSX_ARCHITECTURES MATCHES .*i386.* AND CMAKE_OSX_ARCHITECTURES MATCHES .*x86_64.*)
# NVCC doesn't know how to build universal binaries, so we need to build two separate versions.
SET(BASE_FLAGS ${CUDA_NVCC_FLAGS})
SET(CMAKE_OSX_ARCHITECTURES i386)
SET(CUDA_NVCC_FLAGS ${BASE_FLAGS} -m32)
CUDA_ADD_LIBRARY("${SHARED_TARGET}32" SHARED ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})
TARGET_LINK_LIBRARIES(${SHARED_TARGET}32 ${MAIN_OPENMM_LIB} ${CUFFT_TARGET_LINK})
SET_TARGET_PROPERTIES(${SHARED_TARGET}32 PROPERTIES COMPILE_FLAGS "-DOPENMMCUDA_BUILDING_SHARED_LIBRARY")
SET(CMAKE_OSX_ARCHITECTURES x86_64)
SET(CUDA_NVCC_FLAGS ${BASE_FLAGS} -m64)
CUDA_ADD_LIBRARY(${SHARED_TARGET} SHARED ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})
TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${MAIN_OPENMM_LIB} ${CUFFT_TARGET_LINK})
SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES COMPILE_FLAGS "-DOPENMMCUDA_BUILDING_SHARED_LIBRARY")
ADD_DEPENDENCIES(${SHARED_TARGET} "${SHARED_TARGET}32")
# Join them into a single universal binary.
ADD_CUSTOM_COMMAND(
TARGET ${SHARED_TARGET}
POST_BUILD
COMMAND /usr/bin/lipo lib${SHARED_TARGET}.dylib lib${SHARED_TARGET}32.dylib -create -output lib${SHARED_TARGET}.dylib
WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
COMMENT "Creating universal binary")
ELSE(APPLE AND CMAKE_OSX_ARCHITECTURES AND CMAKE_OSX_ARCHITECTURES MATCHES .*i386.* AND CMAKE_OSX_ARCHITECTURES MATCHES .*x86_64.*)
CUDA_ADD_LIBRARY(${SHARED_TARGET} SHARED ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})
TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${MAIN_OPENMM_LIB} ${CUFFT_TARGET_LINK})
SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES COMPILE_FLAGS "-DOPENMMCUDA_BUILDING_SHARED_LIBRARY")
ENDIF(APPLE AND CMAKE_OSX_ARCHITECTURES AND CMAKE_OSX_ARCHITECTURES MATCHES .*i386.* AND CMAKE_OSX_ARCHITECTURES MATCHES .*x86_64.*)
INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${SHARED_TARGET})
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "CudaForceInfo.h"
using namespace OpenMM;
using namespace std;
bool CudaForceInfo::areParticlesIdentical(int particle1, int particle2) {
return true;
}
int CudaForceInfo::getNumParticleGroups() {
return 0;
}
void CudaForceInfo::getParticlesInGroup(int index, vector<int>& particles) {
return;
}
bool CudaForceInfo::areGroupsIdentical(int group1, int group2) {
return true;
}
#ifndef OPENMM_CUDAFORCEINFO_H_
#define OPENMM_CUDAFORCEINFO_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "openmm/internal/windowsExport.h"
#include <vector>
namespace OpenMM {
/**
* This class is used by the Cuda implementation of a Force class to convey information
* about the behavior and requirements of that force.
*/
class CudaForceInfo {
public:
CudaForceInfo() {
}
virtual ~CudaForceInfo() {
}
/**
* Get whether or not two particles have identical force field parameters.
*/
virtual OPENMM_EXPORT bool areParticlesIdentical(int particle1, int particle2);
/**
* Get the number of particle groups defined by this force.
*/
virtual OPENMM_EXPORT int getNumParticleGroups();
/**
* Get the list of particles in a particular group.
*/
virtual OPENMM_EXPORT void getParticlesInGroup(int index, std::vector<int>& particles);
/**
* Get whether two particle groups are identical.
*/
virtual OPENMM_EXPORT bool areGroupsIdentical(int group1, int group2);
};
} // namespace OpenMM
#endif /*OPENMM_CUDAFORCEINFO_H_*/
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "CudaKernelFactory.h"
#include "CudaKernels.h"
#include "openmm/internal/ContextImpl.h"
#include "openmm/OpenMMException.h"
using namespace OpenMM;
OPENMMCUDA_EXPORT KernelImpl* CudaKernelFactory::createKernelImpl(std::string name, const Platform& platform, ContextImpl& context) const {
CudaPlatform::PlatformData& data = *static_cast<CudaPlatform::PlatformData*>(context.getPlatformData());
if (name == CalcForcesAndEnergyKernel::Name())
return new CudaCalcForcesAndEnergyKernel(name, platform, data);
if (name == UpdateStateDataKernel::Name())
return new CudaUpdateStateDataKernel(name, platform, data);
if (name == ApplyConstraintsKernel::Name())
return new CudaApplyConstraintsKernel(name, platform, data);
if (name == VirtualSitesKernel::Name())
return new CudaVirtualSitesKernel(name, platform);
if (name == CalcHarmonicBondForceKernel::Name())
return new CudaCalcHarmonicBondForceKernel(name, platform, data, context.getSystem());
if (name == CalcCustomBondForceKernel::Name())
return new CudaCalcCustomBondForceKernel(name, platform, data, context.getSystem());
if (name == CalcHarmonicAngleForceKernel::Name())
return new CudaCalcHarmonicAngleForceKernel(name, platform, data, context.getSystem());
if (name == CalcCustomAngleForceKernel::Name())
return new CudaCalcCustomAngleForceKernel(name, platform, data, context.getSystem());
if (name == CalcPeriodicTorsionForceKernel::Name())
return new CudaCalcPeriodicTorsionForceKernel(name, platform, data, context.getSystem());
if (name == CalcRBTorsionForceKernel::Name())
return new CudaCalcRBTorsionForceKernel(name, platform, data, context.getSystem());
if (name == CalcCMAPTorsionForceKernel::Name())
return new CudaCalcCMAPTorsionForceKernel(name, platform, data, context.getSystem());
if (name == CalcCustomTorsionForceKernel::Name())
return new CudaCalcCustomTorsionForceKernel(name, platform, data, context.getSystem());
if (name == CalcNonbondedForceKernel::Name())
return new CudaCalcNonbondedForceKernel(name, platform, data, context.getSystem());
if (name == CalcCustomNonbondedForceKernel::Name())
return new CudaCalcCustomNonbondedForceKernel(name, platform, data, context.getSystem());
if (name == CalcGBSAOBCForceKernel::Name())
return new CudaCalcGBSAOBCForceKernel(name, platform, data);
if (name == CalcGBVIForceKernel::Name())
return new CudaCalcGBVIForceKernel(name, platform, data);
if (name == CalcCustomExternalForceKernel::Name())
return new CudaCalcCustomExternalForceKernel(name, platform, data, context.getSystem());
if (name == IntegrateVerletStepKernel::Name())
return new CudaIntegrateVerletStepKernel(name, platform, data);
if (name == IntegrateLangevinStepKernel::Name())
return new CudaIntegrateLangevinStepKernel(name, platform, data);
if (name == IntegrateBrownianStepKernel::Name())
return new CudaIntegrateBrownianStepKernel(name, platform, data);
if (name == IntegrateVariableVerletStepKernel::Name())
return new CudaIntegrateVariableVerletStepKernel(name, platform, data);
if (name == IntegrateVariableLangevinStepKernel::Name())
return new CudaIntegrateVariableLangevinStepKernel(name, platform, data);
if (name == ApplyAndersenThermostatKernel::Name())
return new CudaApplyAndersenThermostatKernel(name, platform, data);
if (name == ApplyMonteCarloBarostatKernel::Name())
return new CudaApplyMonteCarloBarostatKernel(name, platform, data);
if (name == CalcKineticEnergyKernel::Name())
return new CudaCalcKineticEnergyKernel(name, platform, data);
if (name == RemoveCMMotionKernel::Name())
return new CudaRemoveCMMotionKernel(name, platform, data);
throw OpenMMException((std::string("Tried to create kernel with illegal kernel name '")+name+"'").c_str());
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008-2009 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "CudaKernels.h"
#include "CudaForceInfo.h"
#include "openmm/LangevinIntegrator.h"
#include "openmm/Context.h"
#include "openmm/OpenMMException.h"
#include "openmm/internal/AndersenThermostatImpl.h"
#include "openmm/internal/CMAPTorsionForceImpl.h"
#include "openmm/internal/ContextImpl.h"
#include "openmm/internal/NonbondedForceImpl.h"
#include "kernels/gputypes.h"
#include "kernels/cudaKernels.h"
#include "../src/SimTKUtilities/SimTKOpenMMRealType.h"
#include <cmath>
extern "C" int OPENMMCUDA_EXPORT gpuSetConstants( gpuContext gpu );
using namespace OpenMM;
using namespace std;
void CudaCalcForcesAndEnergyKernel::initialize(const System& system) {
}
void CudaCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForces, bool includeEnergy, int groups) {
_gpuContext* gpu = data.gpu;
if (data.nonbondedMethod != NO_CUTOFF && data.computeForceCount%100 == 0)
gpuReorderAtoms(gpu);
if ((data.hasNonbonded && data.nonbondedMethod != NO_CUTOFF && data.nonbondedMethod != CUTOFF) ||
(data.hasCustomNonbonded && data.customNonbondedMethod != NO_CUTOFF && data.customNonbondedMethod != CUTOFF)) {
double minAllowedSize = 1.999999*gpu->sim.nonbondedCutoff;
if (gpu->sim.periodicBoxSizeX < minAllowedSize || gpu->sim.periodicBoxSizeY < minAllowedSize || gpu->sim.periodicBoxSizeZ < minAllowedSize)
throw OpenMMException("The periodic box size has decreased to less than twice the nonbonded cutoff.");
}
data.computeForceCount++;
if (gpu->bIncludeGBSA || gpu->bIncludeGBVI)
kClearBornSumAndForces(gpu);
else if (includeForces)
kClearForces(gpu);
if (includeEnergy)
kClearEnergy(gpu);
}
double CudaCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bool includeForces, bool includeEnergy, int groups) {
_gpuContext* gpu = data.gpu;
if (gpu->bIncludeGBSA || gpu->bIncludeGBVI) {
gpu->bRecalculateBornRadii = true;
kCalculateCDLJObcGbsaForces1(gpu);
kReduceObcGbsaBornForces(gpu);
if (gpu->bIncludeGBSA ) {
kCalculateObcGbsaForces2(gpu);
} else {
kCalculateGBVIForces2(gpu);
}
}
else if (data.hasNonbonded)
kCalculateCDLJForces(gpu);
if (data.hasCustomNonbonded)
kCalculateCustomNonbondedForces(gpu, data.hasNonbonded);
kCalculateLocalForces(gpu);
if (includeForces)
kReduceForces(gpu);
double energy = 0.0;
if (includeEnergy) {
energy = kReduceEnergy(gpu)+data.ewaldSelfEnergy;
if (data.dispersionCoefficient != 0.0)
energy += data.dispersionCoefficient/(gpu->sim.periodicBoxSizeX*gpu->sim.periodicBoxSizeY*gpu->sim.periodicBoxSizeZ);
}
return energy;
}
void CudaUpdateStateDataKernel::initialize(const System& system) {
}
double CudaUpdateStateDataKernel::getTime(const ContextImpl& context) const {
return data.time;
}
void CudaUpdateStateDataKernel::setTime(ContextImpl& context, double time) {
data.time = time;
}
void CudaUpdateStateDataKernel::getPositions(ContextImpl& context, std::vector<Vec3>& positions) {
_gpuContext* gpu = data.gpu;
gpu->psPosq4->Download();
int* order = gpu->psAtomIndex->_pSysData;
int numParticles = context.getSystem().getNumParticles();
positions.resize(numParticles);
for (int i = 0; i < numParticles; ++i) {
float4 pos = (*gpu->psPosq4)[i];
int3 offset = gpu->posCellOffsets[i];
positions[order[i]] = Vec3(pos.x-offset.x*gpu->sim.periodicBoxSizeX, pos.y-offset.y*gpu->sim.periodicBoxSizeY, pos.z-offset.z*gpu->sim.periodicBoxSizeZ);
}
}
void CudaUpdateStateDataKernel::setPositions(ContextImpl& context, const std::vector<Vec3>& positions) {
_gpuContext* gpu = data.gpu;
int* order = gpu->psAtomIndex->_pSysData;
int numParticles = context.getSystem().getNumParticles();
for (int i = 0; i < numParticles; ++i) {
float4& pos = (*gpu->psPosq4)[i];
const Vec3& p = positions[order[i]];
pos.x = (float) p[0];
pos.y = (float) p[1];
pos.z = (float) p[2];
}
gpu->psPosq4->Upload();
for (int i = 0; i < (int) gpu->posCellOffsets.size(); i++)
gpu->posCellOffsets[i] = make_int3(0, 0, 0);
}
void CudaUpdateStateDataKernel::getVelocities(ContextImpl& context, std::vector<Vec3>& velocities) {
_gpuContext* gpu = data.gpu;
gpu->psVelm4->Download();
int* order = gpu->psAtomIndex->_pSysData;
int numParticles = context.getSystem().getNumParticles();
velocities.resize(numParticles);
for (int i = 0; i < numParticles; ++i) {
float4 vel = (*gpu->psVelm4)[i];
velocities[order[i]] = Vec3(vel.x, vel.y, vel.z);
}
}
void CudaUpdateStateDataKernel::setVelocities(ContextImpl& context, const std::vector<Vec3>& velocities) {
_gpuContext* gpu = data.gpu;
int* order = gpu->psAtomIndex->_pSysData;
int numParticles = context.getSystem().getNumParticles();
for (int i = 0; i < numParticles; ++i) {
float4& vel = (*gpu->psVelm4)[i];
const Vec3& v = velocities[order[i]];
vel.x = (float) v[0];
vel.y = (float) v[1];
vel.z = (float) v[2];
}
gpu->psVelm4->Upload();
}
void CudaUpdateStateDataKernel::getForces(ContextImpl& context, std::vector<Vec3>& forces) {
_gpuContext* gpu = data.gpu;
int* order = gpu->psAtomIndex->_pSysData;
gpu->psForce4->Download();
int numParticles = context.getSystem().getNumParticles();
forces.resize(numParticles);
for (int i = 0; i < numParticles; ++i) {
float4 force = (*gpu->psForce4)[i];
forces[order[i]] = Vec3(force.x, force.y, force.z);
}
}
void CudaUpdateStateDataKernel::getPeriodicBoxVectors(ContextImpl& context, Vec3& a, Vec3& b, Vec3& c) const {
_gpuContext* gpu = data.gpu;
a = Vec3(gpu->sim.periodicBoxSizeX, 0, 0);
b = Vec3(0, gpu->sim.periodicBoxSizeY, 0);
c = Vec3(0, 0, gpu->sim.periodicBoxSizeZ);
}
void CudaUpdateStateDataKernel::setPeriodicBoxVectors(ContextImpl& context, const Vec3& a, const Vec3& b, const Vec3& c) const {
_gpuContext* gpu = data.gpu;
gpuSetPeriodicBoxSize(gpu, a[0], b[1], c[2]);
gpuSetConstants(gpu);
}
void CudaUpdateStateDataKernel::createCheckpoint(ContextImpl& context, ostream& stream) {
throw OpenMMException("CudaPlatform does not support checkpointing");
}
void CudaUpdateStateDataKernel::loadCheckpoint(ContextImpl& context, istream& stream) {
throw OpenMMException("CudaPlatform does not support checkpointing");
}
void CudaApplyConstraintsKernel::initialize(const System& system) {
}
void CudaApplyConstraintsKernel::apply(ContextImpl& context, double tol) {
kApplyConstraints(data.gpu);
}
void CudaVirtualSitesKernel::initialize(const System& system) {
}
void CudaVirtualSitesKernel::computePositions(ContextImpl& context) {
}
class CudaCalcHarmonicBondForceKernel::ForceInfo : public CudaForceInfo {
public:
ForceInfo(const HarmonicBondForce& force) : force(force) {
}
int getNumParticleGroups() {
return force.getNumBonds();
}
void getParticlesInGroup(int index, std::vector<int>& particles) {
int particle1, particle2;
double length, k;
force.getBondParameters(index, particle1, particle2, length, k);
particles.resize(2);
particles[0] = particle1;
particles[1] = particle2;
}
bool areGroupsIdentical(int group1, int group2) {
int particle1, particle2;
double length1, length2, k1, k2;
force.getBondParameters(group1, particle1, particle2, length1, k1);
force.getBondParameters(group2, particle1, particle2, length2, k2);
return (length1 == length2 && k1 == k2);
}
private:
const HarmonicBondForce& force;
};
CudaCalcHarmonicBondForceKernel::~CudaCalcHarmonicBondForceKernel() {
}
void CudaCalcHarmonicBondForceKernel::initialize(const System& system, const HarmonicBondForce& force) {
data.hasBonds = true;
numBonds = force.getNumBonds();
vector<int> particle1(numBonds);
vector<int> particle2(numBonds);
vector<float> length(numBonds);
vector<float> k(numBonds);
for (int i = 0; i < numBonds; i++) {
double lengthValue, kValue;
force.getBondParameters(i, particle1[i], particle2[i], lengthValue, kValue);
length[i] = (float) lengthValue;
k[i] = (float) kValue;
}
gpuSetBondParameters(data.gpu, particle1, particle2, length, k);
data.gpu->forces.push_back(new ForceInfo(force));
}
double CudaCalcHarmonicBondForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
return 0.0;
}
void CudaCalcHarmonicBondForceKernel::copyParametersToContext(ContextImpl& context, const HarmonicBondForce& force) {
throw OpenMMException("CudaPlatform does not support copyParametersToContext");
}
class CudaCalcCustomBondForceKernel::ForceInfo : public CudaForceInfo {
public:
ForceInfo(const CustomBondForce& force) : force(force) {
}
int getNumParticleGroups() {
return force.getNumBonds();
}
void getParticlesInGroup(int index, std::vector<int>& particles) {
int particle1, particle2;
vector<double> parameters;
force.getBondParameters(index, particle1, particle2, parameters);
particles.resize(2);
particles[0] = particle1;
particles[1] = particle2;
}
bool areGroupsIdentical(int group1, int group2) {
int particle1, particle2;
vector<double> parameters1, parameters2;
force.getBondParameters(group1, particle1, particle2, parameters1);
force.getBondParameters(group2, particle1, particle2, parameters2);
for (int i = 0; i < (int) parameters1.size(); i++)
if (parameters1[i] != parameters2[i])
return false;
return true;
}
private:
const CustomBondForce& force;
};
CudaCalcCustomBondForceKernel::~CudaCalcCustomBondForceKernel() {
}
void CudaCalcCustomBondForceKernel::initialize(const System& system, const CustomBondForce& force) {
numBonds = force.getNumBonds();
vector<int> particle1(numBonds);
vector<int> particle2(numBonds);
vector<vector<double> > params(numBonds);
for (int i = 0; i < numBonds; i++)
force.getBondParameters(i, particle1[i], particle2[i], params[i]);
vector<string> paramNames;
for (int i = 0; i < force.getNumPerBondParameters(); i++)
paramNames.push_back(force.getPerBondParameterName(i));
globalParamNames.resize(force.getNumGlobalParameters());
globalParamValues.resize(force.getNumGlobalParameters());
for (int i = 0; i < force.getNumGlobalParameters(); i++) {
globalParamNames[i] = force.getGlobalParameterName(i);
globalParamValues[i] = (float) force.getGlobalParameterDefaultValue(i);
}
gpuSetCustomBondParameters(data.gpu, particle1, particle2, params, force.getEnergyFunction(), paramNames, globalParamNames);
if (globalParamValues.size() > 0)
SetCustomBondGlobalParams(globalParamValues);
data.gpu->forces.push_back(new ForceInfo(force));
}
double CudaCalcCustomBondForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
updateGlobalParams(context);
kCalculateCustomBondForces(data.gpu);
return 0.0;
}
void CudaCalcCustomBondForceKernel::updateGlobalParams(ContextImpl& context) {
bool changed = false;
for (int i = 0; i < (int) globalParamNames.size(); i++) {
float value = (float) context.getParameter(globalParamNames[i]);
if (value != globalParamValues[i])
changed = true;
globalParamValues[i] = value;
}
if (changed)
SetCustomBondGlobalParams(globalParamValues);
}
void CudaCalcCustomBondForceKernel::copyParametersToContext(ContextImpl& context, const CustomBondForce& force) {
throw OpenMMException("CudaPlatform does not support copyParametersToContext");
}
class CudaCalcHarmonicAngleForceKernel::ForceInfo : public CudaForceInfo {
public:
ForceInfo(const HarmonicAngleForce& force) : force(force) {
}
int getNumParticleGroups() {
return force.getNumAngles();
}
void getParticlesInGroup(int index, std::vector<int>& particles) {
int particle1, particle2, particle3;
double angle, k;
force.getAngleParameters(index, particle1, particle2, particle3, angle, k);
particles.resize(3);
particles[0] = particle1;
particles[1] = particle2;
particles[2] = particle3;
}
bool areGroupsIdentical(int group1, int group2) {
int particle1, particle2, particle3;
double angle1, angle2, k1, k2;
force.getAngleParameters(group1, particle1, particle2, particle3, angle1, k1);
force.getAngleParameters(group2, particle1, particle2, particle3, angle2, k2);
return (angle1 == angle2 && k1 == k2);
}
private:
const HarmonicAngleForce& force;
};
CudaCalcHarmonicAngleForceKernel::~CudaCalcHarmonicAngleForceKernel() {
}
void CudaCalcHarmonicAngleForceKernel::initialize(const System& system, const HarmonicAngleForce& force) {
data.hasAngles = true;
numAngles = force.getNumAngles();
const float RadiansToDegrees = (float) (180.0/3.14159265);
vector<int> particle1(numAngles);
vector<int> particle2(numAngles);
vector<int> particle3(numAngles);
vector<float> angle(numAngles);
vector<float> k(numAngles);
for (int i = 0; i < numAngles; i++) {
double angleValue, kValue;
force.getAngleParameters(i, particle1[i], particle2[i], particle3[i], angleValue, kValue);
angle[i] = (float) (angleValue*RadiansToDegrees);
k[i] = (float) kValue;
}
gpuSetBondAngleParameters(data.gpu, particle1, particle2, particle3, angle, k);
data.gpu->forces.push_back(new ForceInfo(force));
}
double CudaCalcHarmonicAngleForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
return 0.0;
}
void CudaCalcHarmonicAngleForceKernel::copyParametersToContext(ContextImpl& context, const HarmonicAngleForce& force) {
throw OpenMMException("CudaPlatform does not support copyParametersToContext");
}
class CudaCalcCustomAngleForceKernel::ForceInfo : public CudaForceInfo {
public:
ForceInfo(const CustomAngleForce& force) : force(force) {
}
int getNumParticleGroups() {
return force.getNumAngles();
}
void getParticlesInGroup(int index, std::vector<int>& particles) {
int particle1, particle2, particle3;
vector<double> parameters;
force.getAngleParameters(index, particle1, particle2, particle3, parameters);
particles.resize(3);
particles[0] = particle1;
particles[1] = particle2;
particles[2] = particle3;
}
bool areGroupsIdentical(int group1, int group2) {
int particle1, particle2, particle3;
vector<double> parameters1, parameters2;
force.getAngleParameters(group1, particle1, particle2, particle3, parameters1);
force.getAngleParameters(group2, particle1, particle2, particle3, parameters2);
for (int i = 0; i < (int) parameters1.size(); i++)
if (parameters1[i] != parameters2[i])
return false;
return true;
}
private:
const CustomAngleForce& force;
};
CudaCalcCustomAngleForceKernel::~CudaCalcCustomAngleForceKernel() {
}
void CudaCalcCustomAngleForceKernel::initialize(const System& system, const CustomAngleForce& force) {
numAngles = force.getNumAngles();
vector<int> particle1(numAngles);
vector<int> particle2(numAngles);
vector<int> particle3(numAngles);
vector<vector<double> > params(numAngles);
for (int i = 0; i < numAngles; i++)
force.getAngleParameters(i, particle1[i], particle2[i], particle3[i], params[i]);
vector<string> paramNames;
for (int i = 0; i < force.getNumPerAngleParameters(); i++)
paramNames.push_back(force.getPerAngleParameterName(i));
globalParamNames.resize(force.getNumGlobalParameters());
globalParamValues.resize(force.getNumGlobalParameters());
for (int i = 0; i < force.getNumGlobalParameters(); i++) {
globalParamNames[i] = force.getGlobalParameterName(i);
globalParamValues[i] = (float) force.getGlobalParameterDefaultValue(i);
}
gpuSetCustomAngleParameters(data.gpu, particle1, particle2, particle3, params, force.getEnergyFunction(), paramNames, globalParamNames);
if (globalParamValues.size() > 0)
SetCustomAngleGlobalParams(globalParamValues);
data.gpu->forces.push_back(new ForceInfo(force));
}
double CudaCalcCustomAngleForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
updateGlobalParams(context);
kCalculateCustomAngleForces(data.gpu);
return 0.0;
}
void CudaCalcCustomAngleForceKernel::updateGlobalParams(ContextImpl& context) {
bool changed = false;
for (int i = 0; i < (int) globalParamNames.size(); i++) {
float value = (float) context.getParameter(globalParamNames[i]);
if (value != globalParamValues[i])
changed = true;
globalParamValues[i] = value;
}
if (changed)
SetCustomAngleGlobalParams(globalParamValues);
}
void CudaCalcCustomAngleForceKernel::copyParametersToContext(ContextImpl& context, const CustomAngleForce& force) {
throw OpenMMException("CudaPlatform does not support copyParametersToContext");
}
class CudaCalcPeriodicTorsionForceKernel::ForceInfo : public CudaForceInfo {
public:
ForceInfo(const PeriodicTorsionForce& force) : force(force) {
}
int getNumParticleGroups() {
return force.getNumTorsions();
}
void getParticlesInGroup(int index, std::vector<int>& particles) {
int particle1, particle2, particle3, particle4, periodicity;
double phase, k;
force.getTorsionParameters(index, particle1, particle2, particle3, particle4, periodicity, phase, k);
particles.resize(4);
particles[0] = particle1;
particles[1] = particle2;
particles[2] = particle3;
particles[3] = particle4;
}
bool areGroupsIdentical(int group1, int group2) {
int particle1, particle2, particle3, particle4, periodicity1, periodicity2;
double phase1, phase2, k1, k2;
force.getTorsionParameters(group1, particle1, particle2, particle3, particle4, periodicity1, phase1, k1);
force.getTorsionParameters(group2, particle1, particle2, particle3, particle4, periodicity2, phase2, k2);
return (periodicity1 == periodicity2 && phase1 == phase2 && k1 == k2);
}
private:
const PeriodicTorsionForce& force;
};
CudaCalcPeriodicTorsionForceKernel::~CudaCalcPeriodicTorsionForceKernel() {
}
void CudaCalcPeriodicTorsionForceKernel::initialize(const System& system, const PeriodicTorsionForce& force) {
data.hasPeriodicTorsions = true;
numTorsions = force.getNumTorsions();
const float RadiansToDegrees = (float)(180.0/3.14159265);
vector<int> particle1(numTorsions);
vector<int> particle2(numTorsions);
vector<int> particle3(numTorsions);
vector<int> particle4(numTorsions);
vector<float> k(numTorsions);
vector<float> phase(numTorsions);
vector<int> periodicity(numTorsions);
for (int i = 0; i < numTorsions; i++) {
double kValue, phaseValue;
force.getTorsionParameters(i, particle1[i], particle2[i], particle3[i], particle4[i], periodicity[i], phaseValue, kValue);
k[i] = (float) kValue;
phase[i] = (float) (phaseValue*RadiansToDegrees);
}
gpuSetDihedralParameters(data.gpu, particle1, particle2, particle3, particle4, k, phase, periodicity);
data.gpu->forces.push_back(new ForceInfo(force));
}
double CudaCalcPeriodicTorsionForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
return 0.0;
}
void CudaCalcPeriodicTorsionForceKernel::copyParametersToContext(ContextImpl& context, const PeriodicTorsionForce& force) {
throw OpenMMException("CudaPlatform does not support copyParametersToContext");
}
class CudaCalcRBTorsionForceKernel::ForceInfo : public CudaForceInfo {
public:
ForceInfo(const RBTorsionForce& force) : force(force) {
}
int getNumParticleGroups() {
return force.getNumTorsions();
}
void getParticlesInGroup(int index, std::vector<int>& particles) {
int particle1, particle2, particle3, particle4;
double c0, c1, c2, c3, c4, c5;
force.getTorsionParameters(index, particle1, particle2, particle3, particle4, c0, c1, c2, c3, c4, c5);
particles.resize(4);
particles[0] = particle1;
particles[1] = particle2;
particles[2] = particle3;
particles[3] = particle4;
}
bool areGroupsIdentical(int group1, int group2) {
int particle1, particle2, particle3, particle4;
double c0a, c0b, c1a, c1b, c2a, c2b, c3a, c3b, c4a, c4b, c5a, c5b;
force.getTorsionParameters(group1, particle1, particle2, particle3, particle4, c0a, c1a, c2a, c3a, c4a, c5a);
force.getTorsionParameters(group2, particle1, particle2, particle3, particle4, c0b, c1b, c2b, c3b, c4b, c5b);
return (c0a == c0b && c1a == c1b && c2a == c2b && c3a == c3b && c4a == c4b && c5a == c5b);
}
private:
const RBTorsionForce& force;
};
CudaCalcRBTorsionForceKernel::~CudaCalcRBTorsionForceKernel() {
}
void CudaCalcRBTorsionForceKernel::initialize(const System& system, const RBTorsionForce& force) {
data.hasRB = true;
numTorsions = force.getNumTorsions();
vector<int> particle1(numTorsions);
vector<int> particle2(numTorsions);
vector<int> particle3(numTorsions);
vector<int> particle4(numTorsions);
vector<float> c0(numTorsions);
vector<float> c1(numTorsions);
vector<float> c2(numTorsions);
vector<float> c3(numTorsions);
vector<float> c4(numTorsions);
vector<float> c5(numTorsions);
for (int i = 0; i < numTorsions; i++) {
double c[6];
force.getTorsionParameters(i, particle1[i], particle2[i], particle3[i], particle4[i], c[0], c[1], c[2], c[3], c[4], c[5]);
c0[i] = (float) c[0];
c1[i] = (float) c[1];
c2[i] = (float) c[2];
c3[i] = (float) c[3];
c4[i] = (float) c[4];
c5[i] = (float) c[5];
}
gpuSetRbDihedralParameters(data.gpu, particle1, particle2, particle3, particle4, c0, c1, c2, c3, c4, c5);
data.gpu->forces.push_back(new ForceInfo(force));
}
double CudaCalcRBTorsionForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
return 0.0;
}
void CudaCalcRBTorsionForceKernel::copyParametersToContext(ContextImpl& context, const RBTorsionForce& force) {
throw OpenMMException("CudaPlatform does not support copyParametersToContext");
}
class CudaCalcCMAPTorsionForceKernel::ForceInfo : public CudaForceInfo {
public:
ForceInfo(const CMAPTorsionForce& force) : force(force) {
}
int getNumParticleGroups() {
return force.getNumTorsions();
}
void getParticlesInGroup(int index, std::vector<int>& particles) {
int map, a1, a2, a3, a4, b1, b2, b3, b4;
force.getTorsionParameters(index, map, a1, a2, a3, a4, b1, b2, b3, b4);
particles.resize(8);
particles[0] = a1;
particles[1] = a2;
particles[2] = a3;
particles[3] = a4;
particles[4] = b1;
particles[5] = b2;
particles[6] = b3;
particles[7] = b4;
}
bool areGroupsIdentical(int group1, int group2) {
int map1, map2, a1, a2, a3, a4, b1, b2, b3, b4;
force.getTorsionParameters(group1, map1, a1, a2, a3, a4, b1, b2, b3, b4);
force.getTorsionParameters(group2, map2, a1, a2, a3, a4, b1, b2, b3, b4);
return (map1 == map2);
}
private:
const CMAPTorsionForce& force;
};
CudaCalcCMAPTorsionForceKernel::~CudaCalcCMAPTorsionForceKernel() {
if (coefficients != NULL)
delete coefficients;
if (mapPositions != NULL)
delete mapPositions;
if (torsionMaps != NULL)
delete torsionMaps;
if (torsionIndices != NULL)
delete torsionIndices;
}
void CudaCalcCMAPTorsionForceKernel::initialize(const System& system, const CMAPTorsionForce& force) {
numTorsions = force.getNumTorsions();
if (numTorsions == 0)
return;
int numMaps = force.getNumMaps();
vector<float4> coeffVec;
vector<int2> mapPositionsVec(numMaps);
vector<double> energy;
vector<vector<double> > c;
int currentPosition = 0;
mapPositions = new CUDAStream<int2>(numMaps, 1, "cmapTorsionMapPositions");
for (int i = 0; i < numMaps; i++) {
int size;
force.getMapParameters(i, size, energy);
CMAPTorsionForceImpl::calcMapDerivatives(size, energy, c);
(*mapPositions)[i] = make_int2(currentPosition, size);
currentPosition += 4*size*size;
for (int j = 0; j < size*size; j++) {
coeffVec.push_back(make_float4(c[j][0], c[j][1], c[j][2], c[j][3]));
coeffVec.push_back(make_float4(c[j][4], c[j][5], c[j][6], c[j][7]));
coeffVec.push_back(make_float4(c[j][8], c[j][9], c[j][10], c[j][11]));
coeffVec.push_back(make_float4(c[j][12], c[j][13], c[j][14], c[j][15]));
}
}
coefficients = new CUDAStream<float4>((int) coeffVec.size(), 1, "cmapTorsionCoefficients");;
for (int i = 0; i < (int) coeffVec.size(); i++)
(*coefficients)[i] = coeffVec[i];
torsionMaps = new CUDAStream<int>(numTorsions, 1, "cmapTorsionMaps");
torsionIndices = new CUDAStream<int4>(4*numTorsions, 1, "cmapTorsionIndices");
vector<int> forceBufferCounter(system.getNumParticles(), 0);
for (int i = 0; i < numTorsions; i++) {
int map, a1, a2, a3, a4, b1, b2, b3, b4;
force.getTorsionParameters(i, map, a1, a2, a3, a4, b1, b2, b3, b4);
(*torsionMaps)[i] = map;
(*torsionIndices)[i*4] = make_int4(a1, a2, a3, a4);
(*torsionIndices)[i*4+1] = make_int4(b1, b2, b3, b4);
(*torsionIndices)[i*4+2] = make_int4(forceBufferCounter[a1]++, forceBufferCounter[a2]++, forceBufferCounter[a3]++, forceBufferCounter[a4]++);
(*torsionIndices)[i*4+3] = make_int4(forceBufferCounter[b1]++, forceBufferCounter[b2]++, forceBufferCounter[b3]++, forceBufferCounter[b4]++);
}
coefficients->Upload();
mapPositions->Upload();
torsionMaps->Upload();
torsionIndices->Upload();
int maxBuffers = 1;
for (int i = 0; i < (int) forceBufferCounter.size(); i++)
maxBuffers = max(maxBuffers, forceBufferCounter[i]);
if (maxBuffers > data.gpu->sim.outputBuffers)
data.gpu->sim.outputBuffers = maxBuffers;
data.gpu->forces.push_back(new ForceInfo(force));
}
double CudaCalcCMAPTorsionForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
if( numTorsions )
kCalculateCMAPTorsionForces(data.gpu, *coefficients, *mapPositions, *torsionIndices, *torsionMaps);
return 0.0;
}
class CudaCalcCustomTorsionForceKernel::ForceInfo : public CudaForceInfo {
public:
ForceInfo(const CustomTorsionForce& force) : force(force) {
}
int getNumParticleGroups() {
return force.getNumTorsions();
}
void getParticlesInGroup(int index, std::vector<int>& particles) {
int particle1, particle2, particle3, particle4;
vector<double> parameters;
force.getTorsionParameters(index, particle1, particle2, particle3, particle4, parameters);
particles.resize(4);
particles[0] = particle1;
particles[1] = particle2;
particles[2] = particle3;
particles[3] = particle4;
}
bool areGroupsIdentical(int group1, int group2) {
int particle1, particle2, particle3, particle4;
vector<double> parameters1, parameters2;
force.getTorsionParameters(group1, particle1, particle2, particle3, particle4, parameters1);
force.getTorsionParameters(group2, particle1, particle2, particle3, particle4, parameters2);
for (int i = 0; i < (int) parameters1.size(); i++)
if (parameters1[i] != parameters2[i])
return false;
return true;
}
private:
const CustomTorsionForce& force;
};
CudaCalcCustomTorsionForceKernel::~CudaCalcCustomTorsionForceKernel() {
}
void CudaCalcCustomTorsionForceKernel::initialize(const System& system, const CustomTorsionForce& force) {
numTorsions = force.getNumTorsions();
vector<int> particle1(numTorsions);
vector<int> particle2(numTorsions);
vector<int> particle3(numTorsions);
vector<int> particle4(numTorsions);
vector<vector<double> > params(numTorsions);
for (int i = 0; i < numTorsions; i++)
force.getTorsionParameters(i, particle1[i], particle2[i], particle3[i], particle4[i], params[i]);
vector<string> paramNames;
for (int i = 0; i < force.getNumPerTorsionParameters(); i++)
paramNames.push_back(force.getPerTorsionParameterName(i));
globalParamNames.resize(force.getNumGlobalParameters());
globalParamValues.resize(force.getNumGlobalParameters());
for (int i = 0; i < force.getNumGlobalParameters(); i++) {
globalParamNames[i] = force.getGlobalParameterName(i);
globalParamValues[i] = (float) force.getGlobalParameterDefaultValue(i);
}
gpuSetCustomTorsionParameters(data.gpu, particle1, particle2, particle3, particle4, params, force.getEnergyFunction(), paramNames, globalParamNames);
if (globalParamValues.size() > 0)
SetCustomTorsionGlobalParams(globalParamValues);
data.gpu->forces.push_back(new ForceInfo(force));
}
double CudaCalcCustomTorsionForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
updateGlobalParams(context);
kCalculateCustomTorsionForces(data.gpu);
return 0.0;
}
void CudaCalcCustomTorsionForceKernel::updateGlobalParams(ContextImpl& context) {
bool changed = false;
for (int i = 0; i < (int) globalParamNames.size(); i++) {
float value = (float) context.getParameter(globalParamNames[i]);
if (value != globalParamValues[i])
changed = true;
globalParamValues[i] = value;
}
if (changed)
SetCustomTorsionGlobalParams(globalParamValues);
}
void CudaCalcCustomTorsionForceKernel::copyParametersToContext(ContextImpl& context, const CustomTorsionForce& force) {
throw OpenMMException("CudaPlatform does not support copyParametersToContext");
}
class CudaCalcNonbondedForceKernel::ForceInfo : public CudaForceInfo {
public:
ForceInfo(const NonbondedForce& force) : force(force) {
}
bool areParticlesIdentical(int particle1, int particle2) {
double charge1, charge2, sigma1, sigma2, epsilon1, epsilon2;
force.getParticleParameters(particle1, charge1, sigma1, epsilon1);
force.getParticleParameters(particle2, charge2, sigma2, epsilon2);
return (charge1 == charge2 && sigma1 == sigma2 && epsilon1 == epsilon2);
}
int getNumParticleGroups() {
return force.getNumExceptions();
}
void getParticlesInGroup(int index, std::vector<int>& particles) {
int particle1, particle2;
double chargeProd, sigma, epsilon;
force.getExceptionParameters(index, particle1, particle2, chargeProd, sigma, epsilon);
particles.resize(2);
particles[0] = particle1;
particles[1] = particle2;
}
bool areGroupsIdentical(int group1, int group2) {
int particle1, particle2;
double chargeProd1, chargeProd2, sigma1, sigma2, epsilon1, epsilon2;
force.getExceptionParameters(group1, particle1, particle2, chargeProd1, sigma1, epsilon1);
force.getExceptionParameters(group2, particle1, particle2, chargeProd2, sigma2, epsilon2);
return (chargeProd1 == chargeProd2 && sigma1 == sigma2 && epsilon1 == epsilon2);
}
private:
const NonbondedForce& force;
};
CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
}
void CudaCalcNonbondedForceKernel::initialize(const System& system, const NonbondedForce& force) {
data.hasNonbonded = true;
numParticles = force.getNumParticles();
_gpuContext* gpu = data.gpu;
// Identify which exceptions are 1-4 interactions.
vector<pair<int, int> > exclusions;
vector<int> exceptions;
for (int i = 0; i < force.getNumExceptions(); i++) {
int particle1, particle2;
double chargeProd, sigma, epsilon;
force.getExceptionParameters(i, particle1, particle2, chargeProd, sigma, epsilon);
exclusions.push_back(pair<int, int>(particle1, particle2));
if (chargeProd != 0.0 || epsilon != 0.0)
exceptions.push_back(i);
}
// Initialize nonbonded interactions.
{
vector<int> particle(numParticles);
vector<float> c6(numParticles);
vector<float> c12(numParticles);
vector<float> q(numParticles);
vector<char> symbol;
vector<vector<int> > exclusionList(numParticles);
for (int i = 0; i < numParticles; i++) {
double charge, radius, depth;
force.getParticleParameters(i, charge, radius, depth);
particle[i] = i;
q[i] = (float) charge;
c6[i] = (float) (4*depth*pow(radius, 6.0));
c12[i] = (float) (4*depth*pow(radius, 12.0));
exclusionList[i].push_back(i);
}
for (int i = 0; i < (int)exclusions.size(); i++) {
exclusionList[exclusions[i].first].push_back(exclusions[i].second);
exclusionList[exclusions[i].second].push_back(exclusions[i].first);
}
CudaNonbondedMethod method = NO_CUTOFF;
if (force.getNonbondedMethod() != NonbondedForce::NoCutoff) {
gpuSetNonbondedCutoff(gpu, (float) force.getCutoffDistance(), (float) force.getReactionFieldDielectric());
method = CUTOFF;
}
if (force.getNonbondedMethod() == NonbondedForce::CutoffPeriodic) {
method = PERIODIC;
}
if (force.getNonbondedMethod() == NonbondedForce::Ewald || force.getNonbondedMethod() == NonbondedForce::PME) {
if (force.getReciprocalSpaceForceGroup() > 0)
throw OpenMMException("CudaPlatform does not support force groups");
if (force.getNonbondedMethod() == NonbondedForce::Ewald) {
double alpha;
int kmaxx, kmaxy, kmaxz;
NonbondedForceImpl::calcEwaldParameters(system, force, alpha, kmaxx, kmaxy, kmaxz);
gpuSetEwaldParameters(gpu, (float) alpha, kmaxx, kmaxy, kmaxz);
method = EWALD;
}
else {
double alpha;
int gridSizeX, gridSizeY, gridSizeZ;
NonbondedForceImpl::calcPMEParameters(system, force, alpha, gridSizeX, gridSizeY, gridSizeZ);
gpuSetPMEParameters(gpu, (float) alpha, gridSizeX, gridSizeY, gridSizeZ);
method = PARTICLE_MESH_EWALD;
}
}
data.nonbondedMethod = method;
gpuSetCoulombParameters(gpu, (float) ONE_4PI_EPS0, particle, c6, c12, q, symbol, exclusionList, method);
// Compute the Ewald self energy.
data.ewaldSelfEnergy = 0.0;
if (force.getNonbondedMethod() == NonbondedForce::Ewald || force.getNonbondedMethod() == NonbondedForce::PME) {
double selfEnergyScale = gpu->sim.epsfac*gpu->sim.alphaEwald/std::sqrt(PI);
for (int i = 0; i < numParticles; i++)
data.ewaldSelfEnergy -= selfEnergyScale*q[i]*q[i];
}
// Compute the long range dispersion correction.
if (force.getUseDispersionCorrection())
data.dispersionCoefficient = NonbondedForceImpl::calcDispersionCorrection(system, force);
else
data.dispersionCoefficient = 0.0;
}
// Initialize 1-4 nonbonded interactions.
{
int numExceptions = exceptions.size();
vector<int> particle1(numExceptions);
vector<int> particle2(numExceptions);
vector<float> c6(numExceptions);
vector<float> c12(numExceptions);
vector<float> q1(numExceptions);
vector<float> q2(numExceptions);
for (int i = 0; i < numExceptions; i++) {
double charge, sig, eps;
force.getExceptionParameters(exceptions[i], particle1[i], particle2[i], charge, sig, eps);
c6[i] = (float) (4*eps*pow(sig, 6.0));
c12[i] = (float) (4*eps*pow(sig, 12.0));
q1[i] = (float) charge;
q2[i] = 1.0f;
}
gpuSetLJ14Parameters(gpu, (float) ONE_4PI_EPS0, 1.0f, particle1, particle2, c6, c12, q1, q2);
}
data.gpu->forces.push_back(new ForceInfo(force));
}
double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy, bool includeDirect, bool includeReciprocal) {
return 0.0;
}
void CudaCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& context, const NonbondedForce& force) {
throw OpenMMException("CudaPlatform does not support copyParametersToContext");
}
class CudaCalcCustomNonbondedForceKernel::ForceInfo : public CudaForceInfo {
public:
ForceInfo(const CustomNonbondedForce& force) : force(force) {
}
bool areParticlesIdentical(int particle1, int particle2) {
vector<double> params1;
vector<double> params2;
force.getParticleParameters(particle1, params1);
force.getParticleParameters(particle2, params2);
for (int i = 0; i < (int) params1.size(); i++)
if (params1[i] != params2[i])
return false;
return true;
}
int getNumParticleGroups() {
return force.getNumExclusions();
}
void getParticlesInGroup(int index, std::vector<int>& particles) {
int particle1, particle2;
force.getExclusionParticles(index, particle1, particle2);
particles.resize(2);
particles[0] = particle1;
particles[1] = particle2;
}
bool areGroupsIdentical(int group1, int group2) {
return true;
}
private:
const CustomNonbondedForce& force;
};
CudaCalcCustomNonbondedForceKernel::~CudaCalcCustomNonbondedForceKernel() {
}
void CudaCalcCustomNonbondedForceKernel::initialize(const System& system, const CustomNonbondedForce& force) {
data.hasCustomNonbonded = true;
numParticles = force.getNumParticles();
_gpuContext* gpu = data.gpu;
// Initialize nonbonded interactions.
vector<int> particle(numParticles);
vector<vector<double> > parameters(numParticles);
vector<vector<int> > exclusionList(numParticles);
for (int i = 0; i < numParticles; i++) {
force.getParticleParameters(i, parameters[i]);
particle[i] = i;
exclusionList[i].push_back(i);
}
for (int i = 0; i < force.getNumExclusions(); i++) {
int particle1, particle2;
force.getExclusionParticles(i, particle1, particle2);
exclusionList[particle1].push_back(particle2);
exclusionList[particle2].push_back(particle1);
}
CudaNonbondedMethod method = NO_CUTOFF;
if (force.getNonbondedMethod() != CustomNonbondedForce::NoCutoff)
method = CUTOFF;
if (force.getNonbondedMethod() == CustomNonbondedForce::CutoffPeriodic) {
method = PERIODIC;
}
data.customNonbondedMethod = method;
// Record the tabulated functions.
for (int i = 0; i < force.getNumFunctions(); i++) {
string name;
vector<double> values;
double min, max;
force.getFunctionParameters(i, name, values, min, max);
gpuSetTabulatedFunction(gpu, i, name, values, min, max);
}
// Record information for the expressions.
vector<string> paramNames;
for (int i = 0; i < force.getNumPerParticleParameters(); i++)
paramNames.push_back(force.getPerParticleParameterName(i));
globalParamNames.resize(force.getNumGlobalParameters());
globalParamValues.resize(force.getNumGlobalParameters());
for (int i = 0; i < force.getNumGlobalParameters(); i++) {
globalParamNames[i] = force.getGlobalParameterName(i);
globalParamValues[i] = (float) force.getGlobalParameterDefaultValue(i);
}
gpuSetCustomNonbondedParameters(gpu, parameters, exclusionList, method, (float) force.getCutoffDistance(), force.getEnergyFunction(), paramNames, globalParamNames);
if (globalParamValues.size() > 0)
SetCustomNonbondedGlobalParams(globalParamValues);
data.gpu->forces.push_back(new ForceInfo(force));
}
double CudaCalcCustomNonbondedForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
updateGlobalParams(context);
return 0.0;
}
void CudaCalcCustomNonbondedForceKernel::updateGlobalParams(ContextImpl& context) {
bool changed = false;
for (int i = 0; i < (int) globalParamNames.size(); i++) {
float value = (float) context.getParameter(globalParamNames[i]);
if (value != globalParamValues[i])
changed = true;
globalParamValues[i] = value;
}
if (changed)
SetCustomNonbondedGlobalParams(globalParamValues);
}
void CudaCalcCustomNonbondedForceKernel::copyParametersToContext(ContextImpl& context, const CustomNonbondedForce& force) {
throw OpenMMException("CudaPlatform does not support copyParametersToContext");
}
class CudaCalcGBSAOBCForceKernel::ForceInfo : public CudaForceInfo {
public:
ForceInfo(const GBSAOBCForce& force) : force(force) {
}
bool areParticlesIdentical(int particle1, int particle2) {
double charge1, charge2, radius1, radius2, scale1, scale2;
force.getParticleParameters(particle1, charge1, radius1, scale1);
force.getParticleParameters(particle2, charge2, radius2, scale2);
return (charge1 == charge2 && radius1 == radius2 && scale1 == scale2);
}
private:
const GBSAOBCForce& force;
};
CudaCalcGBSAOBCForceKernel::~CudaCalcGBSAOBCForceKernel() {
}
void CudaCalcGBSAOBCForceKernel::initialize(const System& system, const GBSAOBCForce& force) {
int numParticles = system.getNumParticles();
_gpuContext* gpu = data.gpu;
vector<float> radius(numParticles);
vector<float> scale(numParticles);
vector<float> charge(numParticles);
for (int i = 0; i < numParticles; i++) {
double particleCharge, particleRadius, scalingFactor;
force.getParticleParameters(i, particleCharge, particleRadius, scalingFactor);
radius[i] = (float) particleRadius;
scale[i] = (float) scalingFactor;
charge[i] = (float) particleCharge;
}
gpuSetObcParameters(gpu, (float) force.getSoluteDielectric(), (float) force.getSolventDielectric(), radius, scale, charge);
data.gpu->forces.push_back(new ForceInfo(force));
}
double CudaCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
return 0.0;
}
void CudaCalcGBSAOBCForceKernel::copyParametersToContext(ContextImpl& context, const GBSAOBCForce& force) {
throw OpenMMException("CudaPlatform does not support copyParametersToContext");
}
class CudaCalcGBVIForceKernel::ForceInfo : public CudaForceInfo {
public:
ForceInfo(const GBVIForce& force) : force(force) {
}
bool areParticlesIdentical(int particle1, int particle2) {
double charge1, charge2, radius1, radius2, gamma1, gamma2;
force.getParticleParameters(particle1, charge1, radius1, gamma1);
force.getParticleParameters(particle2, charge2, radius2, gamma2);
return (charge1 == charge2 && radius1 == radius2 && gamma1 == gamma2);
}
private:
const GBVIForce& force;
};
CudaCalcGBVIForceKernel::~CudaCalcGBVIForceKernel() {
}
void CudaCalcGBVIForceKernel::initialize(const System& system, const GBVIForce& force, const std::vector<double> & inputScaledRadii) {
int numParticles = system.getNumParticles();
_gpuContext* gpu = data.gpu;
vector<int> particle(numParticles);
vector<float> radius(numParticles);
vector<float> scaledRadii(numParticles);
vector<float> gammas(numParticles);
for (int i = 0; i < numParticles; i++) {
double charge, particleRadius, gamma;
force.getParticleParameters(i, charge, particleRadius, gamma );
particle[i] = i;
radius[i] = (float) particleRadius;
gammas[i] = (float) gamma;
scaledRadii[i] = (float) inputScaledRadii[i];
}
int gbviBornRadiusScalingMethod;
if( force.getBornRadiusScalingMethod() == GBVIForce::QuinticSpline ){
gbviBornRadiusScalingMethod = 1;
} else {
gbviBornRadiusScalingMethod = 2;
}
gpuSetGBVIParameters(gpu, (float) force.getSoluteDielectric(), (float) force.getSolventDielectric(), particle,
radius, gammas, scaledRadii, gbviBornRadiusScalingMethod,
static_cast<float>(force.getQuinticLowerLimitFactor()),
static_cast<float>(force.getQuinticUpperBornRadiusLimit()) );
data.gpu->forces.push_back(new ForceInfo(force));
}
double CudaCalcGBVIForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
return 0.0;
}
class CudaCalcCustomExternalForceKernel::ForceInfo : public CudaForceInfo {
public:
ForceInfo(const CustomExternalForce& force, int numParticles) : force(force), indices(numParticles, -1) {
vector<double> params;
for (int i = 0; i < force.getNumParticles(); i++) {
int particle;
force.getParticleParameters(i, particle, params);
indices[particle] = i;
}
}
bool areParticlesIdentical(int particle1, int particle2) {
particle1 = indices[particle1];
particle2 = indices[particle2];
if (particle1 == -1 && particle2 == -1)
return true;
if (particle1 == -1 || particle2 == -1)
return false;
int temp;
vector<double> params1;
vector<double> params2;
force.getParticleParameters(particle1, temp, params1);
force.getParticleParameters(particle2, temp, params2);
for (int i = 0; i < (int) params1.size(); i++)
if (params1[i] != params2[i])
return false;
return true;
}
private:
const CustomExternalForce& force;
vector<int> indices;
};
CudaCalcCustomExternalForceKernel::~CudaCalcCustomExternalForceKernel() {
}
void CudaCalcCustomExternalForceKernel::initialize(const System& system, const CustomExternalForce& force) {
numParticles = force.getNumParticles();
vector<int> particle(numParticles);
vector<vector<double> > params(numParticles);
for (int i = 0; i < numParticles; i++)
force.getParticleParameters(i, particle[i], params[i]);
vector<string> paramNames;
for (int i = 0; i < force.getNumPerParticleParameters(); i++)
paramNames.push_back(force.getPerParticleParameterName(i));
globalParamNames.resize(force.getNumGlobalParameters());
globalParamValues.resize(force.getNumGlobalParameters());
for (int i = 0; i < force.getNumGlobalParameters(); i++) {
globalParamNames[i] = force.getGlobalParameterName(i);
globalParamValues[i] = (float) force.getGlobalParameterDefaultValue(i);
}
gpuSetCustomExternalParameters(data.gpu, particle, params, force.getEnergyFunction(), paramNames, globalParamNames);
if (globalParamValues.size() > 0)
SetCustomExternalGlobalParams(globalParamValues);
data.gpu->forces.push_back(new ForceInfo(force, system.getNumParticles()));
}
double CudaCalcCustomExternalForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
updateGlobalParams(context);
kCalculateCustomExternalForces(data.gpu);
return 0.0;
}
void CudaCalcCustomExternalForceKernel::updateGlobalParams(ContextImpl& context) {
bool changed = false;
for (int i = 0; i < (int) globalParamNames.size(); i++) {
float value = (float) context.getParameter(globalParamNames[i]);
if (value != globalParamValues[i])
changed = true;
globalParamValues[i] = value;
}
if (changed)
SetCustomExternalGlobalParams(globalParamValues);
}
void CudaCalcCustomExternalForceKernel::copyParametersToContext(ContextImpl& context, const CustomExternalForce& force) {
throw OpenMMException("CudaPlatform does not support copyParametersToContext");
}
void OPENMMCUDA_EXPORT OpenMM::cudaOpenMMInitializeIntegration(const System& system, CudaPlatform::PlatformData& data, const Integrator& integrator) {
// Initialize any terms that haven't already been handled by a Force.
_gpuContext* gpu = data.gpu;
if (!data.hasBonds)
gpuSetBondParameters(gpu, vector<int>(), vector<int>(), vector<float>(), vector<float>());
if (!data.hasAngles)
gpuSetBondAngleParameters(gpu, vector<int>(), vector<int>(), vector<int>(), vector<float>(), vector<float>());
if (!data.hasPeriodicTorsions)
gpuSetDihedralParameters(gpu, vector<int>(), vector<int>(), vector<int>(), vector<int>(), vector<float>(), vector<float>(), vector<int>());
if (!data.hasRB)
gpuSetRbDihedralParameters(gpu, vector<int>(), vector<int>(), vector<int>(), vector<int>(), vector<float>(), vector<float>(),
vector<float>(), vector<float>(), vector<float>(), vector<float>());
if (!data.hasNonbonded) {
gpuSetCoulombParameters(gpu, (float) ONE_4PI_EPS0, vector<int>(), vector<float>(), vector<float>(), vector<float>(), vector<char>(), vector<vector<int> >(), NO_CUTOFF);
gpuSetLJ14Parameters(gpu, (float) ONE_4PI_EPS0, 1.0f, vector<int>(), vector<int>(), vector<float>(), vector<float>(), vector<float>(), vector<float>());
if (gpu->bIncludeGBSA || gpu->bIncludeGBVI)
throw OpenMMException("CudaPlatform requires GBSAOBCForce and GBVIForce to be used with a NonbondedForce");
}
// Set masses.
int numParticles = system.getNumParticles();
vector<float> mass(numParticles);
for (int i = 0; i < numParticles; i++)
mass[i] = (float) system.getParticleMass(i);
gpuSetMass(gpu, mass);
// Set constraints.
int numConstraints = system.getNumConstraints();
vector<int> particle1(numConstraints);
vector<int> particle2(numConstraints);
vector<float> distance(numConstraints);
vector<float> invMass1(numConstraints);
vector<float> invMass2(numConstraints);
for (int i = 0; i < numConstraints; i++) {
int particle1Index, particle2Index;
double constraintDistance;
system.getConstraintParameters(i, particle1Index, particle2Index, constraintDistance);
particle1[i] = particle1Index;
particle2[i] = particle2Index;
distance[i] = (float) constraintDistance;
invMass1[i] = 1.0f/mass[particle1Index];
invMass2[i] = 1.0f/mass[particle2Index];
}
gpuSetConstraintParameters(gpu, particle1, particle2, distance, invMass1, invMass2, (float)integrator.getConstraintTolerance());
// Finish initialization.
gpuBuildThreadBlockWorkList(gpu);
gpuBuildExclusionList(gpu);
gpuBuildOutputBuffers(gpu);
gpuSetConstants(gpu);
if (gpu->bIncludeGBSA || gpu->bIncludeGBVI)
kClearBornSumAndForces(gpu);
else
kClearForces(gpu);
cudaThreadSynchronize();
}
CudaIntegrateVerletStepKernel::~CudaIntegrateVerletStepKernel() {
}
void CudaIntegrateVerletStepKernel::initialize(const System& system, const VerletIntegrator& integrator) {
cudaOpenMMInitializeIntegration(system, data, integrator);
prevStepSize = -1.0;
}
void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIntegrator& integrator) {
_gpuContext* gpu = data.gpu;
double stepSize = integrator.getStepSize();
if (stepSize != prevStepSize) {
// Initialize the GPU parameters.
gpuSetVerletIntegrationParameters(gpu, (float) stepSize, 0.0f);
gpuSetConstants(gpu);
prevStepSize = stepSize;
}
kVerletUpdatePart1(gpu);
kApplyShake(gpu);
kApplySettle(gpu);
kApplyCCMA(gpu);
if (data.removeCM)
if (data.stepCount%data.cmMotionFrequency == 0)
gpu->bCalculateCM = true;
kVerletUpdatePart2(gpu);
data.time += stepSize;
data.stepCount++;
}
CudaIntegrateLangevinStepKernel::~CudaIntegrateLangevinStepKernel() {
}
void CudaIntegrateLangevinStepKernel::initialize(const System& system, const LangevinIntegrator& integrator) {
cudaOpenMMInitializeIntegration(system, data, integrator);
_gpuContext* gpu = data.gpu;
gpu->seed = (unsigned long) integrator.getRandomNumberSeed();
gpuInitializeRandoms(gpu);
prevTemp = -1.0;
prevFriction = -1.0;
prevStepSize = -1.0;
}
void CudaIntegrateLangevinStepKernel::execute(ContextImpl& context, const LangevinIntegrator& integrator) {
_gpuContext* gpu = data.gpu;
double temperature = integrator.getTemperature();
double friction = integrator.getFriction();
double stepSize = integrator.getStepSize();
if (temperature != prevTemp || friction != prevFriction || stepSize != prevStepSize) {
// Initialize the GPU parameters.
double tau = (friction == 0.0 ? 0.0 : 1.0/friction);
gpuSetLangevinIntegrationParameters(gpu, (float) tau, (float) stepSize, (float) temperature, 0.0f);
gpuSetConstants(gpu);
kGenerateRandoms(gpu);
prevTemp = temperature;
prevFriction = friction;
prevStepSize = stepSize;
}
kLangevinUpdatePart1(gpu);
if (data.removeCM)
if (data.stepCount%data.cmMotionFrequency == 0)
gpu->bCalculateCM = true;
kLangevinUpdatePart2(gpu);
kApplyShake(gpu);
kApplySettle(gpu);
kApplyCCMA(gpu);
kSetVelocitiesFromPositions(gpu);
data.time += stepSize;
data.stepCount++;
}
CudaIntegrateBrownianStepKernel::~CudaIntegrateBrownianStepKernel() {
}
void CudaIntegrateBrownianStepKernel::initialize(const System& system, const BrownianIntegrator& integrator) {
cudaOpenMMInitializeIntegration(system, data, integrator);
_gpuContext* gpu = data.gpu;
gpu->seed = (unsigned long) integrator.getRandomNumberSeed();
gpuInitializeRandoms(gpu);
prevTemp = -1.0;
prevFriction = -1.0;
prevStepSize = -1.0;
}
void CudaIntegrateBrownianStepKernel::execute(ContextImpl& context, const BrownianIntegrator& integrator) {
_gpuContext* gpu = data.gpu;
double temperature = integrator.getTemperature();
double friction = integrator.getFriction();
double stepSize = integrator.getStepSize();
if (temperature != prevTemp || friction != prevFriction || stepSize != prevStepSize) {
// Initialize the GPU parameters.
double tau = (friction == 0.0 ? 0.0 : 1.0/friction);
gpuSetBrownianIntegrationParameters(gpu, (float) tau, (float) stepSize, (float) temperature);
gpuSetConstants(gpu);
kGenerateRandoms(gpu);
prevTemp = temperature;
prevFriction = friction;
prevStepSize = stepSize;
}
kBrownianUpdatePart1(gpu);
kApplyShake(gpu);
kApplySettle(gpu);
kApplyCCMA(gpu);
if (data.removeCM)
if (data.stepCount%data.cmMotionFrequency == 0)
gpu->bCalculateCM = true;
kBrownianUpdatePart2(gpu);
data.time += stepSize;
data.stepCount++;
}
CudaIntegrateVariableVerletStepKernel::~CudaIntegrateVariableVerletStepKernel() {
}
void CudaIntegrateVariableVerletStepKernel::initialize(const System& system, const VariableVerletIntegrator& integrator) {
cudaOpenMMInitializeIntegration(system, data, integrator);
prevErrorTol = -1.0;
}
double CudaIntegrateVariableVerletStepKernel::execute(ContextImpl& context, const VariableVerletIntegrator& integrator, double maxTime) {
_gpuContext* gpu = data.gpu;
double errorTol = integrator.getErrorTolerance();
if (errorTol != prevErrorTol) {
// Initialize the GPU parameters.
gpuSetVerletIntegrationParameters(gpu, 0.0f, (float) errorTol);
gpuSetConstants(gpu);
prevErrorTol = errorTol;
}
float maxStepSize = (float)(maxTime-data.time);
kSelectVerletStepSize(gpu, maxStepSize);
kVerletUpdatePart1(gpu);
kApplyShake(gpu);
kApplySettle(gpu);
kApplyCCMA(gpu);
if (data.removeCM)
if (data.stepCount%data.cmMotionFrequency == 0)
gpu->bCalculateCM = true;
kVerletUpdatePart2(gpu);
gpu->psStepSize->Download();
data.time += (*gpu->psStepSize)[0].y;
if ((*gpu->psStepSize)[0].y == maxStepSize)
data.time = maxTime; // Avoid round-off error
data.stepCount++;
return (*gpu->psStepSize)[0].y;
}
CudaIntegrateVariableLangevinStepKernel::~CudaIntegrateVariableLangevinStepKernel() {
}
void CudaIntegrateVariableLangevinStepKernel::initialize(const System& system, const VariableLangevinIntegrator& integrator) {
cudaOpenMMInitializeIntegration(system, data, integrator);
_gpuContext* gpu = data.gpu;
gpu->seed = (unsigned long) integrator.getRandomNumberSeed();
gpuInitializeRandoms(gpu);
prevTemp = -1.0;
prevFriction = -1.0;
prevErrorTol = -1.0;
}
double CudaIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, const VariableLangevinIntegrator& integrator, double maxTime) {
_gpuContext* gpu = data.gpu;
double temperature = integrator.getTemperature();
double friction = integrator.getFriction();
double errorTol = integrator.getErrorTolerance();
if (temperature != prevTemp || friction != prevFriction || errorTol != prevErrorTol) {
// Initialize the GPU parameters.
double tau = (friction == 0.0 ? 0.0 : 1.0/friction);
gpuSetLangevinIntegrationParameters(gpu, (float) tau, 0.0f, (float) temperature, (float) errorTol);
gpuSetConstants(gpu);
kGenerateRandoms(gpu);
prevTemp = temperature;
prevFriction = friction;
prevErrorTol = errorTol;
}
float maxStepSize = (float)(maxTime-data.time);
kSelectLangevinStepSize(gpu, maxStepSize);
kLangevinUpdatePart1(gpu);
if (data.removeCM)
if (data.stepCount%data.cmMotionFrequency == 0)
gpu->bCalculateCM = true;
kLangevinUpdatePart2(gpu);
kApplyShake(gpu);
kApplySettle(gpu);
kApplyCCMA(gpu);
kSetVelocitiesFromPositions(gpu);
gpu->psStepSize->Download();
data.time += (*gpu->psStepSize)[0].y;
if ((*gpu->psStepSize)[0].y == maxStepSize)
data.time = maxTime; // Avoid round-off error
data.stepCount++;
return (*gpu->psStepSize)[0].y;
}
CudaApplyAndersenThermostatKernel::~CudaApplyAndersenThermostatKernel() {
if (atomGroups != NULL)
delete atomGroups;
}
void CudaApplyAndersenThermostatKernel::initialize(const System& system, const AndersenThermostat& thermostat) {
_gpuContext* gpu = data.gpu;
gpu->seed = (unsigned long) thermostat.getRandomNumberSeed();
gpuInitializeRandoms(gpu);
prevTemp = -1.0;
prevFrequency = -1.0;
prevStepSize = -1.0;
// Create the arrays with the group definitions.
vector<vector<int> > groups = AndersenThermostatImpl::calcParticleGroups(system);
atomGroups = new CUDAStream<int>(system.getNumParticles(), 1, "atomGroups");
for (int i = 0; i < (int) groups.size(); i++) {
for (int j = 0; j < (int) groups[i].size(); j++)
(*atomGroups)[groups[i][j]] = i;
}
atomGroups->Upload();
}
void CudaApplyAndersenThermostatKernel::execute(ContextImpl& context) {
_gpuContext* gpu = data.gpu;
double temperature = context.getParameter(AndersenThermostat::Temperature());
double frequency = context.getParameter(AndersenThermostat::CollisionFrequency());
double stepSize = context.getIntegrator().getStepSize();
if (temperature != prevTemp || frequency != prevFrequency || stepSize != prevStepSize) {
// Initialize the GPU parameters.
gpuSetAndersenThermostatParameters(gpu, (float) temperature, (float) frequency);
gpuSetConstants(gpu);
kGenerateRandoms(gpu);
prevTemp = temperature;
prevFrequency = frequency;
prevStepSize = stepSize;
}
kCalculateAndersenThermostat(gpu, *atomGroups);
}
CudaApplyMonteCarloBarostatKernel::~CudaApplyMonteCarloBarostatKernel() {
if (moleculeAtoms != NULL)
delete moleculeAtoms;
if (moleculeStartIndex != NULL)
delete moleculeStartIndex;
}
void CudaApplyMonteCarloBarostatKernel::initialize(const System& system, const MonteCarloBarostat& thermostat) {
}
void CudaApplyMonteCarloBarostatKernel::scaleCoordinates(ContextImpl& context, double scale) {
if (!hasInitializedMolecules) {
hasInitializedMolecules = true;
// Create the arrays with the molecule definitions.
vector<vector<int> > molecules = context.getMolecules();
numMolecules = molecules.size();
moleculeAtoms = new CUDAStream<int>(context.getSystem().getNumParticles(), 1, "moleculeAtoms");
moleculeStartIndex = new CUDAStream<int>(numMolecules+1, 1, "moleculeStartIndex");
int index = 0;
for (int i = 0; i < numMolecules; i++) {
(*moleculeStartIndex)[i] = index;
for (int j = 0; j < (int) molecules[i].size(); j++)
(*moleculeAtoms)[index++] = molecules[i][j];
}
(*moleculeStartIndex)[numMolecules] = index;
moleculeAtoms->Upload();
moleculeStartIndex->Upload();
}
_gpuContext* gpu = data.gpu;
gpu->psPosqP4->CopyFrom(*gpu->psPosq4);
kScaleAtomCoordinates(gpu, scale, *moleculeAtoms, *moleculeStartIndex);
for (int i = 0; i < (int) gpu->posCellOffsets.size(); i++)
gpu->posCellOffsets[i] = make_int3(0, 0, 0);
}
void CudaApplyMonteCarloBarostatKernel::restoreCoordinates(ContextImpl& context) {
_gpuContext* gpu = data.gpu;
gpu->psPosq4->CopyFrom(*gpu->psPosqP4);
}
void CudaCalcKineticEnergyKernel::initialize(const System& system) {
int numParticles = system.getNumParticles();
masses.resize(numParticles);
for (int i = 0; i < numParticles; ++i)
masses[i] = system.getParticleMass(i);
}
double CudaCalcKineticEnergyKernel::execute(ContextImpl& context) {
// We don't currently have a GPU kernel to do this, so we retrieve the velocities and calculate the energy
// on the CPU.
_gpuContext* gpu = data.gpu;
gpu->psVelm4->Download();
double energy = 0.0;
for (int i = 0; i < (int) masses.size(); ++i) {
float4 v = (*gpu->psVelm4)[i];
energy += masses[i]*(v.x*v.x+v.y*v.y+v.z*v.z);
}
return 0.5*energy;
}
void CudaRemoveCMMotionKernel::initialize(const System& system, const CMMotionRemover& force) {
data.removeCM = true;
data.cmMotionFrequency = force.getFrequency();
}
void CudaRemoveCMMotionKernel::execute(ContextImpl& context) {
}
#ifndef OPENMM_CUDAKERNELS_H_
#define OPENMM_CUDAKERNELS_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008-2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "CudaPlatform.h"
#include "openmm/kernels.h"
#include "kernels/gputypes.h"
#include "openmm/System.h"
class CudaAndersenThermostat;
class CudaBrownianDynamics;
class CudaStochasticDynamics;
class CudaShakeAlgorithm;
class CudaVerletDynamics;
namespace OpenMM {
// Export internal cudaOpenMMInitializeIntegration() method so it can be used by NML plugin
void OPENMMCUDA_EXPORT cudaOpenMMInitializeIntegration(const System& system, CudaPlatform::PlatformData& data, const Integrator& integrator);
/**
* This kernel is invoked at the beginning and end of force and energy computations. It gives the
* Platform a chance to clear buffers and do other initialization at the beginning, and to do any
* necessary work at the end to determine the final results.
*/
class CudaCalcForcesAndEnergyKernel : public CalcForcesAndEnergyKernel {
public:
CudaCalcForcesAndEnergyKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data) : CalcForcesAndEnergyKernel(name, platform), data(data) {
}
/**
* Initialize the kernel.
*
* @param system the System this kernel will be applied to
*/
void initialize(const System& system);
/**
* This is called at the beginning of each force/energy computation, before calcForcesAndEnergy() has been called on
* any ForceImpl.
*
* @param context the context in which to execute this kernel
* @param includeForce true if forces should be computed
* @param includeEnergy true if potential energy should be computed
* @param groups a set of bit flags for which force groups to include
*/
void beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups);
/**
* This is called at the end of each force/energy computation, after calcForcesAndEnergy() has been called on
* every ForceImpl.
*
* @param context the context in which to execute this kernel
* @param includeForce true if forces should be computed
* @param includeEnergy true if potential energy should be computed
* @param groups a set of bit flags for which force groups to include
* @return the potential energy of the system. This value is added to all values returned by ForceImpls'
* calcForcesAndEnergy() methods. That is, each force kernel may <i>either</i> return its contribution to the
* energy directly, <i>or</i> add it to an internal buffer so that it will be included here.
*/
double finishComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups);
private:
CudaPlatform::PlatformData& data;
};
/**
* This kernel provides methods for setting and retrieving various state data: time, positions,
* velocities, and forces.
*/
class CudaUpdateStateDataKernel : public UpdateStateDataKernel {
public:
CudaUpdateStateDataKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data) : UpdateStateDataKernel(name, platform), data(data) {
}
/**
* Initialize the kernel.
*
* @param system the System this kernel will be applied to
*/
void initialize(const System& system);
/**
* Get the current time (in picoseconds).
*
* @param context the context in which to execute this kernel
*/
double getTime(const ContextImpl& context) const;
/**
* Set the current time (in picoseconds).
*
* @param context the context in which to execute this kernel
*/
void setTime(ContextImpl& context, double time);
/**
* Get the positions of all particles.
*
* @param positions on exit, this contains the particle positions
*/
void getPositions(ContextImpl& context, std::vector<Vec3>& positions);
/**
* Set the positions of all particles.
*
* @param positions a vector containg the particle positions
*/
void setPositions(ContextImpl& context, const std::vector<Vec3>& positions);
/**
* Get the velocities of all particles.
*
* @param velocities on exit, this contains the particle velocities
*/
void getVelocities(ContextImpl& context, std::vector<Vec3>& velocities);
/**
* Set the velocities of all particles.
*
* @param velocities a vector containg the particle velocities
*/
void setVelocities(ContextImpl& context, const std::vector<Vec3>& velocities);
/**
* Get the current forces on all particles.
*
* @param forces on exit, this contains the forces
*/
void getForces(ContextImpl& context, std::vector<Vec3>& forces);
/**
* Get the current periodic box vectors.
*
* @param a on exit, this contains the vector defining the first edge of the periodic box
* @param b on exit, this contains the vector defining the second edge of the periodic box
* @param c on exit, this contains the vector defining the third edge of the periodic box
*/
void getPeriodicBoxVectors(ContextImpl& context, Vec3& a, Vec3& b, Vec3& c) const;
/**
* Set the current periodic box vectors.
*
* @param a the vector defining the first edge of the periodic box
* @param b the vector defining the second edge of the periodic box
* @param c the vector defining the third edge of the periodic box
*/
void setPeriodicBoxVectors(ContextImpl& context, const Vec3& a, const Vec3& b, const Vec3& c) const;
/**
* Create a checkpoint recording the current state of the Context.
*
* @param stream an output stream the checkpoint data should be written to
*/
void createCheckpoint(ContextImpl& context, std::ostream& stream);
/**
* Load a checkpoint that was written by createCheckpoint().
*
* @param stream an input stream the checkpoint data should be read from
*/
void loadCheckpoint(ContextImpl& context, std::istream& stream);
private:
CudaPlatform::PlatformData& data;
};
/**
* This kernel modifies the positions of particles to enforce distance constraints.
*/
class CudaApplyConstraintsKernel : public ApplyConstraintsKernel {
public:
CudaApplyConstraintsKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data) : ApplyConstraintsKernel(name, platform), data(data) {
}
/**
* Initialize the kernel.
*
* @param system the System this kernel will be applied to
*/
void initialize(const System& system);
/**
* Update particle positions to enforce constraints.
*
* @param context the context in which to execute this kernel
* @param tol the distance tolerance within which constraints must be satisfied.
*/
void apply(ContextImpl& context, double tol);
private:
CudaPlatform::PlatformData& data;
};
/**
* This kernel recomputes the positions of virtual sites.
*/
class CudaVirtualSitesKernel : public VirtualSitesKernel {
public:
CudaVirtualSitesKernel(std::string name, const Platform& platform) : VirtualSitesKernel(name, platform) {
}
/**
* Initialize the kernel.
*
* @param system the System this kernel will be applied to
*/
void initialize(const System& system);
/**
* Compute the virtual site locations.
*
* @param context the context in which to execute this kernel
*/
void computePositions(ContextImpl& context);
};
/**
* This kernel is invoked by HarmonicBondForce to calculate the forces acting on the system and the energy of the system.
*/
class CudaCalcHarmonicBondForceKernel : public CalcHarmonicBondForceKernel {
public:
CudaCalcHarmonicBondForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, System& system) : CalcHarmonicBondForceKernel(name, platform), data(data), system(system) {
}
~CudaCalcHarmonicBondForceKernel();
/**
* Initialize the kernel.
*
* @param system the System this kernel will be applied to
* @param force the HarmonicBondForce this kernel will be used for
*/
void initialize(const System& system, const HarmonicBondForce& force);
/**
* Execute the kernel to calculate the forces and/or energy.
*
* @param context the context in which to execute this kernel
* @param includeForces true if forces should be calculated
* @param includeEnergy true if the energy should be calculated
* @return the potential energy due to the force
*/
double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
/**
* Copy changed parameters over to a context.
*
* @param context the context to copy parameters to
* @param force the HarmonicBondForce to copy the parameters from
*/
void copyParametersToContext(ContextImpl& context, const HarmonicBondForce& force);
private:
class ForceInfo;
int numBonds;
CudaPlatform::PlatformData& data;
System& system;
};
/**
* This kernel is invoked by CustomBondForce to calculate the forces acting on the system and the energy of the system.
*/
class CudaCalcCustomBondForceKernel : public CalcCustomBondForceKernel {
public:
CudaCalcCustomBondForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, System& system) : CalcCustomBondForceKernel(name, platform),
data(data), system(system) {
}
~CudaCalcCustomBondForceKernel();
/**
* Initialize the kernel.
*
* @param system the System this kernel will be applied to
* @param force the CustomBondForce this kernel will be used for
*/
void initialize(const System& system, const CustomBondForce& force);
/**
* Execute the kernel to calculate the forces and/or energy.
*
* @param context the context in which to execute this kernel
* @param includeForces true if forces should be calculated
* @param includeEnergy true if the energy should be calculated
* @return the potential energy due to the force
*/
double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
/**
* Copy changed parameters over to a context.
*
* @param context the context to copy parameters to
* @param force the CustomBondForce to copy the parameters from
*/
void copyParametersToContext(ContextImpl& context, const CustomBondForce& force);
private:
class ForceInfo;
void updateGlobalParams(ContextImpl& context);
int numBonds;
CudaPlatform::PlatformData& data;
std::vector<std::string> globalParamNames;
std::vector<float> globalParamValues;
System& system;
};
/**
* This kernel is invoked by HarmonicAngleForce to calculate the forces acting on the system and the energy of the system.
*/
class CudaCalcHarmonicAngleForceKernel : public CalcHarmonicAngleForceKernel {
public:
CudaCalcHarmonicAngleForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, System& system) : CalcHarmonicAngleForceKernel(name, platform), data(data), system(system) {
}
~CudaCalcHarmonicAngleForceKernel();
/**
* Initialize the kernel.
*
* @param system the System this kernel will be applied to
* @param force the HarmonicAngleForce this kernel will be used for
*/
void initialize(const System& system, const HarmonicAngleForce& force);
/**
* Execute the kernel to calculate the forces and/or energy.
*
* @param context the context in which to execute this kernel
* @param includeForces true if forces should be calculated
* @param includeEnergy true if the energy should be calculated
* @return the potential energy due to the force
*/
double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
/**
* Copy changed parameters over to a context.
*
* @param context the context to copy parameters to
* @param force the HarmonicAngleForce to copy the parameters from
*/
void copyParametersToContext(ContextImpl& context, const HarmonicAngleForce& force);
private:
class ForceInfo;
int numAngles;
CudaPlatform::PlatformData& data;
System& system;
};
/**
* This kernel is invoked by CustomAngleForce to calculate the forces acting on the system and the energy of the system.
*/
class CudaCalcCustomAngleForceKernel : public CalcCustomAngleForceKernel {
public:
CudaCalcCustomAngleForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, System& system) : CalcCustomAngleForceKernel(name, platform),
data(data), system(system) {
}
~CudaCalcCustomAngleForceKernel();
/**
* Initialize the kernel.
*
* @param system the System this kernel will be applied to
* @param force the CustomAngleForce this kernel will be used for
*/
void initialize(const System& system, const CustomAngleForce& force);
/**
* Execute the kernel to calculate the forces and/or energy.
*
* @param context the context in which to execute this kernel
* @param includeForces true if forces should be calculated
* @param includeEnergy true if the energy should be calculated
* @return the potential energy due to the force
*/
double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
/**
* Copy changed parameters over to a context.
*
* @param context the context to copy parameters to
* @param force the CustomAngleForce to copy the parameters from
*/
void copyParametersToContext(ContextImpl& context, const CustomAngleForce& force);
private:
class ForceInfo;
void updateGlobalParams(ContextImpl& context);
int numAngles;
CudaPlatform::PlatformData& data;
std::vector<std::string> globalParamNames;
std::vector<float> globalParamValues;
System& system;
};
/**
* This kernel is invoked by PeriodicTorsionForce to calculate the forces acting on the system and the energy of the system.
*/
class CudaCalcPeriodicTorsionForceKernel : public CalcPeriodicTorsionForceKernel {
public:
CudaCalcPeriodicTorsionForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, System& system) : CalcPeriodicTorsionForceKernel(name, platform), data(data), system(system) {
}
~CudaCalcPeriodicTorsionForceKernel();
/**
* Initialize the kernel.
*
* @param system the System this kernel will be applied to
* @param force the PeriodicTorsionForce this kernel will be used for
*/
void initialize(const System& system, const PeriodicTorsionForce& force);
/**
* Execute the kernel to calculate the forces and/or energy.
*
* @param context the context in which to execute this kernel
* @param includeForces true if forces should be calculated
* @param includeEnergy true if the energy should be calculated
* @return the potential energy due to the force
*/
double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
/**
* Copy changed parameters over to a context.
*
* @param context the context to copy parameters to
* @param force the PeriodicTorsionForce to copy the parameters from
*/
void copyParametersToContext(ContextImpl& context, const PeriodicTorsionForce& force);
private:
class ForceInfo;
int numTorsions;
CudaPlatform::PlatformData& data;
System& system;
};
/**
* This kernel is invoked by RBTorsionForce to calculate the forces acting on the system and the energy of the system.
*/
class CudaCalcRBTorsionForceKernel : public CalcRBTorsionForceKernel {
public:
CudaCalcRBTorsionForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, System& system) : CalcRBTorsionForceKernel(name, platform), data(data), system(system) {
}
~CudaCalcRBTorsionForceKernel();
/**
* Initialize the kernel.
*
* @param system the System this kernel will be applied to
* @param force the RBTorsionForce this kernel will be used for
*/
void initialize(const System& system, const RBTorsionForce& force);
/**
* Execute the kernel to calculate the forces and/or energy.
*
* @param context the context in which to execute this kernel
* @param includeForces true if forces should be calculated
* @param includeEnergy true if the energy should be calculated
* @return the potential energy due to the force
*/
double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
/**
* Copy changed parameters over to a context.
*
* @param context the context to copy parameters to
* @param force the RBTorsionForce to copy the parameters from
*/
void copyParametersToContext(ContextImpl& context, const RBTorsionForce& force);
private:
class ForceInfo;
int numTorsions;
CudaPlatform::PlatformData& data;
System& system;
};
/**
* This kernel is invoked by CMAPTorsionForce to calculate the forces acting on the system and the energy of the system.
*/
class CudaCalcCMAPTorsionForceKernel : public CalcCMAPTorsionForceKernel {
public:
CudaCalcCMAPTorsionForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, System& system) :
CalcCMAPTorsionForceKernel(name, platform), data(data), system(system), coefficients(NULL), mapPositions(NULL),
torsionIndices(NULL), torsionMaps(NULL) {
}
~CudaCalcCMAPTorsionForceKernel();
/**
* Initialize the kernel.
*
* @param system the System this kernel will be applied to
* @param force the CMAPTorsionForce this kernel will be used for
*/
void initialize(const System& system, const CMAPTorsionForce& force);
/**
* Execute the kernel to calculate the forces and/or energy.
*
* @param context the context in which to execute this kernel
* @param includeForces true if forces should be calculated
* @param includeEnergy true if the energy should be calculated
* @return the potential energy due to the force
*/
double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
private:
class ForceInfo;
CudaPlatform::PlatformData& data;
System& system;
int numTorsions;
CUDAStream<float4>* coefficients;
CUDAStream<int2>* mapPositions;
CUDAStream<int4>* torsionIndices;
CUDAStream<int>* torsionMaps;
};
/**
* This kernel is invoked by CustomTorsionForce to calculate the forces acting on the system and the energy of the system.
*/
class CudaCalcCustomTorsionForceKernel : public CalcCustomTorsionForceKernel {
public:
CudaCalcCustomTorsionForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, System& system) : CalcCustomTorsionForceKernel(name, platform),
data(data), system(system) {
}
~CudaCalcCustomTorsionForceKernel();
/**
* Initialize the kernel.
*
* @param system the System this kernel will be applied to
* @param force the CustomTorsionForce this kernel will be used for
*/
void initialize(const System& system, const CustomTorsionForce& force);
/**
* Execute the kernel to calculate the forces and/or energy.
*
* @param context the context in which to execute this kernel
* @param includeForces true if forces should be calculated
* @param includeEnergy true if the energy should be calculated
* @return the potential energy due to the force
*/
double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
/**
* Copy changed parameters over to a context.
*
* @param context the context to copy parameters to
* @param force the CustomTorsionForce to copy the parameters from
*/
void copyParametersToContext(ContextImpl& context, const CustomTorsionForce& force);
private:
class ForceInfo;
void updateGlobalParams(ContextImpl& context);
int numTorsions;
CudaPlatform::PlatformData& data;
std::vector<std::string> globalParamNames;
std::vector<float> globalParamValues;
System& system;
};
/**
* This kernel is invoked by NonbondedForce to calculate the forces acting on the system.
*/
class CudaCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
public:
CudaCalcNonbondedForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, System& system) : CalcNonbondedForceKernel(name, platform), data(data), system(system) {
}
~CudaCalcNonbondedForceKernel();
/**
* Initialize the kernel.
*
* @param system the System this kernel will be applied to
* @param force the NonbondedForce this kernel will be used for
*/
void initialize(const System& system, const NonbondedForce& force);
/**
* Execute the kernel to calculate the forces and/or energy.
*
* @param context the context in which to execute this kernel
* @param includeForces true if forces should be calculated
* @param includeEnergy true if the energy should be calculated
* @param includeReciprocal true if reciprocal space interactions should be included
* @return the potential energy due to the force
*/
double execute(ContextImpl& context, bool includeForces, bool includeEnergy, bool includeDirect, bool includeReciprocal);
/**
* Copy changed parameters over to a context.
*
* @param context the context to copy parameters to
* @param force the NonbondedForce to copy the parameters from
*/
void copyParametersToContext(ContextImpl& context, const NonbondedForce& force);
private:
class ForceInfo;
CudaPlatform::PlatformData& data;
int numParticles;
System& system;
};
/**
* This kernel is invoked by CustomNonbondedForce to calculate the forces acting on the system.
*/
class CudaCalcCustomNonbondedForceKernel : public CalcCustomNonbondedForceKernel {
public:
CudaCalcCustomNonbondedForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, System& system) : CalcCustomNonbondedForceKernel(name, platform), data(data), system(system) {
}
~CudaCalcCustomNonbondedForceKernel();
/**
* Initialize the kernel.
*
* @param system the System this kernel will be applied to
* @param force the CustomNonbondedForce this kernel will be used for
*/
void initialize(const System& system, const CustomNonbondedForce& force);
/**
* Execute the kernel to calculate the forces and/or energy.
*
* @param context the context in which to execute this kernel
* @param includeForces true if forces should be calculated
* @param includeEnergy true if the energy should be calculated
* @return the potential energy due to the force
*/
double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
/**
* Copy changed parameters over to a context.
*
* @param context the context to copy parameters to
* @param force the CustomNonbondedForce to copy the parameters from
*/
void copyParametersToContext(ContextImpl& context, const CustomNonbondedForce& force);
private:
class ForceInfo;
void updateGlobalParams(ContextImpl& context);
CudaPlatform::PlatformData& data;
int numParticles;
std::vector<std::string> globalParamNames;
std::vector<float> globalParamValues;
System& system;
};
/**
* This kernel is invoked by GBSAOBCForce to calculate the forces acting on the system.
*/
class CudaCalcGBSAOBCForceKernel : public CalcGBSAOBCForceKernel {
public:
CudaCalcGBSAOBCForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data) : CalcGBSAOBCForceKernel(name, platform), data(data) {
}
~CudaCalcGBSAOBCForceKernel();
/**
* Initialize the kernel.
*
* @param system the System this kernel will be applied to
* @param force the GBSAOBCForce this kernel will be used for
*/
void initialize(const System& system, const GBSAOBCForce& force);
/**
* Execute the kernel to calculate the forces and/or energy.
*
* @param context the context in which to execute this kernel
* @param includeForces true if forces should be calculated
* @param includeEnergy true if the energy should be calculated
* @return the potential energy due to the force
*/
double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
/**
* Copy changed parameters over to a context.
*
* @param context the context to copy parameters to
* @param force the GBSAOBCForce to copy the parameters from
*/
void copyParametersToContext(ContextImpl& context, const GBSAOBCForce& force);
private:
class ForceInfo;
CudaPlatform::PlatformData& data;
};
/**
* This kernel is invoked by GBVIForce to calculate the forces acting on the system.
*/
class CudaCalcGBVIForceKernel : public CalcGBVIForceKernel {
public:
CudaCalcGBVIForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data) : CalcGBVIForceKernel(name, platform), data(data) {
}
~CudaCalcGBVIForceKernel();
/**
* Initialize the kernel.
*
* @param system the System this kernel will be applied to
* @param force the GBVIForce this kernel will be used for
* @param scaledRadii the scaled radii (Eq. 5 of Labute paper)
*/
void initialize(const System& system, const GBVIForce& force, const std::vector<double> & scaledRadii);
/**
* Execute the kernel to calculate the forces and/or energy.
*
* @param context the context in which to execute this kernel
* @param includeForces true if forces should be calculated
* @param includeEnergy true if the energy should be calculated
* @return the potential energy due to the force
*/
double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
private:
class ForceInfo;
CudaPlatform::PlatformData& data;
};
/**
* This kernel is invoked by CustomExternalForce to calculate the forces acting on the system and the energy of the system.
*/
class CudaCalcCustomExternalForceKernel : public CalcCustomExternalForceKernel {
public:
CudaCalcCustomExternalForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, System& system) : CalcCustomExternalForceKernel(name, platform),
data(data), system(system) {
}
~CudaCalcCustomExternalForceKernel();
/**
* Initialize the kernel.
*
* @param system the System this kernel will be applied to
* @param force the CustomExternalForce this kernel will be used for
*/
void initialize(const System& system, const CustomExternalForce& force);
/**
* Execute the kernel to calculate the forces and/or energy.
*
* @param context the context in which to execute this kernel
* @param includeForces true if forces should be calculated
* @param includeEnergy true if the energy should be calculated
* @return the potential energy due to the force
*/
double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
/**
* Copy changed parameters over to a context.
*
* @param context the context to copy parameters to
* @param force the CustomNonbondedForce to copy the parameters from
*/
void copyParametersToContext(ContextImpl& context, const CustomExternalForce& force);
private:
class ForceInfo;
void updateGlobalParams(ContextImpl& context);
int numParticles;
CudaPlatform::PlatformData& data;
std::vector<std::string> globalParamNames;
std::vector<float> globalParamValues;
System& system;
};
/**
* This kernel is invoked by VerletIntegrator to take one time step.
*/
class CudaIntegrateVerletStepKernel : public IntegrateVerletStepKernel {
public:
CudaIntegrateVerletStepKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data) : IntegrateVerletStepKernel(name, platform), data(data) {
}
~CudaIntegrateVerletStepKernel();
/**
* Initialize the kernel.
*
* @param system the System this kernel will be applied to
* @param integrator the VerletIntegrator this kernel will be used for
*/
void initialize(const System& system, const VerletIntegrator& integrator);
/**
* Execute the kernel.
*
* @param context the context in which to execute this kernel
* @param integrator the VerletIntegrator this kernel is being used for
*/
void execute(ContextImpl& context, const VerletIntegrator& integrator);
private:
CudaPlatform::PlatformData& data;
double prevStepSize;
};
/**
* This kernel is invoked by LangevinIntegrator to take one time step.
*/
class CudaIntegrateLangevinStepKernel : public IntegrateLangevinStepKernel {
public:
CudaIntegrateLangevinStepKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data) : IntegrateLangevinStepKernel(name, platform), data(data) {
}
~CudaIntegrateLangevinStepKernel();
/**
* Initialize the kernel, setting up the particle masses.
*
* @param system the System this kernel will be applied to
* @param integrator the LangevinIntegrator this kernel will be used for
*/
void initialize(const System& system, const LangevinIntegrator& integrator);
/**
* Execute the kernel.
*
* @param context the context in which to execute this kernel
* @param integrator the LangevinIntegrator this kernel is being used for
*/
void execute(ContextImpl& context, const LangevinIntegrator& integrator);
private:
CudaPlatform::PlatformData& data;
double prevTemp, prevFriction, prevStepSize;
};
/**
* This kernel is invoked by BrownianIntegrator to take one time step.
*/
class CudaIntegrateBrownianStepKernel : public IntegrateBrownianStepKernel {
public:
CudaIntegrateBrownianStepKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data) : IntegrateBrownianStepKernel(name, platform), data(data) {
}
~CudaIntegrateBrownianStepKernel();
/**
* Initialize the kernel.
*
* @param system the System this kernel will be applied to
* @param integrator the BrownianIntegrator this kernel will be used for
*/
void initialize(const System& system, const BrownianIntegrator& integrator);
/**
* Execute the kernel.
*
* @param context the context in which to execute this kernel
* @param integrator the BrownianIntegrator this kernel is being used for
*/
void execute(ContextImpl& context, const BrownianIntegrator& integrator);
private:
CudaPlatform::PlatformData& data;
double prevTemp, prevFriction, prevStepSize;
};
/**
* This kernel is invoked by VariableVerletIntegrator to take one time step.
*/
class CudaIntegrateVariableVerletStepKernel : public IntegrateVariableVerletStepKernel {
public:
CudaIntegrateVariableVerletStepKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data) : IntegrateVariableVerletStepKernel(name, platform), data(data) {
}
~CudaIntegrateVariableVerletStepKernel();
/**
* Initialize the kernel.
*
* @param system the System this kernel will be applied to
* @param integrator the VerletIntegrator this kernel will be used for
*/
void initialize(const System& system, const VariableVerletIntegrator& integrator);
/**
* Execute the kernel.
*
* @param context the context in which to execute this kernel
* @param integrator the VerletIntegrator this kernel is being used for
* @param maxTime the maximum time beyond which the simulation should not be advanced
* @return the size of the step that was taken
*/
double execute(ContextImpl& context, const VariableVerletIntegrator& integrator, double maxTime);
private:
CudaPlatform::PlatformData& data;
double prevErrorTol;
};
/**
* This kernel is invoked by VariableLangevinIntegrator to take one time step.
*/
class CudaIntegrateVariableLangevinStepKernel : public IntegrateVariableLangevinStepKernel {
public:
CudaIntegrateVariableLangevinStepKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data) : IntegrateVariableLangevinStepKernel(name, platform), data(data) {
}
~CudaIntegrateVariableLangevinStepKernel();
/**
* Initialize the kernel, setting up the particle masses.
*
* @param system the System this kernel will be applied to
* @param integrator the VariableLangevinIntegrator this kernel will be used for
*/
void initialize(const System& system, const VariableLangevinIntegrator& integrator);
/**
* Execute the kernel.
*
* @param context the context in which to execute this kernel
* @param integrator the VariableLangevinIntegrator this kernel is being used for
* @param maxTime the maximum time beyond which the simulation should not be advanced
* @return the size of the step that was taken
*/
double execute(ContextImpl& context, const VariableLangevinIntegrator& integrator, double maxTime);
private:
CudaPlatform::PlatformData& data;
double prevTemp, prevFriction, prevErrorTol;
};
/**
* This kernel is invoked by AndersenThermostat at the start of each time step to adjust the particle velocities.
*/
class CudaApplyAndersenThermostatKernel : public ApplyAndersenThermostatKernel {
public:
CudaApplyAndersenThermostatKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data) : ApplyAndersenThermostatKernel(name, platform),
data(data), atomGroups(NULL) {
}
~CudaApplyAndersenThermostatKernel();
/**
* Initialize the kernel.
*
* @param system the System this kernel will be applied to
* @param thermostat the AndersenThermostat this kernel will be used for
*/
void initialize(const System& system, const AndersenThermostat& thermostat);
/**
* Execute the kernel.
*
* @param context the context in which to execute this kernel
*/
void execute(ContextImpl& context);
private:
CudaPlatform::PlatformData& data;
double prevTemp, prevFrequency, prevStepSize;
CUDAStream<int>* atomGroups;
};
/**
* This kernel is invoked by MonteCarloBarostat to adjust the periodic box volume
*/
class CudaApplyMonteCarloBarostatKernel : public ApplyMonteCarloBarostatKernel {
public:
CudaApplyMonteCarloBarostatKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data) : ApplyMonteCarloBarostatKernel(name, platform), data(data),
hasInitializedMolecules(false), moleculeAtoms(NULL), moleculeStartIndex(NULL) {
}
~CudaApplyMonteCarloBarostatKernel();
/**
* Initialize the kernel.
*
* @param system the System this kernel will be applied to
* @param barostat the MonteCarloBarostat this kernel will be used for
*/
void initialize(const System& system, const MonteCarloBarostat& barostat);
/**
* Attempt a Monte Carlo step, scaling particle positions (or cluster centers) by a specified value.
* This is called BEFORE the periodic box size is modified. It should begin by translating each particle
* or cluster into the first periodic box, so that coordinates will still be correct after the box size
* is changed.
*
* @param context the context in which to execute this kernel
* @param scale the scale factor by which to multiply particle positions
*/
void scaleCoordinates(ContextImpl& context, double scale);
/**
* Reject the most recent Monte Carlo step, restoring the particle positions to where they were before
* scaleCoordinates() was last called.
*
* @param context the context in which to execute this kernel
*/
void restoreCoordinates(ContextImpl& context);
private:
CudaPlatform::PlatformData& data;
bool hasInitializedMolecules;
int numMolecules;
CUDAStream<int>* moleculeAtoms;
CUDAStream<int>* moleculeStartIndex;
};
/**
* This kernel is invoked to calculate the kinetic energy of the system.
*/
class CudaCalcKineticEnergyKernel : public CalcKineticEnergyKernel {
public:
CudaCalcKineticEnergyKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data) : CalcKineticEnergyKernel(name, platform), data(data) {
}
/**
* Initialize the kernel.
*
* @param system the System this kernel will be applied to
*/
void initialize(const System& system);
/**
* Execute the kernel.
*
* @param context the context in which to execute this kernel
*/
double execute(ContextImpl& context);
private:
CudaPlatform::PlatformData& data;
std::vector<double> masses;
};
/**
* This kernel is invoked to remove center of mass motion from the system.
*/
class CudaRemoveCMMotionKernel : public RemoveCMMotionKernel {
public:
CudaRemoveCMMotionKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data) : RemoveCMMotionKernel(name, platform), data(data) {
}
/**
* Initialize the kernel, setting up the particle masses.
*
* @param system the System this kernel will be applied to
* @param force the CMMotionRemover this kernel will be used for
*/
void initialize(const System& system, const CMMotionRemover& force);
/**
* Execute the kernel.
*
* @param context the context in which to execute this kernel
*/
void execute(ContextImpl& context);
private:
CudaPlatform::PlatformData& data;
};
} // namespace OpenMM
#endif /*OPENMM_CUDAKERNELS_H_*/
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "CudaPlatform.h"
#include "CudaKernelFactory.h"
#include "CudaKernels.h"
#include "openmm/internal/ContextImpl.h"
#include "kernels/gputypes.h"
#include "openmm/Context.h"
#include "openmm/OpenMMException.h"
#include "openmm/System.h"
#include <sstream>
using namespace OpenMM;
using std::map;
using std::string;
using std::stringstream;
extern "C" OPENMMCUDA_EXPORT void registerPlatforms() {
if (gpuIsAvailable())
Platform::registerPlatform(new CudaPlatform());
}
CudaPlatform::CudaPlatform() {
CudaKernelFactory* factory = new CudaKernelFactory();
registerKernelFactory(CalcForcesAndEnergyKernel::Name(), factory);
registerKernelFactory(UpdateStateDataKernel::Name(), factory);
registerKernelFactory(ApplyConstraintsKernel::Name(), factory);
registerKernelFactory(VirtualSitesKernel::Name(), factory);
registerKernelFactory(CalcHarmonicBondForceKernel::Name(), factory);
registerKernelFactory(CalcCustomBondForceKernel::Name(), factory);
registerKernelFactory(CalcHarmonicAngleForceKernel::Name(), factory);
registerKernelFactory(CalcCustomAngleForceKernel::Name(), factory);
registerKernelFactory(CalcPeriodicTorsionForceKernel::Name(), factory);
registerKernelFactory(CalcRBTorsionForceKernel::Name(), factory);
registerKernelFactory(CalcCMAPTorsionForceKernel::Name(), factory);
registerKernelFactory(CalcCustomTorsionForceKernel::Name(), factory);
registerKernelFactory(CalcNonbondedForceKernel::Name(), factory);
registerKernelFactory(CalcCustomNonbondedForceKernel::Name(), factory);
registerKernelFactory(CalcGBSAOBCForceKernel::Name(), factory);
registerKernelFactory(CalcGBVIForceKernel::Name(), factory);
registerKernelFactory(CalcCustomExternalForceKernel::Name(), factory);
registerKernelFactory(IntegrateVerletStepKernel::Name(), factory);
registerKernelFactory(IntegrateLangevinStepKernel::Name(), factory);
registerKernelFactory(IntegrateBrownianStepKernel::Name(), factory);
registerKernelFactory(IntegrateVariableVerletStepKernel::Name(), factory);
registerKernelFactory(IntegrateVariableLangevinStepKernel::Name(), factory);
registerKernelFactory(ApplyAndersenThermostatKernel::Name(), factory);
registerKernelFactory(ApplyMonteCarloBarostatKernel::Name(), factory);
registerKernelFactory(CalcKineticEnergyKernel::Name(), factory);
registerKernelFactory(RemoveCMMotionKernel::Name(), factory);
platformProperties.push_back(CudaDevice());
platformProperties.push_back(CudaUseBlockingSync());
setPropertyDefaultValue(CudaDevice(), "0");
setPropertyDefaultValue(CudaUseBlockingSync(), "true");
}
bool CudaPlatform::supportsDoublePrecision() const {
return false;
}
const string& CudaPlatform::getPropertyValue(const Context& context, const string& property) const {
const ContextImpl& impl = getContextImpl(context);
const PlatformData* data = reinterpret_cast<const PlatformData*>(impl.getPlatformData());
map<string, string>::const_iterator value = data->propertyValues.find(property);
if (value != data->propertyValues.end())
return value->second;
return Platform::getPropertyValue(context, property);
}
void CudaPlatform::setPropertyValue(Context& context, const string& property, const string& value) const {
}
void CudaPlatform::contextCreated(ContextImpl& context, const map<string, string>& properties) const {
System& system = context.getSystem();
for (int i = 0; i < system.getNumParticles(); i++)
if (system.isVirtualSite(i))
throw OpenMMException("CudaPlatform does not support virtual sites");
for (int i = 0; i < system.getNumForces(); i++)
if (system.getForce(i).getForceGroup() != 0)
throw OpenMMException("CudaPlatform does not support force groups");
unsigned int device = 0;
const string& devicePropValue = (properties.find(CudaDevice()) == properties.end() ?
getPropertyDefaultValue(CudaDevice()) : properties.find(CudaDevice())->second);
if (devicePropValue.length() > 0)
stringstream(devicePropValue) >> device;
int numParticles = context.getSystem().getNumParticles();
const string& blockingSync = (properties.find(CudaUseBlockingSync()) == properties.end() ?
getPropertyDefaultValue(CudaUseBlockingSync()) : properties.find(CudaUseBlockingSync())->second);
_gpuContext* gpu = (_gpuContext*) gpuInit(numParticles, device, blockingSync == "true");
context.setPlatformData(new PlatformData(gpu));
}
void CudaPlatform::contextDestroyed(ContextImpl& context) const {
PlatformData* data = reinterpret_cast<PlatformData*>(context.getPlatformData());
gpuShutDown(data->gpu);
delete data;
}
CudaPlatform::PlatformData::PlatformData(_gpuContext* gpu) : gpu(gpu), removeCM(false), nonbondedMethod(0), customNonbondedMethod(0), hasBonds(false), hasAngles(false),
hasPeriodicTorsions(false), hasRB(false), hasNonbonded(false), hasCustomNonbonded(false), stepCount(0), computeForceCount(0), time(0.0),
ewaldSelfEnergy(0.0), dispersionCoefficient(0.0) {
stringstream device;
device << gpu->device;
propertyValues[CudaPlatform::CudaDevice()] = device.str();
propertyValues[CudaPlatform::CudaUseBlockingSync()] = (gpu->useBlockingSync ? "true" : "false");
}
/*
* Authored by: Chen, Shifu
*
* Email: chen@gmtk.org
*
* Website: http://www.gmtk.org/gsort
*
* The code is distributed under BSD license, you are allowed to use, modify or sell this code, but a statement is required if you used this code any where.
*
*/
#include <stdio.h>
#include <stdlib.h>
#include "vector_types.h"
#include "bbsort.h"
#include "bbsort_kernel.cu"
int getValue(int2 v){
return v.y;
}
template <typename T>
T getValue(T v){
return v;
}
# define CUDA_SAFE_CALL_NO_SYNC( call) { \
cudaError err = call; \
if( cudaSuccess != err) { \
fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \
__FILE__, __LINE__, cudaGetErrorString( err) ); \
exit(EXIT_FAILURE); \
} }
# define CUDA_SAFE_CALL( call) CUDA_SAFE_CALL_NO_SYNC(call);
bool assignSliceToBuckets(unsigned int* sliceCount,int sliceSize,unsigned int* bucketOffset,unsigned int* bucketOfSlice,unsigned int* bucketSizes,unsigned int* sliceOffsetInBucket,int& bucketsCount,float step)
{
int i=0;
bool overflow=false;
int tmpSum=0;
bucketOffset[0]=0;
for(i=0;i<sliceSize; i++){
if(sliceCount[i] >BLOCK_SIZE)
{
overflow=true;
}
tmpSum += sliceCount[i];
bucketOfSlice[i]=bucketsCount;
bucketSizes[bucketsCount] = tmpSum;
sliceOffsetInBucket[i]=tmpSum -sliceCount[i];
if(tmpSum > BLOCK_SIZE )
{
if(i != 0)
{
bucketOfSlice[i]=bucketsCount+1;
bucketSizes[bucketsCount] -= sliceCount[i];
sliceOffsetInBucket[i]=0;
bucketOffset[bucketsCount+1]=bucketOffset[bucketsCount] + tmpSum - sliceCount[i];
bucketsCount++;
tmpSum=sliceCount[i];
bucketSizes[bucketsCount] = tmpSum;
}
else
{
bucketOffset[bucketsCount+1]=bucketOffset[bucketsCount] + tmpSum ;
sliceOffsetInBucket[i]=0;
tmpSum=0;
bucketsCount++;
}
}
}
bucketsCount++;
return overflow;
}
template <typename T>
void reduceMinMax(T* dData,int size,float& result,bool isMax)
{
int step;
step=(size%2==0)?
(size/2):(size/2 +1);
int blockSize=BLOCK_SIZE;
int blockCount;
int length=size;
T originalResult;
while(step > 0)
{
if(step%BLOCK_SIZE==0)
blockCount=step/BLOCK_SIZE;
else
blockCount=step/BLOCK_SIZE+1;
if(isMax)
reduceMaxD<<<blockCount,blockSize>>>(dData,step,length);
else
reduceMinD<<<blockCount,blockSize>>>(dData,step,length);
length=step;
step=(step%2==0 || step==1)?(step/2):(step/2 +1);
}
CUDA_SAFE_CALL(cudaMemcpy(&originalResult, dData, sizeof(T), cudaMemcpyDeviceToHost));
result=(int)getValue(originalResult);
}
template <typename T>
void evaluateDisorder(T* dData,int size,float maxValue, float minValue, int& listOrder)
{
int blockCount;
if((size-1) % BLOCK_SIZE ==0)blockCount=size/BLOCK_SIZE;
else blockCount=size/BLOCK_SIZE+1;
float* dDiffData;
CUDA_SAFE_CALL(cudaMalloc((void**)&dDiffData, sizeof(float) * size));
calDifferenceD<<<blockCount,BLOCK_SIZE,(BLOCK_SIZE)*sizeof(T)>>>(dData,dDiffData,size);
float sum=0;
int step;
step=(size%2==0)?
(size/2):(size/2 +1);
int blockSize=BLOCK_SIZE;
int length=size;
while(step > 0)
{
if(step%BLOCK_SIZE==0)
blockCount=step/BLOCK_SIZE;
else
blockCount=step/BLOCK_SIZE+1;
reduceSumD<<<blockCount,blockSize>>>(dDiffData,step,length);
length=step;
step=(step%2==0 || step==1)?(step/2):(step/2 +1);
}
CUDA_SAFE_CALL(cudaMemcpy(&sum, dDiffData, sizeof(float), cudaMemcpyDeviceToHost));
if( sum < (maxValue - minValue) * size / 10)
listOrder=NEARLY_SORTED;
else
listOrder=DISORDERLY;
CUDA_SAFE_CALL(cudaFree(dDiffData));
}
template <typename T>
void bbSortBody(T* dData,int size,int listOrder/*,float sliceStep,int sliceSize, T* dTmpData, float minValue,float maxValue*/)
{
float minValue,maxValue;
T* dTmpData;
CUDA_SAFE_CALL(cudaMalloc((void**)&dTmpData, sizeof(T) * size));
CUDA_SAFE_CALL(cudaMemcpy(dTmpData, dData, sizeof(T) * size, cudaMemcpyDeviceToDevice));
reduceMinMax(dTmpData,size,maxValue,true);
CUDA_SAFE_CALL(cudaMemcpy(dTmpData, dData, sizeof(T) * size, cudaMemcpyDeviceToDevice));
reduceMinMax(dTmpData,size,minValue,false);
if(minValue == maxValue)
{
CUDA_SAFE_CALL(cudaFree(dTmpData));
return ;
}
if(listOrder == AUTO_EVALUATE )
{
evaluateDisorder(dData,size,maxValue,minValue,listOrder);
}
float sliceStep = (float) (50.0*((double)(maxValue-minValue)/(double)size));
int sliceSize = (int) ((maxValue-minValue)/sliceStep + 10);
int blockCount;
if(size%BLOCK_SIZE==0)blockCount=size/BLOCK_SIZE;
else blockCount=size/BLOCK_SIZE+1;
unsigned int* dSliceCounts;
unsigned int* dOffsetInSlice;
CUDA_SAFE_CALL(cudaMalloc((void**)&dOffsetInSlice, sizeof(unsigned int) * size));
CUDA_SAFE_CALL(cudaMalloc((void**)&dSliceCounts, sizeof(unsigned int) * sliceSize));
CUDA_SAFE_CALL(cudaMemset(dSliceCounts,0, sizeof(int) * sliceSize));
if(listOrder == NEARLY_SORTED)
{
assignElementToSlicesNearlySortedD<<<blockCount, BLOCK_SIZE>>>(dData,size,dSliceCounts,dOffsetInSlice,minValue,sliceStep,sliceSize,blockCount);
}
else
assignElementToSlicesD<<<blockCount, BLOCK_SIZE>>>(dData,size,dSliceCounts,dOffsetInSlice,minValue,sliceStep,sliceSize);
unsigned int* hSliceCounts=new unsigned int[sliceSize];
CUDA_SAFE_CALL(cudaMemcpy(hSliceCounts, dSliceCounts, sizeof(unsigned int) * sliceSize, cudaMemcpyDeviceToHost));
int looseBucketSize=size/100;
unsigned int* hBucketOffsets=new unsigned int[looseBucketSize];
unsigned int* hBucketSizes=new unsigned int[looseBucketSize];
unsigned int* hBucketOfSlices=new unsigned int[sliceSize];
unsigned int* hSliceOffsetInBucket=new unsigned int[sliceSize];
int bucketsCount=0;
memset(hBucketSizes,0,sizeof(int) * looseBucketSize);
memset(hSliceOffsetInBucket,0,sizeof(unsigned int) * sliceSize);
bool overflow;
overflow = assignSliceToBuckets(hSliceCounts,sliceSize,hBucketOffsets,hBucketOfSlices,hBucketSizes,hSliceOffsetInBucket,bucketsCount,sliceStep);
unsigned int* dBucketOffsets;
unsigned int* dBucketSizes;
unsigned int* dBucketOfSlices;
unsigned int* dSliceOffsetInBucket;
CUDA_SAFE_CALL(cudaMalloc((void**)&dBucketOfSlices, sizeof(unsigned int) * sliceSize));
CUDA_SAFE_CALL(cudaMalloc((void**)&dSliceOffsetInBucket, sizeof(unsigned int) * sliceSize));
CUDA_SAFE_CALL(cudaMalloc((void**)&dBucketOffsets, sizeof(unsigned int) * bucketsCount));
CUDA_SAFE_CALL(cudaMalloc((void**)&dBucketSizes, sizeof(unsigned int) * bucketsCount));
CUDA_SAFE_CALL(cudaMemcpy(dBucketOfSlices, hBucketOfSlices, sizeof(unsigned int) * sliceSize, cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(dSliceOffsetInBucket, hSliceOffsetInBucket, sizeof(unsigned int) * sliceSize, cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(dBucketOffsets, hBucketOffsets, sizeof(unsigned int) * bucketsCount, cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(dBucketSizes, hBucketSizes, sizeof(unsigned int) * bucketsCount, cudaMemcpyHostToDevice));
cudaBindTexture(0,tBucketOffsets,dBucketOffsets);
cudaBindTexture(0,tBucketSizes,dBucketSizes);
cudaBindTexture(0,tBucketOfSlices,dBucketOfSlices);
cudaBindTexture(0,tSliceOffsetInBucket,dSliceOffsetInBucket);
assignElementToBucketD<<<blockCount, BLOCK_SIZE>>>(dData,dTmpData,size,dOffsetInSlice,minValue,sliceStep);
CUDA_SAFE_CALL( cudaThreadSynchronize() );
bitonicSortD<<<bucketsCount, BLOCK_SIZE, sizeof(T) * BLOCK_SIZE>>>(dTmpData);
CUDA_SAFE_CALL(cudaMemcpy(dData, dTmpData, sizeof(T) * size, cudaMemcpyDeviceToDevice));
if(overflow){
for(int i=0;i<bucketsCount;i++)
{
if(hBucketSizes[i] > BLOCK_SIZE)
{
bbSort(dData + hBucketOffsets[i],hBucketSizes[i],listOrder);
}
}
}
delete hBucketOffsets;
delete hBucketOfSlices;
delete hSliceCounts;
delete hBucketSizes;
delete hSliceOffsetInBucket;
CUDA_SAFE_CALL(cudaFree(dOffsetInSlice));
CUDA_SAFE_CALL(cudaFree(dSliceCounts));
CUDA_SAFE_CALL(cudaFree(dTmpData));
cudaUnbindTexture( tBucketSizes );
CUDA_SAFE_CALL(cudaFree(dBucketSizes));
cudaUnbindTexture( tBucketOffsets );
CUDA_SAFE_CALL(cudaFree(dBucketOffsets));
cudaUnbindTexture( tBucketOfSlices );
CUDA_SAFE_CALL(cudaFree(dBucketOfSlices));
cudaUnbindTexture( tSliceOffsetInBucket );
CUDA_SAFE_CALL(cudaFree(dSliceOffsetInBucket));
}
/************************************************************************************
Uncomment your desired function definition here
Please note that, only one type of bbsort() can be used in a program, due to NVCC compiler doesn't support overriding kernel function
float, double, int, uint, short, and ushort are originally supported, if you want to use bbsort() in double
please follow the readme.txt
Also note that you need to use 1.3 capbility (use arch=sm_13 in your compile command) to sort doubles
*************************************************************************************/
template<>
void OPENMMCUDA_EXPORT bbSort(int2* dData,int size,int listOrder)
{
bbSortBody(dData,size,listOrder);
}
//void bbSort(float* dData,int size,int listOrder)
//{
//
// bbSortBody(dData,size,listOrder);
//}
//void bbSort(int* dData,int size,int listOrder)
//{
//
// bbSortBody(dData,size,listOrder);
//}
//
//void bbSort(unsigned int* dData,int size,int listOrder)
//{
//
// bbSortBody(dData,size,listOrder);
//}
//
//void bbSort(double* dData,int size,int listOrder)
//{
//
// bbSortBody(dData,size,listOrder);
//}
/*
* Authored by: Chen, Shifu
*
* Email: chen@gmtk.org
*
* Website: http://www.gmtk.org/gsort
*
* The code is distributed under BSD license, you are allowed to use, modify or sell this code, but a statement is required if you used this code any where.
*
*/
#ifndef _BBSORT_H_
#define _BBSORT_H_
#include "windowsExportCuda.h"
#define BLOCK_SIZE 512
#define DISORDERLY 0
#define NEARLY_SORTED 1
#define AUTO_EVALUATE 2
template <typename T>
void OPENMMCUDA_EXPORT bbSort(T* dData,int number,int listOrder=AUTO_EVALUATE);
#endif // _BBSORT_H_
/*
* Authored by: Chen, Shifu
*
* Email: chen@gmtk.org
*
* Website: http://www.gmtk.org/gsort
*
* The code is distributed under BSD license, you are allowed to use, modify or sell this code, but a statement is required if you used this code any where.
*
*/
#ifndef _BBSORT_KERNEL_H_
#define _BBSORT_KERNEL_H_
#include "bbsort.h"
#include "math_constants.h"
texture<unsigned int, 1, cudaReadModeElementType> tBucketSizes;
texture<unsigned int, 1, cudaReadModeElementType> tBucketOffsets;
texture<unsigned int, 1, cudaReadModeElementType> tBucketOfSlices;
texture<unsigned int, 1, cudaReadModeElementType> tSliceOffsetInBucket;
static __device__ int dGetValue(int2 v){
return v.y;
}
template <typename T>
static __device__ T dGetValue(T v){
return v;
}
static __device__ void dPad(int2& v){
v.x=0x3fffffff;
v.y=0x4fffffff;
}
template <typename T>
static __device__ void dPad(T & v){
v=0x7fffffff;
}
template <typename T>
__global__ static void reduceMaxD(T * dData,int step,int length)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
if(index + step >=length)
return ;
dData[index] = dGetValue(dData[index])>dGetValue(dData[index+step])?dData[index]:dData[index+step];
}
template <typename T>
__global__ static void reduceMinD(T * dData,int step,int length)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
if(index + step >=length)
return ;
dData[index] = dGetValue(dData[index])<dGetValue(dData[index+step])?dData[index]:dData[index+step];
}
__global__ static void reduceSumD(float * dDiffData,int step,int length)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
if(index + step >=length)
return ;
dDiffData[index] += dDiffData[index+step];
}
template <typename T>
__global__ static void calDifferenceD(T * dData,float * dDiffData,int size)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
if(index > size-1)
return ;
const unsigned int tid = threadIdx.x;
extern __shared__ T sData[];
sData[tid]=dData[index];
__syncthreads();
if(tid < blockDim.x -1)
dDiffData[index] = abs(dGetValue(sData[tid+1]) - dGetValue(sData[tid]));
else
dDiffData[index] =0;
}
template <typename T>
__device__ inline void dSwap(T & a, T & b)
{
T tmp = a;
a = b;
b = tmp;
}
template <typename T>
__global__ static void bitonicSortD(T * datas)
{
extern __shared__ T shared[];
const unsigned int bid=blockIdx.x;
const unsigned int tid = threadIdx.x;
__shared__ unsigned int count;
__shared__ unsigned int offset;
if(tid == 0)
{
count=tex1Dfetch(tBucketSizes,bid);
offset=tex1Dfetch(tBucketOffsets,bid);
}
__syncthreads();
if(tid < count)
shared[tid] = datas[tid+offset];
else
{
dPad(shared[tid]);
}
__syncthreads();
for (unsigned int k = 2; k <= BLOCK_SIZE; k *= 2)
{
for (unsigned int j = k / 2; j>0; j /= 2)
{
unsigned int ixj = tid ^ j;
if (ixj > tid)
{
if ((tid & k) == 0)
{
if (dGetValue(shared[tid]) > dGetValue(shared[ixj]))
{
dSwap(shared[tid], shared[ixj]);
}
}
else
{
if (dGetValue(shared[tid]) < dGetValue(shared[ixj]))
{
dSwap(shared[tid], shared[ixj]);
}
}
}
__syncthreads();
}
}
if(tid < count)
datas[tid+offset] = shared[tid];
}
template <typename T>
__global__ void assignElementToSlicesD(T* dDatas,int number,unsigned int* dSliceCounts,unsigned int* dOffsetInSlice,float minValue,float step,int sliceSize)
{
unsigned int index= __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
if(index > number-1)
return ;
unsigned int s=((dGetValue(dDatas[index]) - minValue)/ step);
unsigned int offset=atomicInc(dSliceCounts + s,0xFFFFFFF);
dOffsetInSlice[index] = offset;
}
template <typename T>
__global__ void assignElementToSlicesNearlySortedD(T* dDatas,int number,unsigned int* dSliceCounts,unsigned int* dOffsetInSlice,float minValue,float step,int sliceSize,int blockCount)
{
unsigned int index= blockIdx.x + blockCount * threadIdx.x;
if(index > number-1)
return ;
unsigned int s=((dGetValue(dDatas[index]) - minValue)/ step);
unsigned int offset=atomicInc(dSliceCounts + s,0xFFFFFFF);
dOffsetInSlice[index] = offset;
}
template <typename T>
__global__ void assignElementToBucketD(T* dDatas,T* dNewDatas,int number,unsigned int* dOffsetInSlice,float minValue,float step)
{
unsigned int index= __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
if(index > number-1)
return ;
unsigned int s=((dGetValue(dDatas[index]) - minValue)/ step);
unsigned int b=tex1Dfetch(tBucketOfSlices,s);
unsigned int offset =tex1Dfetch(tBucketOffsets,b) + tex1Dfetch(tSliceOffsetInBucket,s) + dOffsetInSlice[index];
dNewDatas[offset] =dDatas[index];
}
#endif // _BBSORT_KERNEL_H_
/* Code for CUDA stream compaction. Roughly based on:
Billeter M, Olsson O, Assarsson U. Efficient Stream Compaction on Wide SIMD Many-Core Architectures.
High Performance Graphics 2009.
Notes:
- paper recommends 128 threads/block, so this is hard coded.
- I only implement the prefix-sum based compact primitive, and not the POPC one, as that is more
complicated and performs poorly on current hardware
- I only implement the scattered- and staged-write variant of phase III as it they have reasonable
performance across most of the tested workloads in the paper. The selective variant is not
implemented.
- The prefix sum of per-block element counts (phase II) is not done in a particularly efficient
manner. It is, however, done in a very easy to program manner, and integrated into the top of
phase III, reducing the number of kernel invocations required. If one wanted to use existing code,
it'd be easy to take the CUDA SDK scanLargeArray sample, and do a prefix sum over dgBlockCounts in
a phase II kernel. You could also adapt the existing prescan128 to take an initial value, and scan
dgBlockCounts in stages.
Date: 23 Aug 2009
Author: Imran Haque (ihaque@cs.stanford.edu)
Affiliation: Stanford University
License: Public Domain
*/
#include "cudaCompact.h"
typedef unsigned int T;
// Phase 1: Count valid elements per thread block
// Hard-code 128 thd/blk
__device__ unsigned int sumReduce128(volatile unsigned int* arr) {
// Parallel reduce element counts
// Assumes 128 thd/block
if (threadIdx.x < 64) arr[threadIdx.x] += arr[threadIdx.x+64];
__syncthreads();
if (threadIdx.x < 32) {
arr[threadIdx.x] += arr[threadIdx.x+32];
if (threadIdx.x < 16) arr[threadIdx.x] += arr[threadIdx.x+16];
if (threadIdx.x < 8) arr[threadIdx.x] += arr[threadIdx.x+8];
if (threadIdx.x < 4) arr[threadIdx.x] += arr[threadIdx.x+4];
if (threadIdx.x < 2) arr[threadIdx.x] += arr[threadIdx.x+2];
if (threadIdx.x < 1) arr[threadIdx.x] += arr[threadIdx.x+1];
}
__syncthreads();
return arr[0];
}
__global__ void countElts(unsigned int* dgBlockCounts,const unsigned int* dgValid,const size_t eltsPerBlock,const size_t len) {
__shared__ volatile unsigned int dsCount[128];
dsCount[threadIdx.x] = 0;
size_t ub;
ub = (len < (blockIdx.x+1)*eltsPerBlock) ? len : ((blockIdx.x + 1)*eltsPerBlock);
for (int base = blockIdx.x * eltsPerBlock; base < (blockIdx.x+1)*eltsPerBlock; base += blockDim.x) {
if ((base + threadIdx.x) < ub && dgValid[base+threadIdx.x])
dsCount[threadIdx.x]++;
}
__syncthreads();
unsigned int blockCount = sumReduce128(dsCount);
if (threadIdx.x == 0) dgBlockCounts[blockIdx.x] = blockCount;
return;
}
// Phase 2/3: Move valid elements using SIMD compaction (phase 2 is done implicitly at top of __global__ method)
// Exclusive prefix scan over 128 elements
// Assumes 128 threads
// Taken from cuda SDK "scan" sample for naive scan, with small modifications
__device__ int exclusivePrescan128(const unsigned int* in,unsigned int* outAndTemp) {
const int n=128;
//TODO: this temp storage could be reduced since we write to shared memory in out anyway, and n is hardcoded
//__shared__ int temp[2*n];
unsigned int* temp = outAndTemp;
int pout = 1, pin = 0;
// load input into temp
// This is exclusive scan, so shift right by one and set first elt to 0
temp[pout*n + threadIdx.x] = (threadIdx.x > 0) ? in[threadIdx.x-1] : 0;
__syncthreads();
for (int offset = 1; offset < n; offset *= 2)
{
pout = 1 - pout; // swap double buffer indices
pin = 1 - pout;
__syncthreads();
temp[pout*n+threadIdx.x] = temp[pin*n+threadIdx.x];
if (threadIdx.x >= offset)
temp[pout*n+threadIdx.x] += temp[pin*n+threadIdx.x - offset];
}
//out[threadIdx.x] = temp[pout*n+threadIdx.x]; // write output
__syncthreads();
return outAndTemp[127]+in[127]; // Return sum of all elements
}
__device__ int compactSIMDPrefixSum(const T* dsData,const unsigned int* dsValid,T* dsCompact) {
__shared__ unsigned int dsLocalIndex[256];
int numValid = exclusivePrescan128(dsValid,dsLocalIndex);
if (dsValid[threadIdx.x]) dsCompact[dsLocalIndex[threadIdx.x]] = dsData[threadIdx.x];
return numValid;
}
__global__ void moveValidElementsStaged(const T* dgData,T* dgCompact,const unsigned int* dgValid,const unsigned int* dgBlockCounts,size_t eltsPerBlock,size_t len,size_t* dNumValidElements) {
__shared__ T inBlock[128];
__shared__ unsigned int validBlock[128];
__shared__ T compactBlock[128];
int blockOutOffset=0;
// Sum up the blockCounts before us to find our offset
// This is totally inefficient - lots of repeated work b/w blocks, and uneven balancing.
// Paper implements this as a prefix sum kernel in phase II
// May still be faster than an extra kernel invocation?
for (int base = 0; base < blockIdx.x; base += blockDim.x) {
// Load up the count of valid elements for each block before us in batches of 128
if ((base + threadIdx.x) < blockIdx.x) {
validBlock[threadIdx.x] = dgBlockCounts[base+threadIdx.x];
} else {
validBlock[threadIdx.x] = 0;
}
__syncthreads();
// Parallel reduce these counts
// Accumulate in the final offset variable
blockOutOffset += sumReduce128(validBlock);
}
size_t ub;
ub = (len < (blockIdx.x+1)*eltsPerBlock) ? len : ((blockIdx.x + 1)*eltsPerBlock);
for (int base = blockIdx.x * eltsPerBlock; base < (blockIdx.x+1)*eltsPerBlock; base += blockDim.x) {
if ((base + threadIdx.x) < ub) {
validBlock[threadIdx.x] = dgValid[base+threadIdx.x];
inBlock[threadIdx.x] = dgData[base+threadIdx.x];
} else {
validBlock[threadIdx.x] = 0;
}
__syncthreads();
int numValidBlock = compactSIMDPrefixSum(inBlock,validBlock,compactBlock);
__syncthreads();
if (threadIdx.x < numValidBlock) {
dgCompact[blockOutOffset + threadIdx.x] = compactBlock[threadIdx.x];
}
blockOutOffset += numValidBlock;
}
if (blockIdx.x == (gridDim.x-1) && threadIdx.x == 0) {
*dNumValidElements = blockOutOffset;
}
}
__global__ void moveValidElementsScattered(const T* dgData,T* dgCompact,const unsigned int* dgValid,const unsigned int* dgBlockCounts,size_t eltsPerBlock,size_t len,size_t* dNumValidElements) {
__shared__ T inBlock[128];
__shared__ unsigned int validBlock[128];
T* compactBlock=dgCompact;
size_t blockOutOffset = 0;
// Sum up the blockCounts before us to find our offset
// This is totally inefficient - lots of repeated work b/w blocks, and uneven balancing.
// Paper implements this as a prefix sum kernel in phase II
// May still be faster than an extra kernel invocation?
for (int base = 0; base < blockIdx.x; base += blockDim.x) {
// Load up the count of valid elements for each block before us in batches of 128
if ((base + threadIdx.x) < blockIdx.x) {
validBlock[threadIdx.x] = dgBlockCounts[base+threadIdx.x];
} else {
validBlock[threadIdx.x] = 0;
}
__syncthreads();
// Parallel reduce these counts
// Accumulate in the final offset variable
blockOutOffset += sumReduce128(validBlock);
}
compactBlock += blockOutOffset;
size_t ub;
ub = (len < (blockIdx.x+1)*eltsPerBlock) ? len : ((blockIdx.x + 1)*eltsPerBlock);
for (int base = blockIdx.x * eltsPerBlock; base < (blockIdx.x+1)*eltsPerBlock; base += blockDim.x) {
if ((base + threadIdx.x) < ub) {
validBlock[threadIdx.x] = dgValid[base+threadIdx.x];
inBlock[threadIdx.x] = dgData[base+threadIdx.x];
} else {
validBlock[threadIdx.x] = 0;
}
__syncthreads();
int numValidBlock = compactSIMDPrefixSum(inBlock,validBlock,compactBlock);
blockOutOffset += numValidBlock;
compactBlock += numValidBlock;
}
if (blockIdx.x == (gridDim.x-1) && threadIdx.x == 0) {
*dNumValidElements = blockOutOffset;
}
}
void OPENMMCUDA_EXPORT planCompaction(compactionPlan& d,bool stageOutput) {
int device;
cudaGetDevice(&device);
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, device);
d.nThreadBlocks = 16*deviceProp.multiProcessorCount;
cudaMalloc((void**)&(d.dgBlockCounts), d.nThreadBlocks*sizeof(unsigned int));
d.stageOutput = stageOutput;
// TODO: make sure allocation worked
d.valid = true;
}
void OPENMMCUDA_EXPORT destroyCompactionPlan(compactionPlan& d) {
if (d.valid) cudaFree(d.dgBlockCounts);
}
int OPENMMCUDA_EXPORT compactStream(const compactionPlan& d,T* dOut,const T* dIn,const unsigned int* dValid,size_t len,size_t* dNumValid) {
if (!d.valid) {
return -1;
}
// Figure out # elements per block
unsigned int numBlocks = d.nThreadBlocks;
if (numBlocks*128 > len)
numBlocks = (len+127)/128;
const size_t eltsPerBlock = len/numBlocks + ((len % numBlocks) ? 1 : 0);
// TODO: implement loop over blocks of 10M
// Phase 1: Calculate number of valid elements per thread block
countElts<<<numBlocks,128>>>(d.dgBlockCounts,dValid,eltsPerBlock,len);
// Phase 2/3: Move valid elements using SIMD compaction
if (d.stageOutput) {
moveValidElementsStaged<<<numBlocks,128>>>(dIn,dOut,dValid,d.dgBlockCounts,eltsPerBlock,len,dNumValid);
} else {
moveValidElementsScattered<<<numBlocks,128>>>(dIn,dOut,dValid,d.dgBlockCounts,eltsPerBlock,len,dNumValid);
}
return 0;
}
#ifndef __OPENMM_CUDACOMPACT_H__
#define __OPENMM_CUDACOMPACT_H__
/* Code for CUDA stream compaction. Roughly based on:
Billeter M, Olsson O, Assarsson U. Efficient Stream Compaction on Wide SIMD Many-Core Architectures.
High Performance Graphics 2009.
Notes:
- paper recommends 128 threads/block, so this is hard coded.
- I only implement the prefix-sum based compact primitive, and not the POPC one, as that is more
complicated and performs poorly on current hardware
- I only implement the scattered- and staged-write variant of phase III as it they have reasonable
performance across most of the tested workloads in the paper. The selective variant is not
implemented.
- The prefix sum of per-block element counts (phase II) is not done in a particularly efficient
manner. It is, however, done in a very easy to program manner, and integrated into the top of
phase III, reducing the number of kernel invocations required. If one wanted to use existing code,
it'd be easy to take the CUDA SDK scanLargeArray sample, and do a prefix sum over dgBlockCounts in
a phase II kernel. You could also adapt the existing prescan128 to take an initial value, and scan
dgBlockCounts in stages.
Date: 23 Aug 2009
Author: Imran Haque (ihaque@cs.stanford.edu)
Affiliation: Stanford University
License: Public Domain
*/
#include "windowsExportCuda.h"
struct compactionPlan {
bool valid;
unsigned int* dgBlockCounts;
unsigned int nThreadBlocks;
bool stageOutput;
};
extern "C"
void OPENMMCUDA_EXPORT planCompaction(compactionPlan& d,bool stageOutput=true);
extern "C"
void OPENMMCUDA_EXPORT destroyCompactionPlan(compactionPlan& d);
extern "C"
int OPENMMCUDA_EXPORT compactStream(const compactionPlan& d,unsigned int* dOut,const unsigned int* dIn,const unsigned int* dValid,size_t len,size_t* dNumValid);
#endif // __OPENMM_CUDACOMPACT_H__
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "gputypes.h"
// Initialization
extern void OPENMMCUDA_EXPORT kClearForces(gpuContext gpu);
extern void kClearEnergy(gpuContext gpu);
extern void kClearBornSumAndForces(gpuContext gpu);
extern void kClearObcGbsaBornSum(gpuContext gpu);
extern void OPENMMCUDA_EXPORT kCalculateObcGbsaBornSum(gpuContext gpu);
extern void OPENMMCUDA_EXPORT kReduceObcGbsaBornSum(gpuContext gpu);
extern void kCalculateGBVIBornSum(gpuContext gpu);
extern void kReduceGBVIBornSum(gpuContext gpu);
extern void kClearGBVIBornSum( gpuContext gpu );
extern void kGenerateRandoms(gpuContext gpu);
// Main loop
extern void kCalculateCDLJObcGbsaForces1(gpuContext gpu);
extern void kCalculateCDLJGBVIForces1(gpuContext gpu);
extern void kCalculateCDLJForces(gpuContext gpu);
extern void kCalculateCMAPTorsionForces(gpuContext gpu, CUDAStream<float4>& coefficients, CUDAStream<int2>& mapPositions, CUDAStream<int4>& torsionIndices, CUDAStream<int>& torsionMaps);
extern void kCalculateCustomBondForces(gpuContext gpu);
extern void kCalculateCustomAngleForces(gpuContext gpu);
extern void kCalculateCustomTorsionForces(gpuContext gpu);
extern void kCalculateCustomExternalForces(gpuContext gpu);
extern void kCalculateCustomNonbondedForces(gpuContext gpu, bool neighborListValid);
extern void kReduceObcGbsaBornForces(gpuContext gpu);
extern void OPENMMCUDA_EXPORT kCalculateObcGbsaForces2(gpuContext gpu);
extern void kCalculateGBVIForces2(gpuContext gpu);
extern void kCalculateLocalForces(gpuContext gpu);
extern void kCalculateAndersenThermostat(gpuContext gpu, CUDAStream<int>& atomGroups);
extern void kReduceBornSumAndForces(gpuContext gpu);
extern void kApplyShake(gpuContext gpu);
extern void kApplyCCMA(gpuContext gpu);
extern void kApplySettle(gpuContext gpu);
extern void kLangevinUpdatePart1(gpuContext gpu);
extern void kLangevinUpdatePart2(gpuContext gpu);
extern void kSelectLangevinStepSize(gpuContext gpu, float maxTimeStep);
extern void kSetVelocitiesFromPositions(gpuContext gpu);
extern void kVerletUpdatePart1(gpuContext gpu);
extern void kVerletUpdatePart2(gpuContext gpu);
extern void kSelectVerletStepSize(gpuContext gpu, float maxTimeStep);
extern void kBrownianUpdatePart1(gpuContext gpu);
extern void kBrownianUpdatePart2(gpuContext gpu);
extern void kScaleAtomCoordinates(gpuContext gpu, float scale, CUDAStream<int>& moleculeAtoms, CUDAStream<int>& moleculeStartIndex);
extern void kApplyConstraints(gpuContext gpu);
// Extras
extern void OPENMMCUDA_EXPORT kReduceForces(gpuContext gpu);
extern double kReduceEnergy(gpuContext gpu);
// Initializers
extern void SetCalculateCDLJObcGbsaForces1Sim(gpuContext gpu);
extern void GetCalculateCDLJObcGbsaForces1Sim(gpuContext gpu);
extern void SetCalculateCDLJForcesSim(gpuContext gpu);
extern void GetCalculateCDLJForcesSim(gpuContext gpu);
extern void SetCalculateCustomBondForcesSim(gpuContext gpu);
extern void GetCalculateCustomBondForcesSim(gpuContext gpu);
extern void SetCalculateCustomAngleForcesSim(gpuContext gpu);
extern void GetCalculateCustomAngleForcesSim(gpuContext gpu);
extern void SetCalculateCustomTorsionForcesSim(gpuContext gpu);
extern void GetCalculateCustomTorsionForcesSim(gpuContext gpu);
extern void SetCalculateCustomExternalForcesSim(gpuContext gpu);
extern void GetCalculateCustomExternalForcesSim(gpuContext gpu);
extern void SetCalculateCustomNonbondedForcesSim(gpuContext gpu);
extern void GetCalculateCustomNonbondedForcesSim(gpuContext gpu);
extern void SetCalculateLocalForcesSim(gpuContext gpu);
extern void GetCalculateLocalForcesSim(gpuContext gpu);
extern void SetCalculateObcGbsaBornSumSim(gpuContext gpu);
extern void GetCalculateObcGbsaBornSumSim(gpuContext gpu);
extern void SetCalculateGBVIBornSumSim(gpuContext gpu);
extern void GetCalculateGBVIBornSumSim(gpuContext gpu);
extern void OPENMMCUDA_EXPORT SetCalculateObcGbsaForces2Sim(gpuContext gpu);
extern void GetCalculateObcGbsaForces2Sim(gpuContext gpu);
extern void SetCalculateGBVIForces2Sim(gpuContext gpu);
extern void GetCalculateGBVIForces2Sim(gpuContext gpu);
extern void SetCalculateAndersenThermostatSim(gpuContext gpu);
extern void GetCalculateAndersenThermostatSim(gpuContext gpu);
extern void SetCalculatePMESim(gpuContext gpu);
extern void GetCalculatePMESim(gpuContext gpu);
extern void OPENMMCUDA_EXPORT SetForcesSim(gpuContext gpu);
extern void GetForcesSim(gpuContext gpu);
extern void SetShakeHSim(gpuContext gpu);
extern void GetShakeHSim(gpuContext gpu);
extern void SetLangevinUpdateSim(gpuContext gpu);
extern void GetLangevinUpdateSim(gpuContext gpu);
extern void SetSettleSim(gpuContext gpu);
extern void GetSettleSim(gpuContext gpu);
extern void SetCCMASim(gpuContext gpu);
extern void GetCCMASim(gpuContext gpu);
extern void SetVerletUpdateSim(gpuContext gpu);
extern void GetVerletUpdateSim(gpuContext gpu);
extern void SetBrownianUpdateSim(gpuContext gpu);
extern void GetBrownianUpdateSim(gpuContext gpu);
extern void SetRandomSim(gpuContext gpu);
extern void GetRandomSim(gpuContext gpu);
extern void SetCustomBondForceExpression(const Expression<256>& expression);
extern void SetCustomBondEnergyExpression(const Expression<256>& expression);
extern void SetCustomBondGlobalParams(const std::vector<float>& paramValues);
extern void SetCustomAngleForceExpression(const Expression<256>& expression);
extern void SetCustomAngleEnergyExpression(const Expression<256>& expression);
extern void SetCustomAngleGlobalParams(const std::vector<float>& paramValues);
extern void SetCustomTorsionForceExpression(const Expression<256>& expression);
extern void SetCustomTorsionEnergyExpression(const Expression<256>& expression);
extern void SetCustomTorsionGlobalParams(const std::vector<float>& paramValues);
extern void SetCustomExternalForceExpressions(const Expression<256>& expressionX, const Expression<256>& expressionY, const Expression<256>& expressionZ);
extern void SetCustomExternalEnergyExpression(const Expression<256>& expression);
extern void SetCustomExternalGlobalParams(const std::vector<float>& paramValues);
extern void SetCustomNonbondedForceExpression(const Expression<256>& expression);
extern void SetCustomNonbondedEnergyExpression(const Expression<256>& expression);
extern void SetCustomNonbondedGlobalParams(const std::vector<float>& paramValues);
extern void kPrintGBVI( gpuContext gpu, std::string callId, int call, FILE* log);
extern void kPrintObc( gpuContext gpu, std::string callId, int call, FILE* log);
#ifndef CUDATYPES_H
#define CUDATYPES_H
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include <stdarg.h>
#include <limits>
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <string>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <cufft.h>
#include <builtin_types.h>
#include <vector_functions.h>
#include "openmm/OpenMMException.h"
#define RTERROR(status, s) \
if (status != cudaSuccess) { \
throw OpenMM::OpenMMException(std::string(s) + " " + cudaGetErrorString(status)); \
}
#define LAUNCHERROR(s) \
{ \
cudaError_t status = cudaGetLastError(); \
if (status != cudaSuccess) { \
throw OpenMM::OpenMMException(std::string("Error: ") + cudaGetErrorString(status) + " launching kernel " + s); \
} \
}
// Pure virtual class to define an interface for objects resident both on GPU and CPU
struct SoADeviceObject {
virtual void Allocate() = 0;
virtual void Deallocate() = 0;
virtual void Upload() = 0;
virtual void Download() = 0;
};
template <typename T>
struct CUDAStream : public SoADeviceObject
{
unsigned int _length;
unsigned int _subStreams;
unsigned int _stride;
T** _pSysStream;
T** _pDevStream;
T* _pSysData;
T* _pDevData;
std::string _name;
CUDAStream(int length, int subStreams = 1, std::string name="");
CUDAStream(unsigned int length, unsigned int subStreams = 1, std::string name="");
CUDAStream(unsigned int length, int subStreams = 1, std::string name="");
CUDAStream(int length, unsigned int subStreams = 1, std::string name="");
virtual ~CUDAStream();
void Allocate();
void Deallocate();
void Upload();
void Download();
void CopyFrom(const CUDAStream<T>& src);
void Collapse(unsigned int newstreams = 1, unsigned int interleave = 1);
T& operator[](int index);
};
float CompareStreams(CUDAStream<float>& s1, CUDAStream<float>& s2, float tolerance, unsigned int maxindex = 0);
template <typename T>
CUDAStream<T>::CUDAStream(int length, unsigned int subStreams, std::string name) : _length(length), _subStreams(subStreams), _stride((length + 0xf) & 0xfffffff0), _name(name)
{
Allocate();
}
template <typename T>
CUDAStream<T>::CUDAStream(unsigned int length, int subStreams, std::string name) : _length(length), _subStreams(subStreams), _stride((length + 0xf) & 0xfffffff0), _name(name)
{
Allocate();
}
template <typename T>
CUDAStream<T>::CUDAStream(unsigned int length, unsigned int subStreams, std::string name) : _length(length), _subStreams(subStreams), _stride((length + 0xf) & 0xfffffff0), _name(name)
{
Allocate();
}
template <typename T>
CUDAStream<T>::CUDAStream(int length, int subStreams, std::string name) : _length(length), _subStreams(subStreams), _stride((length + 0xf) & 0xfffffff0), _name(name)
{
Allocate();
}
template <typename T>
CUDAStream<T>::~CUDAStream()
{
Deallocate();
}
template <typename T>
void CUDAStream<T>::Allocate()
{
cudaError_t status;
_pSysStream = new T*[_subStreams];
_pDevStream = new T*[_subStreams];
_pSysData = new T[_subStreams * _stride];
status = cudaMalloc((void **) &_pDevData, _stride * _subStreams * sizeof(T));
RTERROR(status, (_name+": cudaMalloc in CUDAStream::Allocate failed").c_str());
for (unsigned int i = 0; i < _subStreams; i++)
{
_pSysStream[i] = _pSysData + i * _stride;
_pDevStream[i] = _pDevData + i * _stride;
}
}
template <typename T>
void CUDAStream<T>::Deallocate()
{
cudaError_t status;
delete[] _pSysStream;
_pSysStream = NULL;
delete[] _pDevStream;
_pDevStream = NULL;
delete[] _pSysData;
_pSysData = NULL;
status = cudaFree(_pDevData);
RTERROR(status, (_name+": cudaFree in CUDAStream::Deallocate failed").c_str());
}
template <typename T>
void CUDAStream<T>::Upload()
{
cudaError_t status;
status = cudaMemcpy(_pDevData, _pSysData, _stride * _subStreams * sizeof(T), cudaMemcpyHostToDevice);
RTERROR(status, (_name+": cudaMemcpy in CUDAStream::Upload failed").c_str());
}
template <typename T>
void CUDAStream<T>::Download()
{
cudaError_t status;
status = cudaMemcpy(_pSysData, _pDevData, _stride * _subStreams * sizeof(T), cudaMemcpyDeviceToHost);
RTERROR(status, (_name+": cudaMemcpy in CUDAStream::Download failed").c_str());
}
template <typename T>
void CUDAStream<T>::CopyFrom(const CUDAStream<T>& src)
{
cudaError_t status;
status = cudaMemcpy(_pDevData, src._pDevData, _stride * _subStreams * sizeof(T), cudaMemcpyDeviceToDevice);
RTERROR(status, (_name+": cudaMemcpy in CUDAStream::Copy failed").c_str());
}
template <typename T>
void CUDAStream<T>::Collapse(unsigned int newstreams, unsigned int interleave)
{
T* pTemp = new T[_subStreams * _stride];
unsigned int stream = 0;
unsigned int pos = 0;
unsigned int newstride = _stride * _subStreams / newstreams;
unsigned int newlength = _length * _subStreams / newstreams;
// Copy data into new format
for (unsigned int i = 0; i < _length; i++)
{
for (unsigned int j = 0; j < _subStreams; j++)
{
pTemp[stream * newstride + pos] = _pSysStream[j][i];
stream++;
if (stream == newstreams)
{
stream = 0;
pos++;
}
}
}
// Remap stream pointers;
for (unsigned int i = 0; i < newstreams; i++)
{
_pSysStream[i] = _pSysData + i * newstride;
_pDevStream[i] = _pDevData + i * newstride;
}
// Copy data back intro original stream
for (unsigned int i = 0; i < newlength; i++)
for (unsigned int j = 0; j < newstreams; j++)
_pSysStream[j][i] = pTemp[j * newstride + i];
_stride = newstride;
_length = newlength;
_subStreams = newstreams;
delete[] pTemp;
}
template <typename T>
T& CUDAStream<T>::operator[](int index)
{
return _pSysData[index];
}
static const unsigned int GRID = 32;
static const unsigned int GRIDBITS = 5;
static const int G8X_BLOCKS_PER_SM = 1;
static const int GT2XX_BLOCKS_PER_SM = 1;
static const int GF1XX_BLOCKS_PER_SM = 1;
static const int G8X_NONBOND_THREADS_PER_BLOCK = 256;
static const int GT2XX_NONBOND_THREADS_PER_BLOCK = 320;
static const int GF1XX_NONBOND_THREADS_PER_BLOCK = 768;
//static const int GF1XX_NONBOND_THREADS_PER_BLOCK = 768;
static const int G8X_BORNFORCE2_THREADS_PER_BLOCK = 256;
static const int GT2XX_BORNFORCE2_THREADS_PER_BLOCK = 320;
static const int GF1XX_BORNFORCE2_THREADS_PER_BLOCK = 768;
//static const int GF1XX_BORNFORCE2_THREADS_PER_BLOCK = 768;
static const int G8X_SHAKE_THREADS_PER_BLOCK = 128;
static const int GT2XX_SHAKE_THREADS_PER_BLOCK = 256;
static const int GF1XX_SHAKE_THREADS_PER_BLOCK = 512;
static const int G8X_UPDATE_THREADS_PER_BLOCK = 192;
static const int GT2XX_UPDATE_THREADS_PER_BLOCK = 384;
static const int GF1XX_UPDATE_THREADS_PER_BLOCK = 768;
static const int G8X_LOCALFORCES_THREADS_PER_BLOCK = 192;
static const int GT2XX_LOCALFORCES_THREADS_PER_BLOCK = 384;
static const int GF1XX_LOCALFORCES_THREADS_PER_BLOCK = 768;
static const int G8X_THREADS_PER_BLOCK = 256;
static const int GT2XX_THREADS_PER_BLOCK = 256;
static const int GF1XX_THREADS_PER_BLOCK = 512;
static const int G8X_RANDOM_THREADS_PER_BLOCK = 256;
static const int GT2XX_RANDOM_THREADS_PER_BLOCK = 384;
static const int GF1XX_RANDOM_THREADS_PER_BLOCK = 768;
static const int G8X_NONBOND_WORKUNITS_PER_SM = 220;
static const int GT2XX_NONBOND_WORKUNITS_PER_SM = 256;
static const int GF1XX_NONBOND_WORKUNITS_PER_SM = 768;
static const unsigned int MAX_STACK_SIZE = 8;
static const unsigned int MAX_TABULATED_FUNCTIONS = 4;
static const float PI = 3.14159265358979323846f;
static const int PME_ORDER = 5;
enum CudaNonbondedMethod
{
NO_CUTOFF,
CUTOFF,
PERIODIC,
EWALD,
PARTICLE_MESH_EWALD
};
enum ExpressionOp {
VARIABLE0 = 0, VARIABLE1, VARIABLE2, VARIABLE3, VARIABLE4, VARIABLE5, VARIABLE6, VARIABLE7, VARIABLE8, MULTIPLY, DIVIDE, ADD, SUBTRACT, POWER, MULTIPLY_CONSTANT, POWER_CONSTANT, ADD_CONSTANT,
GLOBAL, CONSTANT, CUSTOM, CUSTOM_DERIV, NEGATE, RECIPROCAL, SQRT, EXP, LOG, SQUARE, CUBE, STEP, SIN, COS, SEC, CSC, TAN, COT, ASIN, ACOS, ATAN, SINH, COSH, TANH, ERF, ERFC,
MIN, MAX, ABS
};
template<int SIZE>
struct Expression {
int op[SIZE];
float arg[SIZE];
int length, stackSize;
};
struct cudaGmxSimulation {
// Constants
unsigned int atoms; // Number of atoms
unsigned int paddedNumberOfAtoms; // Padded number of atoms
unsigned int blocks; // Number of blocks to launch across linear kernels
unsigned int blocksPerSM; // Number of blocks per share memory
unsigned int nonbond_blocks; // Number of blocks to launch across CDLJ and Born Force Part1
unsigned int bornForce2_blocks; // Number of blocks to launch across Born Force 2
unsigned int interaction_blocks; // Number of blocks to launch when identifying interacting tiles
unsigned int threads_per_block; // Threads per block to launch
unsigned int nonbond_threads_per_block; // Threads per block in nonbond kernel calls
unsigned int bornForce2_threads_per_block; // Threads per block in nonbond kernel calls
unsigned int max_update_threads_per_block; // Maximum threads per block in update kernel calls
unsigned int update_threads_per_block; // Threads per block in update kernel calls
unsigned int bf_reduce_threads_per_block; // Threads per block in Born Force reduction calls
unsigned int bsf_reduce_threads_per_block; // Threads per block in Born Sum And Forces reduction calls
unsigned int max_shake_threads_per_block; // Maximum threads per block in shake kernel calls
unsigned int shake_threads_per_block; // Threads per block in shake kernel calls
unsigned int settle_threads_per_block; // Threads per block in SETTLE kernel calls
unsigned int ccma_threads_per_block; // Threads per block in CCMA kernel calls
unsigned int max_localForces_threads_per_block; // Threads per block in local forces kernel calls
unsigned int localForces_threads_per_block; // Threads per block in local forces kernel calls
unsigned int random_threads_per_block; // Threads per block in RNG kernel calls
unsigned int interaction_threads_per_block; // Threads per block when identifying interacting tiles
unsigned int custom_exception_threads_per_block; // Threads per block in custom nonbonded exception kernel calls
unsigned int customExpressionStackSize; // Stack size for evaluating custom nonbonded forces
unsigned int workUnits; // Number of work units
unsigned int* pWorkUnit; // Pointer to work units
unsigned int* pInteractingWorkUnit; // Pointer to work units that have interactions
unsigned int* pInteractionFlag; // Flags for which work units have interactions
float2* pStepSize; // The size of the previous and current time steps
float* pLangevinParameters; // Parameters used for Langevin integration
float errorTol; // Error tolerance for selecting the step size
size_t* pInteractionCount; // A count of the number of work units which have interactions
unsigned int nonbond_workBlock; // Number of work units running simultaneously per block in CDLJ and Born Force Part 1
unsigned int bornForce2_workBlock; // Number of work units running second half of Born Forces calculation
unsigned int workUnitsPerSM; // Number of workblocks per SM
unsigned int nbWorkUnitsPerBlock; // Number of work units assigned to each nonbond block
unsigned int nbWorkUnitsPerBlockRemainder; // Remainder of work units to assign across lower numbered nonbond blocks
unsigned int bf2WorkUnitsPerBlock; // Number of work units assigned to each bornForce2 block
unsigned int bf2WorkUnitsPerBlockRemainder; // Remainder of work units to assign across lower numbered bornForce2 blocks
unsigned int stride; // Atomic attributes stride
unsigned int stride2; // Atomic attributes stride x 2
unsigned int stride3; // Atomic attributes stride x 3
unsigned int stride4; // Atomic attributes stride x 4
unsigned int nonbondOutputBuffers; // Nonbond output buffers per nonbond call
unsigned int outputBuffers; // Number of output buffers
unsigned int energyOutputBuffers; // Number of energy output buffers
float bigFloat; // Floating point value used as a flag for Shaken atoms
float epsfac; // Epsilon factor for CDLJ calculations
CudaNonbondedMethod nonbondedMethod; // How to handle nonbonded interactions
CudaNonbondedMethod customNonbondedMethod; // How to handle custom nonbonded interactions
float nonbondedCutoff; // Cutoff distance for nonbonded interactions
float nonbondedCutoffSqr; // Square of the cutoff distance for nonbonded interactions
float periodicBoxSizeX; // The X dimension of the periodic box
float periodicBoxSizeY; // The Y dimension of the periodic box
float periodicBoxSizeZ; // The Z dimension of the periodic box
float invPeriodicBoxSizeX; // The 1 over the X dimension of the periodic box
float invPeriodicBoxSizeY; // The 1 over the Y dimension of the periodic box
float invPeriodicBoxSizeZ; // The 1 over the Z dimension of the periodic box
float recipBoxSizeX; // The X dimension of the reciprocal box for Ewald summation
float recipBoxSizeY; // The Y dimension of the reciprocal box for Ewald summation
float recipBoxSizeZ; // The Z dimension of the reciprocal box for Ewald summation
float cellVolume; // Ewald parameter alpha (a.k.a. kappa)
float alphaEwald; // Ewald parameter alpha (a.k.a. kappa)
float factorEwald; // - 1 ( 4 * alphaEwald * alphaEwald)
int kmaxX; // Maximum number of reciprocal vectors in the X direction
int kmaxY; // Maximum number of reciprocal vectors in the Y direction
int kmaxZ; // Maximum number of reciprocal vectors in the Z direction
float reactionFieldK; // Constant for reaction field correction
float reactionFieldC; // Constant for reaction field correction
float probeRadius; // SASA probe radius
float surfaceAreaFactor; // ACE approximation surface area factor
float electricConstant; // ACE approximation electric constant
float forceConversionFactor; // kJ to kcal force conversion factor
float preFactor; // Born electrostatic pre-factor
float dielectricOffset; // Born dielectric offset
float alphaOBC; // OBC alpha factor
float betaOBC; // OBC beta factor
float gammaOBC; // OBC gamma factor
float deltaT; // Molecular dynamics deltaT constant
float oneOverDeltaT; // 1/deltaT
float T; // Temperature
float kT; // Boltzmann's constant times T
float noiseAmplitude; // The magnitude of the noise for Brownian dynamics
float tau; // Inverse friction for Langevin or Brownian dynamics
float tauDeltaT; // tau*deltaT
float collisionFrequency; // Collision frequency for Andersen thermostat
float2* pObcData; // Pointer to fixed Born data
int gbviBornRadiusScalingMethod; // scaling method for GB/VI Born radii
float gbviQuinticLowerLimitFactor; // Lower limit factor for scaing of GB/VI Born radii using quintic spline
float gbviQuinticUpperBornRadiusLimit;// Upper limit for GB/VI Born radii
float4* pGBVIData; // Pointer to fixed Born data for GB/VI algorithm
float* pGBVISwitchDerivative; // Pointer to GB/VI Born switch derivatives
float2* pAttr; // Pointer to additional atom attributes (sig, eps)
float4* pCustomParams; // Pointer to atom parameters for custom nonbonded force
unsigned int customExceptions; // Number of custom nonbonded exceptions
unsigned int customParameters; // Number of parameters for custom nonbonded interactions
int4* pCustomBondID; // Atom indices for custom bonds
float4* pCustomBondParams; // Parameters for custom bonds
unsigned int customBonds; // Number of custom bonds
unsigned int customBondParameters; // Number of parameters for custom bonds
int4* pCustomAngleID1; // Atom indices for custom angles
int2* pCustomAngleID2; // Atom indices for custom angles
float4* pCustomAngleParams; // Parameters for custom angles
unsigned int customAngles; // Number of custom angles
unsigned int customAngleParameters; // Number of parameters for custom angles
int4* pCustomTorsionID1; // Atom indices for custom torsions
int4* pCustomTorsionID2; // Atom indices for custom torsions
float4* pCustomTorsionParams; // Parameters for custom torsions
unsigned int customTorsions; // Number of custom torsions
unsigned int customTorsionParameters; // Number of parameters for custom torsions
int* pCustomExternalID; // Atom indices for custom external force
float4* pCustomExternalParams; // Parameters for custom external force
unsigned int customExternals; // Number of particles for custom external force
unsigned int customExternalParameters; // Number of parameters for custom external force
float4* pTabulatedFunctionCoefficients[MAX_TABULATED_FUNCTIONS]; // The spline coefficients for each tabulated function
float4* pTabulatedFunctionParams; // The min, max, and spacing for each tabulated function
float2* pEwaldCosSinSum; // Pointer to the cos/sin sums (ewald)
float* pTabulatedErfc; // Tabulated values for erfc()
int tabulatedErfcSize; // The number of tabulated values for erfc()
float tabulatedErfcScale; // Scale factor for the argument to erfc()
int3 pmeGridSize; // The dimensions of the grid for particle mesh Ewald
int3 pmeGroupSize; // The dimensions of the groups used in charge spreading for PME
cufftComplex* pPmeGrid; // Grid points for particle mesh Ewald
float* pPmeBsplineModuli[3];
float4* pPmeBsplineTheta;
float4* pPmeBsplineDtheta;
int* pPmeAtomRange; // The range of sorted atoms at each grid point
int2* pPmeAtomGridIndex; // The grid point each atom is at
unsigned int bonds; // Number of bonds
int4* pBondID; // Bond atom and output buffer IDs
float2* pBondParameter; // Bond parameters
unsigned int bond_angles; // Number of bond angles
int4* pBondAngleID1; // Bond angle atom and first output buffer IDs
int2* pBondAngleID2; // Bond angle output buffer IDs
float2* pBondAngleParameter; // Bond angle parameters
unsigned int dihedrals; // Number of dihedrals
int4* pDihedralID1; // Dihedral IDs
int4* pDihedralID2; // Dihedral output buffer IDs
float4* pDihedralParameter; // Dihedral parameters
unsigned int rb_dihedrals; // Number of Ryckaert Bellemans dihedrals
int4* pRbDihedralID1; // Ryckaert Bellemans Dihedral IDs
int4* pRbDihedralID2; // Ryckaert Bellemans Dihedral output buffer IDs
float4* pRbDihedralParameter1; // Ryckaert Bellemans Dihedral parameters
float2* pRbDihedralParameter2; // Ryckaert Bellemans Dihedral parameters
unsigned int LJ14s; // Number of Lennard Jones 1-4 interactions
int4* pLJ14ID; // Lennard Jones 1-4 atom and output buffer IDs
float4* pLJ14Parameter; // Lennard Jones 1-4 parameters
float inverseTotalMass; // Used in linear momentum removal
unsigned int ShakeConstraints; // Total number of Shake constraints
unsigned int settleConstraints; // Total number of Settle constraints
unsigned int ccmaConstraints; // Total number of CCMA constraints.
unsigned int rigidClusters; // Total number of rigid clusters
unsigned int maxRigidClusterSize; // The size of the largest rigid cluster
unsigned int clusterShakeBlockSize; // The number of threads to process each rigid cluster
unsigned int maxShakeIterations; // Maximum shake iterations
unsigned int degreesOfFreedom; // Number of degrees of freedom in system
float shakeTolerance; // Shake tolerance
float InvMassJ; // Shake inverse mass for hydrogens
int* pNonShakeID; // Not Shaking atoms
int4* pShakeID; // Shake atoms and phase
float4* pShakeParameter; // Shake parameters
int4* pSettleID; // Settle atoms
float2* pSettleParameter; // Settle parameters
unsigned int* pExclusion; // Nonbond exclusion data
unsigned int* pExclusionIndex; // Index of exclusion data for each work unit
unsigned int bond_offset; // Offset to end of bonds
unsigned int bond_angle_offset; // Offset to end of bond angles
unsigned int dihedral_offset; // Offset to end of dihedrals
unsigned int rb_dihedral_offset; // Offset to end of Ryckaert Bellemans dihedrals
unsigned int LJ14_offset; // Offset to end of Lennard Jones 1-4 parameters
int* pAtomIndex; // The original index of each atom
float4* pGridBoundingBox; // The size of each grid cell
float4* pGridCenter; // The center of each grid cell
int2* pCcmaAtoms; // The atoms connected by each CCMA constraint
float4* pCcmaDistance; // The displacement vector (x, y, z) and constraint distance (w) for each CCMA constraint
float* pCcmaDelta1; // Workspace for CCMA
float* pCcmaDelta2; // Workspace for CCMA
int* pCcmaAtomConstraints; // The indices of constraints involving each atom
int* pCcmaNumAtomConstraints; // The number of constraints involving each atom
int* ccmaConvergedDeviceMarker; // Device memory used to communicate that CCMA has converged
float* pCcmaReducedMass; // The reduced mass for each CCMA constraint
unsigned int* pConstraintMatrixColumn; // The column of each element in the constraint matrix.
float* pConstraintMatrixValue; // The value of each element in the constraint matrix.
// Mutable stuff
float4* pPosq; // Pointer to atom positions and charges
float4* pPosqP; // Pointer to mid-integration atom positions
float4* pOldPosq; // Pointer to old atom positions
float4* pVelm4; // Pointer to atom velocity and inverse mass
float4* pForce4; // Pointer to force data
float* pEnergy; // Pointer to energy output buffer
float* pBornForce; // Pointer to Born force data
float* pBornSum; // Pointer to Born Radii calculation output buffers
float* pBornRadii; // Pointer to Born Radii
float* pObcChain; // Pointer to OBC chain data
float4* pLinearMomentum; // Pointer to linear momentum
// Random numbers
float4* pRandom4; // Pointer to 4 random numbers
float2* pRandom2; // Pointer to 2 random numbers
uint4* pRandomSeed; // Pointer to random seeds
int* pRandomPosition; // Pointer to random number positions
unsigned int randoms; // Number of randoms
unsigned int totalRandoms; // Number of randoms plus overflow.
unsigned int randomIterations; // Number of iterations before regenerating randoms
unsigned int randomFrames; // Number of frames of random numbers
};
struct Vectors {
float3 v0;
float3 v1;
float3 v2;
};
#endif
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include <stdio.h>
#include <string.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
#include <sstream>
#include <cmath>
#include <map>
#include <set>
#include <algorithm>
#ifdef WIN32
#define _USE_MATH_DEFINES /* M_PI */
#include <math.h>
#include <windows.h>
#else
#include <stdint.h>
#endif
using namespace std;
#include "gputypes.h"
#include "cudaKernels.h"
#include "hilbert.h"
#include "openmm/OpenMMException.h"
#include "openmm/internal/SplineFitter.h"
#include "quern.h"
#include "Lepton.h"
#include "rng.h"
#include "../CudaForceInfo.h"
// In case we're using some primitive version of Visual Studio this will
// make sure that erf() and erfc() are defined.
#include "openmm/internal/MSVC_erfc.h"
#include "openmm/internal/windowsExport.h"
using OpenMM::OpenMMException;
using Lepton::Operation;
struct ShakeCluster {
int centralID;
int peripheralID[3];
int size;
bool valid;
float distance;
float centralInvMass, peripheralInvMass;
ShakeCluster() : valid(true) {
}
ShakeCluster(int centralID, float invMass) : centralID(centralID), centralInvMass(invMass), size(0), valid(true) {
}
void addAtom(int id, float dist, float invMass) {
if (size == 3 || (size > 0 && dist != distance) || (size > 0 && invMass != peripheralInvMass))
valid = false;
else {
peripheralID[size++] = id;
distance = dist;
peripheralInvMass = invMass;
}
}
};
struct Constraint
{
Constraint(int atom1, int atom2, float distance2) : atom1(atom1), atom2(atom2), distance2(distance2) {
}
int atom1, atom2;
float distance2;
};
struct ConstraintOrderer : public binary_function<int, int, bool> {
const vector<int>& atom1;
const vector<int>& atom2;
const vector<int>& constraints;
ConstraintOrderer(const vector<int>& atom1, const vector<int>& atom2, const vector<int>& constraints) : atom1(atom1), atom2(atom2), constraints(constraints) {
}
bool operator()(int x, int y) {
int ix = constraints[x];
int iy = constraints[y];
if (atom1[ix] != atom1[iy])
return atom1[ix] < atom1[iy];
return atom2[ix] < atom2[iy];
}
};
struct Molecule {
vector<int> atoms;
vector<int> constraints;
vector<vector<int> > groups;
};
static const float dielectricOffset = 0.009f;
static const float probeRadius = 0.14f;
static const float forceConversionFactor = 0.4184f;
//static const float surfaceAreaFactor = -6.0f * 0.06786f * forceConversionFactor * 1000.0f; // PI * 4.0f * 0.0049f * 1000.0f;
//static const float surfaceAreaFactor = -6.0f * PI * 4.0f * 0.0049f * 1000.0f;
static const float surfaceAreaFactor = -6.0f*PI*0.0216f*1000.0f*0.4184f;
//static const float surfaceAreaFactor = -1.7035573959e+001;
//static const float surfaceAreaFactor = -166.03185f;
//static const float surfaceAreaFactor = 1.0f;
static const float alphaOBC = 1.0f;
static const float betaOBC = 0.8f;
static const float gammaOBC = 4.85f;
static const float kcalMolTokJNM = -0.4184f;
static const float electricConstant = -166.03185f;
static const float defaultInnerDielectric = 1.0f;
static const float defaultSolventDielectric = 78.3f;
static const float KILO = 1e3; // Thousand
static const float BOLTZMANN = 1.380658e-23f; // (J/K)
static const float AVOGADRO = 6.0221367e23f; // ()
static const float RGAS = BOLTZMANN * AVOGADRO; // (J/(mol K))
static const float BOLTZ = (RGAS / KILO); // (kJ/(mol K))
#define DUMP_PARAMETERS 0
template <int SIZE>
static Expression<SIZE> createExpression(gpuContext gpu, const string& expression, const Lepton::ExpressionProgram& program, const vector<string>& variables,
const vector<string>& globalParamNames, unsigned int& maxStackSize) {
Expression<SIZE> exp;
if (program.getNumOperations() > SIZE)
throw OpenMMException("Expression contains too many operations: "+expression);
exp.length = program.getNumOperations();
exp.stackSize = program.getStackSize();
if (exp.stackSize > (int) maxStackSize)
maxStackSize = exp.stackSize;
for (int i = 0; i < program.getNumOperations(); i++) {
const Operation& op = program.getOperation(i);
switch (op.getId()) {
case Operation::CONSTANT:
exp.op[i] = CONSTANT;
exp.arg[i] = (float) dynamic_cast<const Operation::Constant*>(&op)->getValue();
break;
case Operation::VARIABLE:
if (variables.size() > 0 && op.getName() == variables[0])
exp.op[i] = VARIABLE0;
else if (variables.size() > 1 && op.getName() == variables[1])
exp.op[i] = VARIABLE1;
else if (variables.size() > 2 && op.getName() == variables[2])
exp.op[i] = VARIABLE2;
else if (variables.size() > 3 && op.getName() == variables[3])
exp.op[i] = VARIABLE3;
else if (variables.size() > 4 && op.getName() == variables[4])
exp.op[i] = VARIABLE4;
else if (variables.size() > 5 && op.getName() == variables[5])
exp.op[i] = VARIABLE5;
else if (variables.size() > 6 && op.getName() == variables[6])
exp.op[i] = VARIABLE6;
else if (variables.size() > 7 && op.getName() == variables[7])
exp.op[i] = VARIABLE7;
else if (variables.size() > 8 && op.getName() == variables[8])
exp.op[i] = VARIABLE8;
else {
int j;
for (j = 0; j < (int) globalParamNames.size() && op.getName() != globalParamNames[j]; j++);
if (j == globalParamNames.size())
throw OpenMMException("Unknown variable '"+op.getName()+"' in expression: "+expression);
exp.op[i] = GLOBAL;
exp.arg[i] = (float) j;
}
break;
case Operation::CUSTOM:
exp.op[i] = dynamic_cast<const Operation::Custom*>(&op)->getDerivOrder()[0] == 0 ? CUSTOM : CUSTOM_DERIV;
for (int j = 0; j < MAX_TABULATED_FUNCTIONS; j++)
if (op.getName() == gpu->tabulatedFunctions[j].name) {
exp.arg[i] = (float) j;
break;
}
break;
case Operation::ADD:
exp.op[i] = ADD;
break;
case Operation::SUBTRACT:
exp.op[i] = SUBTRACT;
break;
case Operation::MULTIPLY:
exp.op[i] = MULTIPLY;
break;
case Operation::DIVIDE:
exp.op[i] = DIVIDE;
break;
case Operation::POWER:
exp.op[i] = POWER;
break;
case Operation::NEGATE:
exp.op[i] = NEGATE;
break;
case Operation::SQRT:
exp.op[i] = SQRT;
break;
case Operation::EXP:
exp.op[i] = EXP;
break;
case Operation::LOG:
exp.op[i] = LOG;
break;
case Operation::SIN:
exp.op[i] = SIN;
break;
case Operation::COS:
exp.op[i] = COS;
break;
case Operation::SEC:
exp.op[i] = SEC;
break;
case Operation::CSC:
exp.op[i] = CSC;
break;
case Operation::TAN:
exp.op[i] = TAN;
break;
case Operation::COT:
exp.op[i] = COT;
break;
case Operation::ASIN:
exp.op[i] = ASIN;
break;
case Operation::ACOS:
exp.op[i] = ACOS;
break;
case Operation::ATAN:
exp.op[i] = ATAN;
break;
case Operation::SINH:
exp.op[i] = SINH;
break;
case Operation::COSH:
exp.op[i] = COSH;
break;
case Operation::TANH:
exp.op[i] = TANH;
break;
case Operation::ERF:
exp.op[i] = ERF;
break;
case Operation::ERFC:
exp.op[i] = ERFC;
break;
case Operation::STEP:
exp.op[i] = STEP;
break;
case Operation::SQUARE:
exp.op[i] = SQUARE;
break;
case Operation::CUBE:
exp.op[i] = CUBE;
break;
case Operation::RECIPROCAL:
exp.op[i] = RECIPROCAL;
break;
case Operation::ADD_CONSTANT:
exp.op[i] = ADD_CONSTANT;
exp.arg[i] = (float) dynamic_cast<const Operation::AddConstant*>(&op)->getValue();
break;
case Operation::MULTIPLY_CONSTANT:
exp.op[i] = MULTIPLY_CONSTANT;
exp.arg[i] = (float) dynamic_cast<const Operation::MultiplyConstant*>(&op)->getValue();
break;
case Operation::POWER_CONSTANT:
exp.op[i] = POWER_CONSTANT;
exp.arg[i] = (float) dynamic_cast<const Operation::PowerConstant*>(&op)->getValue();
break;
case Operation::MIN:
exp.op[i] = MIN;
break;
case Operation::MAX:
exp.op[i] = MAX;
break;
case Operation::ABS:
exp.op[i] = ABS;
break;
}
}
return exp;
}
extern "C"
void gpuSetBondParameters(gpuContext gpu, const vector<int>& atom1, const vector<int>& atom2, const vector<float>& length, const vector<float>& k)
{
int bonds = atom1.size();
gpu->sim.bonds = bonds;
CUDAStream<int4>* psBondID = new CUDAStream<int4>(bonds, 1, "BondID");
gpu->psBondID = psBondID;
gpu->sim.pBondID = psBondID->_pDevStream[0];
CUDAStream<float2>* psBondParameter = new CUDAStream<float2>(bonds, 1, "BondParameter");
gpu->psBondParameter = psBondParameter;
gpu->sim.pBondParameter = psBondParameter->_pDevStream[0];
for (int i = 0; i < bonds; i++)
{
(*psBondID)[i].x = atom1[i];
(*psBondID)[i].y = atom2[i];
(*psBondParameter)[i].x = length[i];
(*psBondParameter)[i].y = k[i];
psBondID->_pSysData[i].z = gpu->pOutputBufferCounter[psBondID->_pSysData[i].x]++;
psBondID->_pSysData[i].w = gpu->pOutputBufferCounter[psBondID->_pSysData[i].y]++;
#if (DUMP_PARAMETERS == 1)
cout <<
i << " " <<
(*psBondID)[i].x << " " <<
(*psBondID)[i].y << " " <<
(*psBondID)[i].z << " " <<
(*psBondID)[i].w << " " <<
(*psBondParameter)[i].x << " " <<
(*psBondParameter)[i].y <<
endl;
#endif
}
psBondID->Upload();
psBondParameter->Upload();
}
extern "C"
void gpuSetBondAngleParameters(gpuContext gpu, const vector<int>& atom1, const vector<int>& atom2, const vector<int>& atom3,
const vector<float>& angle, const vector<float>& k)
{
int bond_angles = atom1.size();
gpu->sim.bond_angles = bond_angles;
CUDAStream<int4>* psBondAngleID1 = new CUDAStream<int4>(bond_angles, 1, "BondAngleID1");
gpu->psBondAngleID1 = psBondAngleID1;
gpu->sim.pBondAngleID1 = psBondAngleID1->_pDevStream[0];
CUDAStream<int2>* psBondAngleID2 = new CUDAStream<int2>(bond_angles, 1, "BondAngleID2");
gpu->psBondAngleID2 = psBondAngleID2;
gpu->sim.pBondAngleID2 = psBondAngleID2->_pDevStream[0];
CUDAStream<float2>* psBondAngleParameter = new CUDAStream<float2>(bond_angles, 1, "BondAngleParameter");
gpu->psBondAngleParameter = psBondAngleParameter;
gpu->sim.pBondAngleParameter = psBondAngleParameter->_pDevStream[0];
for (int i = 0; i < bond_angles; i++)
{
(*psBondAngleID1)[i].x = atom1[i];
(*psBondAngleID1)[i].y = atom2[i];
(*psBondAngleID1)[i].z = atom3[i];
(*psBondAngleParameter)[i].x = angle[i];
(*psBondAngleParameter)[i].y = k[i];
psBondAngleID1->_pSysData[i].w = gpu->pOutputBufferCounter[psBondAngleID1->_pSysData[i].x]++;
psBondAngleID2->_pSysData[i].x = gpu->pOutputBufferCounter[psBondAngleID1->_pSysData[i].y]++;
psBondAngleID2->_pSysData[i].y = gpu->pOutputBufferCounter[psBondAngleID1->_pSysData[i].z]++;
#if (DUMP_PARAMETERS == 1)
cout <<
i << " " <<
(*psBondAngleID1)[i].x << " " <<
(*psBondAngleID1)[i].y << " " <<
(*psBondAngleID1)[i].z << " " <<
(*psBondAngleID1)[i].w << " " <<
(*psBondAngleID2)[i].x << " " <<
(*psBondAngleID2)[i].y << " " <<
(*psBondAngleParameter)[i].x << " " <<
(*psBondAngleParameter)[i].y <<
endl;
#endif
}
psBondAngleID1->Upload();
psBondAngleID2->Upload();
psBondAngleParameter->Upload();
}
extern "C"
void gpuSetDihedralParameters(gpuContext gpu, const vector<int>& atom1, const vector<int>& atom2, const vector<int>& atom3, const vector<int>& atom4,
const vector<float>& k, const vector<float>& phase, const vector<int>& periodicity)
{
int dihedrals = atom1.size();
gpu->sim.dihedrals = dihedrals;
CUDAStream<int4>* psDihedralID1 = new CUDAStream<int4>(dihedrals, 1, "DihedralID1");
gpu->psDihedralID1 = psDihedralID1;
gpu->sim.pDihedralID1 = psDihedralID1->_pDevStream[0];
CUDAStream<int4>* psDihedralID2 = new CUDAStream<int4>(dihedrals, 1, "DihedralID2");
gpu->psDihedralID2 = psDihedralID2;
gpu->sim.pDihedralID2 = psDihedralID2->_pDevStream[0];
CUDAStream<float4>* psDihedralParameter = new CUDAStream<float4>(dihedrals, 1, "DihedralParameter");
gpu->psDihedralParameter = psDihedralParameter;
gpu->sim.pDihedralParameter = psDihedralParameter->_pDevStream[0];
for (int i = 0; i < dihedrals; i++)
{
(*psDihedralID1)[i].x = atom1[i];
(*psDihedralID1)[i].y = atom2[i];
(*psDihedralID1)[i].z = atom3[i];
(*psDihedralID1)[i].w = atom4[i];
(*psDihedralParameter)[i].x = k[i];
(*psDihedralParameter)[i].y = phase[i];
(*psDihedralParameter)[i].z = (float) periodicity[i];
psDihedralID2->_pSysData[i].x = gpu->pOutputBufferCounter[psDihedralID1->_pSysData[i].x]++;
psDihedralID2->_pSysData[i].y = gpu->pOutputBufferCounter[psDihedralID1->_pSysData[i].y]++;
psDihedralID2->_pSysData[i].z = gpu->pOutputBufferCounter[psDihedralID1->_pSysData[i].z]++;
psDihedralID2->_pSysData[i].w = gpu->pOutputBufferCounter[psDihedralID1->_pSysData[i].w]++;
#if (DUMP_PARAMETERS == 1)
cout <<
i << " " <<
(*psDihedralID1)[i].x << " " <<
(*psDihedralID1)[i].y << " " <<
(*psDihedralID1)[i].z << " " <<
(*psDihedralID1)[i].w << " " <<
(*psDihedralID2)[i].x << " " <<
(*psDihedralID2)[i].y << " " <<
(*psDihedralID2)[i].z << " " <<
(*psDihedralID2)[i].w << " " <<
(*psDihedralParameter)[i].x << " " <<
(*psDihedralParameter)[i].y << " " <<
(*psDihedralParameter)[i].z << endl;
#endif
}
psDihedralID1->Upload();
psDihedralID2->Upload();
psDihedralParameter->Upload();
}
extern "C"
void gpuSetRbDihedralParameters(gpuContext gpu, const vector<int>& atom1, const vector<int>& atom2, const vector<int>& atom3, const vector<int>& atom4,
const vector<float>& c0, const vector<float>& c1, const vector<float>& c2, const vector<float>& c3, const vector<float>& c4, const vector<float>& c5)
{
int rb_dihedrals = atom1.size();
gpu->sim.rb_dihedrals = rb_dihedrals;
CUDAStream<int4>* psRbDihedralID1 = new CUDAStream<int4>(rb_dihedrals, 1, "RbDihedralID1");
gpu->psRbDihedralID1 = psRbDihedralID1;
gpu->sim.pRbDihedralID1 = psRbDihedralID1->_pDevStream[0];
CUDAStream<int4>* psRbDihedralID2 = new CUDAStream<int4>(rb_dihedrals, 1, "RbDihedralID2");
gpu->psRbDihedralID2 = psRbDihedralID2;
gpu->sim.pRbDihedralID2 = psRbDihedralID2->_pDevStream[0];
CUDAStream<float4>* psRbDihedralParameter1 = new CUDAStream<float4>(rb_dihedrals, 1, "RbDihedralParameter1");
gpu->psRbDihedralParameter1 = psRbDihedralParameter1;
gpu->sim.pRbDihedralParameter1 = psRbDihedralParameter1->_pDevStream[0];
CUDAStream<float2>* psRbDihedralParameter2 = new CUDAStream<float2>(rb_dihedrals, 1, "RbDihedralParameter2");
gpu->psRbDihedralParameter2 = psRbDihedralParameter2;
gpu->sim.pRbDihedralParameter2 = psRbDihedralParameter2->_pDevStream[0];
for (int i = 0; i < rb_dihedrals; i++)
{
(*psRbDihedralID1)[i].x = atom1[i];
(*psRbDihedralID1)[i].y = atom2[i];
(*psRbDihedralID1)[i].z = atom3[i];
(*psRbDihedralID1)[i].w = atom4[i];
(*psRbDihedralParameter1)[i].x = c0[i];
(*psRbDihedralParameter1)[i].y = c1[i];
(*psRbDihedralParameter1)[i].z = c2[i];
(*psRbDihedralParameter1)[i].w = c3[i];
(*psRbDihedralParameter2)[i].x = c4[i];
(*psRbDihedralParameter2)[i].y = c5[i];
psRbDihedralID2->_pSysData[i].x = gpu->pOutputBufferCounter[psRbDihedralID1->_pSysData[i].x]++;
psRbDihedralID2->_pSysData[i].y = gpu->pOutputBufferCounter[psRbDihedralID1->_pSysData[i].y]++;
psRbDihedralID2->_pSysData[i].z = gpu->pOutputBufferCounter[psRbDihedralID1->_pSysData[i].z]++;
psRbDihedralID2->_pSysData[i].w = gpu->pOutputBufferCounter[psRbDihedralID1->_pSysData[i].w]++;
#if (DUMP_PARAMETERS == 1)
cout <<
i << " " <<
(*psRbDihedralID1)[i].x << " " <<
(*psRbDihedralID1)[i].y << " " <<
(*psRbDihedralID1)[i].z << " " <<
(*psRbDihedralID1)[i].w <<" " <<
(*psRbDihedralID2)[i].x << " " <<
(*psRbDihedralID2)[i].y << " " <<
(*psRbDihedralID2)[i].z << " " <<
(*psRbDihedralID2)[i].w <<" " <<
(*psRbDihedralParameter1)[i].x << " " <<
(*psRbDihedralParameter1)[i].y << " " <<
(*psRbDihedralParameter1)[i].z << " " <<
(*psRbDihedralParameter1)[i].w << " " <<
(*psRbDihedralParameter2)[i].x << " " <<
(*psRbDihedralParameter2)[i].y <<
endl;
#endif
}
psRbDihedralID1->Upload();
psRbDihedralID2->Upload();
psRbDihedralParameter1->Upload();
psRbDihedralParameter2->Upload();
}
extern "C"
void gpuSetLJ14Parameters(gpuContext gpu, float epsfac, float fudge, const vector<int>& atom1, const vector<int>& atom2,
const vector<float>& c6, const vector<float>& c12, const vector<float>& q1, const vector<float>& q2)
{
int LJ14s = atom1.size();
float scale = epsfac * fudge;
gpu->sim.LJ14s = LJ14s;
CUDAStream<int4>* psLJ14ID = new CUDAStream<int4>(LJ14s, 1, "LJ14ID");
gpu->psLJ14ID = psLJ14ID;
gpu->sim.pLJ14ID = psLJ14ID->_pDevStream[0];
CUDAStream<float4>* psLJ14Parameter = new CUDAStream<float4>(LJ14s, 1, "LJ14Parameter");
gpu->psLJ14Parameter = psLJ14Parameter;
gpu->sim.pLJ14Parameter = psLJ14Parameter->_pDevStream[0];
for (int i = 0; i < LJ14s; i++)
{
(*psLJ14ID)[i].x = atom1[i];
(*psLJ14ID)[i].y = atom2[i];
psLJ14ID->_pSysData[i].z = gpu->pOutputBufferCounter[psLJ14ID->_pSysData[i].x]++;
psLJ14ID->_pSysData[i].w = gpu->pOutputBufferCounter[psLJ14ID->_pSysData[i].y]++;
float p0, p1, p2;
if (c12[i] == 0.0f)
{
p0 = 0.0f;
p1 = 1.0f;
}
else
{
p0 = c6[i] * c6[i] / c12[i];
p1 = pow(c12[i] / c6[i], 1.0f / 6.0f);
}
p2 = scale * q1[i] * q2[i];
(*psLJ14Parameter)[i].x = p0;
(*psLJ14Parameter)[i].y = p1;
(*psLJ14Parameter)[i].z = p2;
}
#if (DUMP_PARAMETERS == 1)
cout <<
i << " " <<
(*psLJ14ID)[i].x << " " <<
(*psLJ14ID)[i].y << " " <<
(*psLJ14ID)[i].z << " " <<
(*psLJ14ID)[i].w << " " <<
(*psLJ14Parameter)[i].x << " " <<
(*psLJ14Parameter)[i].y << " " <<
(*psLJ14Parameter)[i].z << " " <<
p0 << " " <<
p1 << " " <<
p2 << " " <<
endl;
#endif
psLJ14ID->Upload();
psLJ14Parameter->Upload();
}
extern "C" void OPENMMCUDA_EXPORT setExclusions(gpuContext gpu, const vector<vector<int> >& exclusions) {
if (gpu->exclusions.size() > 0) {
bool ok = (exclusions.size() == gpu->exclusions.size());
for (int i = 0; i < (int) exclusions.size() && ok; i++) {
if (exclusions[i].size() != gpu->exclusions[i].size())
ok = false;
else {
for (int j = 0; j < (int) exclusions[i].size(); j++)
if (find(gpu->exclusions[i].begin(), gpu->exclusions[i].end(), exclusions[i][j]) == gpu->exclusions[i].end())
ok = false;
}
}
if (!ok)
throw OpenMMException("All nonbonded forces must have identical sets of exceptions");
}
gpu->exclusions = exclusions;
}
extern "C"
void gpuSetCoulombParameters(gpuContext gpu, float epsfac, const vector<int>& atom, const vector<float>& c6, const vector<float>& c12, const vector<float>& q,
const vector<char>& symbol, const vector<vector<int> >& exclusions, CudaNonbondedMethod method)
{
unsigned int coulombs = c6.size();
gpu->sim.epsfac = epsfac;
gpu->sim.nonbondedMethod = method;
if (coulombs > 0)
setExclusions(gpu, exclusions);
for (unsigned int i = 0; i < coulombs; i++)
{
float p0 = q[i];
float p1 = 0.5f, p2 = 0.0f;
if ((c6[i] > 0.0f) && (c12[i] > 0.0f))
{
p1 = 0.5f * pow(c12[i] / c6[i], 1.0f / 6.0f);
p2 = c6[i] * sqrt(1.0f / c12[i]);
}
if (symbol.size() > 0)
gpu->pAtomSymbol[i] = symbol[i];
(*gpu->psPosq4)[i].w = p0;
(*gpu->psSigEps2)[i].x = p1;
(*gpu->psSigEps2)[i].y = p2;
}
// Dummy out extra atom data
for (unsigned int i = gpu->natoms; i < gpu->sim.paddedNumberOfAtoms; i++)
{
(*gpu->psPosq4)[i].x = 100000.0f + i * 10.0f;
(*gpu->psPosq4)[i].y = 100000.0f + i * 10.0f;
(*gpu->psPosq4)[i].z = 100000.0f + i * 10.0f;
(*gpu->psPosq4)[i].w = 0.0f;
(*gpu->psSigEps2)[i].x = 0.0f;
(*gpu->psSigEps2)[i].y = 0.0f;
}
gpu->psPosq4->Upload();
gpu->psSigEps2->Upload();
}
extern "C"
void gpuSetNonbondedCutoff(gpuContext gpu, float cutoffDistance, float solventDielectric)
{
if (gpu->sim.nonbondedCutoff != 0.0f && gpu->sim.nonbondedCutoff != cutoffDistance)
throw OpenMMException("All nonbonded forces must use the same cutoff");
gpu->sim.nonbondedCutoff = cutoffDistance;
gpu->sim.nonbondedCutoffSqr = cutoffDistance*cutoffDistance;
gpu->sim.reactionFieldK = pow(cutoffDistance, -3.0f)*(solventDielectric-1.0f)/(2.0f*solventDielectric+1.0f);
gpu->sim.reactionFieldC = (1.0f / cutoffDistance)*(3.0f*solventDielectric)/(2.0f*solventDielectric+1.0f);
}
extern "C"
void gpuSetTabulatedFunction(gpuContext gpu, int index, const string& name, const vector<double>& values, double min, double max)
{
if (index < 0 || index >= MAX_TABULATED_FUNCTIONS) {
stringstream str;
str << "Only " << MAX_TABULATED_FUNCTIONS << " tabulated functions are supported";
throw OpenMMException(str.str());
}
if (gpu->tabulatedFunctions[index].coefficients != NULL)
delete gpu->tabulatedFunctions[index].coefficients;
CUDAStream<float4>* coeff = new CUDAStream<float4>((int) values.size()-1, 1, "TabulatedFunction");
gpu->tabulatedFunctions[index].coefficients = coeff;
gpu->sim.pTabulatedFunctionCoefficients[index] = coeff->_pDevData;
gpu->tabulatedFunctions[index].name = name;
gpu->tabulatedFunctions[index].min = min;
gpu->tabulatedFunctions[index].max = max;
gpu->tabulatedFunctionsChanged = true;
// Compute the spline coefficients.
int numValues = values.size();
vector<double> x(numValues), derivs;
for (int i = 0; i < numValues; i++)
x[i] = min+i*(max-min)/(numValues-1);
OpenMM::SplineFitter::createNaturalSpline(x, values, derivs);
for (int i = 0; i < (int) values.size()-1; i++)
(*coeff)[i] = make_float4((float) values[i], (float) values[i+1], (float) (derivs[i]/6.0), (float) (derivs[i+1]/6.0));
coeff->Upload();
}
extern "C"
void gpuSetCustomBondParameters(gpuContext gpu, const vector<int>& bondAtom1, const vector<int>& bondAtom2, const vector<vector<double> >& bondParams,
const string& energyExp, const vector<string>& paramNames, const vector<string>& globalParamNames)
{
if (paramNames.size() > 4)
throw OpenMMException("CudaPlatform only supports four per-bond parameters for custom bond forces");
if (globalParamNames.size() > 8)
throw OpenMMException("CudaPlatform only supports eight global parameters for custom bond forces");
if (gpu->psCustomBondID != NULL)
throw OpenMMException("CudaPlatform only supports a single CustomBondForce per System");
gpu->sim.customBonds = bondAtom1.size();
gpu->sim.customBondParameters = paramNames.size();
gpu->psCustomBondID = new CUDAStream<int4>(gpu->sim.customBonds, 1, "CustomBondId");
gpu->sim.pCustomBondID = gpu->psCustomBondID->_pDevData;
gpu->psCustomBondParams = new CUDAStream<float4>(gpu->sim.customBonds, 1, "CustomBondParams");
gpu->sim.pCustomBondParams = gpu->psCustomBondParams->_pDevData;
vector<int> forceBufferCounter(gpu->natoms, 0);
for (int i = 0; i < (int) bondAtom1.size(); i++) {
(*gpu->psCustomBondID)[i].x = bondAtom1[i];
(*gpu->psCustomBondID)[i].y = bondAtom2[i];
(*gpu->psCustomBondID)[i].z = forceBufferCounter[bondAtom1[i]]++;
(*gpu->psCustomBondID)[i].w = forceBufferCounter[bondAtom2[i]]++;
if (bondParams[i].size() > 0)
(*gpu->psCustomBondParams)[i].x = (float) bondParams[i][0];
if (bondParams[i].size() > 1)
(*gpu->psCustomBondParams)[i].y = (float) bondParams[i][1];
if (bondParams[i].size() > 2)
(*gpu->psCustomBondParams)[i].z = (float) bondParams[i][2];
if (bondParams[i].size() > 3)
(*gpu->psCustomBondParams)[i].w = (float) bondParams[i][3];
}
gpu->psCustomBondID->Upload();
gpu->psCustomBondParams->Upload();
for (int i = 0; i < (int) forceBufferCounter.size(); i++)
if (forceBufferCounter[i] > (int) gpu->pOutputBufferCounter[i])
gpu->pOutputBufferCounter[i] = forceBufferCounter[i];
// Create the Expressions.
vector<string> variables;
variables.push_back("r");
for (int i = 0; i < (int) paramNames.size(); i++)
variables.push_back(paramNames[i]);
SetCustomBondEnergyExpression(createExpression<256>(gpu, energyExp, Lepton::Parser::parse(energyExp).optimize().createProgram(), variables, globalParamNames, gpu->sim.customExpressionStackSize));
SetCustomBondForceExpression(createExpression<256>(gpu, energyExp, Lepton::Parser::parse(energyExp).differentiate("r").optimize().createProgram(), variables, globalParamNames, gpu->sim.customExpressionStackSize));
}
extern "C"
void gpuSetCustomAngleParameters(gpuContext gpu, const vector<int>& angleAtom1, const vector<int>& angleAtom2, const vector<int>& angleAtom3, const vector<vector<double> >& angleParams,
const string& energyExp, const vector<string>& paramNames, const vector<string>& globalParamNames)
{
if (paramNames.size() > 4)
throw OpenMMException("CudaPlatform only supports four per-angle parameters for custom angle forces");
if (globalParamNames.size() > 8)
throw OpenMMException("CudaPlatform only supports eight global parameters for custom angle forces");
if (gpu->psCustomAngleID1 != NULL)
throw OpenMMException("CudaPlatform only supports a single CustomAngleForce per System");
gpu->sim.customAngles = angleAtom1.size();
gpu->sim.customAngleParameters = paramNames.size();
gpu->psCustomAngleID1 = new CUDAStream<int4>(gpu->sim.customAngles, 1, "CustomAngleId1");
gpu->sim.pCustomAngleID1 = gpu->psCustomAngleID1->_pDevData;
gpu->psCustomAngleID2 = new CUDAStream<int2>(gpu->sim.customAngles, 1, "CustomAngleId2");
gpu->sim.pCustomAngleID2 = gpu->psCustomAngleID2->_pDevData;
gpu->psCustomAngleParams = new CUDAStream<float4>(gpu->sim.customAngles, 1, "CustomAngleParams");
gpu->sim.pCustomAngleParams = gpu->psCustomAngleParams->_pDevData;
vector<int> forceBufferCounter(gpu->natoms, 0);
for (int i = 0; i < (int) angleAtom1.size(); i++) {
(*gpu->psCustomAngleID1)[i].x = angleAtom1[i];
(*gpu->psCustomAngleID1)[i].y = angleAtom2[i];
(*gpu->psCustomAngleID1)[i].z = angleAtom3[i];
(*gpu->psCustomAngleID1)[i].w = forceBufferCounter[angleAtom1[i]]++;
(*gpu->psCustomAngleID2)[i].x = forceBufferCounter[angleAtom2[i]]++;
(*gpu->psCustomAngleID2)[i].y = forceBufferCounter[angleAtom3[i]]++;
if (angleParams[i].size() > 0)
(*gpu->psCustomAngleParams)[i].x = (float) angleParams[i][0];
if (angleParams[i].size() > 1)
(*gpu->psCustomAngleParams)[i].y = (float) angleParams[i][1];
if (angleParams[i].size() > 2)
(*gpu->psCustomAngleParams)[i].z = (float) angleParams[i][2];
if (angleParams[i].size() > 3)
(*gpu->psCustomAngleParams)[i].w = (float) angleParams[i][3];
}
gpu->psCustomAngleID1->Upload();
gpu->psCustomAngleID2->Upload();
gpu->psCustomAngleParams->Upload();
for (int i = 0; i < (int) forceBufferCounter.size(); i++)
if (forceBufferCounter[i] > (int) gpu->pOutputBufferCounter[i])
gpu->pOutputBufferCounter[i] = forceBufferCounter[i];
// Create the Expressions.
vector<string> variables;
variables.push_back("theta");
for (int i = 0; i < (int) paramNames.size(); i++)
variables.push_back(paramNames[i]);
SetCustomAngleEnergyExpression(createExpression<256>(gpu, energyExp, Lepton::Parser::parse(energyExp).optimize().createProgram(), variables, globalParamNames, gpu->sim.customExpressionStackSize));
SetCustomAngleForceExpression(createExpression<256>(gpu, energyExp, Lepton::Parser::parse(energyExp).differentiate("theta").optimize().createProgram(), variables, globalParamNames, gpu->sim.customExpressionStackSize));
}
extern "C"
void gpuSetCustomTorsionParameters(gpuContext gpu, const vector<int>& torsionAtom1, const vector<int>& torsionAtom2, const vector<int>& torsionAtom3, const vector<int>& torsionAtom4, const vector<vector<double> >& torsionParams,
const string& energyExp, const vector<string>& paramNames, const vector<string>& globalParamNames)
{
if (paramNames.size() > 4)
throw OpenMMException("CudaPlatform only supports four per-torsion parameters for custom torsion forces");
if (globalParamNames.size() > 8)
throw OpenMMException("CudaPlatform only supports eight global parameters for custom torsion forces");
if (gpu->psCustomTorsionID1 != NULL)
throw OpenMMException("CudaPlatform only supports a single CustomTorsionForce per System");
gpu->sim.customTorsions = torsionAtom1.size();
gpu->sim.customTorsionParameters = paramNames.size();
gpu->psCustomTorsionID1 = new CUDAStream<int4>(gpu->sim.customTorsions, 1, "CustomTorsionId1");
gpu->sim.pCustomTorsionID1 = gpu->psCustomTorsionID1->_pDevData;
gpu->psCustomTorsionID2 = new CUDAStream<int4>(gpu->sim.customTorsions, 1, "CustomTorsionId2");
gpu->sim.pCustomTorsionID2 = gpu->psCustomTorsionID2->_pDevData;
gpu->psCustomTorsionParams = new CUDAStream<float4>(gpu->sim.customTorsions, 1, "CustomTorsionParams");
gpu->sim.pCustomTorsionParams = gpu->psCustomTorsionParams->_pDevData;
vector<int> forceBufferCounter(gpu->natoms, 0);
for (int i = 0; i < (int) torsionAtom1.size(); i++) {
(*gpu->psCustomTorsionID1)[i].x = torsionAtom1[i];
(*gpu->psCustomTorsionID1)[i].y = torsionAtom2[i];
(*gpu->psCustomTorsionID1)[i].z = torsionAtom3[i];
(*gpu->psCustomTorsionID1)[i].w = torsionAtom4[i];
(*gpu->psCustomTorsionID2)[i].x = forceBufferCounter[torsionAtom1[i]]++;
(*gpu->psCustomTorsionID2)[i].y = forceBufferCounter[torsionAtom2[i]]++;
(*gpu->psCustomTorsionID2)[i].z = forceBufferCounter[torsionAtom3[i]]++;
(*gpu->psCustomTorsionID2)[i].w = forceBufferCounter[torsionAtom4[i]]++;
if (torsionParams[i].size() > 0)
(*gpu->psCustomTorsionParams)[i].x = (float) torsionParams[i][0];
if (torsionParams[i].size() > 1)
(*gpu->psCustomTorsionParams)[i].y = (float) torsionParams[i][1];
if (torsionParams[i].size() > 2)
(*gpu->psCustomTorsionParams)[i].z = (float) torsionParams[i][2];
if (torsionParams[i].size() > 3)
(*gpu->psCustomTorsionParams)[i].w = (float) torsionParams[i][3];
}
gpu->psCustomTorsionID1->Upload();
gpu->psCustomTorsionID2->Upload();
gpu->psCustomTorsionParams->Upload();
for (int i = 0; i < (int) forceBufferCounter.size(); i++)
if (forceBufferCounter[i] > (int) gpu->pOutputBufferCounter[i])
gpu->pOutputBufferCounter[i] = forceBufferCounter[i];
// Create the Expressions.
vector<string> variables;
variables.push_back("theta");
for (int i = 0; i < (int) paramNames.size(); i++)
variables.push_back(paramNames[i]);
SetCustomTorsionEnergyExpression(createExpression<256>(gpu, energyExp, Lepton::Parser::parse(energyExp).optimize().createProgram(), variables, globalParamNames, gpu->sim.customExpressionStackSize));
SetCustomTorsionForceExpression(createExpression<256>(gpu, energyExp, Lepton::Parser::parse(energyExp).differentiate("theta").optimize().createProgram(), variables, globalParamNames, gpu->sim.customExpressionStackSize));
}
extern "C"
void gpuSetCustomExternalParameters(gpuContext gpu, const vector<int>& atomIndex, const vector<vector<double> >& atomParams,
const string& energyExp, const vector<string>& paramNames, const vector<string>& globalParamNames)
{
if (paramNames.size() > 4)
throw OpenMMException("CudaPlatform only supports four per-particle parameters for custom external forces");
if (globalParamNames.size() > 8)
throw OpenMMException("CudaPlatform only supports eight global parameters for custom external forces");
if (gpu->psCustomExternalID != NULL)
throw OpenMMException("CudaPlatform only supports a single CustomExternalForce per System");
gpu->sim.customExternals = atomIndex.size();
gpu->sim.customExternalParameters = paramNames.size();
gpu->psCustomExternalID = new CUDAStream<int>(gpu->sim.customExternals, 1, "CustomExternalId");
gpu->sim.pCustomExternalID = gpu->psCustomExternalID->_pDevData;
gpu->psCustomExternalParams = new CUDAStream<float4>(gpu->sim.customExternals, 1, "CustomExternalParams");
gpu->sim.pCustomExternalParams = gpu->psCustomExternalParams->_pDevData;
for (int i = 0; i < (int) atomIndex.size(); i++) {
(*gpu->psCustomExternalID)[i] = atomIndex[i];
if (atomParams[i].size() > 0)
(*gpu->psCustomExternalParams)[i].x = (float) atomParams[i][0];
if (atomParams[i].size() > 1)
(*gpu->psCustomExternalParams)[i].y = (float) atomParams[i][1];
if (atomParams[i].size() > 2)
(*gpu->psCustomExternalParams)[i].z = (float) atomParams[i][2];
if (atomParams[i].size() > 3)
(*gpu->psCustomExternalParams)[i].w = (float) atomParams[i][3];
}
gpu->psCustomExternalID->Upload();
gpu->psCustomExternalParams->Upload();
// Create the Expressions.
vector<string> variables;
variables.push_back("x");
variables.push_back("y");
variables.push_back("z");
for (int i = 0; i < (int) paramNames.size(); i++)
variables.push_back(paramNames[i]);
SetCustomExternalEnergyExpression(createExpression<256>(gpu, energyExp, Lepton::Parser::parse(energyExp).optimize().createProgram(), variables, globalParamNames, gpu->sim.customExpressionStackSize));
SetCustomExternalForceExpressions(createExpression<256>(gpu, energyExp, Lepton::Parser::parse(energyExp).differentiate("x").optimize().createProgram(), variables, globalParamNames, gpu->sim.customExpressionStackSize),
createExpression<256>(gpu, energyExp, Lepton::Parser::parse(energyExp).differentiate("y").optimize().createProgram(), variables, globalParamNames, gpu->sim.customExpressionStackSize),
createExpression<256>(gpu, energyExp, Lepton::Parser::parse(energyExp).differentiate("z").optimize().createProgram(), variables, globalParamNames, gpu->sim.customExpressionStackSize));
}
extern "C"
void gpuSetCustomNonbondedParameters(gpuContext gpu, const vector<vector<double> >& parameters, const vector<vector<int> >& exclusions,
CudaNonbondedMethod method, float cutoffDistance, const string& energyExp,
const vector<string>& paramNames, const vector<string>& globalParamNames)
{
if (gpu->sim.nonbondedCutoff != 0.0f && gpu->sim.nonbondedCutoff != cutoffDistance)
throw OpenMMException("All nonbonded forces must use the same cutoff");
if (paramNames.size() > 4)
throw OpenMMException("CudaPlatform only supports four per-atom parameters for custom nonbonded forces");
if (globalParamNames.size() > 8)
throw OpenMMException("CudaPlatform only supports eight global parameters for custom nonbonded forces");
gpu->sim.nonbondedCutoff = cutoffDistance;
gpu->sim.nonbondedCutoffSqr = cutoffDistance*cutoffDistance;
gpu->sim.customNonbondedMethod = method;
gpu->sim.customParameters = paramNames.size();
setExclusions(gpu, exclusions);
gpu->psCustomParams = new CUDAStream<float4>(gpu->sim.paddedNumberOfAtoms, 1, "CustomParams");
gpu->sim.pCustomParams = gpu->psCustomParams->_pDevData;
for (int i = 0; i < (int) parameters.size(); i++) {
if (parameters[i].size() > 0)
(*gpu->psCustomParams)[i].x = (float) parameters[i][0];
if (parameters[i].size() > 1)
(*gpu->psCustomParams)[i].y = (float) parameters[i][1];
if (parameters[i].size() > 2)
(*gpu->psCustomParams)[i].z = (float) parameters[i][2];
if (parameters[i].size() > 3)
(*gpu->psCustomParams)[i].w = (float) parameters[i][3];
}
gpu->psCustomParams->Upload();
// This class serves as a placeholder for custom functions in expressions.
class FunctionPlaceholder : public Lepton::CustomFunction {
public:
int getNumArguments() const {
return 1;
}
double evaluate(const double* arguments) const {
return 0.0;
}
double evaluateDerivative(const double* arguments, const int* derivOrder) const {
return 0.0;
}
CustomFunction* clone() const {
return new FunctionPlaceholder();
}
};
// Record the tabulated functions, which were previously set with calls to gpuSetTabulatedFunction().
FunctionPlaceholder* fp = new FunctionPlaceholder();
map<string, Lepton::CustomFunction*> functions;
gpu->psTabulatedFunctionParams = new CUDAStream<float4>(MAX_TABULATED_FUNCTIONS, 1, "TabulatedFunctionRange");
gpu->sim.pTabulatedFunctionParams = gpu->psTabulatedFunctionParams->_pDevData;
for (int i = 0; i < MAX_TABULATED_FUNCTIONS; i++) {
gpuTabulatedFunction& func = gpu->tabulatedFunctions[i];
if (func.coefficients != NULL) {
(*gpu->psTabulatedFunctionParams)[i] = make_float4((float) func.min, (float) func.max, (float) (func.coefficients->_length/(func.max-func.min)), (float) (func.coefficients->_length-1));
functions[func.name] = fp;
}
}
gpu->psTabulatedFunctionParams->Upload();
// Create the Expressions.
vector<string> variables;
for (int j = 1; j < 3; j++) {
for (int i = 0; i < (int) paramNames.size(); i++) {
stringstream name;
name << paramNames[i] << j;
variables.push_back(name.str());
}
for (int i = paramNames.size(); i < 4; i++)
variables.push_back("");
}
variables.push_back("r");
SetCustomNonbondedEnergyExpression(createExpression<256>(gpu, energyExp, Lepton::Parser::parse(energyExp, functions).optimize().createProgram(), variables, globalParamNames, gpu->sim.customExpressionStackSize));
SetCustomNonbondedForceExpression(createExpression<256>(gpu, energyExp, Lepton::Parser::parse(energyExp, functions).differentiate("r").optimize().createProgram(), variables, globalParamNames, gpu->sim.customExpressionStackSize));
delete fp;
}
static void tabulateErfc(gpuContext gpu)
{
int tableSize = 2048;
gpu->sim.tabulatedErfcSize = tableSize;
gpu->sim.tabulatedErfcScale = tableSize/(gpu->sim.alphaEwald*gpu->sim.nonbondedCutoff);
gpu->psTabulatedErfc = new CUDAStream<float>(tableSize, 1, "TabulatedErfc");
gpu->sim.pTabulatedErfc = gpu->psTabulatedErfc->_pDevData;
for (int i = 0; i < tableSize; ++i)
(*gpu->psTabulatedErfc)[i] = (float) erfc(i*(gpu->sim.alphaEwald*gpu->sim.nonbondedCutoff)/tableSize);
gpu->psTabulatedErfc->Upload();
}
extern "C"
void gpuSetEwaldParameters(gpuContext gpu, float alpha, int kmaxx, int kmaxy, int kmaxz)
{
gpu->sim.alphaEwald = alpha;
gpu->sim.factorEwald = -1 / (4*alpha*alpha);
gpu->sim.kmaxX = kmaxx;
gpu->sim.kmaxY = kmaxy;
gpu->sim.kmaxZ = kmaxz;
gpu->psEwaldCosSinSum = new CUDAStream<float2>((gpu->sim.kmaxX*2-1) * (gpu->sim.kmaxY*2-1) * (gpu->sim.kmaxZ*2-1), 1, "EwaldCosSinSum");
gpu->sim.pEwaldCosSinSum = gpu->psEwaldCosSinSum->_pDevStream[0];
tabulateErfc(gpu);
}
extern "C"
void gpuSetPMEParameters(gpuContext gpu, float alpha, int gridSizeX, int gridSizeY, int gridSizeZ)
{
gpu->sim.alphaEwald = alpha;
int3 gridSize = make_int3(gridSizeX, gridSizeY, gridSizeZ);
gpu->sim.pmeGridSize = gridSize;
int3 groupSize = make_int3(2, 4, 4);
gpu->sim.pmeGroupSize = groupSize;
const int3 numGroups = make_int3((gridSize.x+groupSize.x-1)/groupSize.x, (gridSize.y+groupSize.y-1)/groupSize.y, (gridSize.z+groupSize.z-1)/groupSize.z);
const unsigned int totalGroups = numGroups.x*numGroups.y*numGroups.z;
cufftPlan3d(&gpu->fftplan, gridSize.x, gridSize.y, gridSize.z, CUFFT_C2C);
gpu->psPmeGrid = new CUDAStream<cufftComplex>(gridSize.x*gridSize.y*gridSize.z, 1, "PmeGrid");
gpu->sim.pPmeGrid = gpu->psPmeGrid->_pDevData;
gpu->psPmeBsplineModuli[0] = new CUDAStream<float>(gridSize.x, 1, "PmeBsplineModuli0");
gpu->sim.pPmeBsplineModuli[0] = gpu->psPmeBsplineModuli[0]->_pDevData;
gpu->psPmeBsplineModuli[1] = new CUDAStream<float>(gridSize.y, 1, "PmeBsplineModuli1");
gpu->sim.pPmeBsplineModuli[1] = gpu->psPmeBsplineModuli[1]->_pDevData;
gpu->psPmeBsplineModuli[2] = new CUDAStream<float>(gridSize.z, 1, "PmeBsplineModuli2");
gpu->sim.pPmeBsplineModuli[2] = gpu->psPmeBsplineModuli[2]->_pDevData;
gpu->psPmeBsplineTheta = new CUDAStream<float4>(PME_ORDER*gpu->natoms, 1, "PmeBsplineTheta");
gpu->sim.pPmeBsplineTheta = gpu->psPmeBsplineTheta->_pDevData;
gpu->psPmeBsplineDtheta = new CUDAStream<float4>(PME_ORDER*gpu->natoms, 1, "PmeBsplineDtheta");
gpu->sim.pPmeBsplineDtheta = gpu->psPmeBsplineDtheta->_pDevData;
gpu->psPmeAtomRange = new CUDAStream<int>(gridSize.x*gridSize.y*gridSize.z+1, 1, "PmeAtomRange");
gpu->sim.pPmeAtomRange = gpu->psPmeAtomRange->_pDevData;
gpu->psPmeAtomGridIndex = new CUDAStream<int2>(gpu->natoms, 1, "PmeAtomGridIndex");
gpu->sim.pPmeAtomGridIndex = gpu->psPmeAtomGridIndex->_pDevData;
tabulateErfc(gpu);
// Initialize the b-spline moduli.
int maxSize = max(max(gridSize.x, gridSize.y), gridSize.z);
vector<double> data(PME_ORDER);
vector<double> ddata(PME_ORDER);
vector<double> bsplines_data(maxSize);
data[PME_ORDER-1] = 0.0;
data[1] = 0.0;
data[0] = 1.0;
for (int i = 3; i < PME_ORDER; i++)
{
double div = 1.0/(i-1.0);
data[i-1] = 0.0;
for (int j = 1; j < (i-1); j++)
data[i-j-1] = div*(j*data[i-j-2]+(i-j)*data[i-j-1]);
data[0] = div*data[0];
}
// Differentiate.
ddata[0] = -data[0];
for (int i = 1; i < PME_ORDER; i++)
ddata[i] = data[i-1]-data[i];
double div = 1.0/(PME_ORDER-1);
data[PME_ORDER-1] = 0.0;
for (int i = 1; i < (PME_ORDER-1); i++)
data[PME_ORDER-i-1] = div*(i*data[PME_ORDER-i-2]+(PME_ORDER-i)*data[PME_ORDER-i-1]);
data[0] = div*data[0];
for (int i = 0; i < maxSize; i++)
bsplines_data[i] = 0.0;
for (int i = 1; i <= PME_ORDER; i++)
bsplines_data[i] = data[i-1];
// Evaluate the actual bspline moduli for X/Y/Z.
for(int dim = 0; dim < 3; dim++)
{
int ndata = (dim == 0 ? gridSize.x : dim == 1 ? gridSize.y : gridSize.z);
for (int i = 0; i < ndata; i++)
{
double sc = 0.0;
double ss = 0.0;
for (int j = 0; j < ndata; j++)
{
double arg = (2.0*M_PI*i*j)/ndata;
sc += bsplines_data[j]*cos(arg);
ss += bsplines_data[j]*sin(arg);
}
(*gpu->psPmeBsplineModuli[dim])[i] = (float) (sc*sc+ss*ss);
}
for (int i = 0; i < ndata; i++)
{
if ((*gpu->psPmeBsplineModuli[dim])[i] < 1.0e-7)
(*gpu->psPmeBsplineModuli[dim])[i] = ((*gpu->psPmeBsplineModuli[dim])[i-1]+(*gpu->psPmeBsplineModuli[dim])[i+1])*0.5f;
}
gpu->psPmeBsplineModuli[dim]->Upload();
}
}
extern "C"
void gpuSetPeriodicBoxSize(gpuContext gpu, float xsize, float ysize, float zsize)
{
gpu->sim.periodicBoxSizeX = xsize;
gpu->sim.periodicBoxSizeY = ysize;
gpu->sim.periodicBoxSizeZ = zsize;
gpu->sim.invPeriodicBoxSizeX = 1.0f/xsize;
gpu->sim.invPeriodicBoxSizeY = 1.0f/ysize;
gpu->sim.invPeriodicBoxSizeZ = 1.0f/zsize;
gpu->sim.recipBoxSizeX = 2.0f*PI/gpu->sim.periodicBoxSizeX;
gpu->sim.recipBoxSizeY = 2.0f*PI/gpu->sim.periodicBoxSizeY;
gpu->sim.recipBoxSizeZ = 2.0f*PI/gpu->sim.periodicBoxSizeZ;
gpu->sim.cellVolume = gpu->sim.periodicBoxSizeX*gpu->sim.periodicBoxSizeY*gpu->sim.periodicBoxSizeZ;
}
extern "C"
void gpuSetObcParameters(gpuContext gpu, float innerDielectric, float solventDielectric, const vector<float>& radius, const vector<float>& scale, const vector<float>& charge)
{
unsigned int atoms = radius.size();
gpu->bIncludeGBSA = true;
for (unsigned int i = 0; i < atoms; i++)
{
(*gpu->psObcData)[i].x = radius[i] - dielectricOffset;
(*gpu->psObcData)[i].y = scale[i] * (*gpu->psObcData)[i].x;
(*gpu->psPosq4)[i].w = charge[i];
#if (DUMP_PARAMETERS == 1)
cout <<
i << " " <<
(*gpu->psObcData)[i].x << " " <<
(*gpu->psObcData)[i].y;
#endif
}
// Dummy out extra atom data
for (unsigned int i = atoms; i < gpu->sim.paddedNumberOfAtoms; i++)
{
(*gpu->psBornRadii)[i] = 0.2f;
(*gpu->psObcData)[i].x = 0.01f;
(*gpu->psObcData)[i].y = 0.01f;
}
gpu->psBornRadii->Upload();
gpu->psObcData->Upload();
gpu->psPosq4->Upload();
gpu->sim.preFactor = 2.0f*electricConstant*((1.0f/innerDielectric)-(1.0f/solventDielectric))*gpu->sim.forceConversionFactor;
}
extern "C"
void gpuSetGBVIParameters(gpuContext gpu, float innerDielectric, float solventDielectric, const vector<int>& atom, const vector<float>& radius,
const vector<float>& gamma, const vector<float>& scaledRadii, int bornRadiusScalingMethod, float quinticLowerLimitFactor,
float quinticUpperBornRadiusLimit )
{
unsigned int atoms = atom.size();
gpu->bIncludeGBVI = true;
double tau = ((1.0f/innerDielectric)-(1.0f/solventDielectric));
gpu->sim.gbviQuinticLowerLimitFactor = quinticLowerLimitFactor;
gpu->sim.gbviQuinticUpperBornRadiusLimit = quinticUpperBornRadiusLimit;
gpu->sim.gbviBornRadiusScalingMethod = bornRadiusScalingMethod;
for (unsigned int i = 0; i < atoms; i++)
{
(*gpu->psGBVIData)[i].x = radius[i];
(*gpu->psGBVIData)[i].y = scaledRadii[i];
(*gpu->psGBVIData)[i].z = (float) (tau*gamma[i]);
(*gpu->psGBVIData)[i].w = 1.0f;
(*gpu->psGBVISwitchDerivative)[i] = 1.0f;
#define DUMP_PARAMETERS 0
#if (DUMP_PARAMETERS == 1)
(void) fprintf( stderr,"GBVI param: %5u R=%15.7e scaledR=%15.7e R-S=%15.7e gamma*tau=%15.7e bornRadiusScaleFactor=%15.7e\n",
i, (*gpu->psGBVIData)[i].x, (*gpu->psGBVIData)[i].y, (*gpu->psGBVIData)[i].x - (*gpu->psGBVIData)[i].y,
(*gpu->psGBVIData)[i].z, (*gpu->psGBVIData)[i].w );
#endif
#undef DUMP_PARAMETERS
}
// Dummy out extra atom data
for (unsigned int i = atoms; i < gpu->sim.paddedNumberOfAtoms; i++)
{
(*gpu->psBornRadii)[i] = 0.2f;
(*gpu->psGBVIData)[i].x = 0.01f;
(*gpu->psGBVIData)[i].y = 0.01f;
(*gpu->psGBVIData)[i].z = 0.01f;
(*gpu->psGBVIData)[i].w = 1.00f;
(*gpu->psGBVISwitchDerivative)[i] = 1.0f;
}
gpu->psBornRadii->Upload();
gpu->psGBVIData->Upload();
gpu->psGBVISwitchDerivative->Upload();
gpu->psObcData->Upload();
gpu->sim.preFactor = 2.0f*electricConstant*((1.0f/innerDielectric)-(1.0f/solventDielectric))*gpu->sim.forceConversionFactor;
#if (DUMP_PARAMETERS == 1)
(void) fprintf( stderr, "gpuSetGBVIParameters: preFactor=%14.6e elecCnstnt=%.4f frcCnvrsnFctr=%.4f tau=%.4f.\n",
gpu->sim.preFactor, 2.0f*electricConstant, gpu->sim.forceConversionFactor, ((1.0f/innerDielectric)-(1.0f/solventDielectric)) );
#endif
}
static void markShakeClusterInvalid(ShakeCluster& cluster, map<int, ShakeCluster>& allClusters, vector<bool>& invalidForShake)
{
cluster.valid = false;
invalidForShake[cluster.centralID] = true;
for (int i = 0; i < cluster.size; i++) {
invalidForShake[cluster.peripheralID[i]] = true;
map<int, ShakeCluster>::iterator otherCluster = allClusters.find(cluster.peripheralID[i]);
if (otherCluster != allClusters.end() && otherCluster->second.valid)
markShakeClusterInvalid(otherCluster->second, allClusters, invalidForShake);
}
}
extern "C"
void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const vector<int>& atom2, const vector<float>& distance,
const vector<float>& invMass1, const vector<float>& invMass2, float constraintTolerance)
{
// Create a vector for recording which atoms are handled by SHAKE (or SETTLE).
vector<bool> isShakeAtom(gpu->natoms, false);
// Find how many constraints each atom is involved in.
vector<int> constraintCount(gpu->natoms, 0);
for (int i = 0; i < (int)atom1.size(); i++) {
constraintCount[atom1[i]]++;
constraintCount[atom2[i]]++;
}
// Identify clusters of three atoms that can be treated with SETTLE. First, for every
// atom that might be part of such a cluster, make a list of the two other atoms it is
// connected to.
vector<map<int, float> > settleConstraints(gpu->natoms);
for (int i = 0; i < (int)atom1.size(); i++) {
if (constraintCount[atom1[i]] == 2 && constraintCount[atom2[i]] == 2) {
settleConstraints[atom1[i]][atom2[i]] = distance[i];
settleConstraints[atom2[i]][atom1[i]] = distance[i];
}
}
// Now remove the ones that don't actually form closed loops of three atoms.
vector<int> settleClusters;
for (int i = 0; i < (int)settleConstraints.size(); i++) {
if (settleConstraints[i].size() == 2) {
int partner1 = settleConstraints[i].begin()->first;
int partner2 = (++settleConstraints[i].begin())->first;
if (settleConstraints[partner1].size() != 2 || settleConstraints[partner2].size() != 2 ||
settleConstraints[partner1].find(partner2) == settleConstraints[partner1].end())
settleConstraints[i].clear();
else if (i < partner1 && i < partner2)
settleClusters.push_back(i);
}
else
settleConstraints[i].clear();
}
// Record the actual SETTLE clusters.
CUDAStream<int4>* psSettleID = new CUDAStream<int4>((int) settleClusters.size(), 1, "SettleID");
gpu->psSettleID = psSettleID;
gpu->sim.pSettleID = psSettleID->_pDevStream[0];
CUDAStream<float2>* psSettleParameter = new CUDAStream<float2>((int) settleClusters.size(), 1, "SettleParameter");
gpu->psSettleParameter = psSettleParameter;
gpu->sim.pSettleParameter = psSettleParameter->_pDevStream[0];
gpu->sim.settleConstraints = settleClusters.size();
for (int i = 0; i < (int)settleClusters.size(); i++) {
int atom1 = settleClusters[i];
int atom2 = settleConstraints[atom1].begin()->first;
int atom3 = (++settleConstraints[atom1].begin())->first;
float dist12 = settleConstraints[atom1].find(atom2)->second;
float dist13 = settleConstraints[atom1].find(atom3)->second;
float dist23 = settleConstraints[atom2].find(atom3)->second;
if (dist12 == dist13) { // atom1 is the central atom
(*psSettleID)[i].x = atom1;
(*psSettleID)[i].y = atom2;
(*psSettleID)[i].z = atom3;
(*psSettleParameter)[i].x = dist12;
(*psSettleParameter)[i].y = dist23;
}
else if (dist12 == dist23) { // atom2 is the central atom
(*psSettleID)[i].x = atom2;
(*psSettleID)[i].y = atom1;
(*psSettleID)[i].z = atom3;
(*psSettleParameter)[i].x = dist12;
(*psSettleParameter)[i].y = dist13;
}
else if (dist13 == dist23) { // atom3 is the central atom
(*psSettleID)[i].x = atom3;
(*psSettleID)[i].y = atom1;
(*psSettleID)[i].z = atom2;
(*psSettleParameter)[i].x = dist13;
(*psSettleParameter)[i].y = dist12;
}
else
throw OpenMMException("Two of the three distances constrained with SETTLE must be the same.");
isShakeAtom[atom1] = true;
isShakeAtom[atom2] = true;
isShakeAtom[atom3] = true;
}
psSettleID->Upload();
psSettleParameter->Upload();
gpu->sim.settle_threads_per_block = (gpu->sim.settleConstraints + gpu->sim.blocks - 1) / gpu->sim.blocks;
if (gpu->sim.settle_threads_per_block > gpu->sim.max_shake_threads_per_block)
gpu->sim.settle_threads_per_block = gpu->sim.max_shake_threads_per_block;
if (gpu->sim.settle_threads_per_block < 1)
gpu->sim.settle_threads_per_block = 1;
// Find clusters consisting of a central atom with up to three peripheral atoms.
map<int, ShakeCluster> clusters;
vector<bool> invalidForShake(gpu->natoms, false);
for (int i = 0; i < (int)atom1.size(); i++) {
if (isShakeAtom[atom1[i]])
continue; // This is being taken care of with SETTLE.
// Determine which is the central atom.
bool firstIsCentral;
if (constraintCount[atom1[i]] > 1)
firstIsCentral = true;
else if (constraintCount[atom2[i]] > 1)
firstIsCentral = false;
else if (atom1[i] < atom2[i])
firstIsCentral = true;
else
firstIsCentral = false;
int centralID, peripheralID;
float centralInvMass, peripheralInvMass;
if (firstIsCentral) {
centralID = atom1[i];
peripheralID = atom2[i];
centralInvMass = invMass1[i];
peripheralInvMass = invMass2[i];
}
else {
centralID = atom2[i];
peripheralID = atom1[i];
centralInvMass = invMass2[i];
peripheralInvMass = invMass1[i];
}
// Add it to the cluster.
if (clusters.find(centralID) == clusters.end()) {
clusters[centralID] = ShakeCluster(centralID, centralInvMass);
}
ShakeCluster& cluster = clusters[centralID];
cluster.addAtom(peripheralID, distance[i], peripheralInvMass);
if (constraintCount[peripheralID] != 1 || invalidForShake[atom1[i]] || invalidForShake[atom2[i]]) {
markShakeClusterInvalid(cluster, clusters, invalidForShake);
map<int, ShakeCluster>::iterator otherCluster = clusters.find(peripheralID);
if (otherCluster != clusters.end() && otherCluster->second.valid)
markShakeClusterInvalid(otherCluster->second, clusters, invalidForShake);
}
}
int validShakeClusters = 0;
for (map<int, ShakeCluster>::iterator iter = clusters.begin(); iter != clusters.end(); ++iter) {
ShakeCluster& cluster = iter->second;
if (cluster.valid) {
cluster.valid = !invalidForShake[cluster.centralID] && cluster.size == constraintCount[cluster.centralID];
for (int i = 0; i < cluster.size; i++)
if (invalidForShake[cluster.peripheralID[i]])
cluster.valid = false;
if (cluster.valid)
++validShakeClusters;
}
}
// Fill in the Cuda streams.
CUDAStream<int4>* psShakeID = new CUDAStream<int4>(validShakeClusters, 1, "ShakeID");
gpu->psShakeID = psShakeID;
gpu->sim.pShakeID = psShakeID->_pDevStream[0];
CUDAStream<float4>* psShakeParameter = new CUDAStream<float4>(validShakeClusters, 1, "ShakeParameter");
gpu->psShakeParameter = psShakeParameter;
gpu->sim.pShakeParameter = psShakeParameter->_pDevStream[0];
gpu->sim.ShakeConstraints = validShakeClusters;
int index = 0;
for (map<int, ShakeCluster>::const_iterator iter = clusters.begin(); iter != clusters.end(); ++iter) {
const ShakeCluster& cluster = iter->second;
if (!cluster.valid)
continue;
(*psShakeID)[index].x = cluster.centralID;
(*psShakeID)[index].y = cluster.peripheralID[0];
(*psShakeID)[index].z = cluster.size > 1 ? cluster.peripheralID[1] : -1;
(*psShakeID)[index].w = cluster.size > 2 ? cluster.peripheralID[2] : -1;
(*psShakeParameter)[index].x = cluster.centralInvMass;
(*psShakeParameter)[index].y = 0.5f/(cluster.centralInvMass+cluster.peripheralInvMass);
(*psShakeParameter)[index].z = cluster.distance*cluster.distance;
(*psShakeParameter)[index].w = cluster.peripheralInvMass;
isShakeAtom[cluster.centralID] = true;
isShakeAtom[cluster.peripheralID[0]] = true;
if (cluster.size > 1)
isShakeAtom[cluster.peripheralID[1]] = true;
if (cluster.size > 2)
isShakeAtom[cluster.peripheralID[2]] = true;
++index;
}
psShakeID->Upload();
psShakeParameter->Upload();
gpu->sim.shakeTolerance = constraintTolerance;
gpu->sim.shake_threads_per_block = (gpu->sim.ShakeConstraints + gpu->sim.blocks - 1) / gpu->sim.blocks;
if (gpu->sim.shake_threads_per_block > gpu->sim.max_shake_threads_per_block)
gpu->sim.shake_threads_per_block = gpu->sim.max_shake_threads_per_block;
if (gpu->sim.shake_threads_per_block < 1)
gpu->sim.shake_threads_per_block = 1;
// Find connected constraints for CCMA.
vector<int> ccmaConstraints;
for (unsigned i = 0; i < atom1.size(); i++)
if (!isShakeAtom[atom1[i]])
ccmaConstraints.push_back(i);
// Record the connections between constraints.
int numCCMA = (int) ccmaConstraints.size();
vector<vector<int> > atomConstraints(gpu->natoms);
for (int i = 0; i < numCCMA; i++) {
atomConstraints[atom1[ccmaConstraints[i]]].push_back(i);
atomConstraints[atom2[ccmaConstraints[i]]].push_back(i);
}
vector<vector<int> > linkedConstraints(numCCMA);
for (unsigned atom = 0; atom < atomConstraints.size(); atom++) {
for (unsigned i = 0; i < atomConstraints[atom].size(); i++)
for (unsigned j = 0; j < i; j++) {
int c1 = atomConstraints[atom][i];
int c2 = atomConstraints[atom][j];
linkedConstraints[c1].push_back(c2);
linkedConstraints[c2].push_back(c1);
}
}
int maxLinks = 0;
for (unsigned i = 0; i < linkedConstraints.size(); i++)
maxLinks = max(maxLinks, (int) linkedConstraints[i].size());
int maxAtomConstraints = 0;
for (unsigned i = 0; i < atomConstraints.size(); i++)
maxAtomConstraints = max(maxAtomConstraints, (int) atomConstraints[i].size());
// Compute the constraint coupling matrix
vector<vector<int> > atomAngles(gpu->natoms);
for (int i = 0; i < (int) gpu->sim.bond_angles; i++)
atomAngles[(*gpu->psBondAngleID1)[i].y].push_back(i);
vector<vector<pair<int, double> > > matrix(numCCMA);
if (numCCMA > 0) {
for (int j = 0; j < numCCMA; j++) {
for (int k = 0; k < numCCMA; k++) {
if (j == k) {
matrix[j].push_back(pair<int, double>(j, 1.0));
continue;
}
double scale;
int cj = ccmaConstraints[j];
int ck = ccmaConstraints[k];
int atomj0 = atom1[cj];
int atomj1 = atom2[cj];
int atomk0 = atom1[ck];
int atomk1 = atom2[ck];
int atoma, atomb, atomc;
if (atomj0 == atomk0) {
atoma = atomj1;
atomb = atomj0;
atomc = atomk1;
scale = invMass1[cj]/(invMass1[cj]+invMass2[cj]);
}
else if (atomj1 == atomk1) {
atoma = atomj0;
atomb = atomj1;
atomc = atomk0;
scale = invMass2[cj]/(invMass1[cj]+invMass2[cj]);
}
else if (atomj0 == atomk1) {
atoma = atomj1;
atomb = atomj0;
atomc = atomk0;
scale = invMass1[cj]/(invMass1[cj]+invMass2[cj]);
}
else if (atomj1 == atomk0) {
atoma = atomj0;
atomb = atomj1;
atomc = atomk1;
scale = invMass2[cj]/(invMass1[cj]+invMass2[cj]);
}
else
continue; // These constraints are not connected.
// Look for a third constraint forming a triangle with these two.
bool foundConstraint = false;
for (int m = 0; m < numCCMA; m++) {
int other = ccmaConstraints[m];
if ((atom1[other] == atoma && atom2[other] == atomc) || (atom1[other] == atomc && atom2[other] == atoma)) {
double d1 = distance[cj];
double d2 = distance[ck];
double d3 = distance[other];
matrix[j].push_back(pair<int, double>(k, scale*(d1*d1+d2*d2-d3*d3)/(2.0*d1*d2)));
foundConstraint = true;
break;
}
}
if (!foundConstraint) {
// We didn't find one, so look for an angle force field term.
const vector<int>& angleCandidates = atomAngles[atomb];
for (vector<int>::const_iterator iter = angleCandidates.begin(); iter != angleCandidates.end(); iter++) {
int4 atoms = (*gpu->psBondAngleID1)[*iter];
if ((atoms.x == atoma && atoms.z == atomc) || (atoms.z == atoma && atoms.x == atomc)) {
double angle = (*gpu->psBondAngleParameter)[*iter].x;
matrix[j].push_back(pair<int, double>(k, scale*cos(angle*PI/180.0)));
break;
}
}
}
}
}
// Invert it using QR.
vector<int> matrixRowStart;
vector<int> matrixColIndex;
vector<double> matrixValue;
for (int i = 0; i < numCCMA; i++) {
matrixRowStart.push_back(matrixValue.size());
for (int j = 0; j < (int) matrix[i].size(); j++) {
pair<int, double> element = matrix[i][j];
matrixColIndex.push_back(element.first);
matrixValue.push_back(element.second);
}
}
matrixRowStart.push_back(matrixValue.size());
int *qRowStart, *qColIndex, *rRowStart, *rColIndex;
double *qValue, *rValue;
int result = QUERN_compute_qr(numCCMA, numCCMA, &matrixRowStart[0], &matrixColIndex[0], &matrixValue[0], NULL,
&qRowStart, &qColIndex, &qValue, &rRowStart, &rColIndex, &rValue);
vector<double> rhs(numCCMA);
matrix.clear();
matrix.resize(numCCMA);
for (int i = 0; i < numCCMA; i++) {
// Extract column i of the inverse matrix.
for (int j = 0; j < numCCMA; j++)
rhs[j] = (i == j ? 1.0 : 0.0);
result = QUERN_multiply_with_q_transpose(numCCMA, qRowStart, qColIndex, qValue, &rhs[0]);
result = QUERN_solve_with_r(numCCMA, rRowStart, rColIndex, rValue, &rhs[0], &rhs[0]);
for (int j = 0; j < numCCMA; j++) {
double value = rhs[j]*distance[ccmaConstraints[i]]/distance[ccmaConstraints[j]];
if (abs(value) > 0.05)
matrix[j].push_back(pair<int, double>(i, value));
}
}
QUERN_free_result(qRowStart, qColIndex, qValue);
QUERN_free_result(rRowStart, rColIndex, rValue);
}
int maxRowElements = 0;
for (unsigned i = 0; i < matrix.size(); i++)
maxRowElements = max(maxRowElements, (int) matrix[i].size());
maxRowElements++;
// Sort the constraints.
vector<int> constraintOrder(numCCMA);
for (int i = 0; i < numCCMA; ++i)
constraintOrder[i] = i;
sort(constraintOrder.begin(), constraintOrder.end(), ConstraintOrderer(atom1, atom2, ccmaConstraints));
vector<int> inverseOrder(numCCMA);
for (int i = 0; i < numCCMA; ++i)
inverseOrder[constraintOrder[i]] = i;
for (int i = 0; i < (int)matrix.size(); ++i)
for (int j = 0; j < (int)matrix[i].size(); ++j)
matrix[i][j].first = inverseOrder[matrix[i][j].first];
// Fill in the CUDA streams.
CUDAStream<int2>* psCcmaAtoms = new CUDAStream<int2>(numCCMA, 1, "CcmaAtoms");
gpu->psCcmaAtoms = psCcmaAtoms;
gpu->sim.pCcmaAtoms = psCcmaAtoms->_pDevData;
CUDAStream<float4>* psCcmaDistance = new CUDAStream<float4>(numCCMA, 1, "CcmaDistance");
gpu->psCcmaDistance = psCcmaDistance;
gpu->sim.pCcmaDistance = psCcmaDistance->_pDevData;
CUDAStream<int>* psCcmaAtomConstraints = new CUDAStream<int>(gpu->natoms*maxAtomConstraints, 1, "CcmaAtomConstraints");
gpu->psCcmaAtomConstraints = psCcmaAtomConstraints;
gpu->sim.pCcmaAtomConstraints = psCcmaAtomConstraints->_pDevData;
CUDAStream<int>* psCcmaNumAtomConstraints = new CUDAStream<int>(gpu->natoms, 1, "CcmaAtomConstraintsIndex");
gpu->psCcmaNumAtomConstraints = psCcmaNumAtomConstraints;
gpu->sim.pCcmaNumAtomConstraints = psCcmaNumAtomConstraints->_pDevData;
CUDAStream<float>* psCcmaDelta1 = new CUDAStream<float>(numCCMA, 1, "CcmaDelta1");
gpu->psCcmaDelta1 = psCcmaDelta1;
gpu->sim.pCcmaDelta1 = psCcmaDelta1->_pDevData;
CUDAStream<float>* psCcmaDelta2 = new CUDAStream<float>(numCCMA, 1, "CcmaDelta2");
gpu->psCcmaDelta2 = psCcmaDelta2;
gpu->sim.pCcmaDelta2 = psCcmaDelta2->_pDevData;
CUDAStream<float>* psCcmaReducedMass = new CUDAStream<float>(numCCMA, 1, "CcmaReducedMass");
gpu->psCcmaReducedMass = psCcmaReducedMass;
gpu->sim.pCcmaReducedMass = psCcmaReducedMass->_pDevData;
CUDAStream<unsigned int>* psConstraintMatrixColumn = new CUDAStream<unsigned int>(numCCMA*maxRowElements, 1, "ConstraintMatrixColumn");
gpu->psConstraintMatrixColumn = psConstraintMatrixColumn;
gpu->sim.pConstraintMatrixColumn = psConstraintMatrixColumn->_pDevData;
CUDAStream<float>* psConstraintMatrixValue = new CUDAStream<float>(numCCMA*maxRowElements, 1, "ConstraintMatrixValue");
gpu->psConstraintMatrixValue = psConstraintMatrixValue;
gpu->sim.pConstraintMatrixValue = psConstraintMatrixValue->_pDevData;
cudaHostAlloc((void**) &gpu->ccmaConvergedHostMarker, sizeof(int), cudaHostAllocMapped);
cudaHostGetDevicePointer((void**) &gpu->sim.ccmaConvergedDeviceMarker, (void*) gpu->ccmaConvergedHostMarker, 0);
cudaEventCreate(&gpu->ccmaEvent);
gpu->sim.ccmaConstraints = numCCMA;
for (int i = 0; i < numCCMA; i++) {
int index = constraintOrder[i];
int c = ccmaConstraints[index];
(*psCcmaAtoms)[i].x = atom1[c];
(*psCcmaAtoms)[i].y = atom2[c];
(*psCcmaDistance)[i].w = distance[c];
(*psCcmaReducedMass)[i] = 0.5f/(invMass1[c]+invMass2[c]);
for (unsigned int j = 0; j < matrix[index].size(); j++) {
(*psConstraintMatrixColumn)[i+j*numCCMA] = matrix[index][j].first;
(*psConstraintMatrixValue)[i+j*numCCMA] = (float) matrix[index][j].second;
}
(*psConstraintMatrixColumn)[i+matrix[index].size()*numCCMA] = numCCMA;
}
for (unsigned int i = 0; i < atomConstraints.size(); i++) {
(*psCcmaNumAtomConstraints)[i] = atomConstraints[i].size();
for (unsigned int j = 0; j < atomConstraints[i].size(); j++) {
bool forward = (atom1[ccmaConstraints[atomConstraints[i][j]]] == i);
(*psCcmaAtomConstraints)[i+j*gpu->natoms] = (forward ? inverseOrder[atomConstraints[i][j]]+1 : -inverseOrder[atomConstraints[i][j]]-1);
}
}
psCcmaAtoms->Upload();
psCcmaDistance->Upload();
psCcmaReducedMass->Upload();
psCcmaAtomConstraints->Upload();
psCcmaNumAtomConstraints->Upload();
psConstraintMatrixColumn->Upload();
psConstraintMatrixValue->Upload();
gpu->sim.ccma_threads_per_block = (gpu->sim.ccmaConstraints + gpu->sim.blocks - 1) / gpu->sim.blocks;
if (gpu->sim.ccma_threads_per_block > gpu->sim.threads_per_block)
gpu->sim.ccma_threads_per_block = gpu->sim.threads_per_block;
if (gpu->sim.ccma_threads_per_block < gpu->sim.blocks)
gpu->sim.ccma_threads_per_block = gpu->sim.blocks;
}
extern "C"
int gpuAllocateInitialBuffers(gpuContext gpu)
{
gpu->sim.atoms = gpu->natoms;
gpu->sim.paddedNumberOfAtoms = ((gpu->sim.atoms + GRID - 1) >> GRIDBITS) << GRIDBITS;
gpu->sim.degreesOfFreedom = 3 * gpu->sim.atoms - 6;
gpu->gpAtomTable = NULL;
gpu->gAtomTypes = 0;
gpu->psPosq4 = new CUDAStream<float4>(gpu->sim.paddedNumberOfAtoms, 1, "Posq");
gpu->sim.stride = gpu->psPosq4->_stride;
gpu->sim.stride2 = gpu->sim.stride * 2;
gpu->sim.stride3 = gpu->sim.stride * 3;
gpu->sim.stride4 = gpu->sim.stride * 4;
gpu->sim.pPosq = gpu->psPosq4->_pDevStream[0];
gpu->sim.stride = gpu->psPosq4->_stride;
gpu->sim.stride2 = 2 * gpu->sim.stride;
gpu->sim.stride3 = 3 * gpu->sim.stride;
gpu->sim.stride4 = 4 * gpu->sim.stride;
gpu->psPosqP4 = new CUDAStream<float4>(gpu->sim.paddedNumberOfAtoms, 1, "PosqP");
gpu->sim.pPosqP = gpu->psPosqP4->_pDevStream[0];
gpu->psOldPosq4 = new CUDAStream<float4>(gpu->sim.paddedNumberOfAtoms, 1, "OldPosq");
gpu->sim.pOldPosq = gpu->psOldPosq4->_pDevStream[0];
gpu->psVelm4 = new CUDAStream<float4>(gpu->sim.paddedNumberOfAtoms, 1, "Velm");
gpu->sim.pVelm4 = gpu->psVelm4->_pDevStream[0];
gpu->psBornRadii = new CUDAStream<float>(gpu->sim.paddedNumberOfAtoms, 1, "BornRadii");
gpu->sim.pBornRadii = gpu->psBornRadii->_pDevStream[0];
gpu->psObcChain = new CUDAStream<float>(gpu->sim.paddedNumberOfAtoms, 1, "ObcChain");
gpu->sim.pObcChain = gpu->psObcChain->_pDevStream[0];
gpu->psSigEps2 = new CUDAStream<float2>(gpu->sim.paddedNumberOfAtoms, 1, "SigEps2");
gpu->sim.pAttr = gpu->psSigEps2->_pDevStream[0];
gpu->psObcData = new CUDAStream<float2>(gpu->sim.paddedNumberOfAtoms, 1, "ObcData");
gpu->sim.pObcData = gpu->psObcData->_pDevStream[0];
gpu->psGBVIData = new CUDAStream<float4>(gpu->sim.paddedNumberOfAtoms, 1, "GBVIData");
gpu->sim.pGBVIData = gpu->psGBVIData->_pDevStream[0];
gpu->psGBVISwitchDerivative = new CUDAStream<float>(gpu->sim.paddedNumberOfAtoms, 1, "psGBVISwitchDerivative");
gpu->sim.pGBVISwitchDerivative = gpu->psGBVISwitchDerivative->_pDevStream[0];
gpu->psStepSize = new CUDAStream<float2>(1, 1, "StepSize");
gpu->sim.pStepSize = gpu->psStepSize->_pDevStream[0];
(*gpu->psStepSize)[0] = make_float2(0.0f, 0.0f);
gpu->psStepSize->Upload();
gpu->psLangevinParameters = new CUDAStream<float>(3, 1, "LangevinParameters");
gpu->sim.pLangevinParameters = gpu->psLangevinParameters->_pDevStream[0];
gpu->pAtomSymbol = new unsigned char[gpu->natoms];
gpu->psAtomIndex = new CUDAStream<int>(gpu->sim.paddedNumberOfAtoms, 1, "AtomIndex");
gpu->sim.pAtomIndex = gpu->psAtomIndex->_pDevStream[0];
for (int i = 0; i < (int) gpu->sim.paddedNumberOfAtoms; i++)
(*gpu->psAtomIndex)[i] = i;
gpu->psAtomIndex->Upload();
gpu->posCellOffsets.resize(gpu->natoms, make_int3(0, 0, 0));
gpu->sim.outputBuffers = 0;
// Determine randoms
gpu->seed = 1;
gpu->sim.randomFrames = 20;
gpu->sim.randomIterations = gpu->sim.randomFrames;
gpu->sim.randoms = gpu->sim.randomFrames * gpu->sim.paddedNumberOfAtoms;
gpu->sim.totalRandoms = gpu->sim.randoms + gpu->sim.paddedNumberOfAtoms;
gpu->psRandom4 = new CUDAStream<float4>(gpu->sim.totalRandoms, 1, "Random4");
gpu->psRandom2 = new CUDAStream<float2>(gpu->sim.totalRandoms, 1, "Random2");
gpu->psRandomPosition = new CUDAStream<int>(gpu->sim.blocks, 1, "RandomPosition");
gpu->psRandomSeed = new CUDAStream<uint4>(gpu->sim.blocks * gpu->sim.random_threads_per_block, 1, "RandomSeed");
gpu->sim.pRandom4 = gpu->psRandom4->_pDevStream[0];
gpu->sim.pRandom2 = gpu->psRandom2->_pDevStream[0];
gpu->sim.pRandomPosition = gpu->psRandomPosition->_pDevStream[0];
gpu->sim.pRandomSeed = gpu->psRandomSeed->_pDevStream[0];
// Allocate and clear linear momentum buffer
gpu->psLinearMomentum = new CUDAStream<float4>(gpu->sim.blocks, 1, "LinearMomentum");
gpu->sim.pLinearMomentum = gpu->psLinearMomentum->_pDevStream[0];
for (int i = 0; i < (int) gpu->sim.blocks; i++)
{
(*gpu->psLinearMomentum)[i].x = 0.0f;
(*gpu->psLinearMomentum)[i].y = 0.0f;
(*gpu->psLinearMomentum)[i].z = 0.0f;
(*gpu->psLinearMomentum)[i].w = 0.0f;
}
gpu->psLinearMomentum->Upload();
return 1;
}
extern "C"
void gpuSetPositions(gpuContext gpu, const vector<float>& x, const vector<float>& y, const vector<float>& z)
{
for (int i = 0; i < gpu->natoms; i++)
{
(*gpu->psPosq4)[i].x = x[i];
(*gpu->psPosq4)[i].y = y[i];
(*gpu->psPosq4)[i].z = z[i];
}
gpu->psPosq4->Upload();
// set flag to recalculate Born radii
gpu->bRecalculateBornRadii = true;
}
extern "C"
void gpuSetVelocities(gpuContext gpu, const vector<float>& x, const vector<float>& y, const vector<float>& z)
{
for (int i = 0; i < gpu->natoms; i++)
{
(*gpu->psVelm4)[i].x = x[i];
(*gpu->psVelm4)[i].y = y[i];
(*gpu->psVelm4)[i].z = z[i];
}
gpu->psVelm4->Upload();
}
extern "C"
void gpuSetMass(gpuContext gpu, const vector<float>& mass)
{
float totalMass = 0.0f;
for (int i = 0; i < gpu->natoms; i++)
{
(*gpu->psVelm4)[i].w = 1.0f/mass[i];
totalMass += mass[i];
}
gpu->sim.inverseTotalMass = 1.0f / totalMass;
gpu->psVelm4->Upload();
}
extern "C"
void gpuInitializeRandoms(gpuContext gpu)
{
for (int i = 0; i < (int) gpu->sim.blocks; i++)
{
(*gpu->psRandomPosition)[i] = 0;
}
int seed = gpu->seed | ((gpu->seed ^ 0xffffffff) << 16);
#if 0
srand(seed);
for (int i = 0; i < (int) (gpu->sim.blocks * gpu->sim.random_threads_per_block); i++)
{
(*gpu->psRandomSeed)[i].x = rand();
(*gpu->psRandomSeed)[i].y = rand();
(*gpu->psRandomSeed)[i].z = rand();
(*gpu->psRandomSeed)[i].w = rand();
}
#else
RNG rng(seed);
for (int i = 0; i < (int) (gpu->sim.blocks * gpu->sim.random_threads_per_block); i++)
{
(*gpu->psRandomSeed)[i].x = rng.rand_int();
(*gpu->psRandomSeed)[i].y = rng.rand_int();
(*gpu->psRandomSeed)[i].z = rng.rand_int();
(*gpu->psRandomSeed)[i].w = rng.rand_int();
}
#endif
gpu->psRandomPosition->Upload();
gpu->psRandomSeed->Upload();
gpuSetConstants(gpu);
kGenerateRandoms(gpu);
return;
}
extern "C"
bool OPENMMCUDA_EXPORT gpuIsAvailable()
{
int deviceCount;
cudaGetDeviceCount(&deviceCount);
return (deviceCount > 0);
}
extern "C"
void* gpuInit(int numAtoms, unsigned int device, bool useBlockingSync)
{
gpuContext gpu = new _gpuContext;
int LRFSize = 0;
int SMCount = 0;
int SMMajor = 0;
int SMMinor = 0;
// Select which device to use
int currentDevice;
cudaError_t status = cudaGetDevice(&currentDevice);
RTERROR(status, "Error getting CUDA device")
if (device != currentDevice)
cudaSetDevice(device); // Ignore errors
status = cudaGetDevice(&gpu->device);
RTERROR(status, "Error getting CUDA device")
status = cudaSetDeviceFlags(cudaDeviceMapHost+(useBlockingSync ? cudaDeviceBlockingSync : cudaDeviceScheduleAuto));
RTERROR(status, "Error setting device flags")
gpu->useBlockingSync = useBlockingSync;
// Determine kernel call configuration
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, currentDevice);
// Determine SM version
if (deviceProp.major == 1)
{
switch (deviceProp.minor)
{
case 0:
case 1:
gpu->sm_version = SM_10;
gpu->sim.workUnitsPerSM = G8X_NONBOND_WORKUNITS_PER_SM;
break;
default:
gpu->sm_version = SM_12;
gpu->sim.workUnitsPerSM = GT2XX_NONBOND_WORKUNITS_PER_SM;
break;
}
}
else
{
gpu->sm_version = SM_20;
gpu->sim.workUnitsPerSM = GF1XX_NONBOND_WORKUNITS_PER_SM;
}
if (deviceProp.regsPerBlock == 8192)
{
gpu->sim.nonbond_threads_per_block = G8X_NONBOND_THREADS_PER_BLOCK;
gpu->sim.bornForce2_threads_per_block = G8X_BORNFORCE2_THREADS_PER_BLOCK;
gpu->sim.max_shake_threads_per_block = G8X_SHAKE_THREADS_PER_BLOCK;
gpu->sim.max_update_threads_per_block = G8X_UPDATE_THREADS_PER_BLOCK;
gpu->sim.max_localForces_threads_per_block = G8X_LOCALFORCES_THREADS_PER_BLOCK;
gpu->sim.threads_per_block = G8X_THREADS_PER_BLOCK;
gpu->sim.random_threads_per_block = G8X_RANDOM_THREADS_PER_BLOCK;
gpu->blocksPerSM = G8X_BLOCKS_PER_SM;
}
else if (deviceProp.regsPerBlock <= 16384)
{
gpu->sim.nonbond_threads_per_block = GT2XX_NONBOND_THREADS_PER_BLOCK;
gpu->sim.bornForce2_threads_per_block = GT2XX_BORNFORCE2_THREADS_PER_BLOCK;
gpu->sim.max_shake_threads_per_block = GT2XX_SHAKE_THREADS_PER_BLOCK;
gpu->sim.max_update_threads_per_block = GT2XX_UPDATE_THREADS_PER_BLOCK;
gpu->sim.max_localForces_threads_per_block = GT2XX_LOCALFORCES_THREADS_PER_BLOCK;
gpu->sim.threads_per_block = GT2XX_THREADS_PER_BLOCK;
gpu->sim.random_threads_per_block = GT2XX_RANDOM_THREADS_PER_BLOCK;
gpu->blocksPerSM = GT2XX_BLOCKS_PER_SM;
}
else
{
gpu->sim.nonbond_threads_per_block = GF1XX_NONBOND_THREADS_PER_BLOCK;
gpu->sim.bornForce2_threads_per_block = GF1XX_BORNFORCE2_THREADS_PER_BLOCK;
gpu->sim.max_shake_threads_per_block = GF1XX_SHAKE_THREADS_PER_BLOCK;
gpu->sim.max_update_threads_per_block = GF1XX_UPDATE_THREADS_PER_BLOCK;
gpu->sim.max_localForces_threads_per_block = GF1XX_LOCALFORCES_THREADS_PER_BLOCK;
gpu->sim.threads_per_block = GF1XX_THREADS_PER_BLOCK;
gpu->sim.random_threads_per_block = GF1XX_RANDOM_THREADS_PER_BLOCK;
gpu->blocksPerSM = GF1XX_BLOCKS_PER_SM;
}
gpu->sim.nonbond_blocks = deviceProp.multiProcessorCount*gpu->blocksPerSM;
gpu->sim.bornForce2_blocks = deviceProp.multiProcessorCount*gpu->blocksPerSM;
gpu->sim.blocks = deviceProp.multiProcessorCount;
gpu->sharedMemoryPerBlock = deviceProp.sharedMemPerBlock;
gpu->sim.shake_threads_per_block = gpu->sim.max_shake_threads_per_block;
gpu->sim.localForces_threads_per_block = gpu->sim.max_localForces_threads_per_block;
gpu->natoms = numAtoms;
gpuAllocateInitialBuffers(gpu);
gpu->iterations = 0;
gpu->sim.update_threads_per_block = (gpu->natoms + gpu->sim.blocks - 1) / gpu->sim.blocks;
if (gpu->sim.update_threads_per_block > gpu->sim.max_update_threads_per_block)
gpu->sim.update_threads_per_block = gpu->sim.max_update_threads_per_block;
if (gpu->sim.update_threads_per_block < gpu->psLangevinParameters->_length)
gpu->sim.update_threads_per_block = gpu->psLangevinParameters->_length;
gpu->sim.bf_reduce_threads_per_block = gpu->sim.update_threads_per_block;
gpu->sim.bsf_reduce_threads_per_block = (gpu->sim.stride4 + gpu->natoms + gpu->sim.blocks - 1) / gpu->sim.blocks;
gpu->sim.bsf_reduce_threads_per_block = ((gpu->sim.bsf_reduce_threads_per_block + (GRID - 1)) / GRID) * GRID;
if (gpu->sim.bsf_reduce_threads_per_block > gpu->sim.threads_per_block)
gpu->sim.bsf_reduce_threads_per_block = gpu->sim.threads_per_block;
if (gpu->sim.bsf_reduce_threads_per_block < 1)
gpu->sim.bsf_reduce_threads_per_block = 1;
// Initialize constants to reasonable values
gpu->sim.probeRadius = probeRadius;
gpu->sim.surfaceAreaFactor = surfaceAreaFactor;
gpu->sim.electricConstant = electricConstant;
gpu->sim.nonbondedMethod = NO_CUTOFF;
gpu->sim.nonbondedCutoff = 0.0f;
gpu->sim.nonbondedCutoffSqr = 0.0f;
gpu->sim.bigFloat = 99999999.0f;
gpu->sim.forceConversionFactor = forceConversionFactor;
gpu->sim.preFactor = 2.0f*electricConstant*((1.0f/defaultInnerDielectric)-(1.0f/defaultSolventDielectric))*gpu->sim.forceConversionFactor;
gpu->sim.dielectricOffset = dielectricOffset;
gpu->sim.alphaOBC = alphaOBC;
gpu->sim.betaOBC = betaOBC;
gpu->sim.gammaOBC = gammaOBC;
gpu->sim.maxShakeIterations = 15;
gpu->sim.shakeTolerance = 1.0e-04f * 2.0f;
gpu->sim.InvMassJ = 9.920635e-001f;
gpu->grid = GRID;
gpu->bCalculateCM = false;
gpu->bRemoveCM = false;
gpu->bRecalculateBornRadii = true;
gpu->bIncludeGBSA = false;
gpu->bIncludeGBVI = false;
gpuInitializeRandoms(gpu);
// To be determined later
gpu->psLJ14ID = NULL;
gpu->psForce4 = NULL;
gpu->psEnergy = NULL;
gpu->sim.pForce4 = NULL;
gpu->psBornForce = NULL;
gpu->sim.pBornForce = NULL;
gpu->psBornSum = NULL;
gpu->sim.pBornSum = NULL;
gpu->psBondID = NULL;
gpu->psBondParameter = NULL;
gpu->psBondAngleID1 = NULL;
gpu->psBondAngleID2 = NULL;
gpu->psBondAngleParameter = NULL;
gpu->psDihedralID1 = NULL;
gpu->psDihedralID2 = NULL;
gpu->psDihedralParameter = NULL;
gpu->psRbDihedralID1 = NULL;
gpu->psRbDihedralID2 = NULL;
gpu->psRbDihedralParameter1 = NULL;
gpu->psRbDihedralParameter2 = NULL;
gpu->psLJ14ID = NULL;
gpu->psLJ14Parameter = NULL;
gpu->psCustomParams = NULL;
gpu->psCustomBondID = NULL;
gpu->psCustomBondParams = NULL;
gpu->psCustomAngleID1 = NULL;
gpu->psCustomAngleID2 = NULL;
gpu->psCustomAngleParams = NULL;
gpu->psCustomTorsionID1 = NULL;
gpu->psCustomTorsionID2 = NULL;
gpu->psCustomTorsionParams = NULL;
gpu->psCustomExternalID = NULL;
gpu->psCustomExternalParams = NULL;
gpu->psEwaldCosSinSum = NULL;
gpu->psTabulatedErfc = NULL;
gpu->psPmeGrid = NULL;
gpu->psPmeBsplineModuli[0] = NULL;
gpu->psPmeBsplineModuli[1] = NULL;
gpu->psPmeBsplineModuli[2] = NULL;
gpu->psPmeBsplineTheta = NULL;
gpu->psPmeBsplineDtheta = NULL;
gpu->psPmeAtomRange = NULL;
gpu->psPmeAtomGridIndex = NULL;
gpu->psShakeID = NULL;
gpu->psShakeParameter = NULL;
gpu->psSettleID = NULL;
gpu->psSettleParameter = NULL;
gpu->psExclusion = NULL;
gpu->psExclusionIndex = NULL;
gpu->psWorkUnit = NULL;
gpu->psInteractingWorkUnit = NULL;
gpu->psInteractionFlag = NULL;
gpu->psInteractionCount = NULL;
gpu->psGridBoundingBox = NULL;
gpu->psGridCenter = NULL;
gpu->psCcmaAtoms = NULL;
gpu->psCcmaDistance = NULL;
gpu->psCcmaAtomConstraints = NULL;
gpu->psCcmaNumAtomConstraints = NULL;
gpu->psCcmaDelta1 = NULL;
gpu->psCcmaDelta2 = NULL;
gpu->psCcmaReducedMass = NULL;
gpu->psConstraintMatrixColumn = NULL;
gpu->psConstraintMatrixValue = NULL;
gpu->psTabulatedFunctionParams = NULL;
for (int i = 0; i < MAX_TABULATED_FUNCTIONS; i++)
gpu->tabulatedFunctions[i].coefficients = NULL;
gpu->sim.customExpressionStackSize = 0;
gpu->sim.customBonds = 0;
gpu->sim.customAngles = 0;
gpu->sim.customTorsions = 0;
// Initialize output buffer before reading parameters
gpu->pOutputBufferCounter = new unsigned int[gpu->sim.paddedNumberOfAtoms];
memset(gpu->pOutputBufferCounter, 0, gpu->sim.paddedNumberOfAtoms * sizeof(unsigned int));
return (void*)gpu;
}
extern "C"
void gpuSetLangevinIntegrationParameters(gpuContext gpu, float tau, float deltaT, float temperature, float errorTol) {
gpu->sim.deltaT = deltaT;
gpu->sim.oneOverDeltaT = 1.0f/deltaT;
gpu->sim.errorTol = errorTol;
gpu->sim.tau = tau;
gpu->sim.T = temperature;
gpu->sim.kT = BOLTZ * gpu->sim.T;
double vscale = exp(-deltaT/tau);
double fscale = (1-vscale)*tau;
double noisescale = sqrt(2*gpu->sim.kT/tau)*sqrt(0.5*(1-vscale*vscale)*tau);
(*gpu->psLangevinParameters)[0] = (float) vscale;
(*gpu->psLangevinParameters)[1] = (float) fscale;
(*gpu->psLangevinParameters)[2] = (float) noisescale;
gpu->psLangevinParameters->Upload();
gpu->psStepSize->Download();
if ((*gpu->psStepSize)[0].x == 0)
(*gpu->psStepSize)[0].x = deltaT;
(*gpu->psStepSize)[0].y = deltaT;
gpu->psStepSize->Upload();
}
extern "C"
void gpuSetVerletIntegrationParameters(gpuContext gpu, float deltaT, float errorTol) {
gpu->sim.deltaT = deltaT;
gpu->sim.oneOverDeltaT = 1.0f/deltaT;
gpu->sim.errorTol = errorTol;
gpu->psStepSize->Download();
if ((*gpu->psStepSize)[0].x == 0)
(*gpu->psStepSize)[0].x = deltaT;
(*gpu->psStepSize)[0].y = deltaT;
gpu->psStepSize->Upload();
}
extern "C"
void gpuSetBrownianIntegrationParameters(gpuContext gpu, float tau, float deltaT, float temperature) {
gpu->sim.deltaT = deltaT;
gpu->sim.oneOverDeltaT = 1.0f/deltaT;
gpu->sim.tau = tau;
gpu->sim.tauDeltaT = gpu->sim.deltaT * gpu->sim.tau;
gpu->sim.T = temperature;
gpu->sim.kT = BOLTZ * gpu->sim.T;
gpu->sim.noiseAmplitude = sqrt(2.0f*gpu->sim.kT*deltaT*tau);
gpu->psStepSize->Download();
if ((*gpu->psStepSize)[0].x == 0)
(*gpu->psStepSize)[0].x = deltaT;
(*gpu->psStepSize)[0].y = deltaT;
gpu->psStepSize->Upload();
}
extern "C"
void gpuSetAndersenThermostatParameters(gpuContext gpu, float temperature, float collisionFrequency) {
gpu->sim.T = temperature;
gpu->sim.kT = BOLTZ * gpu->sim.T;
gpu->sim.collisionFrequency = collisionFrequency;
}
extern "C"
void gpuShutDown(gpuContext gpu)
{
// Delete sysmem pointers
delete[] gpu->pOutputBufferCounter;
delete[] gpu->gpAtomTable;
delete[] gpu->pAtomSymbol;
// Delete device pointers
delete gpu->psPosq4;
delete gpu->psPosqP4;
delete gpu->psOldPosq4;
delete gpu->psVelm4;
delete gpu->psForce4;
delete gpu->psEnergy;
delete gpu->psSigEps2;
if (gpu->psCustomParams != NULL)
delete gpu->psCustomParams;
if (gpu->psCustomBondParams != NULL) {
delete gpu->psCustomBondID;
delete gpu->psCustomBondParams;
}
if (gpu->psCustomAngleParams != NULL) {
delete gpu->psCustomAngleID1;
delete gpu->psCustomAngleID2;
delete gpu->psCustomAngleParams;
}
if (gpu->psCustomTorsionParams != NULL) {
delete gpu->psCustomTorsionID1;
delete gpu->psCustomTorsionID2;
delete gpu->psCustomTorsionParams;
}
if (gpu->psCustomExternalParams != NULL) {
delete gpu->psCustomExternalID;
delete gpu->psCustomExternalParams;
}
if (gpu->psEwaldCosSinSum != NULL)
delete gpu->psEwaldCosSinSum;
if (gpu->psPmeGrid != NULL) {
delete gpu->psPmeGrid;
delete gpu->psPmeBsplineModuli[0];
delete gpu->psPmeBsplineModuli[1];
delete gpu->psPmeBsplineModuli[2];
delete gpu->psPmeBsplineTheta;
delete gpu->psPmeBsplineDtheta;
delete gpu->psPmeAtomRange;
delete gpu->psPmeAtomGridIndex;
cufftDestroy(gpu->fftplan);
}
if (gpu->psTabulatedErfc != NULL)
delete gpu->psTabulatedErfc;
delete gpu->psObcData;
delete gpu->psGBVIData;
delete gpu->psGBVISwitchDerivative;
delete gpu->psObcChain;
delete gpu->psBornForce;
delete gpu->psBornRadii;
delete gpu->psBornSum;
delete gpu->psBondID;
delete gpu->psBondParameter;
delete gpu->psBondAngleID1;
delete gpu->psBondAngleID2;
delete gpu->psBondAngleParameter;
delete gpu->psDihedralID1;
delete gpu->psDihedralID2;
delete gpu->psDihedralParameter;
delete gpu->psRbDihedralID1;
delete gpu->psRbDihedralID2;
delete gpu->psRbDihedralParameter1;
delete gpu->psRbDihedralParameter2;
delete gpu->psLJ14ID;
delete gpu->psLJ14Parameter;
delete gpu->psShakeID;
delete gpu->psShakeParameter;
delete gpu->psSettleID;
delete gpu->psSettleParameter;
delete gpu->psExclusion;
delete gpu->psExclusionIndex;
delete gpu->psWorkUnit;
delete gpu->psInteractingWorkUnit;
delete gpu->psInteractionFlag;
delete gpu->psInteractionCount;
delete gpu->psStepSize;
delete gpu->psLangevinParameters;
delete gpu->psRandom4;
delete gpu->psRandom2;
delete gpu->psRandomPosition;
delete gpu->psRandomSeed;
delete gpu->psLinearMomentum;
delete gpu->psAtomIndex;
delete gpu->psGridBoundingBox;
delete gpu->psGridCenter;
delete gpu->psCcmaAtoms;
delete gpu->psCcmaDistance;
delete gpu->psCcmaAtomConstraints;
delete gpu->psCcmaNumAtomConstraints;
delete gpu->psCcmaDelta1;
delete gpu->psCcmaDelta2;
delete gpu->psCcmaReducedMass;
cudaEventDestroy(gpu->ccmaEvent);
delete gpu->psConstraintMatrixColumn;
delete gpu->psConstraintMatrixValue;
delete gpu->psTabulatedFunctionParams;
for (int i = 0; i < MAX_TABULATED_FUNCTIONS; i++)
if (gpu->tabulatedFunctions[i].coefficients != NULL)
delete gpu->tabulatedFunctions[i].coefficients;
if (gpu->compactPlan.valid)
destroyCompactionPlan(gpu->compactPlan);
// Wrap up
delete gpu;
cudaThreadExit();
return;
}
extern "C"
int gpuBuildOutputBuffers(gpuContext gpu)
{
// Select the number of output buffer to use.
gpu->bOutputBufferPerWarp = true;
gpu->sim.nonbondOutputBuffers = gpu->sim.nonbond_blocks * gpu->sim.nonbond_threads_per_block / GRID;
if (gpu->sim.nonbondOutputBuffers >= gpu->sim.paddedNumberOfAtoms/GRID)
{
// For small systems, it is more efficient to have one output buffer per block of 32 atoms instead of one per warp.
gpu->bOutputBufferPerWarp = false;
gpu->sim.nonbondOutputBuffers = gpu->sim.paddedNumberOfAtoms / GRID;
}
if (gpu->sim.nonbondOutputBuffers > gpu->sim.outputBuffers)
gpu->sim.outputBuffers = gpu->sim.nonbondOutputBuffers;
unsigned int outputBuffers = gpu->sim.outputBuffers;
for (unsigned int i = 0; i < gpu->sim.paddedNumberOfAtoms; i++)
{
if (outputBuffers < gpu->pOutputBufferCounter[i])
{
outputBuffers = gpu->pOutputBufferCounter[i];
}
}
gpu->sim.outputBuffers = outputBuffers;
gpu->sim.energyOutputBuffers = max(gpu->sim.nonbond_threads_per_block, gpu->sim.localForces_threads_per_block)*gpu->sim.blocks;
gpu->psForce4 = new CUDAStream<float4>(gpu->sim.paddedNumberOfAtoms, outputBuffers, "Force");
gpu->psEnergy = new CUDAStream<float>(gpu->sim.energyOutputBuffers, 1, "Energy");
gpu->psBornForce = new CUDAStream<float>(gpu->sim.paddedNumberOfAtoms, gpu->sim.nonbondOutputBuffers, "BornForce");
gpu->psBornSum = new CUDAStream<float>(gpu->sim.paddedNumberOfAtoms, gpu->sim.nonbondOutputBuffers, "BornSum");
gpu->sim.pForce4 = gpu->psForce4->_pDevStream[0];
gpu->sim.pEnergy = gpu->psEnergy->_pDevStream[0];
gpu->sim.pBornForce = gpu->psBornForce->_pDevStream[0];
gpu->sim.pBornSum = gpu->psBornSum->_pDevStream[0];
// Determine local energy paramter offsets for bonded interactions
gpu->sim.bond_offset = gpu->psBondParameter->_stride;
gpu->sim.bond_angle_offset = gpu->sim.bond_offset + gpu->psBondAngleParameter->_stride;
gpu->sim.dihedral_offset = gpu->sim.bond_angle_offset + gpu->psDihedralParameter->_stride;
gpu->sim.rb_dihedral_offset = gpu->sim.dihedral_offset + gpu->psRbDihedralParameter1->_stride;
gpu->sim.LJ14_offset = gpu->sim.rb_dihedral_offset + gpu->psLJ14Parameter->_stride;
gpu->sim.localForces_threads_per_block = (max(gpu->sim.LJ14_offset, gpu->sim.customBonds) / gpu->sim.blocks + 15) & 0xfffffff0;
if (gpu->sim.localForces_threads_per_block > gpu->sim.max_localForces_threads_per_block)
gpu->sim.localForces_threads_per_block = gpu->sim.max_localForces_threads_per_block;
if (gpu->sim.localForces_threads_per_block < 1)
gpu->sim.localForces_threads_per_block = 1;
// Flip local force output buffers
int flip = outputBuffers - 1;
for (int i = 0; i < (int) gpu->sim.bonds; i++)
{
(*gpu->psBondID)[i].z = flip - (*gpu->psBondID)[i].z;
(*gpu->psBondID)[i].w = flip - (*gpu->psBondID)[i].w;
}
for (int i = 0; i < (int) gpu->sim.bond_angles; i++)
{
(*gpu->psBondAngleID1)[i].w = flip - (*gpu->psBondAngleID1)[i].w;
(*gpu->psBondAngleID2)[i].x = flip - (*gpu->psBondAngleID2)[i].x;
(*gpu->psBondAngleID2)[i].y = flip - (*gpu->psBondAngleID2)[i].y;
}
for (int i = 0; i < (int) gpu->sim.dihedrals; i++)
{
(*gpu->psDihedralID2)[i].x = flip - (*gpu->psDihedralID2)[i].x;
(*gpu->psDihedralID2)[i].y = flip - (*gpu->psDihedralID2)[i].y;
(*gpu->psDihedralID2)[i].z = flip - (*gpu->psDihedralID2)[i].z;
(*gpu->psDihedralID2)[i].w = flip - (*gpu->psDihedralID2)[i].w;
}
for (int i = 0; i < (int) gpu->sim.rb_dihedrals; i++)
{
(*gpu->psRbDihedralID2)[i].x = flip - (*gpu->psRbDihedralID2)[i].x;
(*gpu->psRbDihedralID2)[i].y = flip - (*gpu->psRbDihedralID2)[i].y;
(*gpu->psRbDihedralID2)[i].z = flip - (*gpu->psRbDihedralID2)[i].z;
(*gpu->psRbDihedralID2)[i].w = flip - (*gpu->psRbDihedralID2)[i].w;
}
for (int i = 0; i < (int) gpu->sim.LJ14s; i++)
{
(*gpu->psLJ14ID)[i].z = flip - (*gpu->psLJ14ID)[i].z;
(*gpu->psLJ14ID)[i].w = flip - (*gpu->psLJ14ID)[i].w;
}
gpu->psBondID->Upload();
gpu->psBondAngleID1->Upload();
gpu->psBondAngleID2->Upload();
gpu->psDihedralID2->Upload();
gpu->psRbDihedralID2->Upload();
gpu->psLJ14ID->Upload();
return 1;
}
extern "C"
int gpuBuildThreadBlockWorkList(gpuContext gpu)
{
const unsigned int atoms = gpu->sim.paddedNumberOfAtoms;
const unsigned int grid = gpu->grid;
const unsigned int dim = (atoms + (grid - 1)) / grid;
const unsigned int cells = dim * (dim + 1) / 2;
CUDAStream<unsigned int>* psWorkUnit = new CUDAStream<unsigned int>(cells, 1u, "WorkUnit");
unsigned int* pWorkList = psWorkUnit->_pSysData;
gpu->psWorkUnit = psWorkUnit;
gpu->sim.pWorkUnit = psWorkUnit->_pDevStream[0];
CUDAStream<unsigned int>* psInteractingWorkUnit = new CUDAStream<unsigned int>(cells, 1u, "InteractingWorkUnit");
gpu->psInteractingWorkUnit = psInteractingWorkUnit;
gpu->sim.pInteractingWorkUnit = psInteractingWorkUnit->_pDevStream[0];
CUDAStream<unsigned int>* psInteractionFlag = new CUDAStream<unsigned int>(cells, 1u, "InteractionFlag");
gpu->psInteractionFlag = psInteractionFlag;
gpu->sim.pInteractionFlag = psInteractionFlag->_pDevStream[0];
CUDAStream<size_t>* psInteractionCount = new CUDAStream<size_t>(1, 1u, "InteractionCount");
gpu->psInteractionCount = psInteractionCount;
gpu->sim.pInteractionCount = psInteractionCount->_pDevStream[0];
CUDAStream<float4>* psGridBoundingBox = new CUDAStream<float4>(dim, 1u, "GridBoundingBox");
gpu->psGridBoundingBox = psGridBoundingBox;
gpu->sim.pGridBoundingBox = psGridBoundingBox->_pDevStream[0];
CUDAStream<float4>* psGridCenter = new CUDAStream<float4>(dim, 1u, "GridCenter");
gpu->psGridCenter = psGridCenter;
gpu->sim.pGridCenter = psGridCenter->_pDevStream[0];
gpu->sim.nonbond_workBlock = gpu->sim.nonbond_threads_per_block / GRID;
gpu->sim.bornForce2_workBlock = gpu->sim.bornForce2_threads_per_block / GRID;
gpu->sim.workUnits = cells;
// Initialize the plan for doing stream compaction.
planCompaction(gpu->compactPlan);
// Increase block count if necessary for extra large molecules that would
// otherwise overflow the SM workunit buffers
// int minimumBlocks = (cells + gpu->sim.workUnitsPerSM - 1) / gpu->sim.workUnitsPerSM;
// if ((int) gpu->sim.nonbond_blocks < minimumBlocks)
// {
// gpu->sim.nonbond_blocks = gpu->sim.nonbond_blocks * ((minimumBlocks + gpu->sim.nonbond_blocks - 1) / gpu->sim.nonbond_blocks);
// }
// if ((int) gpu->sim.bornForce2_blocks < minimumBlocks)
// {
// gpu->sim.bornForce2_blocks = gpu->sim.bornForce2_blocks * ((minimumBlocks + gpu->sim.bornForce2_blocks - 1) / gpu->sim.bornForce2_blocks);
// }
gpu->sim.nbWorkUnitsPerBlock = cells / gpu->sim.nonbond_blocks;
gpu->sim.nbWorkUnitsPerBlockRemainder = cells - gpu->sim.nonbond_blocks * gpu->sim.nbWorkUnitsPerBlock;
gpu->sim.bf2WorkUnitsPerBlock = cells / gpu->sim.bornForce2_blocks;
gpu->sim.bf2WorkUnitsPerBlockRemainder = cells - gpu->sim.bornForce2_blocks * gpu->sim.bf2WorkUnitsPerBlock;
gpu->sim.interaction_threads_per_block = 64;
gpu->sim.interaction_blocks = (gpu->sim.workUnits + gpu->sim.interaction_threads_per_block - 1) / gpu->sim.interaction_threads_per_block;
if (gpu->sim.interaction_blocks > 8*gpu->sim.blocks)
gpu->sim.interaction_blocks = 8*gpu->sim.blocks;
// Decrease thread count for extra small molecules to spread computation
// across entire chip
int activeWorkUnits = gpu->sim.nonbond_blocks * gpu->sim.nonbond_workBlock;
if (activeWorkUnits > (int) cells)
{
int balancedWorkBlock = (cells + gpu->sim.nonbond_blocks - 1) / gpu->sim.nonbond_blocks;
gpu->sim.nonbond_threads_per_block = balancedWorkBlock * GRID;
gpu->sim.nonbond_workBlock = balancedWorkBlock;
}
activeWorkUnits = gpu->sim.bornForce2_blocks * gpu->sim.bornForce2_workBlock;
if (activeWorkUnits > (int) cells)
{
int balancedWorkBlock = (cells + gpu->sim.bornForce2_blocks - 1) / gpu->sim.bornForce2_blocks;
gpu->sim.bornForce2_threads_per_block = balancedWorkBlock * GRID;
gpu->sim.bornForce2_workBlock = balancedWorkBlock;
}
unsigned int count = 0;
for (unsigned int y = 0; y < dim; y++)
{
for (unsigned int x = y; x < dim; x++)
{
pWorkList[count] = (x << 17) | (y << 2);
count++;
}
}
(*gpu->psInteractionCount)[0] = gpu->sim.workUnits;
gpu->psInteractionCount->Upload();
psWorkUnit->Upload();
gpuSetConstants(gpu);
return cells;
}
extern "C"
void OPENMMCUDA_EXPORT gpuBuildExclusionList(gpuContext gpu)
{
const unsigned int atoms = gpu->sim.paddedNumberOfAtoms;
const unsigned int grid = gpu->grid;
const unsigned int dim = atoms/grid;
unsigned int* pWorkList = gpu->psWorkUnit->_pSysData;
// Mark which work units have exclusions.
for (int atom1 = 0; atom1 < (int)gpu->exclusions.size(); ++atom1)
{
int x = atom1/grid;
for (int j = 0; j < (int)gpu->exclusions[atom1].size(); ++j)
{
int atom2 = gpu->exclusions[atom1][j];
int y = atom2/grid;
int cell = (x > y ? x+y*dim-y*(y+1)/2 : y+x*dim-x*(x+1)/2);
pWorkList[cell] |= 1;
}
}
if ((int)gpu->sim.paddedNumberOfAtoms > gpu->natoms)
{
int lastBlock = gpu->natoms/grid;
for (int i = 0; i < (int)gpu->sim.workUnits; ++i)
{
int x = pWorkList[i]>>17;
int y = (pWorkList[i]>>2)&0x7FFF;
if (x == lastBlock || y == lastBlock)
pWorkList[i] |= 1;
}
}
// Build a list of indexes for the work units with exclusions.
CUDAStream<unsigned int>* psExclusionIndex = new CUDAStream<unsigned int>(gpu->sim.workUnits, 1u, "ExclusionIndex");
gpu->psExclusionIndex = psExclusionIndex;
unsigned int* pExclusionIndex = psExclusionIndex->_pSysData;
gpu->sim.pExclusionIndex = psExclusionIndex->_pDevData;
int numWithExclusions = 0;
for (int i = 0; i < (int)psExclusionIndex->_length; ++i)
if ((pWorkList[i]&1) == 1)
pExclusionIndex[i] = (numWithExclusions++)*grid;
// Record the exclusion data.
CUDAStream<unsigned int>* psExclusion = new CUDAStream<unsigned int>(numWithExclusions*grid, 1u, "Exclusion");
gpu->psExclusion = psExclusion;
unsigned int* pExclusion = psExclusion->_pSysData;
gpu->sim.pExclusion = psExclusion->_pDevData;
for (int i = 0; i < (int)psExclusion->_length; ++i)
pExclusion[i] = 0xFFFFFFFF;
for (int atom1 = 0; atom1 < (int)gpu->exclusions.size(); ++atom1)
{
int x = atom1/grid;
int offset1 = atom1-x*grid;
for (int j = 0; j < (int)gpu->exclusions[atom1].size(); ++j)
{
int atom2 = gpu->exclusions[atom1][j];
int y = atom2/grid;
int offset2 = atom2-y*grid;
if (x > y)
{
int cell = x+y*dim-y*(y+1)/2;
pExclusion[pExclusionIndex[cell]+offset1] &= 0xFFFFFFFF-(1<<offset2);
}
else
{
int cell = y+x*dim-x*(x+1)/2;
pExclusion[pExclusionIndex[cell]+offset2] &= 0xFFFFFFFF-(1<<offset1);
}
}
}
// Mark all interactions that involve a padding atom as being excluded.
for (int atom1 = gpu->natoms; atom1 < (int)atoms; ++atom1)
{
int x = atom1/grid;
int offset1 = atom1-x*grid;
for (int atom2 = 0; atom2 < (int)atoms; ++atom2)
{
int y = atom2/grid;
int offset2 = atom2-y*grid;
if (x >= y)
{
int cell = x+y*dim-y*(y+1)/2;
pExclusion[pExclusionIndex[cell]+offset1] &= 0xFFFFFFFF-(1<<offset2);
}
if (y >= x)
{
int cell = y+x*dim-x*(x+1)/2;
pExclusion[pExclusionIndex[cell]+offset2] &= 0xFFFFFFFF-(1<<offset1);
}
}
}
psExclusion->Upload();
psExclusionIndex->Upload();
gpu->psWorkUnit->Upload();
gpuSetConstants(gpu);
}
extern "C"
int gpuSetConstants(gpuContext gpu)
{
SetCalculateCDLJForcesSim(gpu);
SetCalculateCDLJObcGbsaForces1Sim(gpu);
SetCalculateCustomNonbondedForcesSim(gpu);
SetCalculateCustomBondForcesSim(gpu);
SetCalculateCustomAngleForcesSim(gpu);
SetCalculateCustomTorsionForcesSim(gpu);
SetCalculateCustomExternalForcesSim(gpu);
SetCalculateLocalForcesSim(gpu);
SetCalculateObcGbsaBornSumSim(gpu);
SetCalculateGBVIBornSumSim(gpu);
SetCalculateObcGbsaForces2Sim(gpu);
SetCalculateGBVIForces2Sim(gpu);
SetCalculateAndersenThermostatSim(gpu);
SetCalculatePMESim(gpu);
SetForcesSim(gpu);
SetShakeHSim(gpu);
SetLangevinUpdateSim(gpu);
SetVerletUpdateSim(gpu);
SetBrownianUpdateSim(gpu);
SetSettleSim(gpu);
SetCCMASim(gpu);
SetRandomSim(gpu);
return 1;
}
static void tagAtomsInMolecule(int atom, int molecule, vector<int>& atomMolecule, vector<vector<int> >& atomBonds)
{
// Recursively tag atoms as belonging to a particular molecule.
atomMolecule[atom] = molecule;
for (int i = 0; i < (int)atomBonds[atom].size(); i++)
if (atomMolecule[atomBonds[atom][i]] == -1)
tagAtomsInMolecule(atomBonds[atom][i], molecule, atomMolecule, atomBonds);
}
static void findMoleculeGroups(gpuContext gpu)
{
// First make a list of constraints for future use.
vector<Constraint> constraints;
for (int i = 0; i < (int)gpu->sim.ShakeConstraints; i++)
{
int atom1 = (*gpu->psShakeID)[i].x;
int atom2 = (*gpu->psShakeID)[i].y;
int atom3 = (*gpu->psShakeID)[i].z;
int atom4 = (*gpu->psShakeID)[i].w;
float distance2 = (*gpu->psShakeParameter)[i].z;
constraints.push_back(Constraint(atom1, atom2, distance2));
if (atom3 != -1)
constraints.push_back(Constraint(atom1, atom3, distance2));
if (atom4 != -1)
constraints.push_back(Constraint(atom1, atom4, distance2));
}
for (int i = 0; i < (int)gpu->sim.settleConstraints; i++)
{
int atom1 = (*gpu->psSettleID)[i].x;
int atom2 = (*gpu->psSettleID)[i].y;
int atom3 = (*gpu->psSettleID)[i].z;
float distance12 = (*gpu->psSettleParameter)[i].x;
float distance23 = (*gpu->psSettleParameter)[i].y;
constraints.push_back(Constraint(atom1, atom2, distance12*distance12));
constraints.push_back(Constraint(atom1, atom3, distance12*distance12));
constraints.push_back(Constraint(atom2, atom3, distance23*distance23));
}
for (int i = 0; i < (int)gpu->sim.ccmaConstraints; i++)
{
int atom1 = (*gpu->psCcmaAtoms)[i].x;
int atom2 = (*gpu->psCcmaAtoms)[i].y;
float distance2 = (*gpu->psCcmaDistance)[i].w;
constraints.push_back(Constraint(atom1, atom2, distance2));
}
// First make a list of every other atom to which each atom is connect by a bond, constraint, or exclusion.
int numAtoms = gpu->natoms;
vector<vector<int> > atomBonds(numAtoms);
for (int i = 0; i < (int) gpu->forces.size(); i++) {
for (int j = 0; j < gpu->forces[i]->getNumParticleGroups(); j++) {
vector<int> particles;
gpu->forces[i]->getParticlesInGroup(j, particles);
for (int k = 0; k < (int) particles.size(); k++)
for (int m = 0; m < (int) particles.size(); m++)
if (k != m)
atomBonds[particles[k]].push_back(particles[m]);
}
}
for (int i = 0; i < (int)constraints.size(); i++)
{
int atom1 = constraints[i].atom1;
int atom2 = constraints[i].atom2;
atomBonds[atom1].push_back(atom2);
atomBonds[atom2].push_back(atom1);
}
// Now tag atoms by which molecule they belong to.
vector<int> atomMolecule(numAtoms, -1);
int numMolecules = 0;
for (int i = 0; i < numAtoms; i++)
if (atomMolecule[i] == -1)
tagAtomsInMolecule(i, numMolecules++, atomMolecule, atomBonds);
vector<vector<int> > atomIndices(numMolecules);
for (int i = 0; i < numAtoms; i++)
atomIndices[atomMolecule[i]].push_back(i);
// Construct a description of each molecule.
vector<Molecule> molecules(numMolecules);
for (int i = 0; i < numMolecules; i++)
{
molecules[i].atoms = atomIndices[i];
molecules[i].groups.resize(gpu->forces.size());
}
for (int i = 0; i < (int) gpu->forces.size(); i++)
for (int j = 0; j < gpu->forces[i]->getNumParticleGroups(); j++)
{
vector<int> particles;
gpu->forces[i]->getParticlesInGroup(j, particles);
molecules[atomMolecule[particles[0]]].groups[i].push_back(j);
}
for (int i = 0; i < (int)constraints.size(); i++)
{
molecules[atomMolecule[constraints[i].atom1]].constraints.push_back(i);
}
// Sort them into groups of identical molecules.
vector<Molecule> uniqueMolecules;
vector<vector<int> > moleculeInstances;
for (int molIndex = 0; molIndex < (int)molecules.size(); molIndex++)
{
Molecule& mol = molecules[molIndex];
// See if it is identical to another molecule.
bool isNew = true;
for (int j = 0; j < (int)uniqueMolecules.size() && isNew; j++)
{
Molecule& mol2 = uniqueMolecules[j];
bool identical = (mol.atoms.size() == mol2.atoms.size() && mol.constraints.size() == mol2.constraints.size());
// See if the atoms are identical.
int atomOffset = mol2.atoms[0]-mol.atoms[0];
float4* velm = gpu->psVelm4->_pSysData;
for (int i = 0; i < (int) mol.atoms.size() && identical; i++) {
if (mol.atoms[i] != mol2.atoms[i]-atomOffset || velm[mol.atoms[i]].w != velm[mol2.atoms[i]].w)
identical = false;
for (int k = 0; k < (int) gpu->forces.size(); k++)
if (!gpu->forces[k]->areParticlesIdentical(mol.atoms[i], mol2.atoms[i]))
identical = false;
}
// See if the constraints are identical.
for (int i = 0; i < (int) mol.constraints.size() && identical; i++)
if (constraints[mol.constraints[i]].atom1 != constraints[mol2.constraints[i]].atom1-atomOffset ||
constraints[mol.constraints[i]].atom2 != constraints[mol2.constraints[i]].atom2-atomOffset ||
constraints[mol.constraints[i]].distance2 != constraints[mol2.constraints[i]].distance2)
identical = false;
// See if the force groups are identical.
for (int i = 0; i < (int) gpu->forces.size() && identical; i++)
{
if (mol.groups[i].size() != mol2.groups[i].size())
identical = false;
for (int k = 0; k < (int) mol.groups[i].size() && identical; k++)
if (!gpu->forces[i]->areGroupsIdentical(mol.groups[i][k], mol2.groups[i][k]))
identical = false;
}
if (identical)
{
moleculeInstances[j].push_back(mol.atoms[0]);
isNew = false;
}
}
if (isNew)
{
uniqueMolecules.push_back(mol);
moleculeInstances.push_back(vector<int>());
moleculeInstances[moleculeInstances.size()-1].push_back(mol.atoms[0]);
}
}
gpu->moleculeGroups.resize(moleculeInstances.size());
for (int i = 0; i < (int)moleculeInstances.size(); i++)
{
gpu->moleculeGroups[i].instances = moleculeInstances[i];
vector<int>& atoms = uniqueMolecules[i].atoms;
gpu->moleculeGroups[i].atoms.resize(atoms.size());
for (int j = 0; j < (int)atoms.size(); j++)
gpu->moleculeGroups[i].atoms[j] = atoms[j]-atoms[0];
}
}
extern "C"
void gpuReorderAtoms(gpuContext gpu)
{
if (gpu->natoms == 0 || gpu->sim.nonbondedCutoffSqr == 0.0)
return;
if (gpu->moleculeGroups.size() == 0)
findMoleculeGroups(gpu);
// Find the range of positions and the number of bins along each axis.
int numAtoms = gpu->natoms;
gpu->psPosq4->Download();
gpu->psVelm4->Download();
float4* posq = gpu->psPosq4->_pSysData;
float4* velm = gpu->psVelm4->_pSysData;
float minx = posq[0].x, maxx = posq[0].x;
float miny = posq[0].y, maxy = posq[0].y;
float minz = posq[0].z, maxz = posq[0].z;
if (gpu->sim.nonbondedMethod == PERIODIC || gpu->sim.nonbondedMethod == EWALD || gpu->sim.nonbondedMethod == PARTICLE_MESH_EWALD)
{
minx = miny = minz = 0.0;
maxx = gpu->sim.periodicBoxSizeX;
maxy = gpu->sim.periodicBoxSizeY;
maxz = gpu->sim.periodicBoxSizeZ;
}
else
{
for (int i = 1; i < numAtoms; i++)
{
minx = min(minx, posq[i].x);
maxx = max(maxx, posq[i].x);
miny = min(miny, posq[i].y);
maxy = max(maxy, posq[i].y);
minz = min(minz, posq[i].z);
maxz = max(maxz, posq[i].z);
}
}
// Loop over each group of identical molecules and reorder them.
vector<int> originalIndex(numAtoms);
vector<float4> newPosq(numAtoms);
vector<float4> newVelm(numAtoms);
vector<int3> newCellOffsets(numAtoms);
for (int group = 0; group < (int)gpu->moleculeGroups.size(); group++)
{
// Find the center of each molecule.
gpuMoleculeGroup& mol = gpu->moleculeGroups[group];
int numMolecules = mol.instances.size();
vector<int>& atoms = mol.atoms;
vector<float3> molPos(numMolecules);
for (int i = 0; i < numMolecules; i++)
{
molPos[i].x = 0.0f;
molPos[i].y = 0.0f;
molPos[i].z = 0.0f;
for (int j = 0; j < (int)atoms.size(); j++)
{
int atom = atoms[j]+mol.instances[i];
molPos[i].x += posq[atom].x;
molPos[i].y += posq[atom].y;
molPos[i].z += posq[atom].z;
}
molPos[i].x /= atoms.size();
molPos[i].y /= atoms.size();
molPos[i].z /= atoms.size();
}
if (gpu->sim.nonbondedMethod == PERIODIC || gpu->sim.nonbondedMethod == EWALD || gpu->sim.nonbondedMethod == PARTICLE_MESH_EWALD)
{
// Move each molecule position into the same box.
for (int i = 0; i < numMolecules; i++)
{
int xcell = (int) floor(molPos[i].x/gpu->sim.periodicBoxSizeX);
int ycell = (int) floor(molPos[i].y/gpu->sim.periodicBoxSizeY);
int zcell = (int) floor(molPos[i].z/gpu->sim.periodicBoxSizeZ);
float dx = xcell*gpu->sim.periodicBoxSizeX;
float dy = ycell*gpu->sim.periodicBoxSizeY;
float dz = zcell*gpu->sim.periodicBoxSizeZ;
if (dx != 0.0f || dy != 0.0f || dz != 0.0f)
{
molPos[i].x -= dx;
molPos[i].y -= dy;
molPos[i].z -= dz;
for (int j = 0; j < (int)atoms.size(); j++)
{
int atom = atoms[j]+mol.instances[i];
posq[atom].x -= dx;
posq[atom].y -= dy;
posq[atom].z -= dz;
gpu->posCellOffsets[atom].x -= xcell;
gpu->posCellOffsets[atom].y -= ycell;
gpu->posCellOffsets[atom].z -= zcell;
}
}
}
}
// Select a bin for each molecule, then sort them by bin.
bool useHilbert = (numMolecules > 5000 || atoms.size() > 8); // For small systems, a simple zigzag curve works better than a Hilbert curve.
float binWidth;
if (useHilbert)
binWidth = (float)(max(max(maxx-minx, maxy-miny), maxz-minz)/255.0);
else
binWidth = (float)(0.2*sqrt(gpu->sim.nonbondedCutoffSqr));
int xbins = 1 + (int) ((maxx-minx)/binWidth);
int ybins = 1 + (int) ((maxy-miny)/binWidth);
vector<pair<int, int> > molBins(numMolecules);
bitmask_t coords[3];
for (int i = 0; i < numMolecules; i++)
{
int x = (int) ((molPos[i].x-minx)/binWidth);
int y = (int) ((molPos[i].y-miny)/binWidth);
int z = (int) ((molPos[i].z-minz)/binWidth);
int bin;
if (useHilbert)
{
coords[0] = x;
coords[1] = y;
coords[2] = z;
bin = (int) hilbert_c2i(3, 8, coords);
}
else
{
int yodd = y&1;
int zodd = z&1;
bin = z*xbins*ybins;
bin += (zodd ? ybins-y : y)*xbins;
bin += (yodd ? xbins-x : x);
}
molBins[i] = pair<int, int>(bin, i);
}
sort(molBins.begin(), molBins.end());
// Reorder the atoms.
for (int i = 0; i < numMolecules; i++)
{
for (int j = 0; j < (int)atoms.size(); j++)
{
int oldIndex = mol.instances[molBins[i].second]+atoms[j];
int newIndex = mol.instances[i]+atoms[j];
originalIndex[newIndex] = (*gpu->psAtomIndex)[oldIndex];
newPosq[newIndex] = posq[oldIndex];
newVelm[newIndex] = velm[oldIndex];
newCellOffsets[newIndex] = gpu->posCellOffsets[oldIndex];
}
}
}
// Update the streams.
for (int i = 0; i < numAtoms; i++) {
posq[i] = newPosq[i];
velm[i] = newVelm[i];
(*gpu->psAtomIndex)[i] = originalIndex[i];
gpu->posCellOffsets[i] = newCellOffsets[i];
}
gpu->psPosq4->Upload();
gpu->psVelm4->Upload();
gpu->psAtomIndex->Upload();
}
#ifndef __GPUTYPES_H__
#define __GPUTYPES_H__
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "cudatypes.h"
#include "cudaCompact.h"
#include <vector>
#include "windowsExportCuda.h"
namespace OpenMM {
class CudaForceInfo;
}
struct gpuAtomType {
std::string name;
char symbol;
float r;
};
struct gpuMoleculeGroup {
std::vector<int> atoms;
std::vector<int> instances;
};
struct gpuTabulatedFunction {
gpuTabulatedFunction() : coefficients(NULL) {
}
std::string name;
double min, max;
CUDAStream<float4>* coefficients;
};
enum SM_VERSION
{
SM_10,
SM_11,
SM_12,
SM_20
};
/* Pointer to this structure will be given
* to gromacs functions*/
struct _gpuContext {
//Cache this here so that it doesn't
//have to be repeatedly passed around
int natoms;
int device;
bool useBlockingSync;
gpuAtomType* gpAtomTable;
int gAtomTypes;
unsigned int blocksPerSM;
unsigned int sharedMemoryPerBlock;
cudaGmxSimulation sim;
unsigned int* pOutputBufferCounter;
std::vector<OpenMM::CudaForceInfo*> forces;
std::vector<std::vector<int> > exclusions;
unsigned char* pAtomSymbol;
std::vector<gpuMoleculeGroup> moleculeGroups;
gpuTabulatedFunction tabulatedFunctions[MAX_TABULATED_FUNCTIONS];
std::vector<int3> posCellOffsets;
int iterations;
float epsfac;
float solventDielectric;
float soluteDielectric;
int grid;
bool bCalculateCM;
bool bRemoveCM;
bool bRecalculateBornRadii;
bool bOutputBufferPerWarp;
bool bIncludeGBSA;
bool bIncludeGBVI;
bool tabulatedFunctionsChanged;
unsigned long seed;
SM_VERSION sm_version;
compactionPlan compactPlan;
cufftHandle fftplan;
CUDAStream<float4>* psPosq4;
CUDAStream<float4>* psPosqP4;
CUDAStream<float4>* psOldPosq4;
CUDAStream<float4>* psVelm4;
CUDAStream<float4>* psForce4;
CUDAStream<float>* psEnergy; // Energy output buffer
CUDAStream<float2>* psSigEps2;
CUDAStream<float4>* psCustomParams; // Atom parameters for custom nonbonded force
CUDAStream<int4>* psCustomBondID; // Atom indices for custom bonds
CUDAStream<float4>* psCustomBondParams; // Parameters for custom bonds
CUDAStream<int4>* psCustomAngleID1; // Atom indices for custom angles
CUDAStream<int2>* psCustomAngleID2; // Atom indices for custom angles
CUDAStream<float4>* psCustomAngleParams; // Parameters for custom angles
CUDAStream<int4>* psCustomTorsionID1; // Atom indices for custom torsions
CUDAStream<int4>* psCustomTorsionID2; // Atom indices for custom torsions
CUDAStream<float4>* psCustomTorsionParams; // Parameters for custom torsions
CUDAStream<int>* psCustomExternalID; // Atom indices for custom external force
CUDAStream<float4>* psCustomExternalParams; // Parameters for custom external force
CUDAStream<float4>* psTabulatedFunctionParams; // The min, max, and spacing for each tabulated function
CUDAStream<float2>* psEwaldCosSinSum;
CUDAStream<float>* psTabulatedErfc; // Tabulated values for erfc()
CUDAStream<cufftComplex>* psPmeGrid; // Grid points for particle mesh Ewald
CUDAStream<float>* psPmeBsplineModuli[3];
CUDAStream<float4>* psPmeBsplineTheta;
CUDAStream<float4>* psPmeBsplineDtheta;
CUDAStream<int>* psPmeAtomRange; // The range of sorted atoms at each grid point
CUDAStream<int2>* psPmeAtomGridIndex; // The grid point each atom is at
CUDAStream<float2>* psObcData;
CUDAStream<float4>* psGBVIData;
CUDAStream<float>* psGBVISwitchDerivative;
CUDAStream<float>* psObcChain;
CUDAStream<float>* psBornForce;
CUDAStream<float>* psBornRadii;
CUDAStream<float>* psBornSum;
CUDAStream<int4>* psBondID;
CUDAStream<float2>* psBondParameter;
CUDAStream<int4>* psBondAngleID1;
CUDAStream<int2>* psBondAngleID2;
CUDAStream<float2>* psBondAngleParameter;
CUDAStream<int4>* psDihedralID1;
CUDAStream<int4>* psDihedralID2;
CUDAStream<float4>* psDihedralParameter;
CUDAStream<int4>* psRbDihedralID1;
CUDAStream<int4>* psRbDihedralID2;
CUDAStream<float4>* psRbDihedralParameter1;
CUDAStream<float2>* psRbDihedralParameter2;
CUDAStream<int4>* psLJ14ID;
CUDAStream<float4>* psLJ14Parameter;
CUDAStream<int4>* psShakeID;
CUDAStream<float4>* psShakeParameter;
CUDAStream<int4>* psSettleID;
CUDAStream<float2>* psSettleParameter;
CUDAStream<unsigned int>* psExclusion;
CUDAStream<unsigned int>* psExclusionIndex;
CUDAStream<unsigned int>* psWorkUnit;
CUDAStream<unsigned int>* psInteractingWorkUnit;
CUDAStream<unsigned int>* psInteractionFlag;
CUDAStream<size_t>* psInteractionCount;
CUDAStream<float2>* psStepSize; // The size of the previous and current time steps
CUDAStream<float>* psLangevinParameters;// Parameters used for Langevin integration
CUDAStream<float4>* psRandom4; // Pointer to sets of 4 random numbers for MD integration
CUDAStream<float2>* psRandom2; // Pointer to sets of 2 random numbers for MD integration
CUDAStream<uint4>* psRandomSeed; // Pointer to each random seed
CUDAStream<int>* psRandomPosition; // Pointer to random number positions
CUDAStream<float4>* psLinearMomentum; // Pointer to total linear momentum per CTA
CUDAStream<int>* psAtomIndex; // The original index of each atom
CUDAStream<float4>* psGridBoundingBox; // The size of each grid cell
CUDAStream<float4>* psGridCenter; // The center and radius for each grid cell
CUDAStream<int2>* psCcmaAtoms; // The atoms connected by each CCMA constraint
CUDAStream<float4>* psCcmaDistance; // The displacement vector (x, y, z) and constraint distance (w) for each CCMA constraint
CUDAStream<int>* psCcmaAtomConstraints; // The indices of constraints involving each atom
CUDAStream<int>* psCcmaNumAtomConstraints; // The number of constraints involving each atom
CUDAStream<float>* psCcmaDelta1; // Workspace for CCMA
CUDAStream<float>* psCcmaDelta2; // Workspace for CCMA
int* ccmaConvergedHostMarker; // Host memory used to communicate that CCMA has converged
cudaEvent_t ccmaEvent; // Used to optimize communication during CCMA
CUDAStream<float>* psCcmaReducedMass; // The reduced mass for each CCMA constraint
CUDAStream<float>* psRigidClusterMatrix;// The inverse constraint matrix for each rigid cluster
CUDAStream<unsigned int>* psRigidClusterConstraintIndex; // The index of each cluster in the stream containing cluster constraints.
CUDAStream<unsigned int>* psRigidClusterMatrixIndex; // The index of each cluster in the stream containing cluster matrices.
CUDAStream<unsigned int>* psConstraintMatrixColumn; // The column of each element in the constraint matrix.
CUDAStream<float>* psConstraintMatrixValue; // The value of each element in the constraint matrix.
};
typedef struct _gpuContext *gpuContext;
// Function prototypes
extern "C"
bool OPENMMCUDA_EXPORT gpuIsAvailable();
extern "C"
void gpuSetBondParameters(gpuContext gpu, const std::vector<int>& atom1, const std::vector<int>& atom2, const std::vector<float>& length, const std::vector<float>& k);
extern "C"
void gpuSetBondAngleParameters(gpuContext gpu, const std::vector<int>& atom1, const std::vector<int>& atom2, const std::vector<int>& atom3,
const std::vector<float>& angle, const std::vector<float>& k);
extern "C"
void gpuSetDihedralParameters(gpuContext gpu, const std::vector<int>& atom1, const std::vector<int>& atom2, const std::vector<int>& atom3, const std::vector<int>& atom4,
const std::vector<float>& k, const std::vector<float>& phase, const std::vector<int>& periodicity);
extern "C"
void gpuSetRbDihedralParameters(gpuContext gpu, const std::vector<int>& atom1, const std::vector<int>& atom2, const std::vector<int>& atom3, const std::vector<int>& atom4,
const std::vector<float>& c0, const std::vector<float>& c1, const std::vector<float>& c2, const std::vector<float>& c3, const std::vector<float>& c4, const std::vector<float>& c5);
extern "C"
void gpuSetLJ14Parameters(gpuContext gpu, float epsfac, float fudge, const std::vector<int>& atom1, const std::vector<int>& atom2,
const std::vector<float>& c6, const std::vector<float>& c12, const std::vector<float>& q1, const std::vector<float>& q2);
extern "C"
void gpuSetCoulombParameters(gpuContext gpu, float epsfac, const std::vector<int>& atom, const std::vector<float>& c6, const std::vector<float>& c12, const std::vector<float>& q,
const std::vector<char>& symbol, const std::vector<std::vector<int> >& exclusions, CudaNonbondedMethod method);
extern "C"
void gpuSetNonbondedCutoff(gpuContext gpu, float cutoffDistance, float solventDielectric);
extern "C"
void gpuSetTabulatedFunction(gpuContext gpu, int index, const std::string& name, const std::vector<double>& values, double min, double max);
extern "C"
void gpuSetCustomBondParameters(gpuContext gpu, const std::vector<int>& bondAtom1, const std::vector<int>& bondAtom2, const std::vector<std::vector<double> >& bondParams,
const std::string& energyExp, const std::vector<std::string>& paramNames, const std::vector<std::string>& globalParamNames);
extern "C"
void gpuSetCustomAngleParameters(gpuContext gpu, const std::vector<int>& angleAtom1, const std::vector<int>& angleAtom2, const std::vector<int>& angleAtom3, const std::vector<std::vector<double> >& angleParams,
const std::string& energyExp, const std::vector<std::string>& paramNames, const std::vector<std::string>& globalParamNames);
extern "C"
void gpuSetCustomTorsionParameters(gpuContext gpu, const std::vector<int>& torsionAtom1, const std::vector<int>& torsionAtom2, const std::vector<int>& torsionAtom3, const std::vector<int>& torsionAtom4, const std::vector<std::vector<double> >& torsionParams,
const std::string& energyExp, const std::vector<std::string>& paramNames, const std::vector<std::string>& globalParamNames);
extern "C"
void gpuSetCustomExternalParameters(gpuContext gpu, const std::vector<int>& atomIndex, const std::vector<std::vector<double> >& atomParams,
const std::string& energyExp, const std::vector<std::string>& paramNames, const std::vector<std::string>& globalParamNames);
extern "C"
void gpuSetCustomNonbondedParameters(gpuContext gpu, const std::vector<std::vector<double> >& parameters, const std::vector<std::vector<int> >& exclusions,
CudaNonbondedMethod method, float cutoffDistance, const std::string& energyExp,
const std::vector<std::string>& paramNames, const std::vector<std::string>& globalParamNames);
extern "C"
void gpuSetEwaldParameters(gpuContext gpu, float alpha, int kmaxx, int kmaxy, int kmaxz);
extern "C"
void gpuSetPMEParameters(gpuContext gpu, float alpha, int gridSizeX, int gridSizeY, int gridSizeZ);
extern "C"
void OPENMMCUDA_EXPORT gpuSetPeriodicBoxSize(gpuContext gpu, float xsize, float ysize, float zsize);
extern "C"
void gpuSetObcParameters(gpuContext gpu, float innerDielectric, float solventDielectric, const std::vector<float>& radius, const std::vector<float>& scale, const std::vector<float>& charge);
extern "C"
void gpuSetGBVIParameters(gpuContext gpu, float innerDielectric, float solventDielectric, const std::vector<int>& atom, const std::vector<float>& radius,
const std::vector<float>& gammas, const std::vector<float>& scaledRadii,
int bornRadiusScalingMethod, float quinticLowerLimitFactor, float quinticUpperBornRadiusLimit);
extern "C"
void gpuSetConstraintParameters(gpuContext gpu, const std::vector<int>& atom1, const std::vector<int>& atom2, const std::vector<float>& distance,
const std::vector<float>& invMass1, const std::vector<float>& invMass2, float constraintTolerance);
extern "C"
int gpuAllocateInitialBuffers(gpuContext gpu);
extern "C"
void gpuSetPositions(gpuContext gpu, const std::vector<float>& x, const std::vector<float>& y, const std::vector<float>& z);
extern "C"
void gpuSetVelocities(gpuContext gpu, const std::vector<float>& x, const std::vector<float>& y, const std::vector<float>& z);
extern "C"
void gpuSetMass(gpuContext gpu, const std::vector<float>& mass);
extern "C"
void OPENMMCUDA_EXPORT gpuInitializeRandoms(gpuContext gpu);
extern "C"
OPENMMCUDA_EXPORT void* gpuInit(int numAtoms, unsigned int device = 0, bool useBlockingSync = false);
extern "C"
void gpuSetLangevinIntegrationParameters(gpuContext gpu, float tau, float deltaT, float temperature, float errorTol);
extern "C"
void gpuSetVerletIntegrationParameters(gpuContext gpu, float deltaT, float errorTol);
extern "C"
void gpuSetBrownianIntegrationParameters(gpuContext gpu, float tau, float deltaT, float temperature);
extern "C"
void gpuSetAndersenThermostatParameters(gpuContext gpu, float temperature, float collisionFrequency);
extern "C"
void gpuShutDown(gpuContext gpu);
extern "C"
int gpuBuildOutputBuffers(gpuContext gpu);
extern "C"
int gpuBuildThreadBlockWorkList(gpuContext gpu);
extern "C"
void OPENMMCUDA_EXPORT gpuBuildExclusionList(gpuContext gpu);
extern "C"
int OPENMMCUDA_EXPORT gpuSetConstants(gpuContext gpu);
extern "C"
void gpuReorderAtoms(gpuContext gpu);
extern "C"
void OPENMMCUDA_EXPORT setExclusions(gpuContext gpu, const std::vector<std::vector<int> >& exclusions);
#endif //__GPUTYPES_H__
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment