Commit abb8cb4b authored by Peter Eastman's avatar Peter Eastman
Browse files

Very early beginnings of new CUDA platform

parent 18501459
#---------------------------------------------------
# OpenMM CUDA Platform
#
# Creates OpenMM library, base name=OpenMMCUDA.
# Default libraries are shared & optimized. Variants
# are created for static (_static) and debug (_d).
#
# Windows:
# OpenMMCUDA[_d].dll
# OpenMMCUDA[_d].lib
# OpenMMCUDA_static[_d].lib
# Unix:
# libOpenMMCUDA[_d].so
# libOpenMMCUDA_static[_d].a
#----------------------------------------------------
IF (APPLE)
SET (CMAKE_OSX_DEPLOYMENT_TARGET "10.6")
SET (CMAKE_OSX_SYSROOT "/Developer/SDKs/MacOSX10.6.sdk")
ENDIF (APPLE)
set(OPENMM_BUILD_CUDA_TESTS TRUE CACHE BOOL "Whether to build CUDA test cases")
if(OPENMM_BUILD_CUDA_TESTS)
SUBDIRS (tests)
endif(OPENMM_BUILD_CUDA_TESTS)
# The source is organized into subdirectories, but we handle them all from
# this CMakeLists file rather than letting CMake visit them as SUBDIRS.
SET(OPENMM_SOURCE_SUBDIRS .)
# Collect up information about the version of the OpenMM library we're building
# and make it available to the code so it can be built into the binaries.
SET(OPENMMCUDA_LIBRARY_NAME OpenMMCUDA)
SET(SHARED_TARGET ${OPENMMCUDA_LIBRARY_NAME})
SET(STATIC_TARGET ${OPENMMCUDA_LIBRARY_NAME}_static)
# Ensure that debug libraries have "_d" appended to their names.
# CMake gets this right on Windows automatically with this definition.
IF (${CMAKE_GENERATOR} MATCHES "Visual Studio")
SET(CMAKE_DEBUG_POSTFIX "_d" CACHE INTERNAL "" FORCE)
ENDIF (${CMAKE_GENERATOR} MATCHES "Visual Studio")
# But on Unix or Cygwin we have to add the suffix manually
IF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
SET(SHARED_TARGET ${SHARED_TARGET}_d)
SET(STATIC_TARGET ${STATIC_TARGET}_d)
ENDIF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
# These are all the places to search for header files which are
# to be part of the API.
SET(API_INCLUDE_DIRS) # start empty
FOREACH(subdir ${OPENMM_SOURCE_SUBDIRS})
# append
SET(API_INCLUDE_DIRS ${API_INCLUDE_DIRS}
${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include
${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include/internal)
ENDFOREACH(subdir)
# We'll need both *relative* path names, starting with their API_INCLUDE_DIRS,
# and absolute pathnames.
SET(API_REL_INCLUDE_FILES) # start these out empty
SET(API_ABS_INCLUDE_FILES)
FOREACH(dir ${API_INCLUDE_DIRS})
FILE(GLOB fullpaths ${dir}/*.h) # returns full pathnames
SET(API_ABS_INCLUDE_FILES ${API_ABS_INCLUDE_FILES} ${fullpaths})
FOREACH(pathname ${fullpaths})
GET_FILENAME_COMPONENT(filename ${pathname} NAME)
SET(API_REL_INCLUDE_FILES ${API_REL_INCLUDE_FILES} ${dir}/${filename})
ENDFOREACH(pathname)
ENDFOREACH(dir)
# collect up source files
SET(SOURCE_FILES) # empty
SET(SOURCE_INCLUDE_FILES)
FOREACH(subdir ${OPENMM_SOURCE_SUBDIRS})
FILE(GLOB_RECURSE src_files ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/src/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/src/*.c)
FILE(GLOB incl_files ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/src/*.h)
SET(SOURCE_FILES ${SOURCE_FILES} ${src_files}) #append
SET(SOURCE_INCLUDE_FILES ${SOURCE_INCLUDE_FILES} ${incl_files})
INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include)
ENDFOREACH(subdir)
INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/src)
# Set variables needed for encoding kernel sources into a C++ class
SET(CUDA_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/src)
SET(CUDA_SOURCE_CLASS CudaKernelSources)
SET(CUDA_KERNELS_CPP ${CMAKE_CURRENT_BINARY_DIR}/src/${CUDA_SOURCE_CLASS}.cpp)
SET(CUDA_KERNELS_H ${CMAKE_CURRENT_BINARY_DIR}/src/${CUDA_SOURCE_CLASS}.h)
SET(SOURCE_FILES ${SOURCE_FILES} ${CUDA_KERNELS_CPP} ${CUDA_KERNELS_H})
INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_BINARY_DIR}/src)
SUBDIRS (sharedTarget)
FILE(GLOB CUDA_KERNELS ${CUDA_SOURCE_DIR}/kernels/*.cu)
SET(CUDA_FILE_DECLARATIONS)
SET(CUDA_FILE_DEFINITIONS)
CONFIGURE_FILE(${CUDA_SOURCE_DIR}/${CUDA_SOURCE_CLASS}.cpp.in ${CUDA_KERNELS_CPP})
FOREACH(file ${CUDA_KERNELS})
# Load the file contents and process it.
FILE(STRINGS ${file} file_content NEWLINE_CONSUME)
# Replace all backslashes by double backslashes as they are being put in a C string.
# Be careful not to replace the backslash before a semicolon as that is the CMAKE
# internal escaping of a semicolon to prevent it from acting as a list seperator.
STRING(REGEX REPLACE "\\\\([^;])" "\\\\\\\\\\1" file_content "${file_content}")
# Escape double quotes as being put in a C string.
STRING(REPLACE "\"" "\\\"" file_content "${file_content}")
# Split in separate C strings for each line.
STRING(REPLACE "\n" "\\n\"\n\"" file_content "${file_content}")
# Determine a name for the variable that will contain this file's contents
FILE(RELATIVE_PATH filename ${CUDA_SOURCE_DIR}/kernels ${file})
STRING(LENGTH ${filename} filename_length)
MATH(EXPR filename_length ${filename_length}-3)
STRING(SUBSTRING ${filename} 0 ${filename_length} variable_name)
# Record the variable declaration and definition.
SET(CUDA_FILE_DECLARATIONS ${CUDA_FILE_DECLARATIONS}static\ const\ std::string\ ${variable_name};\n)
FILE(APPEND ${CUDA_KERNELS_CPP} const\ string\ ${CUDA_SOURCE_CLASS}::${variable_name}\ =\ \"${file_content}\"\;\n)
ENDFOREACH(file)
CONFIGURE_FILE(${CUDA_SOURCE_DIR}/${CUDA_SOURCE_CLASS}.h.in ${CUDA_KERNELS_H})
#ifndef OPENMM_CUDAKERNELFACTORY_H_
#define OPENMM_CUDAKERNELFACTORY_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "openmm/KernelFactory.h"
namespace OpenMM {
/**
* This KernelFactory creates all kernels for CudaPlatform.
*/
class CudaKernelFactory : public KernelFactory {
public:
KernelImpl* createKernelImpl(std::string name, const Platform& platform, ContextImpl& context) const;
};
} // namespace OpenMM
#endif /*OPENMM_CUDAKERNELFACTORY_H_*/
#ifndef OPENMM_CUDAPLATFORM_H_
#define OPENMM_CUDAPLATFORM_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008-2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "openmm/Platform.h"
#include "openmm/System.h"
namespace OpenMM {
class CudaContext;
/**
* This Platform subclass uses CUDA implementations of the OpenMM kernels.
*/
class OPENMM_EXPORT CudaPlatform : public Platform {
public:
class PlatformData;
CudaPlatform();
const std::string& getName() const {
static const std::string name = "CUDA";
return name;
}
double getSpeed() const {
return 100;
}
bool supportsDoublePrecision() const;
const std::string& getPropertyValue(const Context& context, const std::string& property) const;
void setPropertyValue(Context& context, const std::string& property, const std::string& value) const;
void contextCreated(ContextImpl& context, const std::map<std::string, std::string>& properties) const;
void contextDestroyed(ContextImpl& context) const;
/**
* This is the name of the parameter for selecting which CUDA device or devices to use.
*/
static const std::string& CudaDeviceIndex() {
static const std::string key = "CudaDeviceIndex";
return key;
}
/**
* This is the name of the parameter for selecting whether CUDA should sync or spin loop while waiting for results.
*/
static const std::string& CudaUseBlockingSync() {
static const std::string key = "CudaUseBlockingSync";
return key;
}
/**
* This is the name of the parameter for selecting what numerical precision to use.
*/
static const std::string& CudaPrecision() {
static const std::string key = "CudaPrecision";
return key;
}
/**
* This is the name of the parameter for specifying the path to the CUDA compiler.
*/
static const std::string& CudaCompiler() {
static const std::string key = "CudaCompiler";
return key;
}
/**
* This is the name of the parameter for specifying the path to the directory for creating temporary files.
*/
static const std::string& CudaTempDirectory() {
static const std::string key = "CudaTempDirectory";
return key;
}
};
class OPENMM_EXPORT CudaPlatform::PlatformData {
public:
PlatformData(const System& system, const std::string& deviceIndexProperty, const std::string& blockingProperty, const std::string& precisionProperty,
const std::string& compilerProperty, const std::string& tempProperty);
~PlatformData();
void initializeContexts(const System& system);
void syncContexts();
std::vector<CudaContext*> contexts;
std::vector<double> contextEnergy;
bool removeCM;
int cmMotionFrequency;
int stepCount, computeForceCount;
double time;
std::map<std::string, std::string> propertyValues;
};
} // namespace OpenMM
#endif /*OPENMM_CUDAPLATFORM_H_*/
#
# Include CUDA related files.
#
INCLUDE(FindCUDA)
INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS})
FILE(GLOB CUDA_KERNELS ${CUDA_SOURCE_DIR}/kernels/*.cu)
ADD_CUSTOM_COMMAND(OUTPUT ${CUDA_KERNELS_CPP} ${CUDA_KERNELS_H}
COMMAND ${CMAKE_COMMAND}
ARGS -D CUDA_SOURCE_DIR=${CUDA_SOURCE_DIR} -D CUDA_KERNELS_CPP=${CUDA_KERNELS_CPP} -D CUDA_KERNELS_H=${CUDA_KERNELS_H} -D CUDA_SOURCE_CLASS=${CUDA_SOURCE_CLASS} -P ${CMAKE_CURRENT_SOURCE_DIR}/../EncodeCUDAFiles.cmake
DEPENDS ${CUDA_KERNELS}
)
SET_SOURCE_FILES_PROPERTIES(${CUDA_KERNELS_CPP} ${CUDA_KERNELS_H} PROPERTIES GENERATED TRUE)
ADD_LIBRARY(${SHARED_TARGET} SHARED ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})
IF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
SET(MAIN_OPENMM_LIB ${OPENMM_LIBRARY_NAME}_d)
ELSE (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
SET(MAIN_OPENMM_LIB ${OPENMM_LIBRARY_NAME})
ENDIF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${MAIN_OPENMM_LIB} ${CUDA_LIBRARIES} ${PTHREADS_LIB})
SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES COMPILE_FLAGS "-DOPENMM_BUILDING_SHARED_LIBRARY")
INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${SHARED_TARGET})
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "CudaArray.h"
#include <iostream>
#include <sstream>
#include <vector>
using namespace OpenMM;
CudaArray::CudaArray(int size, int elementSize, const std::string& name) :
size(size), elementSize(elementSize), name(name), ownsMemory(true) {
CUresult result = cuMemAlloc(&pointer, size*elementSize);
if (result != CUDA_SUCCESS) {
std::stringstream str;
str<<"Error creating array "<<name<<": "<<result;
throw OpenMMException(str.str());
}
}
CudaArray::~CudaArray() {
if (ownsMemory) {
CUresult result = cuMemFree(pointer);
if (result != CUDA_SUCCESS) {
std::stringstream str;
str<<"Error deleting array "<<name<<": "<<result;
throw OpenMMException(str.str());
}
}
}
void CudaArray::upload(void* data, bool blocking) {
CUresult result;
if (blocking)
result = cuMemcpyHtoD(pointer, data, size*elementSize);
else
result = cuMemcpyHtoDAsync(pointer, data, size*elementSize, 0);
if (result != CUDA_SUCCESS) {
std::stringstream str;
str<<"Error uploading array "<<name<<": "<<result;
throw OpenMMException(str.str());
}
}
void CudaArray::download(void* data, bool blocking) const {
CUresult result;
if (blocking)
result = cuMemcpyDtoH(data, pointer, size*elementSize);
else
result = cuMemcpyDtoHAsync(data, pointer, size*elementSize, 0);
if (result != CUDA_SUCCESS) {
std::stringstream str;
str<<"Error downloading array "<<name<<": "<<result;
throw OpenMMException(str.str());
}
}
#ifndef OPENMM_CUDAARRAY_H_
#define OPENMM_CUDAARRAY_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "openmm/OpenMMException.h"
#include <cuda.h>
#include <iostream>
#include <sstream>
#include <vector>
namespace OpenMM {
/**
* This class encapsulates a block of CUDA device memory. It provides a simplified API
* for working with it and for copying data to and from device memory.
*/
class CudaArray {
public:
/**
* Create a CudaArray object. The object is allocated on the heap with the "new" operator.
* The template argument is the data type of each array element.
*
* @param size the number of elements in the array
* @param name the name of the array
*/
template <class T>
static CudaArray* create(int size, const std::string& name) {
return new CudaArray(size, sizeof(T), name);
}
/**
* Create a CudaArray object.
*
* @param size the number of elements in the array
* @param elementSize the size of each element in bytes
* @param name the name of the array
*/
CudaArray(int size, int elementSize, const std::string& name);
~CudaArray();
/**
* Get the number of elements in the array.
*/
int getSize() const {
return size;
}
/**
* Get the size of each element in bytes.
*/
int getElementSize() const {
return elementSize;
}
/**
* Get the name of the array.
*/
const std::string& getName() const {
return name;
}
/**
* Get a pointer to the device memory.
*/
CUdeviceptr getDevicePointer() {
return pointer;
}
/**
* Copy the values in a vector to the device memory.
*/
template <class T>
void upload(std::vector<T>& data) {
if (sizeof(T) != elementSize || data.size() != size)
throw OpenMMException("Error uploading array "+name+": The specified vector does not match the size of the array");
upload(&data[0], true);
}
/**
* Copy the values in the Buffer to a vector.
*/
template <class T>
void download(std::vector<T>& data) const {
if (sizeof(T) != elementSize)
throw OpenMMException("Error downloading array "+name+": The specified vector has the wrong element size");
if (data.size() != size)
data.resize(size);
download(&data[0], true);
}
/**
* Copy the values in an array to the device memory.
*
* @param data the data to copy
* @param blocking if true, this call will block until the transfer is complete. If false,
* the source array must be in page-locked memory.
*/
void upload(void* data, bool blocking = true);
/**
* Copy the values in the device memory to an array.
*
* @param data the array to copy the memory to
* @param blocking if true, this call will block until the transfer is complete. If false,
* the destination array must be in page-locked memory.
*/
void download(void* data, bool blocking = true) const;
private:
CUdeviceptr pointer;
int size, elementSize;
bool ownsMemory;
std::string name;
};
} // namespace OpenMM
#endif /*OPENMM_CUDAARRAY_H_*/
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#ifdef WIN32
#define _USE_MATH_DEFINES // Needed to get M_PI
#endif
#include <cmath>
#include "CudaContext.h"
#include "CudaArray.h"
//#include "CudaBondedUtilities.h"
#include "CudaExpressionUtilities.h"
#include "CudaForceInfo.h"
//#include "CudaIntegrationUtilities.h"
#include "CudaKernelSources.h"
//#include "CudaNonbondedUtilities.h"
#include "hilbert.h"
#include "openmm/OpenMMException.h"
#include "openmm/Platform.h"
#include "openmm/System.h"
#include "openmm/VirtualSite.h"
#include <algorithm>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <sstream>
#include <typeinfo>
#define CHECK_RESULT(result) CHECK_RESULT2(result, errorMessage);
#define CHECK_RESULT2(result, prefix) \
if (result != CUDA_SUCCESS) { \
std::stringstream m; \
m<<prefix<<": "<<result<<" ("<<__FILE__<<": "<<__LINE__<<")"; \
throw OpenMMException(m.str());\
}
using namespace OpenMM;
using namespace std;
const int CudaContext::ThreadBlockSize = 64;
const int CudaContext::TileSize = 32;
bool CudaContext::hasInitializedCuda = false;
CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const string& precision, const string& compiler,
const string& tempDir, CudaPlatform::PlatformData& platformData) : system(system), compiler(compiler),
time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), contextIsValid(false), atomsWereReordered(false), posq(NULL),
velm(NULL), /*forceBuffers(NULL), longForceBuffer(NULL), energyBuffer(NULL), atomIndex(NULL), integration(NULL),
bonded(NULL), nonbonded(NULL),*/ thread(NULL) {
if (!hasInitializedCuda) {
CHECK_RESULT2(cuInit(0), "Error initializing CUDA");
hasInitializedCuda = true;
}
if (precision == "single") {
useDoublePrecision = false;
accumulateInDouble = false;
}
else if (precision == "mixed") {
useDoublePrecision = false;
accumulateInDouble = true;
}
else if (precision == "double") {
useDoublePrecision = true;
accumulateInDouble = true;
}
else
throw OpenMMException("Illegal value for CudaPrecision: "+precision);
#ifdef WIN32
this->tempDir = tempDir+"\";
#else
this->tempDir = tempDir+"/";
#endif
contextIndex = platformData.contexts.size();
int numDevices;
string errorMessage = "Error initializing Context";
CHECK_RESULT(cuDeviceGetCount(&numDevices));
if (deviceIndex < 0 || deviceIndex >= numDevices) {
// Try to figure out which device is the fastest.
int bestSpeed = -1;
int bestCompute = -1;
for (int i = 0; i < numDevices; i++) {
CHECK_RESULT(cuDeviceGet(&device, i));
int major, minor, clock, multiprocessors;
CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, device));
if (major == 1 && minor < 2)
continue; // 1.0 and 1.1 are not supported
CHECK_RESULT(cuDeviceGetAttribute(&clock, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device));
CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device));
int speed = clock*multiprocessors;
if (major > bestCompute || (major == bestCompute && speed > bestSpeed)) {
deviceIndex = i;
bestSpeed = speed;
bestCompute = major;
}
}
}
if (deviceIndex == -1)
throw OpenMMException("No compatible CUDA device is available");
CHECK_RESULT(cuDeviceGet(&device, deviceIndex));
this->deviceIndex = deviceIndex;
int major, minor;
CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, device));
gpuArchitecture = CudaExpressionUtilities::intToString(major)+CudaExpressionUtilities::intToString(minor);
compilationDefines["WORK_GROUP_SIZE"] = CudaExpressionUtilities::intToString(ThreadBlockSize);
defaultOptimizationOptions = "--use_fast_math";
int numThreadBlocksPerComputeUnit = 6;
CHECK_RESULT(cuCtxCreate(&context, 0, device));
contextIsValid = true;
numAtoms = system.getNumParticles();
paddedNumAtoms = TileSize*((numAtoms+TileSize-1)/TileSize);
numAtomBlocks = (paddedNumAtoms+(TileSize-1))/TileSize;
int multiprocessors;
CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device));
numThreadBlocks = numThreadBlocksPerComputeUnit*multiprocessors;
// bonded = new CudaBondedUtilities(*this);
// nonbonded = new CudaNonbondedUtilities(*this);
posq = CudaArray::create<float4>(paddedNumAtoms, "posq");
velm = CudaArray::create<float4>(paddedNumAtoms, "velm");
posCellOffsets.resize(paddedNumAtoms, make_int4(0, 0, 0, 0));
// Create utility kernels that are used in multiple places.
CUmodule utilities = createModule(CudaKernelSources::vectorOps+CudaKernelSources::utilities);
cuModuleGetFunction(&clearBufferKernel, utilities, "clearBuffer");
cuModuleGetFunction(&clearTwoBuffersKernel, utilities, "clearTwoBuffers");
cuModuleGetFunction(&clearThreeBuffersKernel, utilities, "clearThreeBuffers");
cuModuleGetFunction(&clearFourBuffersKernel, utilities, "clearFourBuffers");
cuModuleGetFunction(&clearFiveBuffersKernel, utilities, "clearFiveBuffers");
cuModuleGetFunction(&clearSixBuffersKernel, utilities, "clearSixBuffers");
cuModuleGetFunction(&reduceFloat4Kernel, utilities, "reduceFloat4Buffer");
cuModuleGetFunction(&reduceForcesKernel, utilities, "reduceForces");
// Set defines based on the requested precision.
compilationDefines["SQRT"] = useDoublePrecision ? "sqrt" : "sqrtf";
compilationDefines["RSQRT"] = useDoublePrecision ? "rsqrt" : "rsqrtf";
compilationDefines["RECIP"] = useDoublePrecision ? "1.0/" : "1.0f/";
compilationDefines["EXP"] = useDoublePrecision ? "exp" : "expf";
compilationDefines["LOG"] = useDoublePrecision ? "log" : "logf";
// Create the work thread used for parallelization when running on multiple devices.
thread = new WorkThread();
//
// // Create the integration utilities object.
//
// integration = new CudaIntegrationUtilities(*this, system);
}
CudaContext::~CudaContext() {
for (int i = 0; i < (int) forces.size(); i++)
delete forces[i];
for (int i = 0; i < (int) reorderListeners.size(); i++)
delete reorderListeners[i];
if (posq != NULL)
delete posq;
if (velm != NULL)
delete velm;
// if (force != NULL)
// delete force;
// if (forceBuffers != NULL)
// delete forceBuffers;
// if (longForceBuffer != NULL)
// delete longForceBuffer;
// if (energyBuffer != NULL)
// delete energyBuffer;
// if (atomIndex != NULL)
// delete atomIndex;
// if (integration != NULL)
// delete integration;
// if (bonded != NULL)
// delete bonded;
// if (nonbonded != NULL)
// delete nonbonded;
if (thread != NULL)
delete thread;
string errorMessage = "Error deleting Context";
if (contextIsValid)
CHECK_RESULT(cuCtxDestroy(context));
}
//void CudaContext::initialize() {
// for (int i = 0; i < numAtoms; i++) {
// double mass = system.getParticleMass(i);
// (*velm)[i].w = (float) (mass == 0.0 ? 0.0 : 1.0/mass);
// }
// velm->upload();
// bonded->initialize(system);
// numForceBuffers = platformData.contexts.size();
// numForceBuffers = std::max(numForceBuffers, bonded->getNumForceBuffers());
// for (int i = 0; i < (int) forces.size(); i++)
// numForceBuffers = std::max(numForceBuffers, forces[i]->getRequiredForceBuffers());
// forceBuffers = new CudaArray<mm_float4>(*this, paddedNumAtoms*numForceBuffers, "forceBuffers", false);
// if (supports64BitGlobalAtomics) {
// longForceBuffer = new CudaArray<cl_long>(*this, 3*paddedNumAtoms, "longForceBuffer", false);
// reduceForcesKernel.setArg<cl::Buffer>(0, longForceBuffer->getDeviceBuffer());
// reduceForcesKernel.setArg<cl::Buffer>(1, forceBuffers->getDeviceBuffer());
// reduceForcesKernel.setArg<cl_int>(2, paddedNumAtoms);
// reduceForcesKernel.setArg<cl_int>(3, numForceBuffers);
// addAutoclearBuffer(longForceBuffer->getDeviceBuffer(), longForceBuffer->getSize()*2);
// }
// addAutoclearBuffer(forceBuffers->getDeviceBuffer(), forceBuffers->getSize()*4);
// force = new CudaArray<mm_float4>(*this, &forceBuffers->getDeviceBuffer(), paddedNumAtoms, "force", true);
// energyBuffer = new CudaArray<cl_float>(*this, max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers()), "energyBuffer", true);
// addAutoclearBuffer(energyBuffer->getDeviceBuffer(), energyBuffer->getSize());
// atomIndex = new CudaArray<cl_int>(*this, paddedNumAtoms, "atomIndex", true);
// for (int i = 0; i < paddedNumAtoms; ++i)
// (*atomIndex)[i] = i;
// atomIndex->upload();
// findMoleculeGroups();
// moleculesInvalid = false;
// nonbonded->initialize(system);
//}
void CudaContext::addForce(CudaForceInfo* force) {
forces.push_back(force);
}
string CudaContext::replaceStrings(const string& input, const std::map<std::string, std::string>& replacements) const {
string result = input;
for (map<string, string>::const_iterator iter = replacements.begin(); iter != replacements.end(); iter++) {
int index = -1;
do {
index = result.find(iter->first);
if (index != result.npos)
result.replace(index, iter->first.size(), iter->second);
} while (index != result.npos);
}
return result;
}
CUmodule CudaContext::createModule(const string source, const char* optimizationFlags) {
return createModule(source, map<string, string>(), optimizationFlags);
}
CUmodule CudaContext::createModule(const string source, const map<string, string>& defines, const char* optimizationFlags) {
string options = (optimizationFlags == NULL ? defaultOptimizationOptions : string(optimizationFlags));
stringstream src;
if (!options.empty())
src << "// Compilation Options: " << options << endl << endl;
for (map<string, string>::const_iterator iter = compilationDefines.begin(); iter != compilationDefines.end(); ++iter) {
src << "#define " << iter->first;
if (!iter->second.empty())
src << " " << iter->second;
src << endl;
}
if (!compilationDefines.empty())
src << endl;
for (map<string, string>::const_iterator iter = defines.begin(); iter != defines.end(); ++iter) {
src << "#define " << iter->first;
if (!iter->second.empty())
src << " " << iter->second;
src << endl;
}
if (!defines.empty())
src << endl;
src << source << endl;
// Write out the source to a temporary file.
stringstream tempFileName;
tempFileName << "openmmTempKernel" << this; // Include a pointer to this context as part of the filename to avoid collisions.
string inputFile = (tempDir+tempFileName.str()+".cu");
string outputFile = (tempDir+tempFileName.str()+".ptx");
string logFile = (tempDir+tempFileName.str()+".log");
ofstream out(inputFile.c_str());
out << src.str();
out.close();
#ifdef WIN32
#else
string command = "\""+compiler+"\" --ptx -arch=compute_"+gpuArchitecture+" -o \""+outputFile+"\" "+options+" \""+inputFile+"\" 2> \""+logFile+"\"";
int res = std::system(command.c_str());
#endif
try {
if (res != 0) {
// Load the error log.
stringstream error;
error << "Error launching CUDA compiler: " << res;
ifstream log(logFile.c_str());
if (log.is_open()) {
string line;
while (!log.eof()) {
getline(log, line);
error << '\n' << line;
}
log.close();
}
throw OpenMMException(error.str());
}
CUmodule module;
CUresult result = cuModuleLoad(&module, outputFile.c_str());
if (result != CUDA_SUCCESS) {
std::stringstream m;
m<<"Error loading CUDA module: "<<result;
throw OpenMMException(m.str());
}
remove(inputFile.c_str());
remove(outputFile.c_str());
remove(logFile.c_str());
return module;
}
catch (...) {
remove(inputFile.c_str());
remove(outputFile.c_str());
remove(logFile.c_str());
throw;
}
//
// // Get length before using c_str() to avoid length() call invalidating the c_str() value.
// string src_string = src.str();
// ::size_t src_length = src_string.length();
// cl::Program::Sources sources(1, make_pair(src_string.c_str(), src_length));
// cl::Program program(context, sources);
// try {
// program.build(vector<cl::Device>(1, device), options.c_str());
// } catch (cl::Error err) {
// throw OpenMMException("Error compiling kernel: "+program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device));
// }
}
//
//void CudaContext::executeKernel(cl::Kernel& kernel, int workUnits, int blockSize) {
// if (blockSize == -1)
// blockSize = ThreadBlockSize;
// int size = std::min((workUnits+blockSize-1)/blockSize, numThreadBlocks)*blockSize;
// try {
// queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(size), cl::NDRange(blockSize));
// }
// catch (cl::Error err) {
// stringstream str;
// str<<"Error invoking kernel "<<kernel.getInfo<CL_KERNEL_FUNCTION_NAME>()<<": "<<err.what()<<" ("<<err.err()<<")";
// throw OpenMMException(str.str());
// }
//}
//
//void CudaContext::clearBuffer(CudaArray<float>& array) {
// clearBuffer(array.getDeviceBuffer(), array.getSize());
//}
//
//void CudaContext::clearBuffer(CudaArray<mm_float4>& array) {
// clearBuffer(array.getDeviceBuffer(), array.getSize()*4);
//}
//
//void CudaContext::clearBuffer(cl::Memory& memory, int size) {
// clearBufferKernel.setArg<cl::Memory>(0, memory);
// clearBufferKernel.setArg<cl_int>(1, size);
// executeKernel(clearBufferKernel, size, 128);
//}
//
//void CudaContext::addAutoclearBuffer(cl::Memory& memory, int size) {
// autoclearBuffers.push_back(&memory);
// autoclearBufferSizes.push_back(size);
//}
//
//void CudaContext::clearAutoclearBuffers() {
// int base = 0;
// int total = autoclearBufferSizes.size();
// while (total-base >= 6) {
// clearSixBuffersKernel.setArg<cl::Memory>(0, *autoclearBuffers[base]);
// clearSixBuffersKernel.setArg<cl_int>(1, autoclearBufferSizes[base]);
// clearSixBuffersKernel.setArg<cl::Memory>(2, *autoclearBuffers[base+1]);
// clearSixBuffersKernel.setArg<cl_int>(3, autoclearBufferSizes[base+1]);
// clearSixBuffersKernel.setArg<cl::Memory>(4, *autoclearBuffers[base+2]);
// clearSixBuffersKernel.setArg<cl_int>(5, autoclearBufferSizes[base+2]);
// clearSixBuffersKernel.setArg<cl::Memory>(6, *autoclearBuffers[base+3]);
// clearSixBuffersKernel.setArg<cl_int>(7, autoclearBufferSizes[base+3]);
// clearSixBuffersKernel.setArg<cl::Memory>(8, *autoclearBuffers[base+4]);
// clearSixBuffersKernel.setArg<cl_int>(9, autoclearBufferSizes[base+4]);
// clearSixBuffersKernel.setArg<cl::Memory>(10, *autoclearBuffers[base+5]);
// clearSixBuffersKernel.setArg<cl_int>(11, autoclearBufferSizes[base+5]);
// executeKernel(clearSixBuffersKernel, max(max(max(max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), autoclearBufferSizes[base+3]), autoclearBufferSizes[base+4]), autoclearBufferSizes[base+5]), 128);
// base += 6;
// }
// if (total-base == 5) {
// clearFiveBuffersKernel.setArg<cl::Memory>(0, *autoclearBuffers[base]);
// clearFiveBuffersKernel.setArg<cl_int>(1, autoclearBufferSizes[base]);
// clearFiveBuffersKernel.setArg<cl::Memory>(2, *autoclearBuffers[base+1]);
// clearFiveBuffersKernel.setArg<cl_int>(3, autoclearBufferSizes[base+1]);
// clearFiveBuffersKernel.setArg<cl::Memory>(4, *autoclearBuffers[base+2]);
// clearFiveBuffersKernel.setArg<cl_int>(5, autoclearBufferSizes[base+2]);
// clearFiveBuffersKernel.setArg<cl::Memory>(6, *autoclearBuffers[base+3]);
// clearFiveBuffersKernel.setArg<cl_int>(7, autoclearBufferSizes[base+3]);
// clearFiveBuffersKernel.setArg<cl::Memory>(8, *autoclearBuffers[base+4]);
// clearFiveBuffersKernel.setArg<cl_int>(9, autoclearBufferSizes[base+4]);
// executeKernel(clearFiveBuffersKernel, max(max(max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), autoclearBufferSizes[base+3]), autoclearBufferSizes[base+4]), 128);
// }
// else if (total-base == 4) {
// clearFourBuffersKernel.setArg<cl::Memory>(0, *autoclearBuffers[base]);
// clearFourBuffersKernel.setArg<cl_int>(1, autoclearBufferSizes[base]);
// clearFourBuffersKernel.setArg<cl::Memory>(2, *autoclearBuffers[base+1]);
// clearFourBuffersKernel.setArg<cl_int>(3, autoclearBufferSizes[base+1]);
// clearFourBuffersKernel.setArg<cl::Memory>(4, *autoclearBuffers[base+2]);
// clearFourBuffersKernel.setArg<cl_int>(5, autoclearBufferSizes[base+2]);
// clearFourBuffersKernel.setArg<cl::Memory>(6, *autoclearBuffers[base+3]);
// clearFourBuffersKernel.setArg<cl_int>(7, autoclearBufferSizes[base+3]);
// executeKernel(clearFourBuffersKernel, max(max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), autoclearBufferSizes[base+3]), 128);
// }
// else if (total-base == 3) {
// clearThreeBuffersKernel.setArg<cl::Memory>(0, *autoclearBuffers[base]);
// clearThreeBuffersKernel.setArg<cl_int>(1, autoclearBufferSizes[base]);
// clearThreeBuffersKernel.setArg<cl::Memory>(2, *autoclearBuffers[base+1]);
// clearThreeBuffersKernel.setArg<cl_int>(3, autoclearBufferSizes[base+1]);
// clearThreeBuffersKernel.setArg<cl::Memory>(4, *autoclearBuffers[base+2]);
// clearThreeBuffersKernel.setArg<cl_int>(5, autoclearBufferSizes[base+2]);
// executeKernel(clearThreeBuffersKernel, max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), 128);
// }
// else if (total-base == 2) {
// clearTwoBuffersKernel.setArg<cl::Memory>(0, *autoclearBuffers[base]);
// clearTwoBuffersKernel.setArg<cl_int>(1, autoclearBufferSizes[base]);
// clearTwoBuffersKernel.setArg<cl::Memory>(2, *autoclearBuffers[base+1]);
// clearTwoBuffersKernel.setArg<cl_int>(3, autoclearBufferSizes[base+1]);
// executeKernel(clearTwoBuffersKernel, max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), 128);
// }
// else if (total-base == 1) {
// clearBuffer(*autoclearBuffers[base], autoclearBufferSizes[base]);
// }
//}
//
//void CudaContext::reduceForces() {
// if (supports64BitGlobalAtomics)
// executeKernel(reduceForcesKernel, paddedNumAtoms, 128);
// else
// reduceBuffer(*forceBuffers, numForceBuffers);
//}
//
//void CudaContext::reduceBuffer(CudaArray<mm_float4>& array, int numBuffers) {
// int bufferSize = array.getSize()/numBuffers;
// reduceFloat4Kernel.setArg<cl::Buffer>(0, array.getDeviceBuffer());
// reduceFloat4Kernel.setArg<cl_int>(1, bufferSize);
// reduceFloat4Kernel.setArg<cl_int>(2, numBuffers);
// executeKernel(reduceFloat4Kernel, bufferSize, 128);
//}
//
//void CudaContext::tagAtomsInMolecule(int atom, int molecule, vector<int>& atomMolecule, vector<vector<int> >& atomBonds) {
// // Recursively tag atoms as belonging to a particular molecule.
//
// atomMolecule[atom] = molecule;
// for (int i = 0; i < (int) atomBonds[atom].size(); i++)
// if (atomMolecule[atomBonds[atom][i]] == -1)
// tagAtomsInMolecule(atomBonds[atom][i], molecule, atomMolecule, atomBonds);
//}
//
///**
// * This class ensures that atom reordering doesn't break virtual sites.
// */
//class CudaContext::VirtualSiteInfo : public CudaForceInfo {
//public:
// VirtualSiteInfo(const System& system) : CudaForceInfo(0) {
// for (int i = 0; i < system.getNumParticles(); i++) {
// if (system.isVirtualSite(i)) {
// siteTypes.push_back(&typeid(system.getVirtualSite(i)));
// vector<int> particles;
// particles.push_back(i);
// for (int j = 0; j < system.getVirtualSite(i).getNumParticles(); j++)
// particles.push_back(system.getVirtualSite(i).getParticle(j));
// siteParticles.push_back(particles);
// vector<double> weights;
// if (dynamic_cast<const TwoParticleAverageSite*>(&system.getVirtualSite(i)) != NULL) {
// // A two particle average.
//
// const TwoParticleAverageSite& site = dynamic_cast<const TwoParticleAverageSite&>(system.getVirtualSite(i));
// weights.push_back(site.getWeight(0));
// weights.push_back(site.getWeight(1));
// }
// else if (dynamic_cast<const ThreeParticleAverageSite*>(&system.getVirtualSite(i)) != NULL) {
// // A three particle average.
//
// const ThreeParticleAverageSite& site = dynamic_cast<const ThreeParticleAverageSite&>(system.getVirtualSite(i));
// weights.push_back(site.getWeight(0));
// weights.push_back(site.getWeight(1));
// weights.push_back(site.getWeight(2));
// }
// else if (dynamic_cast<const OutOfPlaneSite*>(&system.getVirtualSite(i)) != NULL) {
// // An out of plane site.
//
// const OutOfPlaneSite& site = dynamic_cast<const OutOfPlaneSite&>(system.getVirtualSite(i));
// weights.push_back(site.getWeight12());
// weights.push_back(site.getWeight13());
// weights.push_back(site.getWeightCross());
// }
// siteWeights.push_back(weights);
// }
// }
// }
// int getNumParticleGroups() {
// return siteTypes.size();
// }
// void getParticlesInGroup(int index, std::vector<int>& particles) {
// particles = siteParticles[index];
// }
// bool areGroupsIdentical(int group1, int group2) {
// if (siteTypes[group1] != siteTypes[group2])
// return false;
// int numParticles = siteWeights[group1].size();
// if (siteWeights[group2].size() != numParticles)
// return false;
// for (int i = 0; i < numParticles; i++)
// if (siteWeights[group1][i] != siteWeights[group2][i])
// return false;
// return true;
// }
//private:
// vector<const type_info*> siteTypes;
// vector<vector<int> > siteParticles;
// vector<vector<double> > siteWeights;
//};
//
//
//void CudaContext::findMoleculeGroups() {
// // The first time this is called, we need to identify all the molecules in the system.
//
// if (moleculeGroups.size() == 0) {
// // Add a ForceInfo that makes sure reordering doesn't break virtual sites.
//
// addForce(new VirtualSiteInfo(system));
//
// // First make a list of every other atom to which each atom is connect by a constraint or force group.
//
// vector<vector<int> > atomBonds(system.getNumParticles());
// for (int i = 0; i < system.getNumConstraints(); i++) {
// int particle1, particle2;
// double distance;
// system.getConstraintParameters(i, particle1, particle2, distance);
// atomBonds[particle1].push_back(particle2);
// atomBonds[particle2].push_back(particle1);
// }
// for (int i = 0; i < (int) forces.size(); i++) {
// for (int j = 0; j < forces[i]->getNumParticleGroups(); j++) {
// vector<int> particles;
// forces[i]->getParticlesInGroup(j, particles);
// for (int k = 0; k < (int) particles.size(); k++)
// for (int m = 0; m < (int) particles.size(); m++)
// if (k != m)
// atomBonds[particles[k]].push_back(particles[m]);
// }
// }
//
// // Now tag atoms by which molecule they belong to.
//
// vector<int> atomMolecule(numAtoms, -1);
// int numMolecules = 0;
// for (int i = 0; i < numAtoms; i++)
// if (atomMolecule[i] == -1)
// tagAtomsInMolecule(i, numMolecules++, atomMolecule, atomBonds);
// vector<vector<int> > atomIndices(numMolecules);
// for (int i = 0; i < numAtoms; i++)
// atomIndices[atomMolecule[i]].push_back(i);
//
// // Construct a description of each molecule.
//
// molecules.resize(numMolecules);
// for (int i = 0; i < numMolecules; i++) {
// molecules[i].atoms = atomIndices[i];
// molecules[i].groups.resize(forces.size());
// }
// for (int i = 0; i < system.getNumConstraints(); i++) {
// int particle1, particle2;
// double distance;
// system.getConstraintParameters(i, particle1, particle2, distance);
// molecules[atomMolecule[particle1]].constraints.push_back(i);
// }
// for (int i = 0; i < (int) forces.size(); i++)
// for (int j = 0; j < forces[i]->getNumParticleGroups(); j++) {
// vector<int> particles;
// forces[i]->getParticlesInGroup(j, particles);
// molecules[atomMolecule[particles[0]]].groups[i].push_back(j);
// }
// }
//
// // Sort them into groups of identical molecules.
//
// vector<Molecule> uniqueMolecules;
// vector<vector<int> > moleculeInstances;
// vector<vector<int> > moleculeOffsets;
// for (int molIndex = 0; molIndex < (int) molecules.size(); molIndex++) {
// Molecule& mol = molecules[molIndex];
//
// // See if it is identical to another molecule.
//
// bool isNew = true;
// for (int j = 0; j < (int) uniqueMolecules.size() && isNew; j++) {
// Molecule& mol2 = uniqueMolecules[j];
// bool identical = (mol.atoms.size() == mol2.atoms.size() && mol.constraints.size() == mol2.constraints.size());
//
// // See if the atoms are identical.
//
// int atomOffset = mol2.atoms[0]-mol.atoms[0];
// for (int i = 0; i < (int) mol.atoms.size() && identical; i++) {
// if (mol.atoms[i] != mol2.atoms[i]-atomOffset || system.getParticleMass(mol.atoms[i]) != system.getParticleMass(mol2.atoms[i]))
// identical = false;
// for (int k = 0; k < (int) forces.size(); k++)
// if (!forces[k]->areParticlesIdentical(mol.atoms[i], mol2.atoms[i]))
// identical = false;
// }
//
// // See if the constraints are identical.
//
// for (int i = 0; i < (int) mol.constraints.size() && identical; i++) {
// int c1particle1, c1particle2, c2particle1, c2particle2;
// double distance1, distance2;
// system.getConstraintParameters(mol.constraints[i], c1particle1, c1particle2, distance1);
// system.getConstraintParameters(mol2.constraints[i], c2particle1, c2particle2, distance2);
// if (c1particle1 != c2particle1-atomOffset || c1particle2 != c2particle2-atomOffset || distance1 != distance2)
// identical = false;
// }
//
// // See if the force groups are identical.
//
// for (int i = 0; i < (int) forces.size() && identical; i++) {
// if (mol.groups[i].size() != mol2.groups[i].size())
// identical = false;
// for (int k = 0; k < (int) mol.groups[i].size() && identical; k++)
// if (!forces[i]->areGroupsIdentical(mol.groups[i][k], mol2.groups[i][k]))
// identical = false;
// }
// if (identical) {
// moleculeInstances[j].push_back(molIndex);
// moleculeOffsets[j].push_back(mol.atoms[0]);
// isNew = false;
// }
// }
// if (isNew) {
// uniqueMolecules.push_back(mol);
// moleculeInstances.push_back(vector<int>());
// moleculeInstances[moleculeInstances.size()-1].push_back(molIndex);
// moleculeOffsets.push_back(vector<int>());
// moleculeOffsets[moleculeOffsets.size()-1].push_back(mol.atoms[0]);
// }
// }
// moleculeGroups.resize(moleculeInstances.size());
// for (int i = 0; i < (int) moleculeInstances.size(); i++)
// {
// moleculeGroups[i].instances = moleculeInstances[i];
// moleculeGroups[i].offsets = moleculeOffsets[i];
// vector<int>& atoms = uniqueMolecules[i].atoms;
// moleculeGroups[i].atoms.resize(atoms.size());
// for (int j = 0; j < (int) atoms.size(); j++)
// moleculeGroups[i].atoms[j] = atoms[j]-atoms[0];
// }
//}
//
//void CudaContext::invalidateMolecules() {
// moleculesInvalid = true;
//}
//
//
//void OpenCLContext::validateMolecules() {
// moleculesInvalid = false;
// if (numAtoms == 0 || nonbonded == NULL || !nonbonded->getUseCutoff())
// return;
// bool valid = true;
// for (int group = 0; valid && group < (int) moleculeGroups.size(); group++) {
// MoleculeGroup& mol = moleculeGroups[group];
// vector<int>& instances = mol.instances;
// vector<int>& offsets = mol.offsets;
// vector<int>& atoms = mol.atoms;
// int numMolecules = instances.size();
// Molecule& m1 = molecules[instances[0]];
// int offset1 = offsets[0];
// for (int j = 1; valid && j < numMolecules; j++) {
// // See if the atoms are identical.
//
// Molecule& m2 = molecules[instances[j]];
// int offset2 = offsets[j];
// for (int i = 0; i < (int) atoms.size() && valid; i++) {
// for (int k = 0; k < (int) forces.size(); k++)
// if (!forces[k]->areParticlesIdentical(atoms[i]+offset1, atoms[i]+offset2))
// valid = false;
// }
//
// // See if the force groups are identical.
//
// for (int i = 0; i < (int) forces.size() && valid; i++) {
// for (int k = 0; k < (int) m1.groups[i].size() && valid; k++)
// if (!forces[i]->areGroupsIdentical(m1.groups[i][k], m2.groups[i][k]))
// valid = false;
// }
// }
// }
// if (valid)
// return;
//
// // The list of which molecules are identical is no longer valid. We need to restore the
// // atoms to their original order, rebuild the list of identical molecules, and sort them
// // again.
//
// vector<mm_float4> newPosq(numAtoms);
// vector<mm_float4> newVelm(numAtoms);
// vector<mm_int4> newCellOffsets(numAtoms);
// posq->download();
// velm->download();
// for (int i = 0; i < numAtoms; i++) {
// int index = atomIndex->get(i);
// newPosq[index] = posq->get(i);
// newVelm[index] = velm->get(i);
// newCellOffsets[index] = posCellOffsets[i];
// }
// for (int i = 0; i < numAtoms; i++) {
// posq->set(i, newPosq[i]);
// velm->set(i, newVelm[i]);
// atomIndex->set(i, i);
// posCellOffsets[i] = newCellOffsets[i];
// }
// posq->upload();
// velm->upload();
// atomIndex->upload();
// findMoleculeGroups();
// for (int i = 0; i < (int) reorderListeners.size(); i++)
// reorderListeners[i]->execute();
//}
//
//void OpenCLContext::reorderAtoms(bool enforcePeriodic) {
// if (numAtoms == 0 || nonbonded == NULL || !nonbonded->getUseCutoff())
// return;
// if (moleculesInvalid)
// validateMolecules();
// atomsWereReordered = true;
//
// // Find the range of positions and the number of bins along each axis.
//
// posq->download();
// velm->download();
// float minx = posq->get(0).x, maxx = posq->get(0).x;
// float miny = posq->get(0).y, maxy = posq->get(0).y;
// float minz = posq->get(0).z, maxz = posq->get(0).z;
// if (nonbonded->getUsePeriodic()) {
// minx = miny = minz = 0.0;
// maxx = periodicBoxSize.x;
// maxy = periodicBoxSize.y;
// maxz = periodicBoxSize.z;
// }
// else {
// for (int i = 1; i < numAtoms; i++) {
// const mm_float4& pos = posq->get(i);
// minx = min(minx, pos.x);
// maxx = max(maxx, pos.x);
// miny = min(miny, pos.y);
// maxy = max(maxy, pos.y);
// minz = min(minz, pos.z);
// maxz = max(maxz, pos.z);
// }
// }
//
// // Loop over each group of identical molecules and reorder them.
//
// vector<int> originalIndex(numAtoms);
// vector<mm_float4> newPosq(numAtoms);
// vector<mm_float4> newVelm(numAtoms);
// vector<mm_int4> newCellOffsets(numAtoms);
// for (int group = 0; group < (int) moleculeGroups.size(); group++) {
// // Find the center of each molecule.
//
// MoleculeGroup& mol = moleculeGroups[group];
// int numMolecules = mol.offsets.size();
// vector<int>& atoms = mol.atoms;
// vector<mm_float4> molPos(numMolecules);
// float invNumAtoms = 1.0f/atoms.size();
// for (int i = 0; i < numMolecules; i++) {
// molPos[i].x = 0.0f;
// molPos[i].y = 0.0f;
// molPos[i].z = 0.0f;
// for (int j = 0; j < (int)atoms.size(); j++) {
// int atom = atoms[j]+mol.offsets[i];
// const mm_float4& pos = posq->get(atom);
// molPos[i].x += pos.x;
// molPos[i].y += pos.y;
// molPos[i].z += pos.z;
// }
// molPos[i].x *= invNumAtoms;
// molPos[i].y *= invNumAtoms;
// molPos[i].z *= invNumAtoms;
// }
// if (nonbonded->getUsePeriodic()) {
// // Move each molecule position into the same box.
//
// for (int i = 0; i < numMolecules; i++) {
// int xcell = (int) floor(molPos[i].x*invPeriodicBoxSize.x);
// int ycell = (int) floor(molPos[i].y*invPeriodicBoxSize.y);
// int zcell = (int) floor(molPos[i].z*invPeriodicBoxSize.z);
// float dx = xcell*periodicBoxSize.x;
// float dy = ycell*periodicBoxSize.y;
// float dz = zcell*periodicBoxSize.z;
// if (dx != 0.0f || dy != 0.0f || dz != 0.0f) {
// molPos[i].x -= dx;
// molPos[i].y -= dy;
// molPos[i].z -= dz;
// if (enforcePeriodic) {
// for (int j = 0; j < (int) atoms.size(); j++) {
// int atom = atoms[j]+mol.offsets[i];
// mm_float4 p = posq->get(atom);
// p.x -= dx;
// p.y -= dy;
// p.z -= dz;
// posq->set(atom, p);
// posCellOffsets[atom].x -= xcell;
// posCellOffsets[atom].y -= ycell;
// posCellOffsets[atom].z -= zcell;
// }
// }
// }
// }
// }
//
// // Select a bin for each molecule, then sort them by bin.
//
// bool useHilbert = (numMolecules > 5000 || atoms.size() > 8); // For small systems, a simple zigzag curve works better than a Hilbert curve.
// float binWidth;
// if (useHilbert)
// binWidth = (float)(max(max(maxx-minx, maxy-miny), maxz-minz)/255.0);
// else
// binWidth = (float)(0.2*nonbonded->getCutoffDistance());
// float invBinWidth = 1.0f/binWidth;
// int xbins = 1 + (int) ((maxx-minx)*invBinWidth);
// int ybins = 1 + (int) ((maxy-miny)*invBinWidth);
// vector<pair<int, int> > molBins(numMolecules);
// bitmask_t coords[3];
// for (int i = 0; i < numMolecules; i++) {
// int x = (int) ((molPos[i].x-minx)*invBinWidth);
// int y = (int) ((molPos[i].y-miny)*invBinWidth);
// int z = (int) ((molPos[i].z-minz)*invBinWidth);
// int bin;
// if (useHilbert) {
// coords[0] = x;
// coords[1] = y;
// coords[2] = z;
// bin = (int) hilbert_c2i(3, 8, coords);
// }
// else {
// int yodd = y&1;
// int zodd = z&1;
// bin = z*xbins*ybins;
// bin += (zodd ? ybins-y : y)*xbins;
// bin += (yodd ? xbins-x : x);
// }
// molBins[i] = pair<int, int>(bin, i);
// }
// sort(molBins.begin(), molBins.end());
//
// // Reorder the atoms.
//
// for (int i = 0; i < numMolecules; i++) {
// for (int j = 0; j < (int)atoms.size(); j++) {
// int oldIndex = mol.offsets[molBins[i].second]+atoms[j];
// int newIndex = mol.offsets[i]+atoms[j];
// originalIndex[newIndex] = atomIndex->get(oldIndex);
// newPosq[newIndex] = posq->get(oldIndex);
// newVelm[newIndex] = velm->get(oldIndex);
// newCellOffsets[newIndex] = posCellOffsets[oldIndex];
// }
// }
// }
//
// // Update the streams.
//
// for (int i = 0; i < numAtoms; i++) {
// posq->set(i, newPosq[i]);
// velm->set(i, newVelm[i]);
// atomIndex->set(i, originalIndex[i]);
// posCellOffsets[i] = newCellOffsets[i];
// }
// posq->upload();
// velm->upload();
// atomIndex->upload();
// for (int i = 0; i < (int) reorderListeners.size(); i++)
// reorderListeners[i]->execute();
//}
struct CudaContext::WorkThread::ThreadData {
ThreadData(std::queue<CudaContext::WorkTask*>& tasks, bool& waiting, bool& finished,
pthread_mutex_t& queueLock, pthread_cond_t& waitForTaskCondition, pthread_cond_t& queueEmptyCondition) :
tasks(tasks), waiting(waiting), finished(finished), queueLock(queueLock),
waitForTaskCondition(waitForTaskCondition), queueEmptyCondition(queueEmptyCondition) {
}
std::queue<CudaContext::WorkTask*>& tasks;
bool& waiting;
bool& finished;
pthread_mutex_t& queueLock;
pthread_cond_t& waitForTaskCondition;
pthread_cond_t& queueEmptyCondition;
};
static void* threadBody(void* args) {
CudaContext::WorkThread::ThreadData& data = *reinterpret_cast<CudaContext::WorkThread::ThreadData*>(args);
while (!data.finished || data.tasks.size() > 0) {
pthread_mutex_lock(&data.queueLock);
while (data.tasks.empty() && !data.finished) {
data.waiting = true;
pthread_cond_signal(&data.queueEmptyCondition);
pthread_cond_wait(&data.waitForTaskCondition, &data.queueLock);
}
CudaContext::WorkTask* task = NULL;
if (!data.tasks.empty()) {
data.waiting = false;
task = data.tasks.front();
data.tasks.pop();
}
pthread_mutex_unlock(&data.queueLock);
if (task != NULL) {
task->execute();
delete task;
}
}
data.waiting = true;
pthread_cond_signal(&data.queueEmptyCondition);
delete &data;
return 0;
}
CudaContext::WorkThread::WorkThread() : waiting(true), finished(false) {
pthread_mutex_init(&queueLock, NULL);
pthread_cond_init(&waitForTaskCondition, NULL);
pthread_cond_init(&queueEmptyCondition, NULL);
ThreadData* data = new ThreadData(tasks, waiting, finished, queueLock, waitForTaskCondition, queueEmptyCondition);
pthread_create(&thread, NULL, threadBody, data);
}
CudaContext::WorkThread::~WorkThread() {
pthread_mutex_lock(&queueLock);
finished = true;
pthread_cond_broadcast(&waitForTaskCondition);
pthread_mutex_unlock(&queueLock);
pthread_join(thread, NULL);
pthread_mutex_destroy(&queueLock);
pthread_cond_destroy(&waitForTaskCondition);
pthread_cond_destroy(&queueEmptyCondition);
}
void CudaContext::WorkThread::addTask(CudaContext::WorkTask* task) {
pthread_mutex_lock(&queueLock);
tasks.push(task);
waiting = false;
pthread_cond_signal(&waitForTaskCondition);
pthread_mutex_unlock(&queueLock);
}
bool CudaContext::WorkThread::isWaiting() {
return waiting;
}
bool CudaContext::WorkThread::isFinished() {
return finished;
}
void CudaContext::WorkThread::flush() {
pthread_mutex_lock(&queueLock);
while (!waiting)
pthread_cond_wait(&queueEmptyCondition, &queueLock);
pthread_mutex_unlock(&queueLock);
}
#ifndef OPENMM_CUDACONTEXT_H_
#define OPENMM_CUDACONTEXT_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include <map>
#include <queue>
#include <string>
#include <pthread.h>
#define __CL_ENABLE_EXCEPTIONS
#ifdef _MSC_VER
// Prevent Windows from defining macros that interfere with other code.
#define NOMINMAX
#endif
#include <cuda.h>
#include <builtin_types.h>
#include <vector_functions.h>
#include "openmm/internal/windowsExport.h"
#include "CudaPlatform.h"
namespace OpenMM {
class CudaArray;
class CudaForceInfo;
class CudaIntegrationUtilities;
class CudaBondedUtilities;
class CudaNonbondedUtilities;
class System;
/**
* This class contains the information associated with a Context by the CUDA Platform. Each CudaContext is
* specific to a particular device, and manages data structures and kernels for that device. When running a simulation
* in parallel on multiple devices, there is a separate CudaContext for each one. The list of all contexts is
* stored in the CudaPlatform::PlatformData.
* <p>
* In addition, a worker thread is created for each CudaContext. This is used for parallel computations, so that
* blocking calls to one device will not block other devices. When only a single device is being used, the worker
* thread is not used and calculations are performed on the main application thread.
*/
class OPENMM_EXPORT CudaContext {
public:
class WorkTask;
class WorkThread;
class ReorderListener;
static const int ThreadBlockSize;
static const int TileSize;
CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const std::string& precision,
const std::string& compiler, const std::string& tempDir, CudaPlatform::PlatformData& platformData);
~CudaContext();
// /**
// * This is called to initialize internal data structures after all Forces in the system
// * have been initialized.
// */
// void initialize();
/**
* Add a CudaForce to this context.
*/
void addForce(CudaForceInfo* force);
/**
* Get the CUcontext associated with this object.
*/
CUcontext getContext() {
return context;
}
/**
* Get the CUdevice associated with this object.
*/
CUdevice getDevice() {
return device;
}
/**
* Get the index of the CUdevice associated with this object.
*/
int getDeviceIndex() {
return deviceIndex;
}
/**
* Get the PlatformData object this context is part of.
*/
CudaPlatform::PlatformData& getPlatformData() {
return platformData;
}
/**
* Get the index of this context in the list stored in the PlatformData.
*/
int getContextIndex() const {
return contextIndex;
}
/**
* Get the array which contains the position (the xyz components) and charge (the w component) of each atom.
*/
CudaArray& getPosq() {
return *posq;
}
/**
* Get the array which contains the velocity (the xyz components) and inverse mass (the w component) of each atom.
*/
CudaArray& getVelm() {
return *velm;
}
// /**
// * Get the array which contains the force on each atom.
// */
// CudaArray<mm_float4>& getForce() {
// return *force;
// }
// /**
// * Get the array which contains the buffers in which forces are computed.
// */
// CudaArray<mm_float4>& getForceBuffers() {
// return *forceBuffers;
// }
// /**
// * Get the array which contains a contribution to each force represented as 64 bit fixed point.
// */
// CudaArray<cl_long>& getLongForceBuffer() {
// return *longForceBuffer;
// }
// /**
// * Get the array which contains the buffer in which energy is computed.
// */
// CudaArray<cl_float>& getEnergyBuffer() {
// return *energyBuffer;
// }
// /**
// * Get the array which contains the index of each atom.
// */
// CudaArray<cl_int>& getAtomIndex() {
// return *atomIndex;
// }
// /**
// * Get the number of cells by which the positions are offset.
// */
// std::vector<mm_int4>& getPosCellOffsets() {
// return posCellOffsets;
// }
/**
* Replace all occurrences of a list of substrings.
*
* @param input a string to process
* @param replacements a set of strings that should be replaced with new strings wherever they appear in the input string
* @return a new string produced by performing the replacements
*/
std::string replaceStrings(const std::string& input, const std::map<std::string, std::string>& replacements) const;
/**
* Create a CUDA module from source code.
*
* @param source the source code of the module
* @param optimizationFlags the optimization flags to pass to the CUDA compiler. If this is
* omitted, a default set of options will be used
*/
CUmodule createModule(const std::string source, const char* optimizationFlags = NULL);
/**
* Create a CUDA module from source code.
*
* @param source the source code of the module
* @param defines a set of preprocessor definitions (name, value) to define when compiling the program
* @param optimizationFlags the optimization flags to pass to the CUDA compiler. If this is
* omitted, a default set of options will be used
*/
CUmodule createModule(const std::string source, const std::map<std::string, std::string>& defines, const char* optimizationFlags = NULL);
// /**
// * Execute a kernel.
// *
// * @param kernel the kernel to execute
// * @param workUnits the maximum number of work units that should be used
// * @param blockSize the size of each thread block to use
// */
// void executeKernel(cl::Kernel& kernel, int workUnits, int blockSize = -1);
// /**
// * Set all elements of an array to 0.
// */
// void clearBuffer(CudaArray<float>& array);
// /**
// * Set all elements of an array to 0.
// */
// void clearBuffer(CudaArray<mm_float4>& array);
// /**
// * Set all elements of an array to 0.
// *
// * @param memory the Memory to clear
// * @param size the number of float elements in the buffer
// */
// void clearBuffer(cl::Memory& memory, int size);
// /**
// * Register a buffer that should be automatically cleared (all elements set to 0) at the start of each force or energy computation.
// *
// * @param memory the Memory to clear
// * @param size the number of float elements in the buffer
// */
// void addAutoclearBuffer(cl::Memory& memory, int size);
// /**
// * Clear all buffers that have been registered with addAutoclearBuffer().
// */
// void clearAutoclearBuffers();
// /**
// * Given a collection of buffers packed into an array, sum them and store
// * the sum in the first buffer.
// *
// * @param array the array containing the buffers to reduce
// * @param numBuffers the number of buffers packed into the array
// */
// void reduceBuffer(CudaArray<mm_float4>& array, int numBuffers);
// /**
// * Sum the buffesr containing forces.
// */
// void reduceForces();
// /**
// * Get the current simulation time.
// */
// double getTime() {
// return time;
// }
// /**
// * Set the current simulation time.
// */
// void setTime(double t) {
// time = t;
// }
// /**
// * Get the number of integration steps that have been taken.
// */
// int getStepCount() {
// return stepCount;
// }
// /**
// * Set the number of integration steps that have been taken.
// */
// void setStepCount(int steps) {
// stepCount = steps;
// }
// /**
// * Get the number of times forces or energy has been computed.
// */
// int getComputeForceCount() {
// return computeForceCount;
// }
// /**
// * Set the number of times forces or energy has been computed.
// */
// void setComputeForceCount(int count) {
// computeForceCount = count;
// }
// /**
// * Get the number of atoms.
// */
// int getNumAtoms() const {
// return numAtoms;
// }
// /**
// * Get the number of atoms, rounded up to a multiple of TileSize. This is the actual size of
// * most arrays with one element per atom.
// */
// int getPaddedNumAtoms() const {
// return paddedNumAtoms;
// }
// /**
// * Get the number of blocks of TileSize atoms.
// */
// int getNumAtomBlocks() const {
// return numAtomBlocks;
// }
// /**
// * Get the standard number of thread blocks to use when executing kernels.
// */
// int getNumThreadBlocks() const {
// return numThreadBlocks;
// }
// /**
// * Get the number of force buffers.
// */
// int getNumForceBuffers() const {
// return numForceBuffers;
// }
// /**
// * Get the SIMD width of the device being used.
// */
// int getSIMDWidth() const {
// return simdWidth;
// }
// /**
// * Get whether the device being used supports 64 bit atomic operations on global memory.
// */
// bool getSupports64BitGlobalAtomics() {
// return supports64BitGlobalAtomics;
// }
// /**
// * Get whether the device being used supports double precision math.
// */
// bool getSupportsDoublePrecision() {
// return supportsDoublePrecision;
// }
// /**
// * Get the size of the periodic box.
// */
// mm_float4 getPeriodicBoxSize() const {
// return periodicBoxSize;
// }
// /**
// * Set the size of the periodic box.
// */
// void setPeriodicBoxSize(double xsize, double ysize, double zsize) {
// periodicBoxSize = mm_float4((float) xsize, (float) ysize, (float) zsize, 0);
// invPeriodicBoxSize = mm_float4((float) (1.0/xsize), (float) (1.0/ysize), (float) (1.0/zsize), 0);
// }
// /**
// * Get the inverse of the size of the periodic box.
// */
// mm_float4 getInvPeriodicBoxSize() const {
// return invPeriodicBoxSize;
// }
// /**
// * Get the CudaIntegrationUtilities for this context.
// */
// CudaIntegrationUtilities& getIntegrationUtilities() {
// return *integration;
// }
// /**
// * Get the CudaBondedUtilities for this context.
// */
// CudaBondedUtilities& getBondedUtilities() {
// return *bonded;
// }
// /**
// * Get the CudaNonbondedUtilities for this context.
// */
// CudaNonbondedUtilities& getNonbondedUtilities() {
// return *nonbonded;
// }
// /**
// * Get the thread used by this context for executing parallel computations.
// */
// WorkThread& getWorkThread() {
// return *thread;
// }
// /**
// * Get whether atoms were reordered during the most recent force/energy computation.
// */
// bool getAtomsWereReordered() const {
// return atomsWereReordered;
// }
// /**
// * Set whether atoms were reordered during the most recent force/energy computation.
// */
// void setAtomsWereReordered(bool wereReordered) {
// atomsWereReordered = wereReordered;
// }
// /**
// * Reorder the internal arrays of atoms to try to keep spatially contiguous atoms close
// * together in the arrays.
// *
// * @param enforcePeriodic if true, the atom positions may be altered to enforce periodic boundary conditions
// */
// void reorderAtoms(bool enforcePeriodic);
// /**
// * Add a listener that should be called whenever atoms get reordered. The CudaContext
// * assumes ownership of the object, and deletes it when the context itself is deleted.
// */
// void addReorderListener(ReorderListener* listener);
// /**
// * Get the list of ReorderListeners.
// */
// std::vector<ReorderListener*>& getReorderListeners() {
// return reorderListeners;
// }
// /**
// * Mark that the current molecule definitions (and hence the atom order) may be invalid.
// * This should be called whenever force field parameters change. It will cause the definitions
// * and order to be revalidated the next to reorderAtoms() is called.
// */
// void invalidateMolecules();
// /**
// * Get whether the current molecule definitions are valid.
// */
// bool getMoleculesAreInvalid() {
// return moleculesInvalid;
// }
private:
struct Molecule;
struct MoleculeGroup;
class VirtualSiteInfo;
// void findMoleculeGroups();
// static void tagAtomsInMolecule(int atom, int molecule, std::vector<int>& atomMolecule, std::vector<std::vector<int> >& atomBonds);
// /**
// * Ensure that all molecules marked as "identical" really are identical. This should be
// * called whenever force field parameters change. If necessary, it will rebuild the list
// * of molecules and resort the atoms.
// */
// void validateMolecules();
static bool hasInitializedCuda;
const System& system;
double time;
CudaPlatform::PlatformData& platformData;
int deviceIndex;
int contextIndex;
int stepCount;
int computeForceCount;
int numAtoms;
int paddedNumAtoms;
int numAtomBlocks;
int numThreadBlocks;
// int numForceBuffers;
// int simdWidth;
bool useBlockingSync, useDoublePrecision, accumulateInDouble, contextIsValid, atomsWereReordered, moleculesInvalid;
std::string compiler, tempDir, gpuArchitecture;
float4 periodicBoxSize;
float4 invPeriodicBoxSize;
std::string defaultOptimizationOptions;
std::map<std::string, std::string> compilationDefines;
CUcontext context;
CUdevice device;
CUfunction clearBufferKernel;
CUfunction clearTwoBuffersKernel;
CUfunction clearThreeBuffersKernel;
CUfunction clearFourBuffersKernel;
CUfunction clearFiveBuffersKernel;
CUfunction clearSixBuffersKernel;
CUfunction reduceFloat4Kernel;
CUfunction reduceForcesKernel;
std::vector<CudaForceInfo*> forces;
std::vector<Molecule> molecules;
std::vector<MoleculeGroup> moleculeGroups;
std::vector<int4> posCellOffsets;
CudaArray* posq;
CudaArray* velm;
// CudaArray<mm_float4>* force;
// CudaArray<mm_float4>* forceBuffers;
// CudaArray<cl_long>* longForceBuffer;
// CudaArray<cl_float>* energyBuffer;
// CudaArray<cl_int>* atomIndex;
// std::vector<cl::Memory*> autoclearBuffers;
// std::vector<int> autoclearBufferSizes;
std::vector<ReorderListener*> reorderListeners;
// CudaIntegrationUtilities* integration;
// CudaBondedUtilities* bonded;
// CudaNonbondedUtilities* nonbonded;
WorkThread* thread;
};
struct CudaContext::Molecule {
std::vector<int> atoms;
std::vector<int> constraints;
std::vector<std::vector<int> > groups;
};
struct CudaContext::MoleculeGroup {
std::vector<int> atoms;
std::vector<int> instances;
std::vector<int> offsets;
};
/**
* This abstract class defines a task to be executed on the worker thread.
*/
class CudaContext::WorkTask {
public:
virtual void execute() = 0;
virtual ~WorkTask() {
}
};
class CudaContext::WorkThread {
public:
struct ThreadData;
WorkThread();
~WorkThread();
/**
* Request that a task be executed on the worker thread. The argument should have been allocated on the
* heap with the "new" operator. After its execute() method finishes, the object will be deleted automatically.
*/
void addTask(CudaContext::WorkTask* task);
/**
* Get whether the worker thread is idle, waiting for a task to be added.
*/
bool isWaiting();
/**
* Get whether the worker thread has exited.
*/
bool isFinished();
/**
* Block until all tasks have finished executing and the worker thread is idle.
*/
void flush();
private:
std::queue<CudaContext::WorkTask*> tasks;
bool waiting, finished;
pthread_mutex_t queueLock;
pthread_cond_t waitForTaskCondition, queueEmptyCondition;
pthread_t thread;
};
/**
* This abstract class defines a function to be executed whenever atoms get reordered.
* Objects that need to know when reordering happens should create a reorderListener
* and register it by calling addReorderListener().
*/
class CudaContext::ReorderListener {
public:
virtual void execute() = 0;
virtual ~ReorderListener() {
}
};
} // namespace OpenMM
#endif /*OPENMM_CUDACONTEXT_H_*/
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "CudaExpressionUtilities.h"
#include "openmm/OpenMMException.h"
#include "openmm/internal/SplineFitter.h"
#include "lepton/Operation.h"
using namespace OpenMM;
using namespace Lepton;
using namespace std;
string CudaExpressionUtilities::doubleToString(double value) {
stringstream s;
s.precision(8);
s << scientific << value << "f";
return s.str();
}
string CudaExpressionUtilities::intToString(int value) {
stringstream s;
s << value;
return s.str();
}
string CudaExpressionUtilities::createExpressions(const map<string, ParsedExpression>& expressions, const map<string, string>& variables,
const vector<pair<string, string> >& functions, const string& prefix, const string& functionParams, const string& tempType) {
vector<pair<ExpressionTreeNode, string> > variableNodes;
for (map<string, string>::const_iterator iter = variables.begin(); iter != variables.end(); ++iter)
variableNodes.push_back(make_pair(ExpressionTreeNode(new Operation::Variable(iter->first)), iter->second));
return createExpressions(expressions, variableNodes, functions, prefix, functionParams, tempType);
}
string CudaExpressionUtilities::createExpressions(const map<string, ParsedExpression>& expressions, const vector<pair<ExpressionTreeNode, string> >& variables,
const vector<pair<string, string> >& functions, const string& prefix, const string& functionParams, const string& tempType) {
stringstream out;
vector<ParsedExpression> allExpressions;
for (map<string, ParsedExpression>::const_iterator iter = expressions.begin(); iter != expressions.end(); ++iter)
allExpressions.push_back(iter->second);
vector<pair<ExpressionTreeNode, string> > temps = variables;
for (map<string, ParsedExpression>::const_iterator iter = expressions.begin(); iter != expressions.end(); ++iter) {
processExpression(out, iter->second.getRootNode(), temps, functions, prefix, functionParams, allExpressions, tempType);
out << iter->first << getTempName(iter->second.getRootNode(), temps) << ";\n";
}
return out.str();
}
void CudaExpressionUtilities::processExpression(stringstream& out, const ExpressionTreeNode& node, vector<pair<ExpressionTreeNode, string> >& temps,
const vector<pair<string, string> >& functions, const string& prefix, const string& functionParams, const vector<ParsedExpression>& allExpressions, const string& tempType) {
for (int i = 0; i < (int) temps.size(); i++)
if (temps[i].first == node)
return;
for (int i = 0; i < (int) node.getChildren().size(); i++)
processExpression(out, node.getChildren()[i], temps, functions, prefix, functionParams, allExpressions, tempType);
string name = prefix+intToString(temps.size());
bool hasRecordedNode = false;
out << tempType << " " << name << " = ";
switch (node.getOperation().getId()) {
case Operation::CONSTANT:
out << doubleToString(dynamic_cast<const Operation::Constant*>(&node.getOperation())->getValue());
break;
case Operation::VARIABLE:
throw OpenMMException("Unknown variable in expression: "+node.getOperation().getName());
case Operation::CUSTOM:
{
int i;
for (i = 0; i < (int) functions.size() && functions[i].first != node.getOperation().getName(); i++)
;
if (i == functions.size())
throw OpenMMException("Unknown function in expression: "+node.getOperation().getName());
bool isDeriv = (dynamic_cast<const Operation::Custom*>(&node.getOperation())->getDerivOrder()[0] == 1);
out << "0.0f;\n";
temps.push_back(make_pair(node, name));
hasRecordedNode = true;
// If both the value and derivative of the function are needed, it's faster to calculate them both
// at once, so check to see if both are needed.
const ExpressionTreeNode* valueNode = NULL;
const ExpressionTreeNode* derivNode = NULL;
for (int j = 0; j < (int) allExpressions.size(); j++)
findRelatedTabulatedFunctions(node, allExpressions[j].getRootNode(), valueNode, derivNode);
string valueName = name;
string derivName = name;
if (valueNode != NULL && derivNode != NULL) {
string name2 = prefix+intToString(temps.size());
out << tempType << " " << name2 << " = 0.0f;\n";
if (isDeriv) {
valueName = name2;
temps.push_back(make_pair(*valueNode, name2));
}
else {
derivName = name2;
temps.push_back(make_pair(*derivNode, name2));
}
}
out << "{\n";
out << "float4 params = " << functionParams << "[" << i << "];\n";
out << "float x = " << getTempName(node.getChildren()[0], temps) << ";\n";
out << "if (x >= params.x && x <= params.y) {\n";
out << "x = (x-params.x)*params.z;\n";
out << "int index = (int) (floor(x));\n";
out << "index = min(index, (int) params.w);\n";
out << "float4 coeff = " << functions[i].second << "[index];\n";
out << "float b = x-index;\n";
out << "float a = 1.0f-b;\n";
if (valueNode != NULL)
out << valueName << " = a*coeff.x+b*coeff.y+((a*a*a-a)*coeff.z+(b*b*b-b)*coeff.w)/(params.z*params.z);\n";
if (derivNode != NULL)
out << derivName << " = (coeff.y-coeff.x)*params.z+((1.0f-3.0f*a*a)*coeff.z+(3.0f*b*b-1.0f)*coeff.w)/params.z;\n";
out << "}\n";
out << "}";
break;
}
case Operation::ADD:
out << getTempName(node.getChildren()[0], temps) << "+" << getTempName(node.getChildren()[1], temps);
break;
case Operation::SUBTRACT:
out << getTempName(node.getChildren()[0], temps) << "-" << getTempName(node.getChildren()[1], temps);
break;
case Operation::MULTIPLY:
out << getTempName(node.getChildren()[0], temps) << "*" << getTempName(node.getChildren()[1], temps);
break;
case Operation::DIVIDE:
{
bool haveReciprocal = false;
for (int i = 0; i < (int) temps.size(); i++)
if (temps[i].first.getOperation().getId() == Operation::RECIPROCAL && temps[i].first.getChildren()[0] == node.getChildren()[1]) {
haveReciprocal = true;
out << getTempName(node.getChildren()[0], temps) << "*" << temps[i].second;
}
if (!haveReciprocal)
out << getTempName(node.getChildren()[0], temps) << "/" << getTempName(node.getChildren()[1], temps);
break;
}
case Operation::POWER:
out << "pow(" << getTempName(node.getChildren()[0], temps) << ", " << getTempName(node.getChildren()[1], temps) << ")";
break;
case Operation::NEGATE:
out << "-" << getTempName(node.getChildren()[0], temps);
break;
case Operation::SQRT:
out << "sqrt(" << getTempName(node.getChildren()[0], temps) << ")";
break;
case Operation::EXP:
out << "EXP(" << getTempName(node.getChildren()[0], temps) << ")";
break;
case Operation::LOG:
out << "LOG(" << getTempName(node.getChildren()[0], temps) << ")";
break;
case Operation::SIN:
out << "sin(" << getTempName(node.getChildren()[0], temps) << ")";
break;
case Operation::COS:
out << "cos(" << getTempName(node.getChildren()[0], temps) << ")";
break;
case Operation::SEC:
out << "1.0f/cos(" << getTempName(node.getChildren()[0], temps) << ")";
break;
case Operation::CSC:
out << "1.0f/sin(" << getTempName(node.getChildren()[0], temps) << ")";
break;
case Operation::TAN:
out << "tan(" << getTempName(node.getChildren()[0], temps) << ")";
break;
case Operation::COT:
out << "1.0f/tan(" << getTempName(node.getChildren()[0], temps) << ")";
break;
case Operation::ASIN:
out << "asin(" << getTempName(node.getChildren()[0], temps) << ")";
break;
case Operation::ACOS:
out << "acos(" << getTempName(node.getChildren()[0], temps) << ")";
break;
case Operation::ATAN:
out << "atan(" << getTempName(node.getChildren()[0], temps) << ")";
break;
case Operation::SINH:
out << "sinh(" << getTempName(node.getChildren()[0], temps) << ")";
break;
case Operation::COSH:
out << "cosh(" << getTempName(node.getChildren()[0], temps) << ")";
break;
case Operation::TANH:
out << "tanh(" << getTempName(node.getChildren()[0], temps) << ")";
break;
case Operation::ERF:
out << "erf(" << getTempName(node.getChildren()[0], temps) << ")";
break;
case Operation::ERFC:
out << "erfc(" << getTempName(node.getChildren()[0], temps) << ")";
break;
case Operation::STEP:
out << getTempName(node.getChildren()[0], temps) << " >= 0.0f ? 1.0f : 0.0f";
break;
case Operation::DELTA:
out << getTempName(node.getChildren()[0], temps) << " == 0.0f ? 1.0f : 0.0f";
break;
case Operation::SQUARE:
{
string arg = getTempName(node.getChildren()[0], temps);
out << arg << "*" << arg;
break;
}
case Operation::CUBE:
{
string arg = getTempName(node.getChildren()[0], temps);
out << arg << "*" << arg << "*" << arg;
break;
}
case Operation::RECIPROCAL:
out << "RECIP(" << getTempName(node.getChildren()[0], temps) << ")";
break;
case Operation::ADD_CONSTANT:
out << doubleToString(dynamic_cast<const Operation::AddConstant*>(&node.getOperation())->getValue()) << "+" << getTempName(node.getChildren()[0], temps);
break;
case Operation::MULTIPLY_CONSTANT:
out << doubleToString(dynamic_cast<const Operation::MultiplyConstant*>(&node.getOperation())->getValue()) << "*" << getTempName(node.getChildren()[0], temps);
break;
case Operation::POWER_CONSTANT:
{
double exponent = dynamic_cast<const Operation::PowerConstant*>(&node.getOperation())->getValue();
if (exponent == 0.0)
out << "1.0f";
else if (exponent == (int) exponent) {
out << "0.0f;\n";
temps.push_back(make_pair(node, name));
hasRecordedNode = true;
// If multiple integral powers of the same base are needed, it's faster to calculate all of them
// at once, so check to see if others are also needed.
map<int, const ExpressionTreeNode*> powers;
powers[(int) exponent] = &node;
for (int j = 0; j < (int) allExpressions.size(); j++)
findRelatedPowers(node, allExpressions[j].getRootNode(), powers);
vector<int> exponents;
vector<string> names;
vector<bool> hasAssigned(powers.size(), false);
exponents.push_back((int) fabs(exponent));
names.push_back(name);
for (map<int, const ExpressionTreeNode*>::const_iterator iter = powers.begin(); iter != powers.end(); ++iter) {
if (iter->first != exponent) {
exponents.push_back(iter->first >= 0 ? iter->first : -iter->first);
string name2 = prefix+intToString(temps.size());
names.push_back(name2);
temps.push_back(make_pair(*iter->second, name2));
out << tempType << " " << name2 << " = 0.0f;\n";
}
}
out << "{\n";
out << "float multiplier = " << (exponent < 0.0 ? "1.0f/" : "") << getTempName(node.getChildren()[0], temps) << ";\n";
bool done = false;
while (!done) {
done = true;
for (int i = 0; i < (int) exponents.size(); i++) {
if (exponents[i]%2 == 1) {
if (!hasAssigned[i])
out << names[i] << " = multiplier;\n";
else
out << names[i] << " *= multiplier;\n";
hasAssigned[i] = true;
}
exponents[i] >>= 1;
if (exponents[i] != 0)
done = false;
}
if (!done)
out << "multiplier *= multiplier;\n";
}
out << "}";
}
else
out << "pow(" << getTempName(node.getChildren()[0], temps) << ", " << doubleToString(exponent) << ")";
break;
}
case Operation::MIN:
out << "min(" << getTempName(node.getChildren()[0], temps) << ", " << getTempName(node.getChildren()[1], temps) << ")";
break;
case Operation::MAX:
out << "max(" << getTempName(node.getChildren()[0], temps) << ", " << getTempName(node.getChildren()[1], temps) << ")";
break;
case Operation::ABS:
out << "fabs(" << getTempName(node.getChildren()[0], temps) << ")";
break;
default:
throw OpenMMException("Internal error: Unknown operation in user-defined expression: "+node.getOperation().getName());
}
out << ";\n";
if (!hasRecordedNode)
temps.push_back(make_pair(node, name));
}
string CudaExpressionUtilities::getTempName(const ExpressionTreeNode& node, const vector<pair<ExpressionTreeNode, string> >& temps) {
for (int i = 0; i < (int) temps.size(); i++)
if (temps[i].first == node)
return temps[i].second;
stringstream out;
out << "Internal error: No temporary variable for expression node: " << node;
throw OpenMMException(out.str());
}
void CudaExpressionUtilities::findRelatedTabulatedFunctions(const ExpressionTreeNode& node, const ExpressionTreeNode& searchNode,
const ExpressionTreeNode*& valueNode, const ExpressionTreeNode*& derivNode) {
if (searchNode.getOperation().getId() == Operation::CUSTOM && node.getChildren()[0] == searchNode.getChildren()[0]) {
if (dynamic_cast<const Operation::Custom*>(&searchNode.getOperation())->getDerivOrder()[0] == 0)
valueNode = &searchNode;
else
derivNode = &searchNode;
}
else
for (int i = 0; i < (int) searchNode.getChildren().size(); i++)
findRelatedTabulatedFunctions(node, searchNode.getChildren()[i], valueNode, derivNode);
}
void CudaExpressionUtilities::findRelatedPowers(const ExpressionTreeNode& node, const ExpressionTreeNode& searchNode, map<int, const ExpressionTreeNode*>& powers) {
if (searchNode.getOperation().getId() == Operation::POWER_CONSTANT && node.getChildren()[0] == searchNode.getChildren()[0]) {
double realPower = dynamic_cast<const Operation::PowerConstant*>(&searchNode.getOperation())->getValue();
int power = (int) realPower;
if (power != realPower)
return; // We are only interested in integer powers.
if (powers.find(power) != powers.end())
return; // This power is already in the map.
if (powers.begin()->first*power < 0)
return; // All powers must have the same sign.
powers[power] = &searchNode;
}
else
for (int i = 0; i < (int) searchNode.getChildren().size(); i++)
findRelatedPowers(node, searchNode.getChildren()[i], powers);
}
vector<float4> CudaExpressionUtilities::computeFunctionCoefficients(const vector<double>& values, double min, double max) {
// Compute the spline coefficients.
int numValues = values.size();
vector<double> x(numValues), derivs;
for (int i = 0; i < numValues; i++)
x[i] = min+i*(max-min)/(numValues-1);
SplineFitter::createNaturalSpline(x, values, derivs);
vector<float4> f(numValues-1);
for (int i = 0; i < (int) values.size()-1; i++)
f[i] = make_float4((float) values[i], (float) values[i+1], (float) (derivs[i]/6.0), (float) (derivs[i+1]/6.0));
return f;
}
#ifndef OPENMM_CUDAEXPRESSIONUTILITIES_H_
#define OPENMM_CUDAEXPRESSIONUTILITIES_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "CudaContext.h"
#include "lepton/CustomFunction.h"
#include "lepton/ExpressionTreeNode.h"
#include "lepton/ParsedExpression.h"
#include <map>
#include <sstream>
#include <string>
#include <utility>
namespace OpenMM {
/**
* This class is used by various classes to generate CUDA source code implementing
* user defined mathematical expressions.
*/
class OPENMM_EXPORT CudaExpressionUtilities {
public:
/**
* Generate the source code for calculating a set of expressions.
*
* @param expressions the expressions to generate code for (keys are the variables to store the output values in)
* @param variables defines the source code to generate for each variable that may appear in the expressions. Keys are
* variable names, and the values are the code to generate for them.
* @param functions defines the variable name for each tabulated function that may appear in the expressions
* @param prefix a prefix to put in front of temporary variables
* @param functionParams the variable name containing the parameters for each tabulated function
* @param tempType the type of value to use for temporary variables (defaults to "float")
*/
static std::string createExpressions(const std::map<std::string, Lepton::ParsedExpression>& expressions, const std::map<std::string, std::string>& variables,
const std::vector<std::pair<std::string, std::string> >& functions, const std::string& prefix, const std::string& functionParams, const std::string& tempType="float");
/**
* Generate the source code for calculating a set of expressions.
*
* @param expressions the expressions to generate code for (keys are the variables to store the output values in)
* @param variables defines the source code to generate for each variable or precomputed sub-expression that may appear in the expressions.
* Each entry is an ExpressionTreeNode, and the code to generate wherever an identical node appears.
* @param functions defines the variable name for each tabulated function that may appear in the expressions
* @param prefix a prefix to put in front of temporary variables
* @param functionParams the variable name containing the parameters for each tabulated function
* @param tempType the type of value to use for temporary variables (defaults to "float")
*/
static std::string createExpressions(const std::map<std::string, Lepton::ParsedExpression>& expressions, const std::vector<std::pair<Lepton::ExpressionTreeNode, std::string> >& variables,
const std::vector<std::pair<std::string, std::string> >& functions, const std::string& prefix, const std::string& functionParams, const std::string& tempType="float");
/**
* Calculate the spline coefficients for a tabulated function that appears in expressions.
*
* @param values the tabulated values of the function
* @param min the value of the independent variable corresponding to the first element of values
* @param max the value of the independent variable corresponding to the last element of values
* @return the spline coefficients
*/
static std::vector<float4> computeFunctionCoefficients(const std::vector<double>& values, double min, double max);
/**
* Convert a number to a string in a format suitable for including in a kernel.
*/
static std::string doubleToString(double value);
/**
* Convert a number to a string in a format suitable for including in a kernel.
*/
static std::string intToString(int value);
class FunctionPlaceholder;
private:
static void processExpression(std::stringstream& out, const Lepton::ExpressionTreeNode& node,
std::vector<std::pair<Lepton::ExpressionTreeNode, std::string> >& temps,
const std::vector<std::pair<std::string, std::string> >& functions, const std::string& prefix, const std::string& functionParams,
const std::vector<Lepton::ParsedExpression>& allExpressions, const std::string& tempType);
static std::string getTempName(const Lepton::ExpressionTreeNode& node, const std::vector<std::pair<Lepton::ExpressionTreeNode, std::string> >& temps);
static void findRelatedTabulatedFunctions(const Lepton::ExpressionTreeNode& node, const Lepton::ExpressionTreeNode& searchNode,
const Lepton::ExpressionTreeNode*& valueNode, const Lepton::ExpressionTreeNode*& derivNode);
static void findRelatedPowers(const Lepton::ExpressionTreeNode& node, const Lepton::ExpressionTreeNode& searchNode,
std::map<int, const Lepton::ExpressionTreeNode*>& powers);
};
/**
* This class serves as a placeholder for custom functions in expressions.
*/
class CudaExpressionUtilities::FunctionPlaceholder : public Lepton::CustomFunction {
public:
int getNumArguments() const {
return 1;
}
double evaluate(const double* arguments) const {
return 0.0;
}
double evaluateDerivative(const double* arguments, const int* derivOrder) const {
return 0.0;
}
CustomFunction* clone() const {
return new FunctionPlaceholder();
}
};
} // namespace OpenMM
#endif /*OPENMM_CUDAEXPRESSIONUTILITIES_H_*/
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "CudaForceInfo.h"
using namespace OpenMM;
using namespace std;
bool CudaForceInfo::areParticlesIdentical(int particle1, int particle2) {
return true;
}
int CudaForceInfo::getNumParticleGroups() {
return 0;
}
void CudaForceInfo::getParticlesInGroup(int index, vector<int>& particles) {
return;
}
bool CudaForceInfo::areGroupsIdentical(int group1, int group2) {
return true;
}
#ifndef OPENMM_CUDAFORCEINFO_H_
#define OPENMM_CUDAFORCEINFO_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "openmm/internal/windowsExport.h"
#include <vector>
namespace OpenMM {
/**
* This class is used by the Cuda implementation of a Force class to convey information
* about the behavior and requirements of that force.
*/
class OPENMM_EXPORT CudaForceInfo {
public:
CudaForceInfo(int requiredForceBuffers) : requiredForceBuffers(requiredForceBuffers) {
}
/**
* Get the number of force buffers this force requires.
*/
int getRequiredForceBuffers() {
return requiredForceBuffers;
}
/**
* Get whether or not two particles have identical force field parameters.
*/
virtual bool areParticlesIdentical(int particle1, int particle2);
/**
* Get the number of particle groups defined by this force.
*/
virtual int getNumParticleGroups();
/**
* Get the list of particles in a particular group.
*/
virtual void getParticlesInGroup(int index, std::vector<int>& particles);
/**
* Get whether two particle groups are identical.
*/
virtual bool areGroupsIdentical(int group1, int group2);
private:
int requiredForceBuffers;
};
} // namespace OpenMM
#endif /*OPENMM_CUDAFORCEINFO_H_*/
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008-2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "CudaKernelFactory.h"
//#include "CudaParallelKernels.h"
#include "CudaPlatform.h"
#include "openmm/internal/ContextImpl.h"
#include "openmm/OpenMMException.h"
using namespace OpenMM;
KernelImpl* CudaKernelFactory::createKernelImpl(std::string name, const Platform& platform, ContextImpl& context) const {
CudaPlatform::PlatformData& data = *static_cast<CudaPlatform::PlatformData*>(context.getPlatformData());
// if (data.contexts.size() > 1) {
// // We are running in parallel on multiple devices, so we may want to create a parallel kernel.
//
// if (name == CalcForcesAndEnergyKernel::Name())
// return new CudaParallelCalcForcesAndEnergyKernel(name, platform, data);
// if (name == CalcHarmonicBondForceKernel::Name())
// return new CudaParallelCalcHarmonicBondForceKernel(name, platform, data, context.getSystem());
// if (name == CalcCustomBondForceKernel::Name())
// return new CudaParallelCalcCustomBondForceKernel(name, platform, data, context.getSystem());
// if (name == CalcHarmonicAngleForceKernel::Name())
// return new CudaParallelCalcHarmonicAngleForceKernel(name, platform, data, context.getSystem());
// if (name == CalcCustomAngleForceKernel::Name())
// return new CudaParallelCalcCustomAngleForceKernel(name, platform, data, context.getSystem());
// if (name == CalcPeriodicTorsionForceKernel::Name())
// return new CudaParallelCalcPeriodicTorsionForceKernel(name, platform, data, context.getSystem());
// if (name == CalcRBTorsionForceKernel::Name())
// return new CudaParallelCalcRBTorsionForceKernel(name, platform, data, context.getSystem());
// if (name == CalcCMAPTorsionForceKernel::Name())
// return new CudaParallelCalcCMAPTorsionForceKernel(name, platform, data, context.getSystem());
// if (name == CalcCustomTorsionForceKernel::Name())
// return new CudaParallelCalcCustomTorsionForceKernel(name, platform, data, context.getSystem());
// if (name == CalcNonbondedForceKernel::Name())
// return new CudaParallelCalcNonbondedForceKernel(name, platform, data, context.getSystem());
// if (name == CalcCustomNonbondedForceKernel::Name())
// return new CudaParallelCalcCustomNonbondedForceKernel(name, platform, data, context.getSystem());
// if (name == CalcCustomExternalForceKernel::Name())
// return new CudaParallelCalcCustomExternalForceKernel(name, platform, data, context.getSystem());
// if (name == CalcCustomHbondForceKernel::Name())
// return new CudaParallelCalcCustomHbondForceKernel(name, platform, data, context.getSystem());
// if (name == CalcCustomCompoundBondForceKernel::Name())
// return new CudaParallelCalcCustomCompoundBondForceKernel(name, platform, data, context.getSystem());
// }
// CudaContext& cl = *data.contexts[0];
// if (name == CalcForcesAndEnergyKernel::Name())
// return new CudaCalcForcesAndEnergyKernel(name, platform, cl);
// if (name == UpdateStateDataKernel::Name())
// return new CudaUpdateStateDataKernel(name, platform, cl);
// if (name == ApplyConstraintsKernel::Name())
// return new CudaApplyConstraintsKernel(name, platform, cl);
// if (name == VirtualSitesKernel::Name())
// return new CudaVirtualSitesKernel(name, platform, cl);
// if (name == CalcHarmonicBondForceKernel::Name())
// return new CudaCalcHarmonicBondForceKernel(name, platform, cl, context.getSystem());
// if (name == CalcCustomBondForceKernel::Name())
// return new CudaCalcCustomBondForceKernel(name, platform, cl, context.getSystem());
// if (name == CalcHarmonicAngleForceKernel::Name())
// return new CudaCalcHarmonicAngleForceKernel(name, platform, cl, context.getSystem());
// if (name == CalcCustomAngleForceKernel::Name())
// return new CudaCalcCustomAngleForceKernel(name, platform, cl, context.getSystem());
// if (name == CalcPeriodicTorsionForceKernel::Name())
// return new CudaCalcPeriodicTorsionForceKernel(name, platform, cl, context.getSystem());
// if (name == CalcRBTorsionForceKernel::Name())
// return new CudaCalcRBTorsionForceKernel(name, platform, cl, context.getSystem());
// if (name == CalcCMAPTorsionForceKernel::Name())
// return new CudaCalcCMAPTorsionForceKernel(name, platform, cl, context.getSystem());
// if (name == CalcCustomTorsionForceKernel::Name())
// return new CudaCalcCustomTorsionForceKernel(name, platform, cl, context.getSystem());
// if (name == CalcNonbondedForceKernel::Name())
// return new CudaCalcNonbondedForceKernel(name, platform, cl, context.getSystem());
// if (name == CalcCustomNonbondedForceKernel::Name())
// return new CudaCalcCustomNonbondedForceKernel(name, platform, cl, context.getSystem());
// if (name == CalcGBSAOBCForceKernel::Name())
// return new CudaCalcGBSAOBCForceKernel(name, platform, cl);
// if (name == CalcCustomGBForceKernel::Name())
// return new CudaCalcCustomGBForceKernel(name, platform, cl, context.getSystem());
// if (name == CalcCustomExternalForceKernel::Name())
// return new CudaCalcCustomExternalForceKernel(name, platform, cl, context.getSystem());
// if (name == CalcCustomHbondForceKernel::Name())
// return new CudaCalcCustomHbondForceKernel(name, platform, cl, context.getSystem());
// if (name == CalcCustomCompoundBondForceKernel::Name())
// return new CudaCalcCustomCompoundBondForceKernel(name, platform, cl, context.getSystem());
// if (name == IntegrateVerletStepKernel::Name())
// return new CudaIntegrateVerletStepKernel(name, platform, cl);
// if (name == IntegrateLangevinStepKernel::Name())
// return new CudaIntegrateLangevinStepKernel(name, platform, cl);
// if (name == IntegrateBrownianStepKernel::Name())
// return new CudaIntegrateBrownianStepKernel(name, platform, cl);
// if (name == IntegrateVariableVerletStepKernel::Name())
// return new CudaIntegrateVariableVerletStepKernel(name, platform, cl);
// if (name == IntegrateVariableLangevinStepKernel::Name())
// return new CudaIntegrateVariableLangevinStepKernel(name, platform, cl);
// if (name == IntegrateCustomStepKernel::Name())
// return new CudaIntegrateCustomStepKernel(name, platform, cl);
// if (name == ApplyAndersenThermostatKernel::Name())
// return new CudaApplyAndersenThermostatKernel(name, platform, cl);
// if (name == ApplyMonteCarloBarostatKernel::Name())
// return new CudaApplyMonteCarloBarostatKernel(name, platform, cl);
// if (name == CalcKineticEnergyKernel::Name())
// return new CudaCalcKineticEnergyKernel(name, platform, cl);
// if (name == RemoveCMMotionKernel::Name())
// return new CudaRemoveCMMotionKernel(name, platform, cl);
throw OpenMMException((std::string("Tried to create kernel with illegal kernel name '")+name+"'").c_str());
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "CudaKernelSources.h"
using namespace OpenMM;
using namespace std;
#ifndef OPENMM_CUDAKERNELSOURCES_H_
#define OPENMM_CUDAKERNELSOURCES_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2010-2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "openmm/internal/windowsExport.h"
#include <string>
namespace OpenMM {
/**
* This class is a central holding place for the source code of CUDA kernels.
* The CMake build script inserts declarations into it based on the .cu files in the
* kernels subfolder.
*/
class OPENMM_EXPORT CudaKernelSources {
public:
@CUDA_FILE_DECLARATIONS@
};
} // namespace OpenMM
#endif /*OPENMM_CUDAKERNELSOURCES_H_*/
#ifndef OPENMM_CUDAKERNELS_H_
#define OPENMM_CUDAKERNELS_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008-2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "CudaPlatform.h"
//#include "CudaArray.h"
#include "CudaContext.h"
//#include "CudaFFT3D.h"
//#include "CudaParameterSet.h"
//#include "CudaSort.h"
#include "openmm/kernels.h"
#include "openmm/System.h"
namespace OpenMM {
///**
// * This kernel is invoked at the beginning and end of force and energy computations. It gives the
// * Platform a chance to clear buffers and do other initialization at the beginning, and to do any
// * necessary work at the end to determine the final results.
// */
//class CudaCalcForcesAndEnergyKernel : public CalcForcesAndEnergyKernel {
//public:
// CudaCalcForcesAndEnergyKernel(std::string name, const Platform& platform, CudaContext& cl) : CalcForcesAndEnergyKernel(name, platform), cl(cl) {
// }
// /**
// * Initialize the kernel.
// *
// * @param system the System this kernel will be applied to
// */
// void initialize(const System& system);
// /**
// * This is called at the beginning of each force/energy computation, before calcForcesAndEnergy() has been called on
// * any ForceImpl.
// *
// * @param context the context in which to execute this kernel
// * @param includeForce true if forces should be computed
// * @param includeEnergy true if potential energy should be computed
// * @param groups a set of bit flags for which force groups to include
// */
// void beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups);
// /**
// * This is called at the end of each force/energy computation, after calcForcesAndEnergy() has been called on
// * every ForceImpl.
// *
// * @param context the context in which to execute this kernel
// * @param includeForce true if forces should be computed
// * @param includeEnergy true if potential energy should be computed
// * @param groups a set of bit flags for which force groups to include
// * @return the potential energy of the system. This value is added to all values returned by ForceImpls'
// * calcForcesAndEnergy() methods. That is, each force kernel may <i>either</i> return its contribution to the
// * energy directly, <i>or</i> add it to an internal buffer so that it will be included here.
// */
// double finishComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups);
//private:
// CudaContext& cl;
//};
//
///**
// * This kernel provides methods for setting and retrieving various state data: time, positions,
// * velocities, and forces.
// */
//class CudaUpdateStateDataKernel : public UpdateStateDataKernel {
//public:
// CudaUpdateStateDataKernel(std::string name, const Platform& platform, CudaContext& cl) : UpdateStateDataKernel(name, platform), cl(cl) {
// }
// /**
// * Initialize the kernel.
// *
// * @param system the System this kernel will be applied to
// */
// void initialize(const System& system);
// /**
// * Get the current time (in picoseconds).
// *
// * @param context the context in which to execute this kernel
// */
// double getTime(const ContextImpl& context) const;
// /**
// * Set the current time (in picoseconds).
// *
// * @param context the context in which to execute this kernel
// */
// void setTime(ContextImpl& context, double time);
// /**
// * Get the positions of all particles.
// *
// * @param positions on exit, this contains the particle positions
// */
// void getPositions(ContextImpl& context, std::vector<Vec3>& positions);
// /**
// * Set the positions of all particles.
// *
// * @param positions a vector containg the particle positions
// */
// void setPositions(ContextImpl& context, const std::vector<Vec3>& positions);
// /**
// * Get the velocities of all particles.
// *
// * @param velocities on exit, this contains the particle velocities
// */
// void getVelocities(ContextImpl& context, std::vector<Vec3>& velocities);
// /**
// * Set the velocities of all particles.
// *
// * @param velocities a vector containg the particle velocities
// */
// void setVelocities(ContextImpl& context, const std::vector<Vec3>& velocities);
// /**
// * Get the current forces on all particles.
// *
// * @param forces on exit, this contains the forces
// */
// void getForces(ContextImpl& context, std::vector<Vec3>& forces);
// /**
// * Get the current periodic box vectors.
// *
// * @param a on exit, this contains the vector defining the first edge of the periodic box
// * @param b on exit, this contains the vector defining the second edge of the periodic box
// * @param c on exit, this contains the vector defining the third edge of the periodic box
// */
// void getPeriodicBoxVectors(ContextImpl& context, Vec3& a, Vec3& b, Vec3& c) const;
// /**
// * Set the current periodic box vectors.
// *
// * @param a the vector defining the first edge of the periodic box
// * @param b the vector defining the second edge of the periodic box
// * @param c the vector defining the third edge of the periodic box
// */
// void setPeriodicBoxVectors(ContextImpl& context, const Vec3& a, const Vec3& b, const Vec3& c) const;
// /**
// * Create a checkpoint recording the current state of the Context.
// *
// * @param stream an output stream the checkpoint data should be written to
// */
// void createCheckpoint(ContextImpl& context, std::ostream& stream);
// /**
// * Load a checkpoint that was written by createCheckpoint().
// *
// * @param stream an input stream the checkpoint data should be read from
// */
// void loadCheckpoint(ContextImpl& context, std::istream& stream);
//private:
// CudaContext& cl;
//};
//
///**
// * This kernel modifies the positions of particles to enforce distance constraints.
// */
//class CudaApplyConstraintsKernel : public ApplyConstraintsKernel {
//public:
// CudaApplyConstraintsKernel(std::string name, const Platform& platform, CudaContext& cl) : ApplyConstraintsKernel(name, platform),
// cl(cl), hasInitializedKernel(false) {
// }
// /**
// * Initialize the kernel.
// *
// * @param system the System this kernel will be applied to
// */
// void initialize(const System& system);
// /**
// * Update particle positions to enforce constraints.
// *
// * @param context the context in which to execute this kernel
// * @param tol the distance tolerance within which constraints must be satisfied.
// */
// void apply(ContextImpl& context, double tol);
//private:
// CudaContext& cl;
// bool hasInitializedKernel;
// cl::Kernel applyDeltasKernel;
//};
//
///**
// * This kernel recomputes the positions of virtual sites.
// */
//class CudaVirtualSitesKernel : public VirtualSitesKernel {
//public:
// CudaVirtualSitesKernel(std::string name, const Platform& platform, CudaContext& cl) : VirtualSitesKernel(name, platform), cl(cl) {
// }
// /**
// * Initialize the kernel.
// *
// * @param system the System this kernel will be applied to
// */
// void initialize(const System& system);
// /**
// * Compute the virtual site locations.
// *
// * @param context the context in which to execute this kernel
// */
// void computePositions(ContextImpl& context);
//private:
// CudaContext& cl;
//};
//
///**
// * This kernel is invoked by HarmonicBondForce to calculate the forces acting on the system and the energy of the system.
// */
//class CudaCalcHarmonicBondForceKernel : public CalcHarmonicBondForceKernel {
//public:
// CudaCalcHarmonicBondForceKernel(std::string name, const Platform& platform, CudaContext& cl, System& system) : CalcHarmonicBondForceKernel(name, platform),
// hasInitializedKernel(false), cl(cl), system(system), params(NULL) {
// }
// ~CudaCalcHarmonicBondForceKernel();
// /**
// * Initialize the kernel.
// *
// * @param system the System this kernel will be applied to
// * @param force the HarmonicBondForce this kernel will be used for
// */
// void initialize(const System& system, const HarmonicBondForce& force);
// /**
// * Execute the kernel to calculate the forces and/or energy.
// *
// * @param context the context in which to execute this kernel
// * @param includeForces true if forces should be calculated
// * @param includeEnergy true if the energy should be calculated
// * @return the potential energy due to the force
// */
// double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
//private:
// int numBonds;
// bool hasInitializedKernel;
// CudaContext& cl;
// System& system;
// CudaArray<mm_float2>* params;
//};
//
///**
// * This kernel is invoked by CustomBondForce to calculate the forces acting on the system and the energy of the system.
// */
//class CudaCalcCustomBondForceKernel : public CalcCustomBondForceKernel {
//public:
// CudaCalcCustomBondForceKernel(std::string name, const Platform& platform, CudaContext& cl, System& system) : CalcCustomBondForceKernel(name, platform),
// hasInitializedKernel(false), cl(cl), system(system), params(NULL), globals(NULL) {
// }
// ~CudaCalcCustomBondForceKernel();
// /**
// * Initialize the kernel.
// *
// * @param system the System this kernel will be applied to
// * @param force the CustomBondForce this kernel will be used for
// */
// void initialize(const System& system, const CustomBondForce& force);
// /**
// * Execute the kernel to calculate the forces and/or energy.
// *
// * @param context the context in which to execute this kernel
// * @param includeForces true if forces should be calculated
// * @param includeEnergy true if the energy should be calculated
// * @return the potential energy due to the force
// */
// double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
//private:
// int numBonds;
// bool hasInitializedKernel;
// CudaContext& cl;
// System& system;
// CudaParameterSet* params;
// CudaArray<cl_float>* globals;
// std::vector<std::string> globalParamNames;
// std::vector<cl_float> globalParamValues;
//};
//
///**
// * This kernel is invoked by HarmonicAngleForce to calculate the forces acting on the system and the energy of the system.
// */
//class CudaCalcHarmonicAngleForceKernel : public CalcHarmonicAngleForceKernel {
//public:
// CudaCalcHarmonicAngleForceKernel(std::string name, const Platform& platform, CudaContext& cl, System& system) : CalcHarmonicAngleForceKernel(name, platform),
// hasInitializedKernel(false), cl(cl), system(system), params(NULL) {
// }
// ~CudaCalcHarmonicAngleForceKernel();
// /**
// * Initialize the kernel.
// *
// * @param system the System this kernel will be applied to
// * @param force the HarmonicAngleForce this kernel will be used for
// */
// void initialize(const System& system, const HarmonicAngleForce& force);
// /**
// * Execute the kernel to calculate the forces and/or energy.
// *
// * @param context the context in which to execute this kernel
// * @param includeForces true if forces should be calculated
// * @param includeEnergy true if the energy should be calculated
// * @return the potential energy due to the force
// */
// double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
//private:
// int numAngles;
// bool hasInitializedKernel;
// CudaContext& cl;
// System& system;
// CudaArray<mm_float2>* params;
//};
//
///**
// * This kernel is invoked by CustomAngleForce to calculate the forces acting on the system and the energy of the system.
// */
//class CudaCalcCustomAngleForceKernel : public CalcCustomAngleForceKernel {
//public:
// CudaCalcCustomAngleForceKernel(std::string name, const Platform& platform, CudaContext& cl, System& system) : CalcCustomAngleForceKernel(name, platform),
// hasInitializedKernel(false), cl(cl), system(system), params(NULL), globals(NULL) {
// }
// ~CudaCalcCustomAngleForceKernel();
// /**
// * Initialize the kernel.
// *
// * @param system the System this kernel will be applied to
// * @param force the CustomAngleForce this kernel will be used for
// */
// void initialize(const System& system, const CustomAngleForce& force);
// /**
// * Execute the kernel to calculate the forces and/or energy.
// *
// * @param context the context in which to execute this kernel
// * @param includeForces true if forces should be calculated
// * @param includeEnergy true if the energy should be calculated
// * @return the potential energy due to the force
// */
// double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
//private:
// int numAngles;
// bool hasInitializedKernel;
// CudaContext& cl;
// System& system;
// CudaParameterSet* params;
// CudaArray<cl_float>* globals;
// std::vector<std::string> globalParamNames;
// std::vector<cl_float> globalParamValues;
//};
//
///**
// * This kernel is invoked by PeriodicTorsionForce to calculate the forces acting on the system and the energy of the system.
// */
//class CudaCalcPeriodicTorsionForceKernel : public CalcPeriodicTorsionForceKernel {
//public:
// CudaCalcPeriodicTorsionForceKernel(std::string name, const Platform& platform, CudaContext& cl, System& system) : CalcPeriodicTorsionForceKernel(name, platform),
// hasInitializedKernel(false), cl(cl), system(system), params(NULL) {
// }
// ~CudaCalcPeriodicTorsionForceKernel();
// /**
// * Initialize the kernel.
// *
// * @param system the System this kernel will be applied to
// * @param force the PeriodicTorsionForce this kernel will be used for
// */
// void initialize(const System& system, const PeriodicTorsionForce& force);
// /**
// * Execute the kernel to calculate the forces and/or energy.
// *
// * @param context the context in which to execute this kernel
// * @param includeForces true if forces should be calculated
// * @param includeEnergy true if the energy should be calculated
// * @return the potential energy due to the force
// */
// double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
//private:
// int numTorsions;
// bool hasInitializedKernel;
// CudaContext& cl;
// System& system;
// CudaArray<mm_float4>* params;
//};
//
///**
// * This kernel is invoked by RBTorsionForce to calculate the forces acting on the system and the energy of the system.
// */
//class CudaCalcRBTorsionForceKernel : public CalcRBTorsionForceKernel {
//public:
// CudaCalcRBTorsionForceKernel(std::string name, const Platform& platform, CudaContext& cl, System& system) : CalcRBTorsionForceKernel(name, platform),
// hasInitializedKernel(false), cl(cl), system(system), params(NULL) {
// }
// ~CudaCalcRBTorsionForceKernel();
// /**
// * Initialize the kernel.
// *
// * @param system the System this kernel will be applied to
// * @param force the RBTorsionForce this kernel will be used for
// */
// void initialize(const System& system, const RBTorsionForce& force);
// /**
// * Execute the kernel to calculate the forces and/or energy.
// *
// * @param context the context in which to execute this kernel
// * @param includeForces true if forces should be calculated
// * @param includeEnergy true if the energy should be calculated
// * @return the potential energy due to the force
// */
// double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
//private:
// int numTorsions;
// bool hasInitializedKernel;
// CudaContext& cl;
// System& system;
// CudaArray<mm_float8>* params;
//};
//
///**
// * This kernel is invoked by CMAPTorsionForce to calculate the forces acting on the system and the energy of the system.
// */
//class CudaCalcCMAPTorsionForceKernel : public CalcCMAPTorsionForceKernel {
//public:
// CudaCalcCMAPTorsionForceKernel(std::string name, const Platform& platform, CudaContext& cl, System& system) : CalcCMAPTorsionForceKernel(name, platform),
// hasInitializedKernel(false), cl(cl), system(system), coefficients(NULL), mapPositions(NULL), torsionMaps(NULL) {
// }
// ~CudaCalcCMAPTorsionForceKernel();
// /**
// * Initialize the kernel.
// *
// * @param system the System this kernel will be applied to
// * @param force the CMAPTorsionForce this kernel will be used for
// */
// void initialize(const System& system, const CMAPTorsionForce& force);
// /**
// * Execute the kernel to calculate the forces and/or energy.
// *
// * @param context the context in which to execute this kernel
// * @param includeForces true if forces should be calculated
// * @param includeEnergy true if the energy should be calculated
// * @return the potential energy due to the force
// */
// double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
//private:
// int numTorsions;
// bool hasInitializedKernel;
// CudaContext& cl;
// System& system;
// CudaArray<mm_float4>* coefficients;
// CudaArray<mm_int2>* mapPositions;
// CudaArray<cl_int>* torsionMaps;
//};
//
///**
// * This kernel is invoked by CustomTorsionForce to calculate the forces acting on the system and the energy of the system.
// */
//class CudaCalcCustomTorsionForceKernel : public CalcCustomTorsionForceKernel {
//public:
// CudaCalcCustomTorsionForceKernel(std::string name, const Platform& platform, CudaContext& cl, System& system) : CalcCustomTorsionForceKernel(name, platform),
// hasInitializedKernel(false), cl(cl), system(system), params(NULL), globals(NULL) {
// }
// ~CudaCalcCustomTorsionForceKernel();
// /**
// * Initialize the kernel.
// *
// * @param system the System this kernel will be applied to
// * @param force the CustomTorsionForce this kernel will be used for
// */
// void initialize(const System& system, const CustomTorsionForce& force);
// /**
// * Execute the kernel to calculate the forces and/or energy.
// *
// * @param context the context in which to execute this kernel
// * @param includeForces true if forces should be calculated
// * @param includeEnergy true if the energy should be calculated
// * @return the potential energy due to the force
// */
// double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
//private:
// int numTorsions;
// bool hasInitializedKernel;
// CudaContext& cl;
// System& system;
// CudaParameterSet* params;
// CudaArray<cl_float>* globals;
// std::vector<std::string> globalParamNames;
// std::vector<cl_float> globalParamValues;
//};
//
///**
// * This kernel is invoked by NonbondedForce to calculate the forces acting on the system.
// */
//class CudaCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
//public:
// CudaCalcNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cl, System& system) : CalcNonbondedForceKernel(name, platform),
// hasInitializedKernel(false), cl(cl), sigmaEpsilon(NULL), exceptionParams(NULL), cosSinSums(NULL), pmeGrid(NULL),
// pmeGrid2(NULL), pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeBsplineTheta(NULL), pmeBsplineDTheta(NULL),
// pmeAtomRange(NULL), pmeAtomGridIndex(NULL), sort(NULL), fft(NULL) {
// }
// ~CudaCalcNonbondedForceKernel();
// /**
// * Initialize the kernel.
// *
// * @param system the System this kernel will be applied to
// * @param force the NonbondedForce this kernel will be used for
// */
// void initialize(const System& system, const NonbondedForce& force);
// /**
// * Execute the kernel to calculate the forces and/or energy.
// *
// * @param context the context in which to execute this kernel
// * @param includeForces true if forces should be calculated
// * @param includeEnergy true if the energy should be calculated
// * @param includeDirect true if direct space interactions should be included
// * @param includeReciprocal true if reciprocal space interactions should be included
// * @return the potential energy due to the force
// */
// double execute(ContextImpl& context, bool includeForces, bool includeEnergy, bool includeDirect, bool includeReciprocal);
// /**
// * Copy changed parameters over to a context.
// *
// * @param context the context to copy parameters to
// * @param force the NonbondedForce to copy the parameters from
// */
// void copyParametersToContext(ContextImpl& context, const NonbondedForce& force);
//private:
// struct SortTrait {
// typedef mm_int2 DataType;
// typedef cl_int KeyType;
// static const char* clDataType() {return "int2";}
// static const char* clKeyType() {return "int";}
// static const char* clMinKey() {return "INT_MIN";}
// static const char* clMaxKey() {return "INT_MAX";}
// static const char* clMaxValue() {return "(int2) (INT_MAX, INT_MAX)";}
// static const char* clSortKey() {return "value.y";}
// };
// CudaContext& cl;
// bool hasInitializedKernel;
// CudaArray<mm_float2>* sigmaEpsilon;
// CudaArray<mm_float4>* exceptionParams;
// CudaArray<mm_float2>* cosSinSums;
// CudaArray<mm_float2>* pmeGrid;
// CudaArray<mm_float2>* pmeGrid2;
// CudaArray<cl_float>* pmeBsplineModuliX;
// CudaArray<cl_float>* pmeBsplineModuliY;
// CudaArray<cl_float>* pmeBsplineModuliZ;
// CudaArray<mm_float4>* pmeBsplineTheta;
// CudaArray<mm_float4>* pmeBsplineDTheta;
// CudaArray<cl_int>* pmeAtomRange;
// CudaArray<mm_int2>* pmeAtomGridIndex;
// CudaSort<SortTrait>* sort;
// CudaFFT3D* fft;
// cl::Kernel ewaldSumsKernel;
// cl::Kernel ewaldForcesKernel;
// cl::Kernel pmeGridIndexKernel;
// cl::Kernel pmeAtomRangeKernel;
// cl::Kernel pmeZIndexKernel;
// cl::Kernel pmeUpdateBsplinesKernel;
// cl::Kernel pmeSpreadChargeKernel;
// cl::Kernel pmeFinishSpreadChargeKernel;
// cl::Kernel pmeConvolutionKernel;
// cl::Kernel pmeInterpolateForceKernel;
// std::map<std::string, std::string> pmeDefines;
// double ewaldSelfEnergy, dispersionCoefficient, alpha;
// int interpolateForceThreads;
// bool hasCoulomb, hasLJ;
// static const int PmeOrder = 5;
//};
//
///**
// * This kernel is invoked by CustomNonbondedForce to calculate the forces acting on the system.
// */
//class CudaCalcCustomNonbondedForceKernel : public CalcCustomNonbondedForceKernel {
//public:
// CudaCalcCustomNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cl, System& system) : CalcCustomNonbondedForceKernel(name, platform),
// hasInitializedKernel(false), cl(cl), params(NULL), globals(NULL), tabulatedFunctionParams(NULL), system(system) {
// }
// ~CudaCalcCustomNonbondedForceKernel();
// /**
// * Initialize the kernel.
// *
// * @param system the System this kernel will be applied to
// * @param force the CustomNonbondedForce this kernel will be used for
// */
// void initialize(const System& system, const CustomNonbondedForce& force);
// /**
// * Execute the kernel to calculate the forces and/or energy.
// *
// * @param context the context in which to execute this kernel
// * @param includeForces true if forces should be calculated
// * @param includeEnergy true if the energy should be calculated
// * @return the potential energy due to the force
// */
// double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
//private:
// bool hasInitializedKernel;
// CudaContext& cl;
// CudaParameterSet* params;
// CudaArray<cl_float>* globals;
// CudaArray<mm_float4>* tabulatedFunctionParams;
// std::vector<std::string> globalParamNames;
// std::vector<cl_float> globalParamValues;
// std::vector<CudaArray<mm_float4>*> tabulatedFunctions;
// System& system;
//};
//
///**
// * This kernel is invoked by GBSAOBCForce to calculate the forces acting on the system.
// */
//class CudaCalcGBSAOBCForceKernel : public CalcGBSAOBCForceKernel {
//public:
// CudaCalcGBSAOBCForceKernel(std::string name, const Platform& platform, CudaContext& cl) : CalcGBSAOBCForceKernel(name, platform), cl(cl),
// hasCreatedKernels(false), params(NULL), bornSum(NULL), longBornSum(NULL), bornRadii(NULL), bornForce(NULL),
// longBornForce(NULL), obcChain(NULL) {
// }
// ~CudaCalcGBSAOBCForceKernel();
// /**
// * Initialize the kernel.
// *
// * @param system the System this kernel will be applied to
// * @param force the GBSAOBCForce this kernel will be used for
// */
// void initialize(const System& system, const GBSAOBCForce& force);
// /**
// * Execute the kernel to calculate the forces and/or energy.
// *
// * @param context the context in which to execute this kernel
// * @param includeForces true if forces should be calculated
// * @param includeEnergy true if the energy should be calculated
// * @return the potential energy due to the force
// */
// double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
//private:
// double prefactor;
// bool hasCreatedKernels;
// int maxTiles;
// CudaContext& cl;
// CudaArray<mm_float2>* params;
// CudaArray<cl_float>* bornSum;
// CudaArray<cl_long>* longBornSum;
// CudaArray<cl_float>* bornRadii;
// CudaArray<cl_float>* bornForce;
// CudaArray<cl_long>* longBornForce;
// CudaArray<cl_float>* obcChain;
// cl::Kernel computeBornSumKernel;
// cl::Kernel reduceBornSumKernel;
// cl::Kernel force1Kernel;
// cl::Kernel reduceBornForceKernel;
//};
//
///**
// * This kernel is invoked by CustomGBForce to calculate the forces acting on the system.
// */
//class CudaCalcCustomGBForceKernel : public CalcCustomGBForceKernel {
//public:
// CudaCalcCustomGBForceKernel(std::string name, const Platform& platform, CudaContext& cl, System& system) : CalcCustomGBForceKernel(name, platform),
// hasInitializedKernels(false), cl(cl), params(NULL), computedValues(NULL), energyDerivs(NULL), longEnergyDerivs(NULL), globals(NULL),
// valueBuffers(NULL), longValueBuffers(NULL), tabulatedFunctionParams(NULL), system(system) {
// }
// ~CudaCalcCustomGBForceKernel();
// /**
// * Initialize the kernel.
// *
// * @param system the System this kernel will be applied to
// * @param force the CustomGBForce this kernel will be used for
// */
// void initialize(const System& system, const CustomGBForce& force);
// /**
// * Execute the kernel to calculate the forces and/or energy.
// *
// * @param context the context in which to execute this kernel
// * @param includeForces true if forces should be calculated
// * @param includeEnergy true if the energy should be calculated
// * @return the potential energy due to the force
// */
// double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
//private:
// bool hasInitializedKernels, needParameterGradient;
// int maxTiles, numComputedValues;
// CudaContext& cl;
// CudaParameterSet* params;
// CudaParameterSet* computedValues;
// CudaParameterSet* energyDerivs;
// CudaArray<cl_long>* longEnergyDerivs;
// CudaArray<cl_float>* globals;
// CudaArray<cl_float>* valueBuffers;
// CudaArray<cl_long>* longValueBuffers;
// CudaArray<mm_float4>* tabulatedFunctionParams;
// std::vector<std::string> globalParamNames;
// std::vector<cl_float> globalParamValues;
// std::vector<CudaArray<mm_float4>*> tabulatedFunctions;
// std::vector<bool> pairValueUsesParam, pairEnergyUsesParam, pairEnergyUsesValue;
// System& system;
// cl::Kernel pairValueKernel, perParticleValueKernel, pairEnergyKernel, perParticleEnergyKernel, gradientChainRuleKernel;
//};
//
///**
// * This kernel is invoked by CustomExternalForce to calculate the forces acting on the system and the energy of the system.
// */
//class CudaCalcCustomExternalForceKernel : public CalcCustomExternalForceKernel {
//public:
// CudaCalcCustomExternalForceKernel(std::string name, const Platform& platform, CudaContext& cl, System& system) : CalcCustomExternalForceKernel(name, platform),
// hasInitializedKernel(false), cl(cl), system(system), params(NULL), globals(NULL) {
// }
// ~CudaCalcCustomExternalForceKernel();
// /**
// * Initialize the kernel.
// *
// * @param system the System this kernel will be applied to
// * @param force the CustomExternalForce this kernel will be used for
// */
// void initialize(const System& system, const CustomExternalForce& force);
// /**
// * Execute the kernel to calculate the forces and/or energy.
// *
// * @param context the context in which to execute this kernel
// * @param includeForces true if forces should be calculated
// * @param includeEnergy true if the energy should be calculated
// * @return the potential energy due to the force
// */
// double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
//private:
// int numParticles;
// bool hasInitializedKernel;
// CudaContext& cl;
// System& system;
// CudaParameterSet* params;
// CudaArray<cl_float>* globals;
// std::vector<std::string> globalParamNames;
// std::vector<cl_float> globalParamValues;
//};
//
///**
// * This kernel is invoked by CustomHbondForce to calculate the forces acting on the system.
// */
//class CudaCalcCustomHbondForceKernel : public CalcCustomHbondForceKernel {
//public:
// CudaCalcCustomHbondForceKernel(std::string name, const Platform& platform, CudaContext& cl, System& system) : CalcCustomHbondForceKernel(name, platform),
// hasInitializedKernel(false), cl(cl), donorParams(NULL), acceptorParams(NULL), donors(NULL), acceptors(NULL),
// donorBufferIndices(NULL), acceptorBufferIndices(NULL), globals(NULL), donorExclusions(NULL), acceptorExclusions(NULL),
// tabulatedFunctionParams(NULL), system(system) {
// }
// ~CudaCalcCustomHbondForceKernel();
// /**
// * Initialize the kernel.
// *
// * @param system the System this kernel will be applied to
// * @param force the CustomHbondForce this kernel will be used for
// */
// void initialize(const System& system, const CustomHbondForce& force);
// /**
// * Execute the kernel to calculate the forces and/or energy.
// *
// * @param context the context in which to execute this kernel
// * @param includeForces true if forces should be calculated
// * @param includeEnergy true if the energy should be calculated
// * @return the potential energy due to the force
// */
// double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
//private:
// int numDonors, numAcceptors;
// bool hasInitializedKernel;
// CudaContext& cl;
// CudaParameterSet* donorParams;
// CudaParameterSet* acceptorParams;
// CudaArray<cl_float>* globals;
// CudaArray<mm_int4>* donors;
// CudaArray<mm_int4>* acceptors;
// CudaArray<mm_int4>* donorBufferIndices;
// CudaArray<mm_int4>* acceptorBufferIndices;
// CudaArray<mm_int4>* donorExclusions;
// CudaArray<mm_int4>* acceptorExclusions;
// CudaArray<mm_float4>* tabulatedFunctionParams;
// std::vector<std::string> globalParamNames;
// std::vector<cl_float> globalParamValues;
// std::vector<CudaArray<mm_float4>*> tabulatedFunctions;
// System& system;
// cl::Kernel donorKernel, acceptorKernel;
//};
//
///**
// * This kernel is invoked by CustomCompoundBondForce to calculate the forces acting on the system.
// */
//class CudaCalcCustomCompoundBondForceKernel : public CalcCustomCompoundBondForceKernel {
//public:
// CudaCalcCustomCompoundBondForceKernel(std::string name, const Platform& platform, CudaContext& cl, System& system) : CalcCustomCompoundBondForceKernel(name, platform),
// cl(cl), params(NULL), globals(NULL), tabulatedFunctionParams(NULL), system(system) {
// }
// ~CudaCalcCustomCompoundBondForceKernel();
// /**
// * Initialize the kernel.
// *
// * @param system the System this kernel will be applied to
// * @param force the CustomCompoundBondForce this kernel will be used for
// */
// void initialize(const System& system, const CustomCompoundBondForce& force);
// /**
// * Execute the kernel to calculate the forces and/or energy.
// *
// * @param context the context in which to execute this kernel
// * @param includeForces true if forces should be calculated
// * @param includeEnergy true if the energy should be calculated
// * @return the potential energy due to the force
// */
// double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
//private:
// int numBonds;
// CudaContext& cl;
// CudaParameterSet* params;
// CudaArray<cl_float>* globals;
// CudaArray<mm_float4>* tabulatedFunctionParams;
// std::vector<std::string> globalParamNames;
// std::vector<cl_float> globalParamValues;
// std::vector<CudaArray<mm_float4>*> tabulatedFunctions;
// System& system;
//};
//
///**
// * This kernel is invoked by VerletIntegrator to take one time step.
// */
//class CudaIntegrateVerletStepKernel : public IntegrateVerletStepKernel {
//public:
// CudaIntegrateVerletStepKernel(std::string name, const Platform& platform, CudaContext& cl) : IntegrateVerletStepKernel(name, platform), cl(cl),
// hasInitializedKernels(false) {
// }
// ~CudaIntegrateVerletStepKernel();
// /**
// * Initialize the kernel.
// *
// * @param system the System this kernel will be applied to
// * @param integrator the VerletIntegrator this kernel will be used for
// */
// void initialize(const System& system, const VerletIntegrator& integrator);
// /**
// * Execute the kernel.
// *
// * @param context the context in which to execute this kernel
// * @param integrator the VerletIntegrator this kernel is being used for
// */
// void execute(ContextImpl& context, const VerletIntegrator& integrator);
//private:
// CudaContext& cl;
// double prevStepSize;
// bool hasInitializedKernels;
// cl::Kernel kernel1, kernel2;
//};
//
///**
// * This kernel is invoked by LangevinIntegrator to take one time step.
// */
//class CudaIntegrateLangevinStepKernel : public IntegrateLangevinStepKernel {
//public:
// CudaIntegrateLangevinStepKernel(std::string name, const Platform& platform, CudaContext& cl) : IntegrateLangevinStepKernel(name, platform), cl(cl),
// hasInitializedKernels(false), params(NULL) {
// }
// ~CudaIntegrateLangevinStepKernel();
// /**
// * Initialize the kernel, setting up the particle masses.
// *
// * @param system the System this kernel will be applied to
// * @param integrator the LangevinIntegrator this kernel will be used for
// */
// void initialize(const System& system, const LangevinIntegrator& integrator);
// /**
// * Execute the kernel.
// *
// * @param context the context in which to execute this kernel
// * @param integrator the LangevinIntegrator this kernel is being used for
// */
// void execute(ContextImpl& context, const LangevinIntegrator& integrator);
//private:
// CudaContext& cl;
// double prevTemp, prevFriction, prevStepSize;
// bool hasInitializedKernels;
// CudaArray<cl_float>* params;
// cl::Kernel kernel1, kernel2;
//};
//
///**
// * This kernel is invoked by BrownianIntegrator to take one time step.
// */
//class CudaIntegrateBrownianStepKernel : public IntegrateBrownianStepKernel {
//public:
// CudaIntegrateBrownianStepKernel(std::string name, const Platform& platform, CudaContext& cl) : IntegrateBrownianStepKernel(name, platform), cl(cl),
// hasInitializedKernels(false), prevTemp(-1), prevFriction(-1), prevStepSize(-1) {
// }
// ~CudaIntegrateBrownianStepKernel();
// /**
// * Initialize the kernel.
// *
// * @param system the System this kernel will be applied to
// * @param integrator the BrownianIntegrator this kernel will be used for
// */
// void initialize(const System& system, const BrownianIntegrator& integrator);
// /**
// * Execute the kernel.
// *
// * @param context the context in which to execute this kernel
// * @param integrator the BrownianIntegrator this kernel is being used for
// */
// void execute(ContextImpl& context, const BrownianIntegrator& integrator);
//private:
// CudaContext& cl;
// double prevTemp, prevFriction, prevStepSize;
// bool hasInitializedKernels;
// cl::Kernel kernel1, kernel2;
//};
//
///**
// * This kernel is invoked by VariableVerletIntegrator to take one time step.
// */
//class CudaIntegrateVariableVerletStepKernel : public IntegrateVariableVerletStepKernel {
//public:
// CudaIntegrateVariableVerletStepKernel(std::string name, const Platform& platform, CudaContext& cl) : IntegrateVariableVerletStepKernel(name, platform), cl(cl),
// hasInitializedKernels(false) {
// }
// ~CudaIntegrateVariableVerletStepKernel();
// /**
// * Initialize the kernel.
// *
// * @param system the System this kernel will be applied to
// * @param integrator the VerletIntegrator this kernel will be used for
// */
// void initialize(const System& system, const VariableVerletIntegrator& integrator);
// /**
// * Execute the kernel.
// *
// * @param context the context in which to execute this kernel
// * @param integrator the VerletIntegrator this kernel is being used for
// * @param maxTime the maximum time beyond which the simulation should not be advanced
// * @return the size of the step that was taken
// */
// double execute(ContextImpl& context, const VariableVerletIntegrator& integrator, double maxTime);
//private:
// CudaContext& cl;
// bool hasInitializedKernels;
// int blockSize;
// cl::Kernel kernel1, kernel2, selectSizeKernel;
//};
//
///**
// * This kernel is invoked by VariableLangevinIntegrator to take one time step.
// */
//class CudaIntegrateVariableLangevinStepKernel : public IntegrateVariableLangevinStepKernel {
//public:
// CudaIntegrateVariableLangevinStepKernel(std::string name, const Platform& platform, CudaContext& cl) : IntegrateVariableLangevinStepKernel(name, platform), cl(cl),
// hasInitializedKernels(false), params(NULL) {
// }
// ~CudaIntegrateVariableLangevinStepKernel();
// /**
// * Initialize the kernel, setting up the particle masses.
// *
// * @param system the System this kernel will be applied to
// * @param integrator the VariableLangevinIntegrator this kernel will be used for
// */
// void initialize(const System& system, const VariableLangevinIntegrator& integrator);
// /**
// * Execute the kernel.
// *
// * @param context the context in which to execute this kernel
// * @param integrator the VariableLangevinIntegrator this kernel is being used for
// * @param maxTime the maximum time beyond which the simulation should not be advanced
// * @return the size of the step that was taken
// */
// double execute(ContextImpl& context, const VariableLangevinIntegrator& integrator, double maxTime);
//private:
// CudaContext& cl;
// bool hasInitializedKernels;
// int blockSize;
// CudaArray<cl_float>* params;
// cl::Kernel kernel1, kernel2, selectSizeKernel;
// double prevTemp, prevFriction, prevErrorTol;
//};
//
///**
// * This kernel is invoked by CustomIntegrator to take one time step.
// */
//class CudaIntegrateCustomStepKernel : public IntegrateCustomStepKernel {
//public:
// CudaIntegrateCustomStepKernel(std::string name, const Platform& platform, CudaContext& cl) : IntegrateCustomStepKernel(name, platform), cl(cl),
// hasInitializedKernels(false), localValuesAreCurrent(false), globalValues(NULL), contextParameterValues(NULL), sumBuffer(NULL), energy(NULL),
// uniformRandoms(NULL), randomSeed(NULL), perDofValues(NULL) {
// }
// ~CudaIntegrateCustomStepKernel();
// /**
// * Initialize the kernel.
// *
// * @param system the System this kernel will be applied to
// * @param integrator the CustomIntegrator this kernel will be used for
// */
// void initialize(const System& system, const CustomIntegrator& integrator);
// /**
// * Execute the kernel.
// *
// * @param context the context in which to execute this kernel
// * @param integrator the CustomIntegrator this kernel is being used for
// * @param forcesAreValid if the context has been modified since the last time step, this will be
// * false to show that cached forces are invalid and must be recalculated.
// * On exit, this should specify whether the cached forces are valid at the
// * end of the step.
// */
// void execute(ContextImpl& context, CustomIntegrator& integrator, bool& forcesAreValid);
// /**
// * Get the values of all global variables.
// *
// * @param context the context in which to execute this kernel
// * @param values on exit, this contains the values
// */
// void getGlobalVariables(ContextImpl& context, std::vector<double>& values) const;
// /**
// * Set the values of all global variables.
// *
// * @param context the context in which to execute this kernel
// * @param values a vector containing the values
// */
// void setGlobalVariables(ContextImpl& context, const std::vector<double>& values);
// /**
// * Get the values of a per-DOF variable.
// *
// * @param context the context in which to execute this kernel
// * @param variable the index of the variable to get
// * @param values on exit, this contains the values
// */
// void getPerDofVariable(ContextImpl& context, int variable, std::vector<Vec3>& values) const;
// /**
// * Set the values of a per-DOF variable.
// *
// * @param context the context in which to execute this kernel
// * @param variable the index of the variable to get
// * @param values a vector containing the values
// */
// void setPerDofVariable(ContextImpl& context, int variable, const std::vector<Vec3>& values);
//private:
// class ReorderListener;
// std::string createGlobalComputation(const std::string& variable, const Lepton::ParsedExpression& expr, CustomIntegrator& integrator, const std::string& energyName);
// std::string createPerDofComputation(const std::string& variable, const Lepton::ParsedExpression& expr, int component, CustomIntegrator& integrator, const std::string& forceName, const std::string& energyName);
// void recordChangedParameters(ContextImpl& context);
// CudaContext& cl;
// double prevStepSize;
// int numGlobalVariables;
// bool hasInitializedKernels, deviceValuesAreCurrent, modifiesParameters;
// mutable bool localValuesAreCurrent;
// CudaArray<cl_float>* globalValues;
// CudaArray<cl_float>* contextParameterValues;
// CudaArray<cl_float>* sumBuffer;
// CudaArray<cl_float>* energy;
// CudaArray<mm_float4>* uniformRandoms;
// CudaArray<mm_int4>* randomSeed;
// CudaParameterSet* perDofValues;
// mutable std::vector<std::vector<cl_float> > localPerDofValues;
// std::vector<std::vector<cl::Kernel> > kernels;
// cl::Kernel sumEnergyKernel, randomKernel;
// std::vector<CustomIntegrator::ComputationType> stepType;
// std::vector<bool> needsForces;
// std::vector<bool> needsEnergy;
// std::vector<bool> invalidatesForces;
// std::vector<bool> merged;
// std::vector<int> forceGroup;
// std::vector<int> requiredGaussian;
// std::vector<int> requiredUniform;
// std::vector<std::string> parameterNames;
//};
//
///**
// * This kernel is invoked by AndersenThermostat at the start of each time step to adjust the particle velocities.
// */
//class CudaApplyAndersenThermostatKernel : public ApplyAndersenThermostatKernel {
//public:
// CudaApplyAndersenThermostatKernel(std::string name, const Platform& platform, CudaContext& cl) : ApplyAndersenThermostatKernel(name, platform), cl(cl),
// hasInitializedKernels(false), atomGroups(NULL) {
// }
// ~CudaApplyAndersenThermostatKernel();
// /**
// * Initialize the kernel.
// *
// * @param system the System this kernel will be applied to
// * @param thermostat the AndersenThermostat this kernel will be used for
// */
// void initialize(const System& system, const AndersenThermostat& thermostat);
// /**
// * Execute the kernel.
// *
// * @param context the context in which to execute this kernel
// */
// void execute(ContextImpl& context);
//private:
// CudaContext& cl;
// bool hasInitializedKernels;
// int randomSeed;
// CudaArray<cl_int>* atomGroups;
// cl::Kernel kernel;
//};
//
///**
// * This kernel is invoked by MonteCarloBarostat to adjust the periodic box volume
// */
//class CudaApplyMonteCarloBarostatKernel : public ApplyMonteCarloBarostatKernel {
//public:
// CudaApplyMonteCarloBarostatKernel(std::string name, const Platform& platform, CudaContext& cl) : ApplyMonteCarloBarostatKernel(name, platform), cl(cl),
// hasInitializedKernels(false), savedPositions(NULL), moleculeAtoms(NULL), moleculeStartIndex(NULL) {
// }
// ~CudaApplyMonteCarloBarostatKernel();
// /**
// * Initialize the kernel.
// *
// * @param system the System this kernel will be applied to
// * @param barostat the MonteCarloBarostat this kernel will be used for
// */
// void initialize(const System& system, const MonteCarloBarostat& barostat);
// /**
// * Attempt a Monte Carlo step, scaling particle positions (or cluster centers) by a specified value.
// * This is called BEFORE the periodic box size is modified. It should begin by translating each particle
// * or cluster into the first periodic box, so that coordinates will still be correct after the box size
// * is changed.
// *
// * @param context the context in which to execute this kernel
// * @param scale the scale factor by which to multiply particle positions
// */
// void scaleCoordinates(ContextImpl& context, double scale);
// /**
// * Reject the most recent Monte Carlo step, restoring the particle positions to where they were before
// * scaleCoordinates() was last called.
// *
// * @param context the context in which to execute this kernel
// */
// void restoreCoordinates(ContextImpl& context);
//private:
// CudaContext& cl;
// bool hasInitializedKernels;
// int numMolecules;
// CudaArray<mm_float4>* savedPositions;
// CudaArray<cl_int>* moleculeAtoms;
// CudaArray<cl_int>* moleculeStartIndex;
// cl::Kernel kernel;
//};
//
///**
// * This kernel is invoked to calculate the kinetic energy of the system.
// */
//class CudaCalcKineticEnergyKernel : public CalcKineticEnergyKernel {
//public:
// CudaCalcKineticEnergyKernel(std::string name, const Platform& platform, CudaContext& cl) : CalcKineticEnergyKernel(name, platform), cl(cl) {
// }
// /**
// * Initialize the kernel.
// *
// * @param system the System this kernel will be applied to
// */
// void initialize(const System& system);
// /**
// * Execute the kernel.
// *
// * @param context the context in which to execute this kernel
// */
// double execute(ContextImpl& context);
//private:
// CudaContext& cl;
// std::vector<double> masses;
//};
//
///**
// * This kernel is invoked to remove center of mass motion from the system.
// */
//class CudaRemoveCMMotionKernel : public RemoveCMMotionKernel {
//public:
// CudaRemoveCMMotionKernel(std::string name, const Platform& platform, CudaContext& cl) : RemoveCMMotionKernel(name, platform), cl(cl), cmMomentum(NULL) {
// }
// ~CudaRemoveCMMotionKernel();
// /**
// * Initialize the kernel, setting up the particle masses.
// *
// * @param system the System this kernel will be applied to
// * @param force the CMMotionRemover this kernel will be used for
// */
// void initialize(const System& system, const CMMotionRemover& force);
// /**
// * Execute the kernel.
// *
// * @param context the context in which to execute this kernel
// */
// void execute(ContextImpl& context);
//private:
// CudaContext& cl;
// int frequency;
// CudaArray<mm_float4>* cmMomentum;
// cl::Kernel kernel1, kernel2;
//};
} // namespace OpenMM
#endif /*OPENMM_CUDAKERNELS_H_*/
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008-2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "CudaContext.h"
#include "CudaExpressionUtilities.h"
#include "CudaPlatform.h"
#include "CudaKernelFactory.h"
#include "CudaKernels.h"
#include "openmm/internal/ContextImpl.h"
#include "openmm/Context.h"
#include "openmm/System.h"
#include <algorithm>
#include <cctype>
#include <sstream>
using namespace OpenMM;
using namespace std;
extern "C" OPENMM_EXPORT void registerPlatforms() {
Platform::registerPlatform(new CudaPlatform());
}
CudaPlatform::CudaPlatform() {
CudaKernelFactory* factory = new CudaKernelFactory();
registerKernelFactory(CalcForcesAndEnergyKernel::Name(), factory);
registerKernelFactory(UpdateStateDataKernel::Name(), factory);
registerKernelFactory(ApplyConstraintsKernel::Name(), factory);
registerKernelFactory(VirtualSitesKernel::Name(), factory);
registerKernelFactory(CalcHarmonicBondForceKernel::Name(), factory);
registerKernelFactory(CalcCustomBondForceKernel::Name(), factory);
registerKernelFactory(CalcHarmonicAngleForceKernel::Name(), factory);
registerKernelFactory(CalcCustomAngleForceKernel::Name(), factory);
registerKernelFactory(CalcPeriodicTorsionForceKernel::Name(), factory);
registerKernelFactory(CalcRBTorsionForceKernel::Name(), factory);
registerKernelFactory(CalcCMAPTorsionForceKernel::Name(), factory);
registerKernelFactory(CalcCustomTorsionForceKernel::Name(), factory);
registerKernelFactory(CalcNonbondedForceKernel::Name(), factory);
registerKernelFactory(CalcCustomNonbondedForceKernel::Name(), factory);
registerKernelFactory(CalcGBSAOBCForceKernel::Name(), factory);
registerKernelFactory(CalcCustomGBForceKernel::Name(), factory);
registerKernelFactory(CalcCustomExternalForceKernel::Name(), factory);
registerKernelFactory(CalcCustomHbondForceKernel::Name(), factory);
registerKernelFactory(CalcCustomCompoundBondForceKernel::Name(), factory);
registerKernelFactory(IntegrateVerletStepKernel::Name(), factory);
registerKernelFactory(IntegrateLangevinStepKernel::Name(), factory);
registerKernelFactory(IntegrateBrownianStepKernel::Name(), factory);
registerKernelFactory(IntegrateVariableVerletStepKernel::Name(), factory);
registerKernelFactory(IntegrateVariableLangevinStepKernel::Name(), factory);
registerKernelFactory(IntegrateCustomStepKernel::Name(), factory);
registerKernelFactory(ApplyAndersenThermostatKernel::Name(), factory);
registerKernelFactory(ApplyMonteCarloBarostatKernel::Name(), factory);
registerKernelFactory(CalcKineticEnergyKernel::Name(), factory);
registerKernelFactory(RemoveCMMotionKernel::Name(), factory);
platformProperties.push_back(CudaDeviceIndex());
platformProperties.push_back(CudaUseBlockingSync());
platformProperties.push_back(CudaPrecision());
platformProperties.push_back(CudaCompiler());
platformProperties.push_back(CudaTempDirectory());
setPropertyDefaultValue(CudaDeviceIndex(), "");
setPropertyDefaultValue(CudaUseBlockingSync(), "true");
setPropertyDefaultValue(CudaPrecision(), "single");
#ifdef _MSC_VER
setPropertyDefaultValue(CudaCompiler(), "nvcc");
setPropertyDefaultValue(CudaTempDirectory(), string(getenv("TEMP")));
#else
setPropertyDefaultValue(CudaCompiler(), "/usr/local/cuda/bin/nvcc");
setPropertyDefaultValue(CudaTempDirectory(), string(getenv("TMPDIR")));
#endif
}
bool CudaPlatform::supportsDoublePrecision() const {
return false;
}
const string& CudaPlatform::getPropertyValue(const Context& context, const string& property) const {
const ContextImpl& impl = getContextImpl(context);
const PlatformData* data = reinterpret_cast<const PlatformData*>(impl.getPlatformData());
map<string, string>::const_iterator value = data->propertyValues.find(property);
if (value != data->propertyValues.end())
return value->second;
return Platform::getPropertyValue(context, property);
}
void CudaPlatform::setPropertyValue(Context& context, const string& property, const string& value) const {
}
void CudaPlatform::contextCreated(ContextImpl& context, const map<string, string>& properties) const {
const string& devicePropValue = (properties.find(CudaDeviceIndex()) == properties.end() ?
getPropertyDefaultValue(CudaDeviceIndex()) : properties.find(CudaDeviceIndex())->second);
string blockingPropValue = (properties.find(CudaUseBlockingSync()) == properties.end() ?
getPropertyDefaultValue(CudaUseBlockingSync()) : properties.find(CudaUseBlockingSync())->second);
string precisionPropValue = (properties.find(CudaPrecision()) == properties.end() ?
getPropertyDefaultValue(CudaPrecision()) : properties.find(CudaPrecision())->second);
const string& compilerPropValue = (properties.find(CudaCompiler()) == properties.end() ?
getPropertyDefaultValue(CudaCompiler()) : properties.find(CudaCompiler())->second);
const string& tempPropValue = (properties.find(CudaTempDirectory()) == properties.end() ?
getPropertyDefaultValue(CudaTempDirectory()) : properties.find(CudaTempDirectory())->second);
transform(blockingPropValue.begin(), blockingPropValue.end(), blockingPropValue.begin(), ::tolower);
transform(precisionPropValue.begin(), precisionPropValue.end(), precisionPropValue.begin(), ::tolower);
context.setPlatformData(new PlatformData(context.getSystem(), devicePropValue, blockingPropValue, precisionPropValue, compilerPropValue, tempPropValue));
}
void CudaPlatform::contextDestroyed(ContextImpl& context) const {
PlatformData* data = reinterpret_cast<PlatformData*>(context.getPlatformData());
delete data;
}
CudaPlatform::PlatformData::PlatformData(const System& system, const string& deviceIndexProperty, const string& blockingProperty, const string& precisionProperty,
const string& compilerProperty, const string& tempProperty) : removeCM(false), stepCount(0), computeForceCount(0), time(0.0) {
bool blocking = (blockingProperty == "true");
vector<string> devices;
size_t searchPos = 0, nextPos;
while ((nextPos = deviceIndexProperty.find_first_of(", ", searchPos)) != string::npos) {
devices.push_back(deviceIndexProperty.substr(searchPos, nextPos-searchPos));
searchPos = nextPos+1;
}
devices.push_back(deviceIndexProperty.substr(searchPos));
for (int i = 0; i < (int) devices.size(); i++) {
if (devices[i].length() > 0) {
unsigned int deviceIndex;
stringstream(devices[i]) >> deviceIndex;
contexts.push_back(new CudaContext(system, deviceIndex, blocking, precisionProperty, compilerProperty, tempProperty, *this));
}
}
if (contexts.size() == 0)
contexts.push_back(new CudaContext(system, -1, blocking, precisionProperty, compilerProperty, tempProperty, *this));
stringstream device;
for (int i = 0; i < (int) contexts.size(); i++) {
if (i > 0)
device << ',';
device << contexts[i]->getDeviceIndex();
}
propertyValues[CudaPlatform::CudaDeviceIndex()] = device.str();
propertyValues[CudaPlatform::CudaPrecision()] = precisionProperty;
propertyValues[CudaPlatform::CudaCompiler()] = compilerProperty;
propertyValues[CudaPlatform::CudaTempDirectory()] = tempProperty;
contextEnergy.resize(contexts.size());
}
CudaPlatform::PlatformData::~PlatformData() {
for (int i = 0; i < (int) contexts.size(); i++)
delete contexts[i];
}
void CudaPlatform::PlatformData::initializeContexts(const System& system) {
// for (int i = 0; i < (int) contexts.size(); i++)
// contexts[i]->initialize();
}
void CudaPlatform::PlatformData::syncContexts() {
// for (int i = 0; i < (int) contexts.size(); i++)
// contexts[i]->getWorkThread().flush();
}
/**
* This is called by the various functions below to clear a buffer.
*/
__device__ void clearSingleBuffer(int* __restrict__ buffer, int size) {
int index = blockDim.x*blockIdx.x+threadIdx.x;
int4* buffer4 = (int4*) buffer;
int sizeDiv4 = size/4;
while (index < sizeDiv4) {
buffer4[index] = make_int4(0);
index += blockDim.x*gridDim.x;
}
if (blockDim.x*blockIdx.x+threadIdx.x == 0)
for (int i = sizeDiv4*4; i < size; i++)
buffer[i] = 0;
}
/**
* Fill a buffer with 0.
*/
__global__ void clearBuffer(int* __restrict__ buffer, int size) {
clearSingleBuffer(buffer, size);
}
/**
* Fill two buffers with 0.
*/
__global__ void clearTwoBuffers(int* __restrict__ buffer1, int size1, int* __restrict__ buffer2, int size2) {
clearSingleBuffer(buffer1, size1);
clearSingleBuffer(buffer2, size2);
}
/**
* Fill three buffers with 0.
*/
__global__ void clearThreeBuffers(int* __restrict__ buffer1, int size1, int* __restrict__ buffer2, int size2, int* __restrict__ buffer3, int size3) {
clearSingleBuffer(buffer1, size1);
clearSingleBuffer(buffer2, size2);
clearSingleBuffer(buffer3, size3);
}
/**
* Fill four buffers with 0.
*/
__global__ void clearFourBuffers(int* __restrict__ buffer1, int size1, int* __restrict__ buffer2, int size2, int* __restrict__ buffer3, int size3, int* __restrict__ buffer4, int size4) {
clearSingleBuffer(buffer1, size1);
clearSingleBuffer(buffer2, size2);
clearSingleBuffer(buffer3, size3);
clearSingleBuffer(buffer4, size4);
}
/**
* Fill five buffers with 0.
*/
__global__ void clearFiveBuffers(int* __restrict__ buffer1, int size1, int* __restrict__ buffer2, int size2, int* __restrict__ buffer3, int size3, int* __restrict__ buffer4, int size4, int* __restrict__ buffer5, int size5) {
clearSingleBuffer(buffer1, size1);
clearSingleBuffer(buffer2, size2);
clearSingleBuffer(buffer3, size3);
clearSingleBuffer(buffer4, size4);
clearSingleBuffer(buffer5, size5);
}
/**
* Fill six buffers with 0.
*/
__global__ void clearSixBuffers(int* __restrict__ buffer1, int size1, int* __restrict__ buffer2, int size2, int* __restrict__ buffer3, int size3, int* __restrict__ buffer4, int size4, int* __restrict__ buffer5, int size5, int* __restrict__ buffer6, int size6) {
clearSingleBuffer(buffer1, size1);
clearSingleBuffer(buffer2, size2);
clearSingleBuffer(buffer3, size3);
clearSingleBuffer(buffer4, size4);
clearSingleBuffer(buffer5, size5);
clearSingleBuffer(buffer6, size6);
}
/**
* Sum a collection of buffers into the first one.
*/
__global__ void reduceFloat4Buffer(float4* __restrict__ buffer, int bufferSize, int numBuffers) {
int index = blockDim.x*blockIdx.x+threadIdx.x;
int totalSize = bufferSize*numBuffers;
while (index < bufferSize) {
float4 sum = buffer[index];
for (int i = index+bufferSize; i < totalSize; i += bufferSize)
sum += buffer[i];
buffer[index] = sum;
index += blockDim.x*gridDim.x;
}
}
/**
* Sum the various buffers containing forces.
*/
__global__ void reduceForces(const long* __restrict__ longBuffer, float4* __restrict__ buffer, int bufferSize, int numBuffers) {
int totalSize = bufferSize*numBuffers;
float scale = 1.0f/(float) 0xFFFFFFFF;
for (int index = blockDim.x*blockIdx.x+threadIdx.x; index < bufferSize; index += blockDim.x*gridDim.x) {
float4 sum = make_float4(scale*longBuffer[index], scale*longBuffer[index+bufferSize], scale*longBuffer[index+2*bufferSize], 0.0f);
for (int i = index; i < totalSize; i += bufferSize)
sum += buffer[i];
buffer[index] = sum;
}
}
/**
* This file defines vector operations to simplify code elsewhere.
*/
// Versions of make_x() that take a single value and set all components to that.
inline __device__ int2 make_int2(int a) {
return make_int2(a, a);
}
inline __device__ int3 make_int3(int a) {
return make_int3(a, a, a);
}
inline __device__ int4 make_int4(int a) {
return make_int4(a, a, a, a);
}
inline __device__ float2 make_float2(float a) {
return make_float2(a, a);
}
inline __device__ float3 make_float3(float a) {
return make_float3(a, a, a);
}
inline __device__ float4 make_float4(float a) {
return make_float4(a, a, a, a);
}
inline __device__ double2 make_double2(double a) {
return make_double2(a, a);
}
inline __device__ double3 make_double3(double a) {
return make_double3(a, a, a);
}
inline __device__ double4 make_double4(double a) {
return make_double4(a, a, a, a);
}
// Negate a vector.
inline __device__ int2 operator*(int2 a) {
return make_int2(-a.x, -a.y);
}
inline __device__ int3 operator-(int3 a) {
return make_int3(-a.x, -a.y, -a.z);
}
inline __device__ int4 operator-(int4 a) {
return make_int4(-a.x, -a.y, -a.z, -a.w);
}
inline __device__ float2 operator-(float2 a) {
return make_float2(-a.x, -a.y);
}
inline __device__ float3 operator-(float3 a) {
return make_float3(-a.x, -a.y, -a.z);
}
inline __device__ float4 operator-(float4 a) {
return make_float4(-a.x, -a.y, -a.z, -a.w);
}
inline __device__ double2 operator-(double2 a) {
return make_double2(-a.x, -a.y);
}
inline __device__ double3 operator-(double3 a) {
return make_double3(-a.x, -a.y, -a.z);
}
inline __device__ double4 operator-(double4 a) {
return make_double4(-a.x, -a.y, -a.z, -a.w);
}
// Add two vectors.
inline __device__ int2 operator+(int2 a, int2 b) {
return make_int2(a.x+b.x, a.y+b.y);
}
inline __device__ int3 operator+(int3 a, int3 b) {
return make_int3(a.x+b.x, a.y+b.y, a.z+b.z);
}
inline __device__ int4 operator+(int4 a, int4 b) {
return make_int4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w);
}
inline __device__ float2 operator+(float2 a, float2 b) {
return make_float2(a.x+b.x, a.y+b.y);
}
inline __device__ float3 operator+(float3 a, float3 b) {
return make_float3(a.x+b.x, a.y+b.y, a.z+b.z);
}
inline __device__ float4 operator+(float4 a, float4 b) {
return make_float4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w);
}
inline __device__ double2 operator+(double2 a, double2 b) {
return make_double2(a.x+b.x, a.y+b.y);
}
inline __device__ double3 operator+(double3 a, double3 b) {
return make_double3(a.x+b.x, a.y+b.y, a.z+b.z);
}
inline __device__ double4 operator+(double4 a, double4 b) {
return make_double4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w);
}
// Subtract two vectors.
inline __device__ int2 operator-(int2 a, int2 b) {
return make_int2(a.x-b.x, a.y-b.y);
}
inline __device__ int3 operator-(int3 a, int3 b) {
return make_int3(a.x-b.x, a.y-b.y, a.z-b.z);
}
inline __device__ int4 operator-(int4 a, int4 b) {
return make_int4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w);
}
inline __device__ float2 operator-(float2 a, float2 b) {
return make_float2(a.x-b.x, a.y-b.y);
}
inline __device__ float3 operator-(float3 a, float3 b) {
return make_float3(a.x-b.x, a.y-b.y, a.z-b.z);
}
inline __device__ float4 operator-(float4 a, float4 b) {
return make_float4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w);
}
inline __device__ double2 operator-(double2 a, double2 b) {
return make_double2(a.x-b.x, a.y-b.y);
}
inline __device__ double3 operator-(double3 a, double3 b) {
return make_double3(a.x-b.x, a.y-b.y, a.z-b.z);
}
inline __device__ double4 operator-(double4 a, double4 b) {
return make_double4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w);
}
// Multiply two vectors.
inline __device__ int2 operator*(int2 a, int2 b) {
return make_int2(a.x*b.x, a.y*b.y);
}
inline __device__ int3 operator*(int3 a, int3 b) {
return make_int3(a.x*b.x, a.y*b.y, a.z*b.z);
}
inline __device__ int4 operator*(int4 a, int4 b) {
return make_int4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w);
}
inline __device__ float2 operator*(float2 a, float2 b) {
return make_float2(a.x*b.x, a.y*b.y);
}
inline __device__ float3 operator*(float3 a, float3 b) {
return make_float3(a.x*b.x, a.y*b.y, a.z*b.z);
}
inline __device__ float4 operator*(float4 a, float4 b) {
return make_float4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w);
}
inline __device__ double2 operator*(double2 a, double2 b) {
return make_double2(a.x*b.x, a.y*b.y);
}
inline __device__ double3 operator*(double3 a, double3 b) {
return make_double3(a.x*b.x, a.y*b.y, a.z*b.z);
}
inline __device__ double4 operator*(double4 a, double4 b) {
return make_double4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w);
}
// Divide two vectors.
inline __device__ int2 operator/(int2 a, int2 b) {
return make_int2(a.x/b.x, a.y/b.y);
}
inline __device__ int3 operator/(int3 a, int3 b) {
return make_int3(a.x/b.x, a.y/b.y, a.z/b.z);
}
inline __device__ int4 operator/(int4 a, int4 b) {
return make_int4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w);
}
inline __device__ float2 operator/(float2 a, float2 b) {
return make_float2(a.x/b.x, a.y/b.y);
}
inline __device__ float3 operator/(float3 a, float3 b) {
return make_float3(a.x/b.x, a.y/b.y, a.z/b.z);
}
inline __device__ float4 operator/(float4 a, float4 b) {
return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w);
}
inline __device__ double2 operator/(double2 a, double2 b) {
return make_double2(a.x/b.x, a.y/b.y);
}
inline __device__ double3 operator/(double3 a, double3 b) {
return make_double3(a.x/b.x, a.y/b.y, a.z/b.z);
}
inline __device__ double4 operator/(double4 a, double4 b) {
return make_double4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w);
}
// += operator
inline __device__ void operator+=(int2& a, int2 b) {
a.x += b.x; a.y += b.y;
}
inline __device__ void operator+=(int3& a, int3 b) {
a.x += b.x; a.y += b.y; a.z += b.z;
}
inline __device__ void operator+=(int4& a, int4 b) {
a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w;
}
inline __device__ void operator+=(float2& a, float2 b) {
a.x += b.x; a.y += b.y;
}
inline __device__ void operator+=(float3& a, float3 b) {
a.x += b.x; a.y += b.y; a.z += b.z;
}
inline __device__ void operator+=(float4& a, float4 b) {
a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w;
}
inline __device__ void operator+=(double2& a, double2 b) {
a.x += b.x; a.y += b.y;
}
inline __device__ void operator+=(double3& a, double3 b) {
a.x += b.x; a.y += b.y; a.z += b.z;
}
inline __device__ void operator+=(double4& a, double4 b) {
a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w;
}
// -= operator
inline __device__ void operator-=(int2& a, int2 b) {
a.x -= b.x; a.y -= b.y;
}
inline __device__ void operator-=(int3& a, int3 b) {
a.x -= b.x; a.y -= b.y; a.z -= b.z;
}
inline __device__ void operator-=(int4& a, int4 b) {
a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w;
}
inline __device__ void operator-=(float2& a, float2 b) {
a.x -= b.x; a.y -= b.y;
}
inline __device__ void operator-=(float3& a, float3 b) {
a.x -= b.x; a.y -= b.y; a.z -= b.z;
}
inline __device__ void operator-=(float4& a, float4 b) {
a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w;
}
inline __device__ void operator-=(double2& a, double2 b) {
a.x -= b.x; a.y -= b.y;
}
inline __device__ void operator-=(double3& a, double3 b) {
a.x -= b.x; a.y -= b.y; a.z -= b.z;
}
inline __device__ void operator-=(double4& a, double4 b) {
a.x -= b.x; a.y -= b.y; a.z -= b.z; a.w -= b.w;
}
// *= operator
inline __device__ void operator*=(int2& a, int2 b) {
a.x *= b.x; a.y *= b.y;
}
inline __device__ void operator*=(int3& a, int3 b) {
a.x *= b.x; a.y *= b.y; a.z *= b.z;
}
inline __device__ void operator*=(int4& a, int4 b) {
a.x *= b.x; a.y *= b.y; a.z *= b.z; a.w *= b.w;
}
inline __device__ void operator*=(float2& a, float2 b) {
a.x *= b.x; a.y *= b.y;
}
inline __device__ void operator*=(float3& a, float3 b) {
a.x *= b.x; a.y *= b.y; a.z *= b.z;
}
inline __device__ void operator*=(float4& a, float4 b) {
a.x *= b.x; a.y *= b.y; a.z *= b.z; a.w *= b.w;
}
inline __device__ void operator*=(double2& a, double2 b) {
a.x *= b.x; a.y *= b.y;
}
inline __device__ void operator*=(double3& a, double3 b) {
a.x *= b.x; a.y *= b.y; a.z *= b.z;
}
inline __device__ void operator*=(double4& a, double4 b) {
a.x *= b.x; a.y *= b.y; a.z *= b.z; a.w *= b.w;
}
// /= operator
inline __device__ void operator/=(int2& a, int2 b) {
a.x /= b.x; a.y /= b.y;
}
inline __device__ void operator/=(int3& a, int3 b) {
a.x /= b.x; a.y /= b.y; a.z /= b.z;
}
inline __device__ void operator/=(int4& a, int4 b) {
a.x /= b.x; a.y /= b.y; a.z /= b.z; a.w /= b.w;
}
inline __device__ void operator/=(float2& a, float2 b) {
a.x /= b.x; a.y /= b.y;
}
inline __device__ void operator/=(float3& a, float3 b) {
a.x /= b.x; a.y /= b.y; a.z /= b.z;
}
inline __device__ void operator/=(float4& a, float4 b) {
a.x /= b.x; a.y /= b.y; a.z /= b.z; a.w /= b.w;
}
inline __device__ void operator/=(double2& a, double2 b) {
a.x /= b.x; a.y /= b.y;
}
inline __device__ void operator/=(double3& a, double3 b) {
a.x /= b.x; a.y /= b.y; a.z /= b.z;
}
inline __device__ void operator/=(double4& a, double4 b) {
a.x /= b.x; a.y /= b.y; a.z /= b.z; a.w /= b.w;
}
// Multiply a vector by a constant.
inline __device__ int2 operator*(int2 a, int b) {
return make_int2(a.x*b, a.y*b);
}
inline __device__ int3 operator*(int3 a, int b) {
return make_int3(a.x*b, a.y*b, a.z*b);
}
inline __device__ int4 operator*(int4 a, int b) {
return make_int4(a.x*b, a.y*b, a.z*b, a.w*b);
}
inline __device__ int2 operator*(int a, int2 b) {
return make_int2(a*b.x, a*b.y);
}
inline __device__ int3 operator*(int a, int3 b) {
return make_int3(a*b.x, a*b.y, a*b.z);
}
inline __device__ int4 operator*(int a, int4 b) {
return make_int4(a*b.x, a*b.y, a*b.z, a*b.w);
}
inline __device__ float2 operator*(float2 a, float b) {
return make_float2(a.x*b, a.y*b);
}
inline __device__ float3 operator*(float3 a, float b) {
return make_float3(a.x*b, a.y*b, a.z*b);
}
inline __device__ float4 operator*(float4 a, float b) {
return make_float4(a.x*b, a.y*b, a.z*b, a.w*b);
}
inline __device__ float2 operator*(float a, float2 b) {
return make_float2(a*b.x, a*b.y);
}
inline __device__ float3 operator*(float a, float3 b) {
return make_float3(a*b.x, a*b.y, a*b.z);
}
inline __device__ float4 operator*(float a, float4 b) {
return make_float4(a*b.x, a*b.y, a*b.z, a*b.w);
}
inline __device__ double2 operator*(double2 a, double b) {
return make_double2(a.x*b, a.y*b);
}
inline __device__ double3 operator*(double3 a, double b) {
return make_double3(a.x*b, a.y*b, a.z*b);
}
inline __device__ double4 operator*(double4 a, double b) {
return make_double4(a.x*b, a.y*b, a.z*b, a.w*b);
}
inline __device__ double2 operator*(double a, double2 b) {
return make_double2(a*b.x, a*b.y);
}
inline __device__ double3 operator*(double a, double3 b) {
return make_double3(a*b.x, a*b.y, a*b.z);
}
inline __device__ double4 operator*(double a, double4 b) {
return make_double4(a*b.x, a*b.y, a*b.z, a*b.w);
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment