"wrappers/python/tests/systems/crds_box.rst7" did not exist on "5cf7f74cc81fcc1ce9f887e198026c46fe437340"
Unverified Commit 8e8923a7 authored by Peter Eastman's avatar Peter Eastman Committed by GitHub
Browse files

Converted AMOEBA to common platform (#3120)

* Began converting AMOEBA to common platform

* Beginning of OpenCL platform for AMOEBA

* Converted AmoebaVdwForce to common platform

* Cleaned up reference AMOEBA tests

* Began converting AmoebaMultipoleForce to common platform

* Continue converting AmoebaMultipoleForce to common platform

* Bug fixes

* Bug fix

* Continue converting AmoebaMultipoleForce to common platform

* Converting AmoebaMultipoleForce and AmoebaGeneralizedKirkwoodForce to common platform

* Converting AmoebaMultipoleForce and AmoebaGeneralizedKirkwoodForce to common platform

* Creating OpenCL version of AmoebaMultipoleForce and AmoebaGeneralizedKirkwoodForce

* Creating OpenCL version of AmoebaMultipoleForce and AmoebaGeneralizedKirkwoodForce

* Creating OpenCL version of AmoebaMultipoleForce and AmoebaGeneralizedKirkwoodForce

* Converted arrays from real3 to real

* Bug fix to OpenCL AmoebaGeneralizedKirkwoodForce

* Fixes for AMD GPUs

* Began converting HippoNonbondedForce to common platform

* Continuing to convert HippoNonbondedForce to common platform

* Continuing to convert HippoNonbondedForce to common platform

* Working on unifying PME kernels

* Fixed error on devices without 64 bit atomics

* Unified PME kernels

* Converted HippoNonbondedForce to common platform

* Creating OpenCL implementation of HippoNonbondedForce

* Continuing OpenCL implementation of HippoNonbondedForce

* Mostly finished OpenCL implementation of HippoNonbondedForce

* Eliminated three component vector types in host code

* Fix errors on CPU OpenCL

* Skip double precision tests for AMOEBA on OpenCL

* Bug fixes

* Bug fixes

* Fixed compilation error
parent 393a4dbd
......@@ -681,6 +681,19 @@ void OpenCLContext::executeKernel(cl::Kernel& kernel, int workUnits, int blockSi
}
}
int OpenCLContext::computeThreadBlockSize(double memory) const {
int maxShared = device.getInfo<CL_DEVICE_LOCAL_MEM_SIZE>();
// On some implementations, more local memory gets used than we calculate by
// adding up the sizes of the fields. To be safe, include a factor of 0.5.
int max = (int) (0.5*maxShared/memory);
if (max < 64)
return 32;
int threads = 64;
while (threads+64 < max)
threads += 64;
return threads;
}
void OpenCLContext::clearBuffer(ArrayInterface& array) {
clearBuffer(unwrap(array).getDeviceBuffer(), array.getSize()*array.getElementSize());
}
......
......@@ -26,6 +26,7 @@
#include "OpenCLKernel.h"
#include "openmm/common/ComputeArray.h"
#include "openmm/internal/AssertionUtilities.h"
using namespace OpenMM;
using namespace std;
......@@ -37,6 +38,10 @@ string OpenCLKernel::getName() const {
return kernel.getInfo<CL_KERNEL_FUNCTION_NAME>();
}
int OpenCLKernel::getMaxBlockSize() const {
return kernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice());
}
void OpenCLKernel::execute(int threads, int blockSize) {
// Set args that are specified by OpenCLArrays. We can't do this earlier, because it's
// possible resize() will get called on an array, causing its internal storage to be
......@@ -65,10 +70,12 @@ void OpenCLKernel::addEmptyArg() {
}
void OpenCLKernel::setArrayArg(int index, ArrayInterface& value) {
ASSERT_VALID_INDEX(index, arrayArgs);
arrayArgs[index] = &context.unwrap(value);
}
void OpenCLKernel::setPrimitiveArg(int index, const void* value, int size) {
ASSERT_VALID_INDEX(index, arrayArgs);
// The const_cast is needed because of a bug in the OpenCL C++ wrappers. clSetKernelArg()
// declares the value to be const, but the C++ wrapper doesn't.
kernel.setArg(index, size, const_cast<void*>(value));
......
This diff is collapsed.
......@@ -91,6 +91,7 @@ OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : con
}
pinnedCountBuffer = new cl::Buffer(context.getContext(), CL_MEM_ALLOC_HOST_PTR, sizeof(int));
pinnedCountMemory = (int*) context.getQueue().enqueueMapBuffer(*pinnedCountBuffer, CL_TRUE, CL_MAP_READ, 0, sizeof(int));
setKernelSource(deviceIsCpu ? OpenCLKernelSources::nonbonded_cpu : OpenCLKernelSources::nonbonded);
}
OpenCLNonbondedUtilities::~OpenCLNonbondedUtilities() {
......@@ -127,7 +128,7 @@ void OpenCLNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic
void OpenCLNonbondedUtilities::addParameter(ComputeParameterInfo parameter) {
parameters.push_back(ParameterInfo(parameter.getName(), parameter.getComponentType(), parameter.getNumComponents(),
parameter.getSize(), context.unwrap(parameter.getArray()).getDeviceBuffer()));
parameter.getSize(), context.unwrap(parameter.getArray()).getDeviceBuffer(), parameter.isConstant()));
}
void OpenCLNonbondedUtilities::addParameter(const ParameterInfo& parameter) {
......@@ -136,7 +137,7 @@ void OpenCLNonbondedUtilities::addParameter(const ParameterInfo& parameter) {
void OpenCLNonbondedUtilities::addArgument(ComputeParameterInfo parameter) {
arguments.push_back(ParameterInfo(parameter.getName(), parameter.getComponentType(), parameter.getNumComponents(),
parameter.getSize(), context.unwrap(parameter.getArray()).getDeviceBuffer()));
parameter.getSize(), context.unwrap(parameter.getArray()).getDeviceBuffer(), parameter.isConstant()));
}
void OpenCLNonbondedUtilities::addArgument(const ParameterInfo& parameter) {
......@@ -556,97 +557,108 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
const string suffixes[] = {"x", "y", "z", "w"};
stringstream localData;
int localDataSize = 0;
for (int i = 0; i < (int) params.size(); i++) {
if (params[i].getNumComponents() == 1)
localData<<params[i].getType()<<" "<<params[i].getName()<<";\n";
for (const ParameterInfo& param : params) {
if (param.getNumComponents() == 1)
localData<<param.getType()<<" "<<param.getName()<<";\n";
else {
for (int j = 0; j < params[i].getNumComponents(); ++j)
localData<<params[i].getComponentType()<<" "<<params[i].getName()<<"_"<<suffixes[j]<<";\n";
for (int j = 0; j < param.getNumComponents(); ++j)
localData<<param.getComponentType()<<" "<<param.getName()<<"_"<<suffixes[j]<<";\n";
}
localDataSize += params[i].getSize();
localDataSize += param.getSize();
}
replacements["ATOM_PARAMETER_DATA"] = localData.str();
stringstream args;
for (int i = 0; i < (int) params.size(); i++) {
args << ", __global const ";
args << params[i].getType();
for (const ParameterInfo& param : params) {
args << ", __global ";
if (param.isConstant())
args << "const ";
if (param.getNumComponents() == 3)
args << param.getComponentType();
else
args << param.getType();
args << "* restrict global_";
args << params[i].getName();
args << param.getName();
}
for (int i = 0; i < (int) arguments.size(); i++) {
if (arguments[i].getMemory().getInfo<CL_MEM_TYPE>() == CL_MEM_OBJECT_IMAGE2D) {
for (const ParameterInfo& arg : arguments) {
if (arg.getMemory().getInfo<CL_MEM_TYPE>() == CL_MEM_OBJECT_IMAGE2D) {
args << ", __read_only image2d_t ";
args << arguments[i].getName();
args << arg.getName();
}
else {
if ((arguments[i].getMemory().getInfo<CL_MEM_FLAGS>() & CL_MEM_READ_ONLY) == 0)
args << ", __global const ";
if ((arg.getMemory().getInfo<CL_MEM_FLAGS>() & CL_MEM_READ_ONLY) == 0) {
args << ", __global ";
if (arg.isConstant())
args << "const ";
}
else
args << ", __constant ";
args << arguments[i].getType();
args << arg.getType();
args << "* restrict ";
args << arguments[i].getName();
args << arg.getName();
}
}
if (energyParameterDerivatives.size() > 0)
args << ", __global mixed* restrict energyParamDerivs";
replacements["PARAMETER_ARGUMENTS"] = args.str();
stringstream loadLocal1;
for (int i = 0; i < (int) params.size(); i++) {
if (params[i].getNumComponents() == 1) {
loadLocal1<<"localData[localAtomIndex]."<<params[i].getName()<<" = "<<params[i].getName()<<"1;\n";
for (const ParameterInfo& param : params) {
if (param.getNumComponents() == 1) {
loadLocal1<<"localData[localAtomIndex]."<<param.getName()<<" = "<<param.getName()<<"1;\n";
}
else {
for (int j = 0; j < params[i].getNumComponents(); ++j)
loadLocal1<<"localData[localAtomIndex]."<<params[i].getName()<<"_"<<suffixes[j]<<" = "<<params[i].getName()<<"1."<<suffixes[j]<<";\n";
for (int j = 0; j < param.getNumComponents(); ++j)
loadLocal1<<"localData[localAtomIndex]."<<param.getName()<<"_"<<suffixes[j]<<" = "<<param.getName()<<"1."<<suffixes[j]<<";\n";
}
}
replacements["LOAD_LOCAL_PARAMETERS_FROM_1"] = loadLocal1.str();
replacements["DECLARE_LOCAL_PARAMETERS"] = "";
stringstream loadLocal2;
for (int i = 0; i < (int) params.size(); i++) {
if (params[i].getNumComponents() == 1) {
loadLocal2<<"localData[localAtomIndex]."<<params[i].getName()<<" = global_"<<params[i].getName()<<"[j];\n";
for (const ParameterInfo& param : params) {
if (param.getNumComponents() == 1) {
loadLocal2<<"localData[localAtomIndex]."<<param.getName()<<" = global_"<<param.getName()<<"[j];\n";
}
else {
loadLocal2<<params[i].getType()<<" temp_"<<params[i].getName()<<" = global_"<<params[i].getName()<<"[j];\n";
for (int j = 0; j < params[i].getNumComponents(); ++j)
loadLocal2<<"localData[localAtomIndex]."<<params[i].getName()<<"_"<<suffixes[j]<<" = temp_"<<params[i].getName()<<"."<<suffixes[j]<<";\n";
if (param.getNumComponents() == 3)
loadLocal2<<param.getType()<<" temp_"<<param.getName()<<" = make_"<<param.getType()<<"(global_"<<param.getName()<<"[3*j], global_"<<param.getName()<<"[3*j+1], global_"<<param.getName()<<"[3*j+2]);\n";
else
loadLocal2<<param.getType()<<" temp_"<<param.getName()<<" = global_"<<param.getName()<<"[j];\n";
for (int j = 0; j < param.getNumComponents(); ++j)
loadLocal2<<"localData[localAtomIndex]."<<param.getName()<<"_"<<suffixes[j]<<" = temp_"<<param.getName()<<"."<<suffixes[j]<<";\n";
}
}
replacements["LOAD_LOCAL_PARAMETERS_FROM_GLOBAL"] = loadLocal2.str();
stringstream load1;
for (int i = 0; i < (int) params.size(); i++) {
load1 << params[i].getType();
load1 << " ";
load1 << params[i].getName();
load1 << "1 = global_";
load1 << params[i].getName();
load1 << "[atom1];\n";
for (const ParameterInfo& param : params) {
load1<<param.getType()<<" "<<param.getName()<<"1 = ";
if (param.getNumComponents() == 3)
load1<<"make_"<<param.getType()<<"(global_"<<param.getName()<<"[3*atom1], global_"<<param.getName()<<"[3*atom1+1], global_"<<param.getName()<<"[3*atom1+2]);\n";
else
load1<<"global_"<<param.getName()<<"[atom1];\n";
}
replacements["LOAD_ATOM1_PARAMETERS"] = load1.str();
stringstream load2j;
for (int i = 0; i < (int) params.size(); i++) {
if (params[i].getNumComponents() == 1) {
load2j<<params[i].getType()<<" "<<params[i].getName()<<"2 = localData[atom2]."<<params[i].getName()<<";\n";
for (const ParameterInfo& param : params) {
if (param.getNumComponents() == 1) {
load2j<<param.getType()<<" "<<param.getName()<<"2 = localData[atom2]."<<param.getName()<<";\n";
}
else {
load2j<<params[i].getType()<<" "<<params[i].getName()<<"2 = ("<<params[i].getType()<<") (";
for (int j = 0; j < params[i].getNumComponents(); ++j) {
load2j<<param.getType()<<" "<<param.getName()<<"2 = ("<<param.getType()<<") (";
for (int j = 0; j < param.getNumComponents(); ++j) {
if (j > 0)
load2j<<", ";
load2j<<"localData[atom2]."<<params[i].getName()<<"_"<<suffixes[j];
load2j<<"localData[atom2]."<<param.getName()<<"_"<<suffixes[j];
}
load2j<<");\n";
}
}
replacements["LOAD_ATOM2_PARAMETERS"] = load2j.str();
stringstream clearLocal;
for (int i = 0; i < (int) params.size(); i++) {
if (params[i].getNumComponents() == 1)
clearLocal<<"localData[localAtomIndex]."<<params[i].getName()<<" = 0;\n";
for (const ParameterInfo& param : params) {
if (param.getNumComponents() == 1)
clearLocal<<"localData[localAtomIndex]."<<param.getName()<<" = 0;\n";
else
for (int j = 0; j < params[i].getNumComponents(); ++j)
clearLocal<<"localData[localAtomIndex]."<<params[i].getName()<<"_"<<suffixes[j]<<" = 0;\n";
for (int j = 0; j < param.getNumComponents(); ++j)
clearLocal<<"localData[localAtomIndex]."<<param.getName()<<"_"<<suffixes[j]<<" = 0;\n";
}
replacements["CLEAR_LOCAL_PARAMETERS"] = clearLocal.str();
stringstream initDerivs;
......@@ -659,7 +671,7 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
for (int i = 0; i < energyParameterDerivatives.size(); i++)
for (int index = 0; index < numDerivs; index++)
if (allParamDerivNames[index] == energyParameterDerivatives[i])
saveDerivs<<"energyParamDerivs[get_global_id(0)*"<<numDerivs<<"+"<<index<<"] += energyParamDeriv"<<i<<";\n";
saveDerivs<<"energyParamDerivs[GLOBAL_ID*"<<numDerivs<<"+"<<index<<"] += energyParamDeriv"<<i<<";\n";
replacements["SAVE_DERIVATIVES"] = saveDerivs.str();
map<string, string> defines;
if (useCutoff)
......@@ -676,6 +688,7 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
defines["INCLUDE_FORCES"] = "1";
if (includeEnergy)
defines["INCLUDE_ENERGY"] = "1";
defines["THREAD_BLOCK_SIZE"] = context.intToString(forceThreadBlockSize);
defines["FORCE_WORK_GROUP_SIZE"] = context.intToString(forceThreadBlockSize);
double maxCutoff = 0.0;
for (int i = 0; i < 32; i++) {
......@@ -700,12 +713,7 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
defines["LAST_EXCLUSION_TILE"] = context.intToString(endExclusionIndex);
if ((localDataSize/4)%2 == 0)
defines["PARAMETER_SIZE_IS_EVEN"] = "1";
string file;
if (deviceIsCpu)
file = OpenCLKernelSources::nonbonded_cpu;
else
file = OpenCLKernelSources::nonbonded;
cl::Program program = context.createProgram(context.replaceStrings(file, replacements), defines);
cl::Program program = context.createProgram(context.replaceStrings(kernelSource, replacements), defines);
cl::Kernel kernel(program, "computeNonbonded");
// Set arguments to the Kernel.
......@@ -730,13 +738,15 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
kernel.setArg<cl::Buffer>(index++, blockBoundingBox.getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, interactingAtoms.getDeviceBuffer());
}
for (int i = 0; i < (int) params.size(); i++) {
kernel.setArg<cl::Memory>(index++, params[i].getMemory());
}
for (int i = 0; i < (int) arguments.size(); i++) {
kernel.setArg<cl::Memory>(index++, arguments[i].getMemory());
}
for (const ParameterInfo& param : params)
kernel.setArg<cl::Memory>(index++, param.getMemory());
for (const ParameterInfo& arg : arguments)
kernel.setArg<cl::Memory>(index++, arg.getMemory());
if (energyParameterDerivatives.size() > 0)
kernel.setArg<cl::Memory>(index++, context.getEnergyParamDerivBuffer().getDeviceBuffer());
return kernel;
}
void OpenCLNonbondedUtilities::setKernelSource(const string& source) {
kernelSource = source;
}
......@@ -34,6 +34,7 @@ __kernel void computeNonbonded(
const unsigned int warp = get_global_id(0)/TILE_SIZE;
const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
const unsigned int tbx = get_local_id(0) - tgx;
const unsigned int localAtomIndex = get_local_id(0);
mixed energy = 0;
INIT_DERIVATIVES
__local AtomData localData[FORCE_WORK_GROUP_SIZE];
......@@ -57,7 +58,6 @@ __kernel void computeNonbonded(
if (x == y) {
// This tile is on the diagonal.
const unsigned int localAtomIndex = get_local_id(0);
localData[localAtomIndex].x = posq1.x;
localData[localAtomIndex].y = posq1.y;
localData[localAtomIndex].z = posq1.z;
......@@ -105,7 +105,6 @@ __kernel void computeNonbonded(
else {
// This is an off-diagonal tile.
const unsigned int localAtomIndex = get_local_id(0);
unsigned int j = y*TILE_SIZE + tgx;
real4 tempPosq = posq[j];
localData[localAtomIndex].x = tempPosq.x;
......@@ -266,7 +265,6 @@ __kernel void computeNonbonded(
real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
const unsigned int localAtomIndex = get_local_id(0);
#ifdef USE_CUTOFF
unsigned int j = interactingAtoms[pos*TILE_SIZE+tgx];
#else
......
......@@ -124,6 +124,7 @@ ENDIF(OPENMM_BUILD_STATIC_LIB)
# Which hardware platforms to build
ADD_SUBDIRECTORY(platforms/reference)
ADD_SUBDIRECTORY(platforms/common)
IF(OPENMM_BUILD_CUDA_LIB)
SET(OPENMM_BUILD_AMOEBA_CUDA_LIB ON CACHE BOOL "Build OpenMMAmoebaCuda library for Nvidia GPUs")
......@@ -131,7 +132,12 @@ ELSE(OPENMM_BUILD_CUDA_LIB)
SET(OPENMM_BUILD_AMOEBA_CUDA_LIB OFF CACHE BOOL "Build OpenMMAmoebaCuda library for Nvidia GPUs")
ENDIF(OPENMM_BUILD_CUDA_LIB)
SET(OPENMM_BUILD_AMOEBA_PATH)
IF(OPENMM_BUILD_OPENCL_LIB)
SET(OPENMM_BUILD_AMOEBA_OPENCL_LIB ON CACHE BOOL "Build OpenMMAmoebaOpenCL library")
ELSE(OPENMM_BUILD_OPENCL_LIB)
SET(OPENMM_BUILD_AMOEBA_OPENCL_LIB OFF CACHE BOOL "Build OpenMMAmoebaOpenCL library")
ENDIF(OPENMM_BUILD_OPENCL_LIB)
SET(OPENMM_BUILD_AMOEBA_CUDA_PATH)
IF(OPENMM_BUILD_AMOEBA_CUDA_LIB)
ADD_SUBDIRECTORY(platforms/cuda)
......@@ -139,6 +145,13 @@ IF(OPENMM_BUILD_AMOEBA_CUDA_LIB)
SET(OPENMM_AMOEBA_CUDA_SOURCE_SUBDIRS . openmmapi olla platforms/cuda)
ENDIF(OPENMM_BUILD_AMOEBA_CUDA_LIB)
SET(OPENMM_BUILD_AMOEBA_OPENCL_PATH)
IF(OPENMM_BUILD_AMOEBA_OPENCL_LIB)
ADD_SUBDIRECTORY(platforms/opencl)
SET(OPENMM_BUILD_AMOEBA_OPENCL_PATH ${CMAKE_CURRENT_SOURCE_DIR}/platforms/opencl)
SET(OPENMM_AMOEBA_OPENCL_SOURCE_SUBDIRS . openmmapi olla platforms/opencl)
ENDIF(OPENMM_BUILD_AMOEBA_OPENCL_LIB)
INSTALL_TARGETS(/lib RUNTIME_DIRECTORY /lib ${SHARED_AMOEBA_TARGET})
IF(OPENMM_BUILD_STATIC_LIB)
INSTALL_TARGETS(/lib RUNTIME_DIRECTORY /lib ${STATIC_AMOEBA_TARGET})
......
# Encode the kernel sources into a C++ class.
SET(KERNEL_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/src")
SET(KERNEL_SOURCE_CLASS CommonAmoebaKernelSources)
SET(KERNELS_CPP ${CMAKE_CURRENT_BINARY_DIR}/src/${KERNEL_SOURCE_CLASS}.cpp)
SET(KERNELS_H ${CMAKE_CURRENT_BINARY_DIR}/src/${KERNEL_SOURCE_CLASS}.h)
INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_BINARY_DIR}/src)
FILE(GLOB COMMON_KERNELS ${KERNEL_SOURCE_DIR}/kernels/*.cc)
ADD_CUSTOM_COMMAND(OUTPUT ${KERNELS_CPP} ${KERNELS_H}
COMMAND ${CMAKE_COMMAND}
ARGS -D KERNEL_SOURCE_DIR=${KERNEL_SOURCE_DIR} -D KERNELS_CPP=${KERNELS_CPP} -D KERNELS_H=${KERNELS_H} -D KERNEL_SOURCE_CLASS=${KERNEL_SOURCE_CLASS} -D KERNEL_FILE_EXTENSION=cc -P ${CMAKE_SOURCE_DIR}/cmake_modules/EncodeKernelFiles.cmake
DEPENDS ${COMMON_KERNELS}
)
SET_SOURCE_FILES_PROPERTIES(${KERNELS_CPP} ${KERNELS_H} PROPERTIES GENERATED TRUE)
ADD_CUSTOM_TARGET(AmoebaCommonKernels DEPENDS ${KERNELS_CPP} ${KERNELS_H})
This diff is collapsed.
This diff is collapsed.
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2010 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "CommonAmoebaKernelSources.h"
using namespace OpenMM;
using namespace std;
#ifndef OPENMM_COMMONAMOEBAKERNELSOURCES_H_
#define OPENMM_COMMONAMOEBAKERNELSOURCES_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2010 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include <string>
namespace OpenMM {
/**
* This class is a central holding place for the source code of device kernels.
* The CMake build script inserts declarations into it based on the .cc files in the
* kernels subfolder.
*/
class CommonAmoebaKernelSources {
public:
@KERNEL_FILE_DECLARATIONS@
};
} // namespace OpenMM
#endif /*OPENMM_COMMONAMOEBAKERNELSOURCES_H_*/
/**
* Clear the forces, and compute the position to use for each atom based on the bond reduction factors.
*/
extern "C" __global__ void prepareToComputeForce(unsigned long long* __restrict__ forceBuffers, real4* __restrict__ posq, const real4* __restrict__ tempPosq,
const int* __restrict__ bondReductionAtoms, const float* __restrict__ bondReductionFactors) {
for (unsigned int atom = blockIdx.x*blockDim.x+threadIdx.x; atom < PADDED_NUM_ATOMS; atom += blockDim.x*gridDim.x) {
KERNEL void prepareToComputeForce(GLOBAL mm_ulong* RESTRICT forceBuffers, GLOBAL real4* RESTRICT posq, GLOBAL const real4* RESTRICT tempPosq,
GLOBAL const int* RESTRICT bondReductionAtoms, GLOBAL const float* RESTRICT bondReductionFactors) {
for (unsigned int atom = GLOBAL_ID; atom < PADDED_NUM_ATOMS; atom += GLOBAL_SIZE) {
forceBuffers[atom] = 0;
forceBuffers[atom+PADDED_NUM_ATOMS] = 0;
forceBuffers[atom+PADDED_NUM_ATOMS*2] = 0;
......@@ -19,27 +19,27 @@ extern "C" __global__ void prepareToComputeForce(unsigned long long* __restrict_
/**
* Spread the forces between atoms based on the bond reduction factors.
*/
extern "C" __global__ void spreadForces(const unsigned long long* __restrict__ forceBuffers, unsigned long long* __restrict__ tempForceBuffers,
const int* __restrict__ bondReductionAtoms, const float* __restrict__ bondReductionFactors) {
for (unsigned int atom1 = blockIdx.x*blockDim.x+threadIdx.x; atom1 < PADDED_NUM_ATOMS; atom1 += blockDim.x*gridDim.x) {
KERNEL void spreadForces(GLOBAL const mm_ulong* RESTRICT forceBuffers, GLOBAL mm_ulong* RESTRICT tempForceBuffers,
GLOBAL const int* RESTRICT bondReductionAtoms, GLOBAL const float* RESTRICT bondReductionFactors) {
for (unsigned int atom1 = GLOBAL_ID; atom1 < PADDED_NUM_ATOMS; atom1 += GLOBAL_SIZE) {
int atom2 = bondReductionAtoms[atom1];
long long fx1 = forceBuffers[atom1];
long long fy1 = forceBuffers[atom1+PADDED_NUM_ATOMS];
long long fz1 = forceBuffers[atom1+PADDED_NUM_ATOMS*2];
mm_long fx1 = forceBuffers[atom1];
mm_long fy1 = forceBuffers[atom1+PADDED_NUM_ATOMS];
mm_long fz1 = forceBuffers[atom1+PADDED_NUM_ATOMS*2];
if (atom1 != atom2) {
double factor = (double) bondReductionFactors[atom1];
long long fx2 = (long long) ((1-factor)*fx1);
long long fy2 = (long long) ((1-factor)*fy1);
long long fz2 = (long long) ((1-factor)*fz1);
atomicAdd(&tempForceBuffers[atom2], static_cast<unsigned long long>(fx2));
atomicAdd(&tempForceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>(fy2));
atomicAdd(&tempForceBuffers[atom2+PADDED_NUM_ATOMS*2], static_cast<unsigned long long>(fz2));
fx1 = (long long) (factor*fx1);
fy1 = (long long) (factor*fy1);
fz1 = (long long) (factor*fz1);
mm_long fx2 = (mm_long) ((1-factor)*fx1);
mm_long fy2 = (mm_long) ((1-factor)*fy1);
mm_long fz2 = (mm_long) ((1-factor)*fz1);
ATOMIC_ADD(&tempForceBuffers[atom2], (mm_ulong) fx2);
ATOMIC_ADD(&tempForceBuffers[atom2+PADDED_NUM_ATOMS], (mm_ulong) fy2);
ATOMIC_ADD(&tempForceBuffers[atom2+PADDED_NUM_ATOMS*2], (mm_ulong) fz2);
fx1 = (mm_long) (factor*fx1);
fy1 = (mm_long) (factor*fy1);
fz1 = (mm_long) (factor*fz1);
}
atomicAdd(&tempForceBuffers[atom1], static_cast<unsigned long long>(fx1));
atomicAdd(&tempForceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>(fy1));
atomicAdd(&tempForceBuffers[atom1+PADDED_NUM_ATOMS*2], static_cast<unsigned long long>(fz1));
ATOMIC_ADD(&tempForceBuffers[atom1], (mm_ulong) fx1);
ATOMIC_ADD(&tempForceBuffers[atom1+PADDED_NUM_ATOMS], (mm_ulong) fy1);
ATOMIC_ADD(&tempForceBuffers[atom1+PADDED_NUM_ATOMS*2], (mm_ulong) fz1);
}
}
......@@ -6,35 +6,37 @@ typedef struct {
float radius, epsilon, padding;
} AtomData;
inline __device__ void loadAtomData(AtomData& data, int atom, const real4* __restrict__ posq, const float2* __restrict__ radiusEpsilon) {
inline DEVICE AtomData loadAtomData(int atom, GLOBAL const real4* RESTRICT posq, GLOBAL const float2* RESTRICT radiusEpsilon) {
AtomData data;
real4 atomPosq = posq[atom];
data.pos = make_real3(atomPosq.x, atomPosq.y, atomPosq.z);
float2 temp = radiusEpsilon[atom];
data.radius = temp.x;
data.epsilon = temp.y;
return data;
}
__device__ void initParticleParameters(float radius, float epsilon, real& rmixo, real& rmixh, real& emixo, real& emixh) {
DEVICE void initParticleParameters(float radius, float epsilon, real* rmixo, real* rmixh, real* emixo, real* emixh) {
real sqrtEps = SQRT(epsilon);
real denominator = SQRT(EPSO) + sqrtEps;
emixo = 4*EPSO*epsilon / (denominator*denominator);
*emixo = 4*EPSO*epsilon / (denominator*denominator);
denominator = SQRT(EPSH) + sqrtEps;
emixh = 4*EPSH*epsilon / (denominator*denominator);
*emixh = 4*EPSH*epsilon / (denominator*denominator);
real radius2 = radius*radius;
real rmino2 = RMINO*RMINO;
rmixo = 2*(rmino2*RMINO + radius2*radius) / (rmino2 + radius2);
*rmixo = 2*(rmino2*RMINO + radius2*radius) / (rmino2 + radius2);
real rminh2 = RMINH*RMINH;
rmixh = 2*(rminh2*RMINH + radius2*radius) / (rminh2+radius2);
*rmixh = 2*(rminh2*RMINH + radius2*radius) / (rminh2+radius2);
}
__device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, real rmixo, real rmixh, real emixo, real emixh, real3& force, real& energy) {
DEVICE void computeOneInteraction(AtomData atom1, AtomData atom2, real rmixo, real rmixh, real emixo, real emixh, real3* force, real* energy) {
// get deltaR and r between 2 atoms
force = atom2.pos - atom1.pos;
real r2 = dot(force, force);
*force = atom2.pos - atom1.pos;
real r2 = dot(*force, *force);
if (r2 <= 0) {
force = make_real3(0);
energy = 0;
*force = make_real3(0);
*energy = 0;
return;
}
real rI = RSQRT(r2);
......@@ -43,8 +45,8 @@ __device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, real rmi
real sk = atom2.radius*SHCTD;
real sk2 = sk*sk;
if (atom1.radius >= (r+sk)) {
force = make_real3(0);
energy = 0;
*force = make_real3(0);
*energy = 0;
return;
}
......@@ -183,30 +185,30 @@ __device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, real rmi
de += mask2*(ah*rmixh7*M_PI*(dl+du)/(30*r2));
sum += mask2*(irep+idisp);
energy = sum;
*energy = sum;
de *= -AWATER*rI;
force *= de;
*force *= de;
}
/**
* Compute WCA interaction.
*/
extern "C" __global__ void computeWCAForce(unsigned long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer,
const real4* __restrict__ posq, unsigned int startTileIndex, unsigned int numTileIndices, const float2* __restrict__ radiusEpsilon) {
unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
KERNEL void computeWCAForce(GLOBAL mm_ulong* RESTRICT forceBuffers, GLOBAL mixed* RESTRICT energyBuffer,
GLOBAL const real4* RESTRICT posq, unsigned int startTileIndex, unsigned int numTileIndices, GLOBAL const float2* RESTRICT radiusEpsilon) {
unsigned int totalWarps = GLOBAL_SIZE/TILE_SIZE;
unsigned int warp = GLOBAL_ID/TILE_SIZE;
const unsigned int numTiles = numTileIndices;
unsigned int pos = (unsigned int) (startTileIndex+warp*(long long)numTiles/totalWarps);
unsigned int end = (unsigned int) (startTileIndex+(warp+1)*(long long)numTiles/totalWarps);
unsigned int pos = (unsigned int) (startTileIndex+warp*(mm_long)numTiles/totalWarps);
unsigned int end = (unsigned int) (startTileIndex+(warp+1)*(mm_long)numTiles/totalWarps);
mixed energy = 0;
__shared__ AtomData localData[THREAD_BLOCK_SIZE];
LOCAL AtomData localData[THREAD_BLOCK_SIZE];
do {
// Extract the coordinates of this tile
const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
const unsigned int tbx = threadIdx.x - tgx;
const unsigned int localGroupIndex = threadIdx.x/TILE_SIZE;
const unsigned int tgx = LOCAL_ID & (TILE_SIZE-1);
const unsigned int tbx = LOCAL_ID - tgx;
const unsigned int localGroupIndex = LOCAL_ID/TILE_SIZE;
int x, y;
AtomData data;
if (pos < end) {
......@@ -217,12 +219,13 @@ extern "C" __global__ void computeWCAForce(unsigned long long* __restrict__ forc
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
unsigned int atom1 = x*TILE_SIZE + tgx;
loadAtomData(data, atom1, posq, radiusEpsilon);
loadAtomData(localData[threadIdx.x], y*TILE_SIZE+tgx, posq, radiusEpsilon);
data = loadAtomData(atom1, posq, radiusEpsilon);
localData[LOCAL_ID] = loadAtomData(y*TILE_SIZE+tgx, posq, radiusEpsilon);
real emixo, emixh, rmixo, rmixh;
initParticleParameters(data.radius, data.epsilon, rmixo, rmixh, emixo, emixh);
initParticleParameters(data.radius, data.epsilon, &rmixo, &rmixh, &emixo, &emixh);
data.force = make_real3(0);
localData[threadIdx.x].force = make_real3(0);
localData[LOCAL_ID].force = make_real3(0);
SYNC_WARPS;
// Compute forces.
......@@ -232,31 +235,32 @@ extern "C" __global__ void computeWCAForce(unsigned long long* __restrict__ forc
if (atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
real3 tempForce;
real tempEnergy;
computeOneInteraction(data, localData[tbx+tj], rmixo, rmixh, emixo, emixh, tempForce, tempEnergy);
computeOneInteraction(data, localData[tbx+tj], rmixo, rmixh, emixo, emixh, &tempForce, &tempEnergy);
data.force += tempForce;
localData[tbx+tj].force -= tempForce;
energy += (x == y ? 0.5f*tempEnergy : tempEnergy);
real emjxo, emjxh, rmjxo, rmjxh;
initParticleParameters(localData[tbx+tj].radius, localData[tbx+tj].epsilon, rmjxo, rmjxh, emjxo, emjxh);
computeOneInteraction(localData[tbx+tj], data, rmjxo, rmjxh, emjxo, emjxh, tempForce, tempEnergy);
initParticleParameters(localData[tbx+tj].radius, localData[tbx+tj].epsilon, &rmjxo, &rmjxh, &emjxo, &emjxh);
computeOneInteraction(localData[tbx+tj], data, rmjxo, rmjxh, emjxo, emjxh, &tempForce, &tempEnergy);
data.force -= tempForce;
localData[tbx+tj].force += tempForce;
energy += (x == y ? 0.5f*tempEnergy : tempEnergy);
}
tj = (tj+1) & (TILE_SIZE-1);
SYNC_WARPS;
}
unsigned int offset = x*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) ((mm_long) (data.force.x*0x100000000)));
ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (data.force.y*0x100000000)));
ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (data.force.z*0x100000000)));
if (x != y) {
offset = y*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) ((mm_long) (localData[LOCAL_ID].force.x*0x100000000)));
ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (localData[LOCAL_ID].force.y*0x100000000)));
ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (localData[LOCAL_ID].force.z*0x100000000)));
}
}
pos++;
} while (pos < end);
energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] -= AWATER*energy;
energyBuffer[GLOBAL_ID] -= AWATER*energy;
}
\ No newline at end of file
__device__ void bicubic(real4 y, real4 y1i, real4 y2i, real4 y12i, real x1, real x1l, real x1u,
DEVICE void bicubic(real4 y, real4 y1i, real4 y2i, real4 y12i, real x1, real x1l, real x1u,
real x2, real x2l, real x2u, real* energyOut, real* dang1Out, real* dang2Out) {
real c[4][4];
real d1 = x1u - x1l;
......
......@@ -4,15 +4,15 @@
*/
#if defined F1
__device__ void computeOneInteractionF1(AtomData2& atom1, volatile AtomData2& atom2, real& outputEnergy, real3& force) {
DEVICE void computeOneInteractionF1(AtomData2 atom1, volatile AtomData2 atom2, real* outputEnergy, real3* force) {
#elif defined F2
__device__ void computeOneInteractionF2(AtomData2& atom1, volatile AtomData2& atom2, real& outputEnergy, real3& force) {
DEVICE void computeOneInteractionF2(AtomData2 atom1, volatile AtomData2 atom2, real* outputEnergy, real3* force) {
#elif defined T1
__device__ void computeOneInteractionT1(AtomData2& atom1, volatile AtomData2& atom2, real3& torque) {
DEVICE void computeOneInteractionT1(AtomData2 atom1, volatile AtomData2 atom2, real3* torque) {
#elif defined T2
__device__ void computeOneInteractionT2(AtomData2& atom1, volatile AtomData2& atom2, real3& torque) {
DEVICE void computeOneInteractionT2(AtomData2 atom1, volatile AtomData2 atom2, real3* torque) {
#elif defined B1 && defined B2
__device__ void computeOneInteractionB1B2(AtomData2& atom1, volatile AtomData2& atom2) {
DEVICE void computeOneInteractionB1B2(AtomData2 atom1, volatile AtomData2 atom2, real* bornForce1, real* bornForce2) {
#endif
const real fc = EPSILON_FACTOR*GK_FC;
......
......@@ -217,8 +217,8 @@
atom1.quadrupoleYZ*(atom2.quadrupoleXX*gqxx29 + atom2.quadrupoleYY*gqyy29 + atom2.quadrupoleZZ*gqzz29 + 2*(atom2.quadrupoleXY*gqxy29 + atom2.quadrupoleXZ*gqxz29 + atom2.quadrupoleYZ*gqyz29)));
dsumdrB1 *= 0.5f;
atom1.bornForce += atom2.bornRadius*dsumdrB1;
atom2.bornForce += atom1.bornRadius*dsumdrB1;
*bornForce1 += atom2.bornRadius*dsumdrB1;
*bornForce2 += atom1.bornRadius*dsumdrB1;
#endif
// unweighted 3rd reaction potential gradient tensor;
......@@ -530,21 +530,21 @@
trq2 -= (atom1.quadrupoleXZ*fidg11 + atom1.quadrupoleYZ*fidg12 + atom1.quadrupoleZZ*fidg13 -atom1.quadrupoleXX*fidg13-atom1.quadrupoleXY*fidg23-atom1.quadrupoleXZ*fidg33);
trq3 -= (atom1.quadrupoleXX*fidg12 + atom1.quadrupoleXY*fidg22 + atom1.quadrupoleXZ*fidg23 -atom1.quadrupoleXY*fidg11-atom1.quadrupoleYY*fidg12-atom1.quadrupoleYZ*fidg13);
torque.x = trq1;
torque.y = trq2;
torque.z = trq3;
torque->x = trq1;
torque->y = trq2;
torque->z = trq3;
} else {
torque.x = 0;
torque.y = 0;
torque.z = 0;
torque->x = 0;
torque->y = 0;
torque->z = 0;
}
#endif
#if defined B2
dsumdrB2 *= 0.5f;
atom1.bornForce += 0.5f*atom2.bornRadius*dsumdrB2;
atom2.bornForce += 0.5f*atom1.bornRadius*dsumdrB2;
*bornForce1 += 0.5f*atom2.bornRadius*dsumdrB2;
*bornForce2 += 0.5f*atom1.bornRadius*dsumdrB2;
#endif
#if defined T2
......@@ -566,36 +566,36 @@
trqi3 -= atom1.quadrupoleXX*fidg12 + atom1.quadrupoleXY*fidg22 + atom1.quadrupoleXZ*fidg23
-atom1.quadrupoleXY*fidg11 - atom1.quadrupoleYY*fidg12 - atom1.quadrupoleYZ*fidg13;
torque.x += 0.5f*trqi1;
torque.y += 0.5f*trqi2;
torque.z += 0.5f*trqi3;
torque->x += 0.5f*trqi1;
torque->y += 0.5f*trqi2;
torque->z += 0.5f*trqi3;
#endif
#if defined F1
outputEnergy = energy;
*outputEnergy = energy;
if ((xr != 0 || yr != 0 || zr != 0)) {
force.x = dedx;
force.y = dedy;
force.z = dedz;
force->x = dedx;
force->y = dedy;
force->z = dedz;
} else {
force.x = force.y = force.z = 0;
force->x = force->y = force->z = 0;
}
#endif
#if defined F2
outputEnergy += 0.5f*energy;
*outputEnergy += 0.5f*energy;
dpdx *= 0.5f;
dpdy *= 0.5f;
dpdz *= 0.5f;
if ((xr != 0 || yr != 0 || zr != 0)) {
force.x += dpdx;
force.y += dpdy;
force.z += dpdz;
force->x += dpdx;
force->y += dpdy;
force->z += dpdz;
}
#endif
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment