Unverified Commit be19e022 authored by Peter Eastman's avatar Peter Eastman Committed by GitHub
Browse files

Converted RPMD plugin to common platform (#3079)

* Converted RPMD plugin to common platform

* Merged RPMD tests for different platforms

* Try to fix errors on CPU OpenCL
parent 98d81730
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2019 Stanford University and the Authors. * * Portions copyright (c) 2019-2021 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -140,6 +140,17 @@ public: ...@@ -140,6 +140,17 @@ public:
* in page-locked memory. * in page-locked memory.
*/ */
virtual void upload(const void* data, bool blocking=true) = 0; virtual void upload(const void* data, bool blocking=true) = 0;
/**
* Copy values from host memory to a subset of the array.
*
* @param data the data to copy
* @param offset the index of the element within the array at which the copy should begin
* @param elements the number of elements to copy
* @param blocking if true, this call will block until the transfer is complete. Subclasses often
* have restrictions on non-blocking copies, such as that the source data must be
* in page-locked memory.
*/
virtual void uploadSubArray(const void* data, int offset, int elements, bool blocking=true) = 0;
/** /**
* Copy the values in the array to host memory. * Copy the values in the array to host memory.
* *
......
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2019 Stanford University and the Authors. * * Portions copyright (c) 2019-2021 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -118,7 +118,20 @@ public: ...@@ -118,7 +118,20 @@ public:
* have restrictions on non-blocking copies, such as that the source data must be * have restrictions on non-blocking copies, such as that the source data must be
* in page-locked memory. * in page-locked memory.
*/ */
void upload(const void* data, bool blocking=true); void upload(const void* data, bool blocking=true) {
uploadSubArray(data, 0, getSize(), blocking);
}
/**
* Copy values from host memory to a subset of the array.
*
* @param data the data to copy
* @param offset the index of the element within the array at which the copy should begin
* @param elements the number of elements to copy
* @param blocking if true, this call will block until the transfer is complete. Subclasses often
* have restrictions on non-blocking copies, such as that the source data must be
* in page-locked memory.
*/
void uploadSubArray(const void* data, int offset, int elements, bool blocking=true);
/** /**
* Copy the values in the array to host memory. * Copy the values in the array to host memory.
* *
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2019 Stanford University and the Authors. * * Portions copyright (c) 2019-2021 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -84,10 +84,10 @@ ComputeContext& ComputeArray::getContext() { ...@@ -84,10 +84,10 @@ ComputeContext& ComputeArray::getContext() {
return impl->getContext(); return impl->getContext();
} }
void ComputeArray::upload(const void* data, bool blocking) { void ComputeArray::uploadSubArray(const void* data, int offset, int elements, bool blocking) {
if (impl == NULL) if (impl == NULL)
throw OpenMMException("ComputeArray has not been initialized"); throw OpenMMException("ComputeArray has not been initialized");
impl->upload(data, blocking); impl->uploadSubArray(data, offset, elements, blocking);
} }
void ComputeArray::download(void* data, bool blocking) const { void ComputeArray::download(void* data, bool blocking) const {
......
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2009-2019 Stanford University and the Authors. * * Portions copyright (c) 2009-2021 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -146,13 +146,25 @@ public: ...@@ -146,13 +146,25 @@ public:
ArrayInterface::download(data); ArrayInterface::download(data);
} }
/** /**
* Copy the values in an array to the device memory. * Copy the values from host memory to the array.
* *
* @param data the data to copy * @param data the data to copy
* @param blocking if true, this call will block until the transfer is complete. If false, * @param blocking if true, this call will block until the transfer is complete. If false,
* the source array must be in page-locked memory. * the source array must be in page-locked memory.
*/ */
void upload(const void* data, bool blocking=true); void upload(const void* data, bool blocking=true) {
uploadSubArray(data, 0, getSize(), blocking);
}
/**
* Copy values from host memory to a subset of the array.
*
* @param data the data to copy
* @param offset the index of the element within the array at which the copy should begin
* @param elements the number of elements to copy
* @param blocking if true, this call will block until the transfer is complete. If false,
* the source array must be in page-locked memory.
*/
void uploadSubArray(const void* data, int offset, int elements, bool blocking=true);
/** /**
* Copy the values in the device memory to an array. * Copy the values in the device memory to an array.
* *
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2012-2019 Stanford University and the Authors. * * Portions copyright (c) 2012-2021 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -86,14 +86,16 @@ ComputeContext& CudaArray::getContext() { ...@@ -86,14 +86,16 @@ ComputeContext& CudaArray::getContext() {
return *context; return *context;
} }
void CudaArray::upload(const void* data, bool blocking) { void CudaArray::uploadSubArray(const void* data, int offset, int elements, bool blocking) {
if (pointer == 0) if (pointer == 0)
throw OpenMMException("CudaArray has not been initialized"); throw OpenMMException("CudaArray has not been initialized");
if (offset < 0 || offset+elements > getSize())
throw OpenMMException("uploadSubArray: data exceeds range of array");
CUresult result; CUresult result;
if (blocking) if (blocking)
result = cuMemcpyHtoD(pointer, data, size*elementSize); result = cuMemcpyHtoD(pointer+offset*elementSize, data, elements*elementSize);
else else
result = cuMemcpyHtoDAsync(pointer, data, size*elementSize, context->getCurrentStream()); result = cuMemcpyHtoDAsync(pointer+offset*elementSize, data, elements*elementSize, context->getCurrentStream());
if (result != CUDA_SUCCESS) { if (result != CUDA_SUCCESS) {
std::stringstream str; std::stringstream str;
str<<"Error uploading array "<<name<<": "<<CudaContext::getErrorString(result)<<" ("<<result<<")"; str<<"Error uploading array "<<name<<": "<<CudaContext::getErrorString(result)<<" ("<<result<<")";
......
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2009-2019 Stanford University and the Authors. * * Portions copyright (c) 2009-2021 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -208,12 +208,23 @@ public: ...@@ -208,12 +208,23 @@ public:
ArrayInterface::download(data); ArrayInterface::download(data);
} }
/** /**
* Copy the values in an array to the Buffer. * Copy the values from host memory to the array.
* *
* @param data the data to copy * @param data the data to copy
* @param blocking if true, this call will block until the transfer is complete. * @param blocking if true, this call will block until the transfer is complete.
*/ */
void upload(const void* data, bool blocking=true); void upload(const void* data, bool blocking=true) {
uploadSubArray(data, 0, getSize(), blocking);
}
/**
* Copy values from host memory to a subset of the array.
*
* @param data the data to copy
* @param offset the index of the element within the array at which the copy should begin
* @param elements the number of elements to copy
* @param blocking if true, this call will block until the transfer is complete.
*/
void uploadSubArray(const void* data, int offset, int elements, bool blocking=true);
/** /**
* Copy the values in the Buffer to an array. * Copy the values in the Buffer to an array.
* *
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2012-2019 Stanford University and the Authors. * * Portions copyright (c) 2012-2021 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -96,11 +96,13 @@ ComputeContext& OpenCLArray::getContext() { ...@@ -96,11 +96,13 @@ ComputeContext& OpenCLArray::getContext() {
return *context; return *context;
} }
void OpenCLArray::upload(const void* data, bool blocking) { void OpenCLArray::uploadSubArray(const void* data, int offset, int elements, bool blocking) {
if (buffer == NULL) if (buffer == NULL)
throw OpenMMException("OpenCLArray has not been initialized"); throw OpenMMException("OpenCLArray has not been initialized");
if (offset < 0 || offset+elements > getSize())
throw OpenMMException("uploadSubArray: data exceeds range of array");
try { try {
context->getQueue().enqueueWriteBuffer(*buffer, blocking ? CL_TRUE : CL_FALSE, 0, size*elementSize, data); context->getQueue().enqueueWriteBuffer(*buffer, blocking ? CL_TRUE : CL_FALSE, offset*elementSize, elements*elementSize, data);
} }
catch (cl::Error err) { catch (cl::Error err) {
std::stringstream str; std::stringstream str;
......
...@@ -108,6 +108,7 @@ ENDIF(OPENMM_BUILD_STATIC_LIB) ...@@ -108,6 +108,7 @@ ENDIF(OPENMM_BUILD_STATIC_LIB)
# Which hardware platforms to build # Which hardware platforms to build
ADD_SUBDIRECTORY(platforms/reference) ADD_SUBDIRECTORY(platforms/reference)
ADD_SUBDIRECTORY(platforms/common)
IF(OPENMM_BUILD_OPENCL_LIB) IF(OPENMM_BUILD_OPENCL_LIB)
SET(OPENMM_BUILD_RPMD_OPENCL_LIB ON CACHE BOOL "Build RPMD implementation for OpenCL") SET(OPENMM_BUILD_RPMD_OPENCL_LIB ON CACHE BOOL "Build RPMD implementation for OpenCL")
......
...@@ -69,11 +69,11 @@ public: ...@@ -69,11 +69,11 @@ public:
*/ */
virtual void execute(ContextImpl& context, const RPMDIntegrator& integrator, bool forcesAreValid) = 0; virtual void execute(ContextImpl& context, const RPMDIntegrator& integrator, bool forcesAreValid) = 0;
/** /**
* Get the positions of all particles in one copy of the system. * Set the positions of all particles in one copy of the system.
*/ */
virtual void setPositions(int copy, const std::vector<Vec3>& positions) = 0; virtual void setPositions(int copy, const std::vector<Vec3>& positions) = 0;
/** /**
* Get the velocities of all particles in one copy of the system. * Set the velocities of all particles in one copy of the system.
*/ */
virtual void setVelocities(int copy, const std::vector<Vec3>& velocities) = 0; virtual void setVelocities(int copy, const std::vector<Vec3>& velocities) = 0;
/** /**
......
# Encode the kernel sources into a C++ class.
SET(KERNEL_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/src")
SET(KERNEL_SOURCE_CLASS CommonRpmdKernelSources)
SET(KERNELS_CPP ${CMAKE_CURRENT_BINARY_DIR}/src/${KERNEL_SOURCE_CLASS}.cpp)
SET(KERNELS_H ${CMAKE_CURRENT_BINARY_DIR}/src/${KERNEL_SOURCE_CLASS}.h)
INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_BINARY_DIR}/src)
FILE(GLOB COMMON_KERNELS ${KERNEL_SOURCE_DIR}/kernels/*.cc)
ADD_CUSTOM_COMMAND(OUTPUT ${KERNELS_CPP} ${KERNELS_H}
COMMAND ${CMAKE_COMMAND}
ARGS -D KERNEL_SOURCE_DIR=${KERNEL_SOURCE_DIR} -D KERNELS_CPP=${KERNELS_CPP} -D KERNELS_H=${KERNELS_H} -D KERNEL_SOURCE_CLASS=${KERNEL_SOURCE_CLASS} -D KERNEL_FILE_EXTENSION=cc -P ${CMAKE_SOURCE_DIR}/cmake_modules/EncodeKernelFiles.cmake
DEPENDS ${COMMON_KERNELS}
)
SET_SOURCE_FILES_PROPERTIES(${KERNELS_CPP} ${KERNELS_H} PROPERTIES GENERATED TRUE)
ADD_CUSTOM_TARGET(RpmdCommonKernels DEPENDS ${KERNELS_CPP} ${KERNELS_H})
...@@ -24,7 +24,7 @@ ...@@ -24,7 +24,7 @@
* along with this program. If not, see <http://www.gnu.org/licenses/>. * * along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */ * -------------------------------------------------------------------------- */
#include "CudaRpmdKernelSources.h" #include "CommonRpmdKernelSources.h"
using namespace OpenMM; using namespace OpenMM;
using namespace std; using namespace std;
......
#ifndef OPENMM_CUDARPMDKERNELSOURCES_H_ #ifndef OPENMM_COMMONRPMDKERNELSOURCES_H_
#define OPENMM_CUDARPMDKERNELSOURCES_H_ #define OPENMM_COMMONRPMDKERNELSOURCES_H_
/* -------------------------------------------------------------------------- * /* -------------------------------------------------------------------------- *
* OpenMM * * OpenMM *
...@@ -32,16 +32,16 @@ ...@@ -32,16 +32,16 @@
namespace OpenMM { namespace OpenMM {
/** /**
* This class is a central holding place for the source code of CUDA kernels. * This class is a central holding place for the source code of device kernels.
* The CMake build script inserts declarations into it based on the .cu files in the * The CMake build script inserts declarations into it based on the .cc files in the
* kernels subfolder. * kernels subfolder.
*/ */
class CudaRpmdKernelSources { class CommonRpmdKernelSources {
public: public:
@KERNEL_FILE_DECLARATIONS@ @KERNEL_FILE_DECLARATIONS@
}; };
} // namespace OpenMM } // namespace OpenMM
#endif /*OPENMM_CUDARPMDKERNELSOURCES_H_*/ #endif /*OPENMM_COMMONRPMDKERNELSOURCES_H_*/
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2011-2020 Stanford University and the Authors. * * Portions copyright (c) 2011-2021 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -29,13 +29,12 @@ ...@@ -29,13 +29,12 @@
* USE OR OTHER DEALINGS IN THE SOFTWARE. * * USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */ * -------------------------------------------------------------------------- */
#include "CudaRpmdKernels.h" #include "CommonRpmdKernels.h"
#include "CudaRpmdKernelSources.h" #include "CommonRpmdKernelSources.h"
#include "openmm/internal/ContextImpl.h" #include "openmm/internal/ContextImpl.h"
#include "CudaIntegrationUtilities.h" #include "openmm/common/IntegrationUtilities.h"
#include "CudaExpressionUtilities.h" #include "openmm/common/ExpressionUtilities.h"
#include "CudaKernelSources.h" #include "openmm/common/NonbondedUtilities.h"
#include "CudaNonbondedUtilities.h"
#include "SimTKOpenMMRealType.h" #include "SimTKOpenMMRealType.h"
using namespace OpenMM; using namespace OpenMM;
...@@ -62,39 +61,39 @@ static int findFFTDimension(int minimum) { ...@@ -62,39 +61,39 @@ static int findFFTDimension(int minimum) {
} }
} }
void CudaIntegrateRPMDStepKernel::initialize(const System& system, const RPMDIntegrator& integrator) { void CommonIntegrateRPMDStepKernel::initialize(const System& system, const RPMDIntegrator& integrator) {
cu.getPlatformData().initializeContexts(system); cc.initializeContexts();
numCopies = integrator.getNumCopies(); numCopies = integrator.getNumCopies();
numParticles = system.getNumParticles(); numParticles = system.getNumParticles();
workgroupSize = numCopies; workgroupSize = numCopies;
if (numCopies != findFFTDimension(numCopies)) if (numCopies != findFFTDimension(numCopies))
throw OpenMMException("RPMDIntegrator: the number of copies must be a multiple of powers of 2, 3, and 5."); throw OpenMMException("RPMDIntegrator: the number of copies must be a multiple of powers of 2, 3, and 5.");
int paddedParticles = cu.getPaddedNumAtoms(); int paddedParticles = cc.getPaddedNumAtoms();
bool useDoublePrecision = (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()); bool useDoublePrecision = (cc.getUseDoublePrecision() || cc.getUseMixedPrecision());
int elementSize = (useDoublePrecision ? sizeof(double4) : sizeof(float4)); int elementSize = (useDoublePrecision ? sizeof(mm_double4) : sizeof(mm_float4));
forces.initialize<long long>(cu, numCopies*paddedParticles*3, "rpmdForces"); forces.initialize<long long>(cc, numCopies*paddedParticles*3, "rpmdForces");
positions.initialize(cu, numCopies*paddedParticles, elementSize, "rpmdPositions"); positions.initialize(cc, numCopies*paddedParticles, elementSize, "rpmdPositions");
velocities.initialize(cu, numCopies*paddedParticles, elementSize, "rpmdVelocities"); velocities.initialize(cc, numCopies*paddedParticles, elementSize, "rpmdVelocities");
cu.getIntegrationUtilities().initRandomNumberGenerator((unsigned int) integrator.getRandomNumberSeed()); cc.getIntegrationUtilities().initRandomNumberGenerator((unsigned int) integrator.getRandomNumberSeed());
// Fill in the posq and velm arrays with safe values to avoid a risk of nans. // Fill in the posq and velm arrays with safe values to avoid a risk of nans.
if (useDoublePrecision) { if (useDoublePrecision) {
vector<double4> temp(positions.getSize()); vector<mm_double4> temp(positions.getSize());
for (int i = 0; i < positions.getSize(); i++) for (int i = 0; i < positions.getSize(); i++)
temp[i] = make_double4(0, 0, 0, 0); temp[i] = mm_double4(0, 0, 0, 0);
positions.upload(temp); positions.upload(temp);
for (int i = 0; i < velocities.getSize(); i++) for (int i = 0; i < velocities.getSize(); i++)
temp[i] = make_double4(0, 0, 0, 1); temp[i] = mm_double4(0, 0, 0, 1);
velocities.upload(temp); velocities.upload(temp);
} }
else { else {
vector<float4> temp(positions.getSize()); vector<mm_float4> temp(positions.getSize());
for (int i = 0; i < positions.getSize(); i++) for (int i = 0; i < positions.getSize(); i++)
temp[i] = make_float4(0, 0, 0, 0); temp[i] = mm_float4(0, 0, 0, 0);
positions.upload(temp); positions.upload(temp);
for (int i = 0; i < velocities.getSize(); i++) for (int i = 0; i < velocities.getSize(); i++)
temp[i] = make_float4(0, 0, 0, 1); temp[i] = mm_float4(0, 0, 0, 1);
velocities.upload(temp); velocities.upload(temp);
} }
...@@ -125,54 +124,99 @@ void CudaIntegrateRPMDStepKernel::initialize(const System& system, const RPMDInt ...@@ -125,54 +124,99 @@ void CudaIntegrateRPMDStepKernel::initialize(const System& system, const RPMDInt
} }
groupsNotContracted &= integrator.getIntegrationForceGroups(); groupsNotContracted &= integrator.getIntegrationForceGroups();
if (maxContractedCopies > 0) { if (maxContractedCopies > 0) {
contractedForces.initialize<long long>(cu, maxContractedCopies*paddedParticles*3, "rpmdContractedForces"); contractedForces.initialize<long long>(cc, maxContractedCopies*paddedParticles*3, "rpmdContractedForces");
contractedPositions.initialize(cu, maxContractedCopies*paddedParticles, elementSize, "rpmdContractedPositions"); contractedPositions.initialize(cc, maxContractedCopies*paddedParticles, elementSize, "rpmdContractedPositions");
} }
// Create kernels. // Create kernels.
map<string, string> defines; map<string, string> defines;
defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms()); defines["NUM_ATOMS"] = cc.intToString(cc.getNumAtoms());
defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms()); defines["PADDED_NUM_ATOMS"] = cc.intToString(cc.getPaddedNumAtoms());
defines["NUM_COPIES"] = cu.intToString(numCopies); defines["NUM_COPIES"] = cc.intToString(numCopies);
defines["THREAD_BLOCK_SIZE"] = cu.intToString(workgroupSize); defines["THREAD_BLOCK_SIZE"] = cc.intToString(workgroupSize);
defines["HBAR"] = cu.doubleToString(1.054571628e-34*AVOGADRO/(1000*1e-12)); defines["HBAR"] = cc.doubleToString(1.054571628e-34*AVOGADRO/(1000*1e-12));
defines["SCALE"] = cu.doubleToString(1.0/sqrt((double) numCopies)); defines["SCALE"] = cc.doubleToString(1.0/sqrt((double) numCopies));
defines["M_PI"] = cu.doubleToString(M_PI); defines["M_PI"] = cc.doubleToString(M_PI);
map<string, string> replacements; map<string, string> replacements;
replacements["FFT_Q_FORWARD"] = createFFT(numCopies, "q", true); replacements["FFT_Q_FORWARD"] = createFFT(numCopies, "q", true);
replacements["FFT_Q_BACKWARD"] = createFFT(numCopies, "q", false); replacements["FFT_Q_BACKWARD"] = createFFT(numCopies, "q", false);
replacements["FFT_V_FORWARD"] = createFFT(numCopies, "v", true); replacements["FFT_V_FORWARD"] = createFFT(numCopies, "v", true);
replacements["FFT_V_BACKWARD"] = createFFT(numCopies, "v", false); replacements["FFT_V_BACKWARD"] = createFFT(numCopies, "v", false);
CUmodule module = cu.createModule(cu.replaceStrings(CudaKernelSources::vectorOps+CudaRpmdKernelSources::rpmd, replacements), defines, ""); ComputeProgram program = cc.compileProgram(cc.replaceStrings(CommonRpmdKernelSources::rpmd, replacements), defines);
pileKernel = cu.getKernel(module, "applyPileThermostat"); pileKernel = program->createKernel("applyPileThermostat");
stepKernel = cu.getKernel(module, "integrateStep"); stepKernel = program->createKernel("integrateStep");
velocitiesKernel = cu.getKernel(module, "advanceVelocities"); velocitiesKernel = program->createKernel("advanceVelocities");
copyToContextKernel = cu.getKernel(module, "copyDataToContext"); copyToContextKernel = program->createKernel("copyDataToContext");
copyFromContextKernel = cu.getKernel(module, "copyDataFromContext"); copyFromContextKernel = program->createKernel("copyDataFromContext");
translateKernel = cu.getKernel(module, "applyCellTranslations"); translateKernel = program->createKernel("applyCellTranslations");
// Create kernels for doing contractions. // Create kernels for doing contractions.
for (auto& g : groupsByCopies) { for (auto& g : groupsByCopies) {
int copies = g.first; int copies = g.first;
replacements.clear(); replacements.clear();
replacements["NUM_CONTRACTED_COPIES"] = cu.intToString(copies); replacements["NUM_CONTRACTED_COPIES"] = cc.intToString(copies);
replacements["POS_SCALE"] = cu.doubleToString(1.0/numCopies); replacements["POS_SCALE"] = cc.doubleToString(1.0/numCopies);
replacements["FORCE_SCALE"] = cu.doubleToString(0x100000000/(double) copies); replacements["FORCE_SCALE"] = cc.doubleToString(0x100000000/(double) copies);
replacements["FFT_Q_FORWARD"] = createFFT(numCopies, "q", true); replacements["FFT_Q_FORWARD"] = createFFT(numCopies, "q", true);
replacements["FFT_Q_BACKWARD"] = createFFT(copies, "q", false); replacements["FFT_Q_BACKWARD"] = createFFT(copies, "q", false);
replacements["FFT_F_FORWARD"] = createFFT(copies, "f", true); replacements["FFT_F_FORWARD"] = createFFT(copies, "f", true);
replacements["FFT_F_BACKWARD"] = createFFT(numCopies, "f", false); replacements["FFT_F_BACKWARD"] = createFFT(numCopies, "f", false);
module = cu.createModule(cu.replaceStrings(CudaKernelSources::vectorOps+CudaRpmdKernelSources::rpmdContraction, replacements), defines, ""); program = cc.compileProgram(cc.replaceStrings(CommonRpmdKernelSources::rpmdContraction, replacements), defines);
positionContractionKernels[copies] = cu.getKernel(module, "contractPositions"); positionContractionKernels[copies] = program->createKernel("contractPositions");
forceContractionKernels[copies] = cu.getKernel(module, "contractForces"); forceContractionKernels[copies] = program->createKernel("contractForces");
} }
} }
void CudaIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDIntegrator& integrator, bool forcesAreValid) { void CommonIntegrateRPMDStepKernel::initializeKernels(ContextImpl& context) {
cu.setAsCurrent(); hasInitializedKernels = true;
CudaIntegrationUtilities& integration = cu.getIntegrationUtilities(); pileKernel->addArg(velocities);
pileKernel->addArg(cc.getIntegrationUtilities().getRandom());
pileKernel->addArg();
pileKernel->addArg();
pileKernel->addArg();
pileKernel->addArg();
stepKernel->addArg(positions);
stepKernel->addArg(velocities);
stepKernel->addArg(forces);
stepKernel->addArg();
stepKernel->addArg();
velocitiesKernel->addArg(velocities);
velocitiesKernel->addArg(forces);
velocitiesKernel->addArg();
translateKernel->addArg(positions);
translateKernel->addArg(cc.getPosq());
translateKernel->addArg(cc.getAtomIndexArray());
translateKernel->addArg();
copyToContextKernel->addArg(velocities);
copyToContextKernel->addArg(cc.getVelm());
copyToContextKernel->addArg();
copyToContextKernel->addArg(cc.getPosq());
copyToContextKernel->addArg(cc.getAtomIndexArray());
copyToContextKernel->addArg();
copyFromContextKernel->addArg(cc.getLongForceBuffer());
copyFromContextKernel->addArg();
copyFromContextKernel->addArg(cc.getVelm());
copyFromContextKernel->addArg(velocities);
copyFromContextKernel->addArg(cc.getPosq());
copyFromContextKernel->addArg();
copyFromContextKernel->addArg(cc.getAtomIndexArray());
copyFromContextKernel->addArg();
for (auto& g : groupsByCopies) {
int copies = g.first;
positionContractionKernels[copies]->addArg(positions);
positionContractionKernels[copies]->addArg(contractedPositions);
forceContractionKernels[copies]->addArg(forces);
forceContractionKernels[copies]->addArg(contractedForces);
}
}
void CommonIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDIntegrator& integrator, bool forcesAreValid) {
cc.setAsCurrent();
if (!hasInitializedKernels)
initializeKernels(context);
IntegrationUtilities& integration = cc.getIntegrationUtilities();
// Loop over copies and compute the force on each one. // Loop over copies and compute the force on each one.
...@@ -181,25 +225,31 @@ void CudaIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDIntegr ...@@ -181,25 +225,31 @@ void CudaIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDIntegr
// Apply the PILE-L thermostat. // Apply the PILE-L thermostat.
bool useDoublePrecision = (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()); bool useDoublePrecision = (cc.getUseDoublePrecision() || cc.getUseMixedPrecision());
double dt = integrator.getStepSize(); double dt = integrator.getStepSize();
float dtFloat = (float) dt; pileKernel->setArg(2, integration.prepareRandomNumbers(numParticles*numCopies));
void* dtPtr = (useDoublePrecision ? (void*) &dt : (void*) &dtFloat); if (useDoublePrecision) {
double kT = integrator.getTemperature()*BOLTZ; pileKernel->setArg(3, dt);
float kTFloat = (float) kT; pileKernel->setArg(4, integrator.getTemperature()*BOLTZ);
void* kTPtr = (useDoublePrecision ? (void*) &kT : (void*) &kTFloat); pileKernel->setArg(5, integrator.getFriction());
double friction = integrator.getFriction(); stepKernel->setArg(3, dt);
float frictionFloat = (float) friction; stepKernel->setArg(4, integrator.getTemperature()*BOLTZ);
void* frictionPtr = (useDoublePrecision ? (void*) &friction : (void*) &frictionFloat); velocitiesKernel->setArg(2, dt);
int randomIndex = integration.prepareRandomNumbers(numParticles*numCopies); }
void* pileArgs[] = {&velocities.getDevicePointer(), &integration.getRandom().getDevicePointer(), &randomIndex, dtPtr, kTPtr, frictionPtr}; else {
pileKernel->setArg(3, (float) dt);
pileKernel->setArg(4, (float) (integrator.getTemperature()*BOLTZ));
pileKernel->setArg(5, (float) integrator.getFriction());
stepKernel->setArg(3, (float) dt);
stepKernel->setArg(4, (float) (integrator.getTemperature()*BOLTZ));
velocitiesKernel->setArg(2, (float) dt);
}
if (integrator.getApplyThermostat()) if (integrator.getApplyThermostat())
cu.executeKernel(pileKernel, pileArgs, numParticles*numCopies, workgroupSize); pileKernel->execute(numParticles*numCopies, workgroupSize);
// Update positions and velocities. // Update positions and velocities.
void* stepArgs[] = {&positions.getDevicePointer(), &velocities.getDevicePointer(), &forces.getDevicePointer(), dtPtr, kTPtr}; stepKernel->execute(numParticles*numCopies, workgroupSize);
cu.executeKernel(stepKernel, stepArgs, numParticles*numCopies, workgroupSize);
// Calculate forces based on the updated positions. // Calculate forces based on the updated positions.
...@@ -207,38 +257,38 @@ void CudaIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDIntegr ...@@ -207,38 +257,38 @@ void CudaIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDIntegr
// Update velocities. // Update velocities.
void* velocitiesArgs[] = {&velocities.getDevicePointer(), &forces.getDevicePointer(), dtPtr}; velocitiesKernel->execute(numParticles*numCopies, workgroupSize);
cu.executeKernel(velocitiesKernel, velocitiesArgs, numParticles*numCopies, workgroupSize);
// Apply the PILE-L thermostat again. // Apply the PILE-L thermostat again.
if (integrator.getApplyThermostat()) { if (integrator.getApplyThermostat()) {
randomIndex = integration.prepareRandomNumbers(numParticles*numCopies); pileKernel->setArg(2, integration.prepareRandomNumbers(numParticles*numCopies));
cu.executeKernel(pileKernel, pileArgs, numParticles*numCopies, workgroupSize); pileKernel->execute(numParticles*numCopies, workgroupSize);
} }
// Update the time and step count. // Update the time and step count.
cu.setTime(cu.getTime()+dt); cc.setTime(cc.getTime()+dt);
cu.setStepCount(cu.getStepCount()+1); cc.setStepCount(cc.getStepCount()+1);
cu.reorderAtoms(); cc.reorderAtoms();
if (cu.getAtomsWereReordered() && cu.getNonbondedUtilities().getUsePeriodic()) { if (cc.getAtomsWereReordered() && cc.getNonbondedUtilities().getUsePeriodic()) {
// Atoms may have been translated into a different periodic box, so apply // Atoms may have been translated into a different periodic box, so apply
// the same translation to all the beads. // the same translation to all the beads.
int i = numCopies-1; translateKernel->setArg(3, numCopies-1);
void* args[] = {&positions.getDevicePointer(), &cu.getPosq().getDevicePointer(), &cu.getAtomIndexArray().getDevicePointer(), &i}; translateKernel->execute(cc.getNumAtoms());
cu.executeKernel(translateKernel, args, cu.getNumAtoms());
} }
} }
void CudaIntegrateRPMDStepKernel::computeForces(ContextImpl& context) { void CommonIntegrateRPMDStepKernel::computeForces(ContextImpl& context) {
// Compute forces from all groups that didn't have a specified contraction. // Compute forces from all groups that didn't have a specified contraction.
copyToContextKernel->setArg(2, positions);
copyFromContextKernel->setArg(1, forces);
copyFromContextKernel->setArg(5, positions);
for (int i = 0; i < numCopies; i++) { for (int i = 0; i < numCopies; i++) {
void* copyToContextArgs[] = {&velocities.getDevicePointer(), &cu.getVelm().getDevicePointer(), &positions.getDevicePointer(), copyToContextKernel->setArg(5, i);
&cu.getPosq().getDevicePointer(), &cu.getAtomIndexArray().getDevicePointer(), &i}; copyToContextKernel->execute(cc.getNumAtoms());
cu.executeKernel(copyToContextKernel, copyToContextArgs, cu.getNumAtoms());
context.computeVirtualSites(); context.computeVirtualSites();
Vec3 initialBox[3]; Vec3 initialBox[3];
context.getPeriodicBoxVectors(initialBox[0], initialBox[1], initialBox[2]); context.getPeriodicBoxVectors(initialBox[0], initialBox[1], initialBox[2]);
...@@ -248,55 +298,54 @@ void CudaIntegrateRPMDStepKernel::computeForces(ContextImpl& context) { ...@@ -248,55 +298,54 @@ void CudaIntegrateRPMDStepKernel::computeForces(ContextImpl& context) {
if (initialBox[0] != finalBox[0] || initialBox[1] != finalBox[1] || initialBox[2] != finalBox[2]) if (initialBox[0] != finalBox[0] || initialBox[1] != finalBox[1] || initialBox[2] != finalBox[2])
throw OpenMMException("Standard barostats cannot be used with RPMDIntegrator. Use RPMDMonteCarloBarostat instead."); throw OpenMMException("Standard barostats cannot be used with RPMDIntegrator. Use RPMDMonteCarloBarostat instead.");
context.calcForcesAndEnergy(true, false, groupsNotContracted); context.calcForcesAndEnergy(true, false, groupsNotContracted);
void* copyFromContextArgs[] = {&cu.getForce().getDevicePointer(), &forces.getDevicePointer(), &cu.getVelm().getDevicePointer(), copyFromContextKernel->setArg(7, i);
&velocities.getDevicePointer(), &cu.getPosq().getDevicePointer(), &positions.getDevicePointer(), &cu.getAtomIndexArray().getDevicePointer(), &i}; copyFromContextKernel->execute(cc.getNumAtoms());
cu.executeKernel(copyFromContextKernel, copyFromContextArgs, cu.getNumAtoms());
} }
// Now loop over contractions and compute forces from them. // Now loop over contractions and compute forces from them.
for (auto& g : groupsByCopies) { if (groupsByCopies.size() > 0) {
int copies = g.first; copyToContextKernel->setArg(2, contractedPositions);
int groupFlags = g.second; copyFromContextKernel->setArg(1, contractedForces);
copyFromContextKernel->setArg(5, contractedPositions);
// Find the contracted positions. for (auto& g : groupsByCopies) {
int copies = g.first;
void* contractPosArgs[] = {&positions.getDevicePointer(), &contractedPositions.getDevicePointer()}; int groupFlags = g.second;
cu.executeKernel(positionContractionKernels[copies], contractPosArgs, numParticles*numCopies, workgroupSize);
// Find the contracted positions.
// Compute forces.
positionContractionKernels[copies]->execute(numParticles*numCopies, workgroupSize);
for (int i = 0; i < copies; i++) {
void* copyToContextArgs[] = {&velocities.getDevicePointer(), &cu.getVelm().getDevicePointer(), &contractedPositions.getDevicePointer(), // Compute forces.
&cu.getPosq().getDevicePointer(), &cu.getAtomIndexArray().getDevicePointer(), &i};
cu.executeKernel(copyToContextKernel, copyToContextArgs, cu.getNumAtoms()); for (int i = 0; i < copies; i++) {
context.computeVirtualSites(); copyToContextKernel->setArg(5, i);
context.calcForcesAndEnergy(true, false, groupFlags); copyToContextKernel->execute(cc.getNumAtoms());
void* copyFromContextArgs[] = {&cu.getForce().getDevicePointer(), &contractedForces.getDevicePointer(), &cu.getVelm().getDevicePointer(), context.computeVirtualSites();
&velocities.getDevicePointer(), &cu.getPosq().getDevicePointer(), &contractedPositions.getDevicePointer(), &cu.getAtomIndexArray().getDevicePointer(), &i}; context.calcForcesAndEnergy(true, false, groupFlags);
cu.executeKernel(copyFromContextKernel, copyFromContextArgs, cu.getNumAtoms()); copyFromContextKernel->setArg(7, i);
copyFromContextKernel->execute(cc.getNumAtoms());
}
// Apply the forces to the original copies.
forceContractionKernels[copies]->execute(numParticles*numCopies, workgroupSize);
} }
// Apply the forces to the original copies.
void* contractForceArgs[] = {&forces.getDevicePointer(), &contractedForces.getDevicePointer()};
cu.executeKernel(forceContractionKernels[copies], contractForceArgs, numParticles*numCopies, workgroupSize);
} }
if (groupsByCopies.size() > 0) { if (groupsByCopies.size() > 0) {
// Ensure the Context contains the positions from the last copy, since we'll assume that later. // Ensure the Context contains the positions from the last copy, since we'll assume that later.
int i = numCopies-1; copyToContextKernel->setArg(2, positions);
void* copyToContextArgs[] = {&velocities.getDevicePointer(), &cu.getVelm().getDevicePointer(), &positions.getDevicePointer(), copyToContextKernel->setArg(5, numCopies-1);
&cu.getPosq().getDevicePointer(), &cu.getAtomIndexArray().getDevicePointer(), &i}; copyToContextKernel->execute(cc.getNumAtoms());
cu.executeKernel(copyToContextKernel, copyToContextArgs, cu.getNumAtoms());
} }
} }
double CudaIntegrateRPMDStepKernel::computeKineticEnergy(ContextImpl& context, const RPMDIntegrator& integrator) { double CommonIntegrateRPMDStepKernel::computeKineticEnergy(ContextImpl& context, const RPMDIntegrator& integrator) {
return cu.getIntegrationUtilities().computeKineticEnergy(0); return cc.getIntegrationUtilities().computeKineticEnergy(0);
} }
void CudaIntegrateRPMDStepKernel::setPositions(int copy, const vector<Vec3>& pos) { void CommonIntegrateRPMDStepKernel::setPositions(int copy, const vector<Vec3>& pos) {
if (!positions.isInitialized()) if (!positions.isInitialized())
throw OpenMMException("RPMDIntegrator: Cannot set positions before the integrator is added to a Context"); throw OpenMMException("RPMDIntegrator: Cannot set positions before the integrator is added to a Context");
if (pos.size() != numParticles) if (pos.size() != numParticles)
...@@ -304,80 +353,71 @@ void CudaIntegrateRPMDStepKernel::setPositions(int copy, const vector<Vec3>& pos ...@@ -304,80 +353,71 @@ void CudaIntegrateRPMDStepKernel::setPositions(int copy, const vector<Vec3>& pos
// Adjust the positions based on the current cell offsets. // Adjust the positions based on the current cell offsets.
const vector<int>& order = cu.getAtomIndex(); const vector<int>& order = cc.getAtomIndex();
double4 periodicBoxSize = cu.getPeriodicBoxSize(); Vec3 a, b, c;
cc.getPeriodicBoxVectors(a, b, c);
vector<Vec3> offsetPos(numParticles); vector<Vec3> offsetPos(numParticles);
for (int i = 0; i < numParticles; ++i) { for (int i = 0; i < numParticles; ++i) {
mm_int4 offset = cu.getPosCellOffsets()[i]; mm_int4 offset = cc.getPosCellOffsets()[i];
offsetPos[order[i]] = pos[order[i]] + Vec3(offset.x*periodicBoxSize.x, offset.y*periodicBoxSize.y, offset.z*periodicBoxSize.z); offsetPos[order[i]] = pos[order[i]] + Vec3(offset.x*a[0], offset.y*b[1], offset.z*c[2]);
} }
// Record the positions. // Record the positions.
CUresult result; if (cc.getUseDoublePrecision()) {
if (cu.getUseDoublePrecision()) { vector<mm_double4> posq(cc.getPaddedNumAtoms());
vector<double4> posq(cu.getPaddedNumAtoms()); cc.getPosq().download(posq);
cu.getPosq().download(posq);
for (int i = 0; i < numParticles; i++) for (int i = 0; i < numParticles; i++)
posq[i] = make_double4(offsetPos[i][0], offsetPos[i][1], offsetPos[i][2], posq[i].w); posq[i] = mm_double4(offsetPos[i][0], offsetPos[i][1], offsetPos[i][2], posq[i].w);
result = cuMemcpyHtoD(positions.getDevicePointer()+copy*cu.getPaddedNumAtoms()*sizeof(double4), &posq[0], numParticles*sizeof(double4)); positions.uploadSubArray(&posq[0], copy*cc.getPaddedNumAtoms(), numParticles);
} }
else if (cu.getUseMixedPrecision()) { else if (cc.getUseMixedPrecision()) {
vector<float4> posqf(cu.getPaddedNumAtoms()); vector<mm_float4> posqf(cc.getPaddedNumAtoms());
cu.getPosq().download(posqf); cc.getPosq().download(posqf);
vector<double4> posq(cu.getPaddedNumAtoms()); vector<mm_double4> posq(cc.getPaddedNumAtoms());
for (int i = 0; i < numParticles; i++) for (int i = 0; i < numParticles; i++)
posq[i] = make_double4(offsetPos[i][0], offsetPos[i][1], offsetPos[i][2], posqf[i].w); posq[i] = mm_double4(offsetPos[i][0], offsetPos[i][1], offsetPos[i][2], posqf[i].w);
result = cuMemcpyHtoD(positions.getDevicePointer()+copy*cu.getPaddedNumAtoms()*sizeof(double4), &posq[0], numParticles*sizeof(double4)); positions.uploadSubArray(&posq[0], copy*cc.getPaddedNumAtoms(), numParticles);
} }
else { else {
vector<float4> posq(cu.getPaddedNumAtoms()); vector<mm_float4> posq(cc.getPaddedNumAtoms());
cu.getPosq().download(posq); cc.getPosq().download(posq);
for (int i = 0; i < numParticles; i++) for (int i = 0; i < numParticles; i++)
posq[i] = make_float4((float) offsetPos[i][0], (float) offsetPos[i][1], (float) offsetPos[i][2], posq[i].w); posq[i] = mm_float4((float) offsetPos[i][0], (float) offsetPos[i][1], (float) offsetPos[i][2], posq[i].w);
result = cuMemcpyHtoD(positions.getDevicePointer()+copy*cu.getPaddedNumAtoms()*sizeof(float4), &posq[0], numParticles*sizeof(float4)); positions.uploadSubArray(&posq[0], copy*cc.getPaddedNumAtoms(), numParticles);
}
if (result != CUDA_SUCCESS) {
std::stringstream str;
str<<"Error uploading array "<<positions.getName()<<": "<<CudaContext::getErrorString(result)<<" ("<<result<<")";
throw OpenMMException(str.str());
} }
} }
void CudaIntegrateRPMDStepKernel::setVelocities(int copy, const vector<Vec3>& vel) { void CommonIntegrateRPMDStepKernel::setVelocities(int copy, const vector<Vec3>& vel) {
if (!velocities.isInitialized()) if (!velocities.isInitialized())
throw OpenMMException("RPMDIntegrator: Cannot set velocities before the integrator is added to a Context"); throw OpenMMException("RPMDIntegrator: Cannot set velocities before the integrator is added to a Context");
if (vel.size() != numParticles) if (vel.size() != numParticles)
throw OpenMMException("RPMDIntegrator: wrong number of values passed to setVelocities()"); throw OpenMMException("RPMDIntegrator: wrong number of values passed to setVelocities()");
CUresult result; if (cc.getUseDoublePrecision() || cc.getUseMixedPrecision()) {
if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) { vector<mm_double4> velm(cc.getPaddedNumAtoms());
vector<double4> velm(cu.getPaddedNumAtoms()); cc.getVelm().download(velm);
cu.getVelm().download(velm);
for (int i = 0; i < numParticles; i++) for (int i = 0; i < numParticles; i++)
velm[i] = make_double4(vel[i][0], vel[i][1], vel[i][2], velm[i].w); velm[i] = mm_double4(vel[i][0], vel[i][1], vel[i][2], velm[i].w);
result = cuMemcpyHtoD(velocities.getDevicePointer()+copy*cu.getPaddedNumAtoms()*sizeof(double4), &velm[0], numParticles*sizeof(double4)); velocities.uploadSubArray(&velm[0], copy*cc.getPaddedNumAtoms(), numParticles);
} }
else { else {
vector<float4> velm(cu.getPaddedNumAtoms()); vector<mm_float4> velm(cc.getPaddedNumAtoms());
cu.getVelm().download(velm); cc.getVelm().download(velm);
for (int i = 0; i < numParticles; i++) for (int i = 0; i < numParticles; i++)
velm[i] = make_float4((float) vel[i][0], (float) vel[i][1], (float) vel[i][2], velm[i].w); velm[i] = mm_float4((float) vel[i][0], (float) vel[i][1], (float) vel[i][2], velm[i].w);
result = cuMemcpyHtoD(velocities.getDevicePointer()+copy*cu.getPaddedNumAtoms()*sizeof(float4), &velm[0], numParticles*sizeof(float4)); velocities.uploadSubArray(&velm[0], copy*cc.getPaddedNumAtoms(), numParticles);
}
if (result != CUDA_SUCCESS) {
std::stringstream str;
str<<"Error uploading array "<<velocities.getName()<<": "<<CudaContext::getErrorString(result)<<" ("<<result<<")";
throw OpenMMException(str.str());
} }
} }
void CudaIntegrateRPMDStepKernel::copyToContext(int copy, ContextImpl& context) { void CommonIntegrateRPMDStepKernel::copyToContext(int copy, ContextImpl& context) {
void* copyArgs[] = {&velocities.getDevicePointer(), &cu.getVelm().getDevicePointer(), &positions.getDevicePointer(), if (!hasInitializedKernels)
&cu.getPosq().getDevicePointer(), &cu.getAtomIndexArray().getDevicePointer(), &copy}; initializeKernels(context);
cu.executeKernel(copyToContextKernel, copyArgs, cu.getNumAtoms()); copyToContextKernel->setArg(2, positions);
copyToContextKernel->setArg(5, copy);
copyToContextKernel->execute(cc.getNumAtoms());
} }
string CudaIntegrateRPMDStepKernel::createFFT(int size, const string& variable, bool forward) { string CommonIntegrateRPMDStepKernel::createFFT(int size, const string& variable, bool forward) {
stringstream source; stringstream source;
int stage = 0; int stage = 0;
int L = size; int L = size;
...@@ -387,10 +427,10 @@ string CudaIntegrateRPMDStepKernel::createFFT(int size, const string& variable, ...@@ -387,10 +427,10 @@ string CudaIntegrateRPMDStepKernel::createFFT(int size, const string& variable,
string multImag = (forward ? "multiplyComplexImagPart" : "multiplyComplexImagPartConj"); string multImag = (forward ? "multiplyComplexImagPart" : "multiplyComplexImagPartConj");
source<<"{\n"; source<<"{\n";
source<<"mixed3* real0 = "<<variable<<"real;\n"; source<<"LOCAL_ARG mixed3* real0 = "<<variable<<"real;\n";
source<<"mixed3* imag0 = "<<variable<<"imag;\n"; source<<"LOCAL_ARG mixed3* imag0 = "<<variable<<"imag;\n";
source<<"mixed3* real1 = &temp[blockStart];\n"; source<<"LOCAL_ARG mixed3* real1 = &temp[blockStart];\n";
source<<"mixed3* imag1 = &temp[blockStart+blockDim.x];\n"; source<<"LOCAL_ARG mixed3* imag1 = &temp[blockStart+LOCAL_SIZE];\n";
// Factor size, generating an appropriate block of code for each factor. // Factor size, generating an appropriate block of code for each factor.
...@@ -407,7 +447,7 @@ string CudaIntegrateRPMDStepKernel::createFFT(int size, const string& variable, ...@@ -407,7 +447,7 @@ string CudaIntegrateRPMDStepKernel::createFFT(int size, const string& variable,
else if (L%2 == 0) else if (L%2 == 0)
radix = 2; radix = 2;
else else
throw OpenMMException("Illegal size for FFT: "+cu.intToString(size)); throw OpenMMException("Illegal size for FFT: "+cc.intToString(size));
source<<"{\n"; source<<"{\n";
L = L/radix; L = L/radix;
source<<"// Pass "<<(stage+1)<<" (radix "<<radix<<")\n"; source<<"// Pass "<<(stage+1)<<" (radix "<<radix<<")\n";
...@@ -429,21 +469,21 @@ string CudaIntegrateRPMDStepKernel::createFFT(int size, const string& variable, ...@@ -429,21 +469,21 @@ string CudaIntegrateRPMDStepKernel::createFFT(int size, const string& variable,
source<<"mixed3 d0i = c1i+c4i;\n"; source<<"mixed3 d0i = c1i+c4i;\n";
source<<"mixed3 d1r = c2r+c3r;\n"; source<<"mixed3 d1r = c2r+c3r;\n";
source<<"mixed3 d1i = c2i+c3i;\n"; source<<"mixed3 d1i = c2i+c3i;\n";
source<<"mixed3 d2r = "<<cu.doubleToString(sin(0.4*M_PI))<<"*(c1r-c4r);\n"; source<<"mixed3 d2r = "<<cc.doubleToString(sin(0.4*M_PI))<<"*(c1r-c4r);\n";
source<<"mixed3 d2i = "<<cu.doubleToString(sin(0.4*M_PI))<<"*(c1i-c4i);\n"; source<<"mixed3 d2i = "<<cc.doubleToString(sin(0.4*M_PI))<<"*(c1i-c4i);\n";
source<<"mixed3 d3r = "<<cu.doubleToString(sin(0.4*M_PI))<<"*(c2r-c3r);\n"; source<<"mixed3 d3r = "<<cc.doubleToString(sin(0.4*M_PI))<<"*(c2r-c3r);\n";
source<<"mixed3 d3i = "<<cu.doubleToString(sin(0.4*M_PI))<<"*(c2i-c3i);\n"; source<<"mixed3 d3i = "<<cc.doubleToString(sin(0.4*M_PI))<<"*(c2i-c3i);\n";
source<<"mixed3 d4r = d0r+d1r;\n"; source<<"mixed3 d4r = d0r+d1r;\n";
source<<"mixed3 d4i = d0i+d1i;\n"; source<<"mixed3 d4i = d0i+d1i;\n";
source<<"mixed3 d5r = "<<cu.doubleToString(0.25*sqrt(5.0))<<"*(d0r-d1r);\n"; source<<"mixed3 d5r = "<<cc.doubleToString(0.25*sqrt(5.0))<<"*(d0r-d1r);\n";
source<<"mixed3 d5i = "<<cu.doubleToString(0.25*sqrt(5.0))<<"*(d0i-d1i);\n"; source<<"mixed3 d5i = "<<cc.doubleToString(0.25*sqrt(5.0))<<"*(d0i-d1i);\n";
source<<"mixed3 d6r = c0r-0.25f*d4r;\n"; source<<"mixed3 d6r = c0r-0.25f*d4r;\n";
source<<"mixed3 d6i = c0i-0.25f*d4i;\n"; source<<"mixed3 d6i = c0i-0.25f*d4i;\n";
source<<"mixed3 d7r = d6r+d5r;\n"; source<<"mixed3 d7r = d6r+d5r;\n";
source<<"mixed3 d7i = d6i+d5i;\n"; source<<"mixed3 d7i = d6i+d5i;\n";
source<<"mixed3 d8r = d6r-d5r;\n"; source<<"mixed3 d8r = d6r-d5r;\n";
source<<"mixed3 d8i = d6i-d5i;\n"; source<<"mixed3 d8i = d6i-d5i;\n";
string coeff = cu.doubleToString(sin(0.2*M_PI)/sin(0.4*M_PI)); string coeff = cc.doubleToString(sin(0.2*M_PI)/sin(0.4*M_PI));
source<<"mixed3 d9r = "<<sign<<"*(d2i+"<<coeff<<"*d3i);\n"; source<<"mixed3 d9r = "<<sign<<"*(d2i+"<<coeff<<"*d3i);\n";
source<<"mixed3 d9i = "<<sign<<"*(-d2r-"<<coeff<<"*d3r);\n"; source<<"mixed3 d9i = "<<sign<<"*(-d2r-"<<coeff<<"*d3r);\n";
source<<"mixed3 d10r = "<<sign<<"*("<<coeff<<"*d2i-d3i);\n"; source<<"mixed3 d10r = "<<sign<<"*("<<coeff<<"*d2i-d3i);\n";
...@@ -496,8 +536,8 @@ string CudaIntegrateRPMDStepKernel::createFFT(int size, const string& variable, ...@@ -496,8 +536,8 @@ string CudaIntegrateRPMDStepKernel::createFFT(int size, const string& variable,
source<<"mixed3 d0i = c1i+c2i;\n"; source<<"mixed3 d0i = c1i+c2i;\n";
source<<"mixed3 d1r = c0r-0.5f*d0r;\n"; source<<"mixed3 d1r = c0r-0.5f*d0r;\n";
source<<"mixed3 d1i = c0i-0.5f*d0i;\n"; source<<"mixed3 d1i = c0i-0.5f*d0i;\n";
source<<"mixed3 d2r = "<<sign<<"*"<<cu.doubleToString(sin(M_PI/3.0))<<"*(c1i-c2i);\n"; source<<"mixed3 d2r = "<<sign<<"*"<<cc.doubleToString(sin(M_PI/3.0))<<"*(c1i-c2i);\n";
source<<"mixed3 d2i = "<<sign<<"*"<<cu.doubleToString(sin(M_PI/3.0))<<"*(c2r-c1r);\n"; source<<"mixed3 d2i = "<<sign<<"*"<<cc.doubleToString(sin(M_PI/3.0))<<"*(c2r-c1r);\n";
source<<"real"<<output<<"[i+2*j*"<<m<<"] = c0r+d0r;\n"; source<<"real"<<output<<"[i+2*j*"<<m<<"] = c0r+d0r;\n";
source<<"imag"<<output<<"[i+2*j*"<<m<<"] = c0i+d0i;\n"; source<<"imag"<<output<<"[i+2*j*"<<m<<"] = c0i+d0i;\n";
source<<"real"<<output<<"[i+(2*j+1)*"<<m<<"] = "<<multReal<<"(w[j*"<<size<<"/"<<(3*L)<<"], d1r+d2r, d1i+d2i);\n"; source<<"real"<<output<<"[i+(2*j+1)*"<<m<<"] = "<<multReal<<"(w[j*"<<size<<"/"<<(3*L)<<"], d1r+d2r, d1i+d2i);\n";
...@@ -517,7 +557,7 @@ string CudaIntegrateRPMDStepKernel::createFFT(int size, const string& variable, ...@@ -517,7 +557,7 @@ string CudaIntegrateRPMDStepKernel::createFFT(int size, const string& variable,
} }
source<<"}\n"; source<<"}\n";
m = m*radix; m = m*radix;
source<<"__syncthreads();\n"; source<<"SYNC_THREADS;\n";
source<<"}\n"; source<<"}\n";
++stage; ++stage;
} }
...@@ -527,6 +567,7 @@ string CudaIntegrateRPMDStepKernel::createFFT(int size, const string& variable, ...@@ -527,6 +567,7 @@ string CudaIntegrateRPMDStepKernel::createFFT(int size, const string& variable,
if (stage%2 == 1) { if (stage%2 == 1) {
source<<"real0[indexInBlock] = real1[indexInBlock];\n"; source<<"real0[indexInBlock] = real1[indexInBlock];\n";
source<<"imag0[indexInBlock] = imag1[indexInBlock];\n"; source<<"imag0[indexInBlock] = imag1[indexInBlock];\n";
source<<"SYNC_WARPS;\n";
} }
source<<"}\n"; source<<"}\n";
return source.str(); return source.str();
......
#ifndef OPENCL_RPMD_KERNELS_H_ #ifndef COMMON_RPMD_KERNELS_H_
#define OPENCL_RPMD_KERNELS_H_ #define COMMON_RPMD_KERNELS_H_
/* -------------------------------------------------------------------------- * /* -------------------------------------------------------------------------- *
* OpenMM * * OpenMM *
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2011-2018 Stanford University and the Authors. * * Portions copyright (c) 2011-2021 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -33,19 +33,20 @@ ...@@ -33,19 +33,20 @@
* -------------------------------------------------------------------------- */ * -------------------------------------------------------------------------- */
#include "openmm/RpmdKernels.h" #include "openmm/RpmdKernels.h"
#include "OpenCLContext.h" #include "openmm/common/ComputeContext.h"
#include "OpenCLArray.h" #include "openmm/common/ComputeArray.h"
#include <map> #include <map>
namespace OpenMM { namespace OpenMM {
/** /**
* This kernel is invoked by RPMDIntegrator to take one time step, and to get and * This kernel is invoked by RPMDIntegrator to take one time step, and to get and
* set the state of system copies. * set the state of system copies.
*/ */
class OpenCLIntegrateRPMDStepKernel : public IntegrateRPMDStepKernel { class CommonIntegrateRPMDStepKernel : public IntegrateRPMDStepKernel {
public: public:
OpenCLIntegrateRPMDStepKernel(const std::string& name, const Platform& platform, OpenCLContext& cl) : CommonIntegrateRPMDStepKernel(const std::string& name, const Platform& platform, ComputeContext& cc) :
IntegrateRPMDStepKernel(name, platform), cl(cl), hasInitializedKernel(false) { IntegrateRPMDStepKernel(name, platform), cc(cc), hasInitializedKernels(false) {
} }
/** /**
* Initialize the kernel. * Initialize the kernel.
...@@ -71,11 +72,11 @@ public: ...@@ -71,11 +72,11 @@ public:
*/ */
double computeKineticEnergy(ContextImpl& context, const RPMDIntegrator& integrator); double computeKineticEnergy(ContextImpl& context, const RPMDIntegrator& integrator);
/** /**
* Get the positions of all particles in one copy of the system. * Set the positions of all particles in one copy of the system.
*/ */
void setPositions(int copy, const std::vector<Vec3>& positions); void setPositions(int copy, const std::vector<Vec3>& positions);
/** /**
* Get the velocities of all particles in one copy of the system. * Set the velocities of all particles in one copy of the system.
*/ */
void setVelocities(int copy, const std::vector<Vec3>& velocities); void setVelocities(int copy, const std::vector<Vec3>& velocities);
/** /**
...@@ -86,21 +87,21 @@ private: ...@@ -86,21 +87,21 @@ private:
void initializeKernels(ContextImpl& context); void initializeKernels(ContextImpl& context);
void computeForces(ContextImpl& context); void computeForces(ContextImpl& context);
std::string createFFT(int size, const std::string& variable, bool forward); std::string createFFT(int size, const std::string& variable, bool forward);
OpenCLContext& cl; ComputeContext& cc;
bool hasInitializedKernel; bool hasInitializedKernels;
int numCopies, numParticles, workgroupSize; int numCopies, numParticles, workgroupSize;
std::map<int, int> groupsByCopies; std::map<int, int> groupsByCopies;
int groupsNotContracted; int groupsNotContracted;
OpenCLArray forces; ComputeArray forces;
OpenCLArray positions; ComputeArray positions;
OpenCLArray velocities; ComputeArray velocities;
OpenCLArray contractedForces; ComputeArray contractedForces;
OpenCLArray contractedPositions; ComputeArray contractedPositions;
cl::Kernel pileKernel, stepKernel, velocitiesKernel, copyToContextKernel, copyFromContextKernel, translateKernel; ComputeKernel pileKernel, stepKernel, velocitiesKernel, copyToContextKernel, copyFromContextKernel, translateKernel;
std::map<int, cl::Kernel> positionContractionKernels; std::map<int, ComputeKernel> positionContractionKernels;
std::map<int, cl::Kernel> forceContractionKernels; std::map<int, ComputeKernel> forceContractionKernels;
}; };
} // namespace OpenMM } // namespace OpenMM
#endif /*OPENCL_RPMD_KERNELS_H_*/ #endif /*COMMON_RPMD_KERNELS_H_*/
__device__ mixed3 multiplyComplexRealPart(mixed2 c1, mixed3 c2r, mixed3 c2i) { DEVICE mixed3 multiplyComplexRealPart(mixed2 c1, mixed3 c2r, mixed3 c2i) {
return c1.x*c2r-c1.y*c2i; return c1.x*c2r-c1.y*c2i;
} }
__device__ mixed3 multiplyComplexImagPart(mixed2 c1, mixed3 c2r, mixed3 c2i) { DEVICE mixed3 multiplyComplexImagPart(mixed2 c1, mixed3 c2r, mixed3 c2i) {
return c1.x*c2i+c1.y*c2r; return c1.x*c2i+c1.y*c2r;
} }
__device__ mixed3 multiplyComplexRealPartConj(mixed2 c1, mixed3 c2r, mixed3 c2i) { DEVICE mixed3 multiplyComplexRealPartConj(mixed2 c1, mixed3 c2r, mixed3 c2i) {
return c1.x*c2r+c1.y*c2i; return c1.x*c2r+c1.y*c2i;
} }
__device__ mixed3 multiplyComplexImagPartConj(mixed2 c1, mixed3 c2r, mixed3 c2i) { DEVICE mixed3 multiplyComplexImagPartConj(mixed2 c1, mixed3 c2r, mixed3 c2i) {
return c1.x*c2i-c1.y*c2r; return c1.x*c2i-c1.y*c2r;
} }
/** /**
* Apply the PILE-L thermostat. * Apply the PILE-L thermostat.
*/ */
extern "C" __global__ void applyPileThermostat(mixed4* velm, float4* random, unsigned int randomIndex, KERNEL void applyPileThermostat(GLOBAL mixed4* velm, GLOBAL float4* random, unsigned int randomIndex,
mixed dt, mixed kT, mixed friction) { mixed dt, mixed kT, mixed friction) {
const int numBlocks = blockDim.x*gridDim.x/NUM_COPIES; const int numBlocks = GLOBAL_SIZE/NUM_COPIES;
const int blockStart = NUM_COPIES*(threadIdx.x/NUM_COPIES); const int blockStart = NUM_COPIES*(LOCAL_ID/NUM_COPIES);
const int indexInBlock = threadIdx.x-blockStart; const int indexInBlock = LOCAL_ID-blockStart;
const mixed nkT = NUM_COPIES*kT; const mixed nkT = NUM_COPIES*kT;
const mixed twown = 2.0f*nkT/HBAR; const mixed twown = 2.0f*nkT/HBAR;
const mixed c1_0 = EXP(-0.5f*dt*friction); const mixed c1_0 = exp(-0.5f*dt*friction);
const mixed c2_0 = SQRT(1.0f-c1_0*c1_0); const mixed c2_0 = sqrt(1.0f-c1_0*c1_0);
__shared__ mixed3 v[2*THREAD_BLOCK_SIZE]; LOCAL mixed3 v[2*THREAD_BLOCK_SIZE];
__shared__ mixed3 temp[2*THREAD_BLOCK_SIZE]; LOCAL mixed3 temp[2*THREAD_BLOCK_SIZE];
__shared__ mixed2 w[NUM_COPIES]; LOCAL mixed2 w[NUM_COPIES];
mixed3* vreal = &v[blockStart]; LOCAL_ARG mixed3* vreal = &v[blockStart];
mixed3* vimag = &v[blockStart+blockDim.x]; LOCAL_ARG mixed3* vimag = &v[blockStart+LOCAL_SIZE];
if (threadIdx.x < NUM_COPIES) if (LOCAL_ID < NUM_COPIES)
w[indexInBlock] = make_mixed2(cos(-indexInBlock*2*M_PI/NUM_COPIES), sin(-indexInBlock*2*M_PI/NUM_COPIES)); w[indexInBlock] = make_mixed2(cos(-indexInBlock*2*M_PI/NUM_COPIES), sin(-indexInBlock*2*M_PI/NUM_COPIES));
__syncthreads(); SYNC_THREADS;
randomIndex += NUM_COPIES*((blockIdx.x*blockDim.x+threadIdx.x)/NUM_COPIES); randomIndex += NUM_COPIES*((GLOBAL_ID)/NUM_COPIES);
for (int particle = (blockIdx.x*blockDim.x+threadIdx.x)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) { for (int particle = (GLOBAL_ID)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) {
mixed4 particleVelm = velm[particle+indexInBlock*PADDED_NUM_ATOMS]; mixed4 particleVelm = velm[particle+indexInBlock*PADDED_NUM_ATOMS];
mixed invMass = particleVelm.w; mixed invMass = particleVelm.w;
mixed c3_0 = c2_0*SQRT(nkT*invMass); mixed c3_0 = c2_0*sqrt(nkT*invMass);
// Forward FFT. // Forward FFT.
vreal[indexInBlock] = SCALE*make_mixed3(particleVelm.x, particleVelm.y, particleVelm.z); vreal[indexInBlock] = SCALE*make_mixed3(particleVelm.x, particleVelm.y, particleVelm.z);
vimag[indexInBlock] = make_mixed3(0); vimag[indexInBlock] = make_mixed3(0);
__syncthreads(); SYNC_THREADS;
FFT_V_FORWARD FFT_V_FORWARD
// Apply the thermostat. // Apply the thermostat.
...@@ -61,43 +61,43 @@ extern "C" __global__ void applyPileThermostat(mixed4* velm, float4* random, uns ...@@ -61,43 +61,43 @@ extern "C" __global__ void applyPileThermostat(mixed4* velm, float4* random, uns
int k = (indexInBlock <= NUM_COPIES/2 ? indexInBlock : NUM_COPIES-indexInBlock); int k = (indexInBlock <= NUM_COPIES/2 ? indexInBlock : NUM_COPIES-indexInBlock);
const bool isCenter = (NUM_COPIES%2 == 0 && k == NUM_COPIES/2); const bool isCenter = (NUM_COPIES%2 == 0 && k == NUM_COPIES/2);
const mixed wk = twown*sin(k*M_PI/NUM_COPIES); const mixed wk = twown*sin(k*M_PI/NUM_COPIES);
const mixed c1 = EXP(-wk*dt); const mixed c1 = exp(-wk*dt);
const mixed c2 = SQRT((1.0f-c1*c1)/2.0f) * (isCenter ? sqrt(2.0f) : 1.0f); const mixed c2 = sqrt((1.0f-c1*c1)/2.0f) * (isCenter ? sqrt(2.0f) : 1.0f);
const mixed c3 = c2*SQRT(nkT*invMass); const mixed c3 = c2*sqrt(nkT*invMass);
float4 rand1 = random[randomIndex+k]; float4 rand1 = random[randomIndex+k];
float4 rand2 = (isCenter ? make_float4(0) : random[randomIndex+NUM_COPIES-k]); float4 rand2 = (isCenter ? make_float4(0) : random[randomIndex+NUM_COPIES-k]);
vreal[indexInBlock] = c1*vreal[indexInBlock] + c3*make_mixed3(rand1.x, rand1.y, rand1.z); vreal[indexInBlock] = c1*vreal[indexInBlock] + c3*make_mixed3(rand1.x, rand1.y, rand1.z);
vimag[indexInBlock] = c1*vimag[indexInBlock] + c3*(indexInBlock < NUM_COPIES/2 ? make_mixed3(rand2.x, rand2.y, rand2.z) : make_mixed3(-rand2.x, -rand2.y, -rand2.z)); vimag[indexInBlock] = c1*vimag[indexInBlock] + c3*(indexInBlock < NUM_COPIES/2 ? make_mixed3(rand2.x, rand2.y, rand2.z) : make_mixed3(-rand2.x, -rand2.y, -rand2.z));
} }
__syncthreads(); SYNC_THREADS;
// Inverse FFT. // Inverse FFT.
FFT_V_BACKWARD FFT_V_BACKWARD
if (invMass != 0) if (invMass != 0)
velm[particle+indexInBlock*PADDED_NUM_ATOMS] = make_mixed4(SCALE*vreal[indexInBlock].x, SCALE*vreal[indexInBlock].y, SCALE*vreal[indexInBlock].z, particleVelm.w); velm[particle+indexInBlock*PADDED_NUM_ATOMS] = make_mixed4(SCALE*vreal[indexInBlock].x, SCALE*vreal[indexInBlock].y, SCALE*vreal[indexInBlock].z, particleVelm.w);
randomIndex += blockDim.x*gridDim.x; randomIndex += GLOBAL_SIZE;
} }
} }
/** /**
* Advance the positions and velocities. * Advance the positions and velocities.
*/ */
extern "C" __global__ void integrateStep(mixed4* posq, mixed4* velm, long long* force, mixed dt, mixed kT) { KERNEL void integrateStep(GLOBAL mixed4* posq, GLOBAL mixed4* velm, GLOBAL mm_long* force, mixed dt, mixed kT) {
const int numBlocks = (blockDim.x*gridDim.x)/NUM_COPIES; const int numBlocks = (GLOBAL_SIZE)/NUM_COPIES;
const int blockStart = NUM_COPIES*(threadIdx.x/NUM_COPIES); const int blockStart = NUM_COPIES*(LOCAL_ID/NUM_COPIES);
const int indexInBlock = threadIdx.x-blockStart; const int indexInBlock = LOCAL_ID-blockStart;
const mixed nkT = NUM_COPIES*kT; const mixed nkT = NUM_COPIES*kT;
const mixed twown = 2.0f*nkT/HBAR; const mixed twown = 2.0f*nkT/HBAR;
const mixed forceScale = 1/(mixed) 0x100000000; const mixed forceScale = 1/(mixed) 0x100000000;
__shared__ mixed3 q[2*THREAD_BLOCK_SIZE]; LOCAL mixed3 q[2*THREAD_BLOCK_SIZE];
__shared__ mixed3 v[2*THREAD_BLOCK_SIZE]; LOCAL mixed3 v[2*THREAD_BLOCK_SIZE];
__shared__ mixed3 temp[2*THREAD_BLOCK_SIZE]; LOCAL mixed3 temp[2*THREAD_BLOCK_SIZE];
__shared__ mixed2 w[NUM_COPIES]; LOCAL mixed2 w[NUM_COPIES];
// Update velocities. // Update velocities.
for (int particle = (blockIdx.x*blockDim.x+threadIdx.x)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) { for (int particle = (GLOBAL_ID)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) {
int index = particle+indexInBlock*PADDED_NUM_ATOMS; int index = particle+indexInBlock*PADDED_NUM_ATOMS;
int forceIndex = particle+indexInBlock*PADDED_NUM_ATOMS*3; int forceIndex = particle+indexInBlock*PADDED_NUM_ATOMS*3;
mixed4 particleVelm = velm[index]; mixed4 particleVelm = velm[index];
...@@ -110,14 +110,14 @@ extern "C" __global__ void integrateStep(mixed4* posq, mixed4* velm, long long* ...@@ -110,14 +110,14 @@ extern "C" __global__ void integrateStep(mixed4* posq, mixed4* velm, long long*
// Evolve the free ring polymer by transforming to the frequency domain. // Evolve the free ring polymer by transforming to the frequency domain.
mixed3* qreal = &q[blockStart]; LOCAL_ARG mixed3* qreal = &q[blockStart];
mixed3* qimag = &q[blockStart+blockDim.x]; LOCAL_ARG mixed3* qimag = &q[blockStart+LOCAL_SIZE];
mixed3* vreal = &v[blockStart]; LOCAL_ARG mixed3* vreal = &v[blockStart];
mixed3* vimag = &v[blockStart+blockDim.x]; LOCAL_ARG mixed3* vimag = &v[blockStart+LOCAL_SIZE];
if (threadIdx.x < NUM_COPIES) if (LOCAL_ID < NUM_COPIES)
w[indexInBlock] = make_mixed2(cos(-indexInBlock*2*M_PI/NUM_COPIES), sin(-indexInBlock*2*M_PI/NUM_COPIES)); w[indexInBlock] = make_mixed2(cos(-indexInBlock*2*M_PI/NUM_COPIES), sin(-indexInBlock*2*M_PI/NUM_COPIES));
__syncthreads(); SYNC_THREADS;
for (int particle = (blockIdx.x*blockDim.x+threadIdx.x)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) { for (int particle = (GLOBAL_ID)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) {
mixed4 particlePosq = posq[particle+indexInBlock*PADDED_NUM_ATOMS]; mixed4 particlePosq = posq[particle+indexInBlock*PADDED_NUM_ATOMS];
mixed4 particleVelm = velm[particle+indexInBlock*PADDED_NUM_ATOMS]; mixed4 particleVelm = velm[particle+indexInBlock*PADDED_NUM_ATOMS];
...@@ -127,7 +127,7 @@ extern "C" __global__ void integrateStep(mixed4* posq, mixed4* velm, long long* ...@@ -127,7 +127,7 @@ extern "C" __global__ void integrateStep(mixed4* posq, mixed4* velm, long long*
qimag[indexInBlock] = make_mixed3(0); qimag[indexInBlock] = make_mixed3(0);
vreal[indexInBlock] = SCALE*make_mixed3(particleVelm.x, particleVelm.y, particleVelm.z); vreal[indexInBlock] = SCALE*make_mixed3(particleVelm.x, particleVelm.y, particleVelm.z);
vimag[indexInBlock] = make_mixed3(0); vimag[indexInBlock] = make_mixed3(0);
__syncthreads(); SYNC_THREADS;
FFT_Q_FORWARD FFT_Q_FORWARD
FFT_V_FORWARD FFT_V_FORWARD
...@@ -149,7 +149,7 @@ extern "C" __global__ void integrateStep(mixed4* posq, mixed4* velm, long long* ...@@ -149,7 +149,7 @@ extern "C" __global__ void integrateStep(mixed4* posq, mixed4* velm, long long*
vreal[indexInBlock] = vprimereal; vreal[indexInBlock] = vprimereal;
vimag[indexInBlock] = vprimeimag; vimag[indexInBlock] = vprimeimag;
} }
__syncthreads(); SYNC_THREADS;
// Inverse FFT. // Inverse FFT.
...@@ -165,15 +165,15 @@ extern "C" __global__ void integrateStep(mixed4* posq, mixed4* velm, long long* ...@@ -165,15 +165,15 @@ extern "C" __global__ void integrateStep(mixed4* posq, mixed4* velm, long long*
/** /**
* Advance the velocities by a half step. * Advance the velocities by a half step.
*/ */
extern "C" __global__ void advanceVelocities(mixed4* velm, long long* force, mixed dt) { KERNEL void advanceVelocities(GLOBAL mixed4* velm, GLOBAL mm_long* force, mixed dt) {
const int numBlocks = (blockDim.x*gridDim.x)/NUM_COPIES; const int numBlocks = (GLOBAL_SIZE)/NUM_COPIES;
const int blockStart = NUM_COPIES*(threadIdx.x/NUM_COPIES); const int blockStart = NUM_COPIES*(LOCAL_ID/NUM_COPIES);
const int indexInBlock = threadIdx.x-blockStart; const int indexInBlock = LOCAL_ID-blockStart;
const mixed forceScale = 1/(mixed) 0x100000000; const mixed forceScale = 1/(mixed) 0x100000000;
// Update velocities. // Update velocities.
for (int particle = (blockIdx.x*blockDim.x+threadIdx.x)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) { for (int particle = (GLOBAL_ID)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) {
int index = particle+indexInBlock*PADDED_NUM_ATOMS; int index = particle+indexInBlock*PADDED_NUM_ATOMS;
int forceIndex = particle+indexInBlock*PADDED_NUM_ATOMS*3; int forceIndex = particle+indexInBlock*PADDED_NUM_ATOMS*3;
mixed4 particleVelm = velm[index]; mixed4 particleVelm = velm[index];
...@@ -188,9 +188,9 @@ extern "C" __global__ void advanceVelocities(mixed4* velm, long long* force, mix ...@@ -188,9 +188,9 @@ extern "C" __global__ void advanceVelocities(mixed4* velm, long long* force, mix
/** /**
* Copy a set of positions and velocities from the integrator's arrays to the context. * Copy a set of positions and velocities from the integrator's arrays to the context.
*/ */
extern "C" __global__ void copyDataToContext(mixed4* srcVel, mixed4* dstVel, mixed4* srcPos, real4* dstPos, int* order, int copy) { KERNEL void copyDataToContext(GLOBAL mixed4* srcVel, GLOBAL mixed4* dstVel, GLOBAL mixed4* srcPos, GLOBAL real4* dstPos, GLOBAL int* order, int copy) {
const int base = copy*PADDED_NUM_ATOMS; const int base = copy*PADDED_NUM_ATOMS;
for (int particle = blockIdx.x*blockDim.x+threadIdx.x; particle < NUM_ATOMS; particle += blockDim.x*gridDim.x) { for (int particle = GLOBAL_ID; particle < NUM_ATOMS; particle += GLOBAL_SIZE) {
int index = base+order[particle]; int index = base+order[particle];
dstVel[particle] = srcVel[index]; dstVel[particle] = srcVel[index];
mixed4 posq = srcPos[index]; mixed4 posq = srcPos[index];
...@@ -203,10 +203,10 @@ extern "C" __global__ void copyDataToContext(mixed4* srcVel, mixed4* dstVel, mix ...@@ -203,10 +203,10 @@ extern "C" __global__ void copyDataToContext(mixed4* srcVel, mixed4* dstVel, mix
/** /**
* Copy a set of positions, velocities, and forces from the context to the integrator's arrays. * Copy a set of positions, velocities, and forces from the context to the integrator's arrays.
*/ */
extern "C" __global__ void copyDataFromContext(long long* srcForce, long long* dstForce, mixed4* srcVel, mixed4* dstVel, KERNEL void copyDataFromContext(GLOBAL mm_long* srcForce, GLOBAL mm_long* dstForce, GLOBAL mixed4* srcVel, GLOBAL mixed4* dstVel,
real4* srcPos, mixed4* dstPos, int* order, int copy) { GLOBAL real4* srcPos, GLOBAL mixed4* dstPos, GLOBAL int* order, int copy) {
const int base = copy*PADDED_NUM_ATOMS; const int base = copy*PADDED_NUM_ATOMS;
for (int particle = blockIdx.x*blockDim.x+threadIdx.x; particle < NUM_ATOMS; particle += blockDim.x*gridDim.x) { for (int particle = GLOBAL_ID; particle < NUM_ATOMS; particle += GLOBAL_SIZE) {
int index = order[particle]; int index = order[particle];
dstForce[base*3+index] = srcForce[particle]; dstForce[base*3+index] = srcForce[particle];
dstForce[base*3+index+PADDED_NUM_ATOMS] = srcForce[particle+PADDED_NUM_ATOMS]; dstForce[base*3+index+PADDED_NUM_ATOMS] = srcForce[particle+PADDED_NUM_ATOMS];
...@@ -223,8 +223,8 @@ extern "C" __global__ void copyDataFromContext(long long* srcForce, long long* d ...@@ -223,8 +223,8 @@ extern "C" __global__ void copyDataFromContext(long long* srcForce, long long* d
/** /**
* Atom positions in one copy have been modified. Apply the same offsets to all the other copies. * Atom positions in one copy have been modified. Apply the same offsets to all the other copies.
*/ */
extern "C" __global__ void applyCellTranslations(mixed4* posq, real4* movedPos, int* order, int movedCopy) { KERNEL void applyCellTranslations(GLOBAL mixed4* posq, GLOBAL real4* movedPos, GLOBAL int* order, int movedCopy) {
for (int particle = blockIdx.x*blockDim.x+threadIdx.x; particle < NUM_ATOMS; particle += blockDim.x*gridDim.x) { for (int particle = GLOBAL_ID; particle < NUM_ATOMS; particle += GLOBAL_SIZE) {
int index = order[particle]; int index = order[particle];
real4 p = movedPos[particle]; real4 p = movedPos[particle];
mixed4 delta = make_mixed4(p.x, p.y, p.z, p.w)-posq[movedCopy*PADDED_NUM_ATOMS+index]; mixed4 delta = make_mixed4(p.x, p.y, p.z, p.w)-posq[movedCopy*PADDED_NUM_ATOMS+index];
......
__device__ mixed3 multiplyComplexRealPart(mixed2 c1, mixed3 c2r, mixed3 c2i) { DEVICE mixed3 multiplyComplexRealPart(mixed2 c1, mixed3 c2r, mixed3 c2i) {
return c1.x*c2r-c1.y*c2i; return c1.x*c2r-c1.y*c2i;
} }
__device__ mixed3 multiplyComplexImagPart(mixed2 c1, mixed3 c2r, mixed3 c2i) { DEVICE mixed3 multiplyComplexImagPart(mixed2 c1, mixed3 c2r, mixed3 c2i) {
return c1.x*c2i+c1.y*c2r; return c1.x*c2i+c1.y*c2r;
} }
__device__ mixed3 multiplyComplexRealPartConj(mixed2 c1, mixed3 c2r, mixed3 c2i) { DEVICE mixed3 multiplyComplexRealPartConj(mixed2 c1, mixed3 c2r, mixed3 c2i) {
return c1.x*c2r+c1.y*c2i; return c1.x*c2r+c1.y*c2i;
} }
__device__ mixed3 multiplyComplexImagPartConj(mixed2 c1, mixed3 c2r, mixed3 c2i) { DEVICE mixed3 multiplyComplexImagPartConj(mixed2 c1, mixed3 c2r, mixed3 c2i) {
return c1.x*c2i-c1.y*c2r; return c1.x*c2i-c1.y*c2r;
} }
/** /**
* Compute the contracted positions * Compute the contracted positions
*/ */
extern "C" __global__ void contractPositions(mixed4* posq, mixed4* contracted) { KERNEL void contractPositions(GLOBAL mixed4* posq, GLOBAL mixed4* contracted) {
const int numBlocks = (blockDim.x*gridDim.x)/NUM_COPIES; const int numBlocks = (GLOBAL_SIZE)/NUM_COPIES;
const int blockStart = NUM_COPIES*(threadIdx.x/NUM_COPIES); const int blockStart = NUM_COPIES*(LOCAL_ID/NUM_COPIES);
const int indexInBlock = threadIdx.x-blockStart; const int indexInBlock = LOCAL_ID-blockStart;
__shared__ mixed3 q[2*THREAD_BLOCK_SIZE]; LOCAL mixed3 q[2*THREAD_BLOCK_SIZE];
__shared__ mixed3 temp[2*THREAD_BLOCK_SIZE]; LOCAL mixed3 temp[2*THREAD_BLOCK_SIZE];
__shared__ mixed2 w1[NUM_COPIES]; LOCAL mixed2 w1[NUM_COPIES];
__shared__ mixed2 w2[NUM_CONTRACTED_COPIES]; LOCAL mixed2 w2[NUM_CONTRACTED_COPIES];
mixed3* qreal = &q[blockStart]; LOCAL_ARG mixed3* qreal = &q[blockStart];
mixed3* qimag = &q[blockStart+blockDim.x]; LOCAL_ARG mixed3* qimag = &q[blockStart+LOCAL_SIZE];
mixed3* tempreal = &temp[blockStart]; LOCAL_ARG mixed3* tempreal = &temp[blockStart];
mixed3* tempimag = &temp[blockStart+blockDim.x]; LOCAL_ARG mixed3* tempimag = &temp[blockStart+LOCAL_SIZE];
if (threadIdx.x < NUM_COPIES) if (LOCAL_ID < NUM_COPIES)
w1[indexInBlock] = make_mixed2(cos(-indexInBlock*2*M_PI/NUM_COPIES), sin(-indexInBlock*2*M_PI/NUM_COPIES)); w1[indexInBlock] = make_mixed2(cos(-indexInBlock*2*M_PI/NUM_COPIES), sin(-indexInBlock*2*M_PI/NUM_COPIES));
if (threadIdx.x < NUM_CONTRACTED_COPIES) if (LOCAL_ID < NUM_CONTRACTED_COPIES)
w2[indexInBlock] = make_mixed2(cos(-indexInBlock*2*M_PI/NUM_CONTRACTED_COPIES), sin(-indexInBlock*2*M_PI/NUM_CONTRACTED_COPIES)); w2[indexInBlock] = make_mixed2(cos(-indexInBlock*2*M_PI/NUM_CONTRACTED_COPIES), sin(-indexInBlock*2*M_PI/NUM_CONTRACTED_COPIES));
__syncthreads(); SYNC_THREADS;
for (int particle = (blockIdx.x*blockDim.x+threadIdx.x)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) { for (int particle = (GLOBAL_ID)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) {
// Load the particle position. // Load the particle position.
mixed4 particlePosq = posq[particle+indexInBlock*PADDED_NUM_ATOMS]; mixed4 particlePosq = posq[particle+indexInBlock*PADDED_NUM_ATOMS];
...@@ -43,8 +43,8 @@ extern "C" __global__ void contractPositions(mixed4* posq, mixed4* contracted) { ...@@ -43,8 +43,8 @@ extern "C" __global__ void contractPositions(mixed4* posq, mixed4* contracted) {
// Forward FFT. // Forward FFT.
__syncthreads(); SYNC_THREADS;
mixed2* w = w1; LOCAL_ARG mixed2* w = w1;
FFT_Q_FORWARD FFT_Q_FORWARD
if (NUM_CONTRACTED_COPIES > 1) { if (NUM_CONTRACTED_COPIES > 1) {
// Compress the data to remove high frequencies. // Compress the data to remove high frequencies.
...@@ -52,12 +52,12 @@ extern "C" __global__ void contractPositions(mixed4* posq, mixed4* contracted) { ...@@ -52,12 +52,12 @@ extern "C" __global__ void contractPositions(mixed4* posq, mixed4* contracted) {
int start = (NUM_CONTRACTED_COPIES+1)/2; int start = (NUM_CONTRACTED_COPIES+1)/2;
tempreal[indexInBlock] = qreal[indexInBlock]; tempreal[indexInBlock] = qreal[indexInBlock];
tempimag[indexInBlock] = qimag[indexInBlock]; tempimag[indexInBlock] = qimag[indexInBlock];
__syncthreads(); SYNC_THREADS;
if (indexInBlock < NUM_CONTRACTED_COPIES) { if (indexInBlock < NUM_CONTRACTED_COPIES) {
qreal[indexInBlock] = tempreal[indexInBlock < start ? indexInBlock : indexInBlock+(NUM_COPIES-NUM_CONTRACTED_COPIES)]; qreal[indexInBlock] = tempreal[indexInBlock < start ? indexInBlock : indexInBlock+(NUM_COPIES-NUM_CONTRACTED_COPIES)];
qimag[indexInBlock] = tempimag[indexInBlock < start ? indexInBlock : indexInBlock+(NUM_COPIES-NUM_CONTRACTED_COPIES)]; qimag[indexInBlock] = tempimag[indexInBlock < start ? indexInBlock : indexInBlock+(NUM_COPIES-NUM_CONTRACTED_COPIES)];
} }
__syncthreads(); SYNC_THREADS;
w = w2; w = w2;
FFT_Q_BACKWARD FFT_Q_BACKWARD
} }
...@@ -72,25 +72,25 @@ extern "C" __global__ void contractPositions(mixed4* posq, mixed4* contracted) { ...@@ -72,25 +72,25 @@ extern "C" __global__ void contractPositions(mixed4* posq, mixed4* contracted) {
/** /**
* Apply the contracted forces to all copies. * Apply the contracted forces to all copies.
*/ */
extern "C" __global__ void contractForces(long long* force, long long* contracted) { KERNEL void contractForces(GLOBAL mm_long* force, GLOBAL mm_long* contracted) {
const int numBlocks = (blockDim.x*gridDim.x)/NUM_COPIES; const int numBlocks = (GLOBAL_SIZE)/NUM_COPIES;
const int blockStart = NUM_COPIES*(threadIdx.x/NUM_COPIES); const int blockStart = NUM_COPIES*(LOCAL_ID/NUM_COPIES);
const int indexInBlock = threadIdx.x-blockStart; const int indexInBlock = LOCAL_ID-blockStart;
const mixed forceScale = 1/(mixed) 0x100000000; const mixed forceScale = 1/(mixed) 0x100000000;
__shared__ mixed3 f[2*THREAD_BLOCK_SIZE]; LOCAL mixed3 f[2*THREAD_BLOCK_SIZE];
__shared__ mixed3 temp[2*THREAD_BLOCK_SIZE]; LOCAL mixed3 temp[2*THREAD_BLOCK_SIZE];
__shared__ mixed2 w1[NUM_COPIES]; LOCAL mixed2 w1[NUM_COPIES];
__shared__ mixed2 w2[NUM_CONTRACTED_COPIES]; LOCAL mixed2 w2[NUM_CONTRACTED_COPIES];
mixed3* freal = &f[blockStart]; LOCAL_ARG mixed3* freal = &f[blockStart];
mixed3* fimag = &f[blockStart+blockDim.x]; LOCAL_ARG mixed3* fimag = &f[blockStart+LOCAL_SIZE];
mixed3* tempreal = &temp[blockStart]; LOCAL_ARG mixed3* tempreal = &temp[blockStart];
mixed3* tempimag = &temp[blockStart+blockDim.x]; LOCAL_ARG mixed3* tempimag = &temp[blockStart+LOCAL_SIZE];
if (threadIdx.x < NUM_COPIES) if (LOCAL_ID < NUM_COPIES)
w1[indexInBlock] = make_mixed2(cos(-indexInBlock*2*M_PI/NUM_COPIES), sin(-indexInBlock*2*M_PI/NUM_COPIES)); w1[indexInBlock] = make_mixed2(cos(-indexInBlock*2*M_PI/NUM_COPIES), sin(-indexInBlock*2*M_PI/NUM_COPIES));
if (threadIdx.x < NUM_CONTRACTED_COPIES) if (LOCAL_ID < NUM_CONTRACTED_COPIES)
w2[indexInBlock] = make_mixed2(cos(-indexInBlock*2*M_PI/NUM_CONTRACTED_COPIES), sin(-indexInBlock*2*M_PI/NUM_CONTRACTED_COPIES)); w2[indexInBlock] = make_mixed2(cos(-indexInBlock*2*M_PI/NUM_CONTRACTED_COPIES), sin(-indexInBlock*2*M_PI/NUM_CONTRACTED_COPIES));
__syncthreads(); SYNC_THREADS;
for (int particle = (blockIdx.x*blockDim.x+threadIdx.x)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) { for (int particle = (GLOBAL_ID)/NUM_COPIES; particle < NUM_ATOMS; particle += numBlocks) {
// Load the force. // Load the force.
int forceIndex = particle+indexInBlock*PADDED_NUM_ATOMS*3; int forceIndex = particle+indexInBlock*PADDED_NUM_ATOMS*3;
...@@ -98,11 +98,11 @@ extern "C" __global__ void contractForces(long long* force, long long* contracte ...@@ -98,11 +98,11 @@ extern "C" __global__ void contractForces(long long* force, long long* contracte
freal[indexInBlock] = make_mixed3(contracted[forceIndex]*forceScale, contracted[forceIndex+PADDED_NUM_ATOMS]*forceScale, contracted[forceIndex+PADDED_NUM_ATOMS*2]*forceScale); freal[indexInBlock] = make_mixed3(contracted[forceIndex]*forceScale, contracted[forceIndex+PADDED_NUM_ATOMS]*forceScale, contracted[forceIndex+PADDED_NUM_ATOMS*2]*forceScale);
fimag[indexInBlock] = make_mixed3(0); fimag[indexInBlock] = make_mixed3(0);
} }
__syncthreads(); SYNC_THREADS;
// Forward FFT. // Forward FFT.
mixed2* w = w2; LOCAL_ARG mixed2* w = w2;
if (NUM_CONTRACTED_COPIES > 1) { if (NUM_CONTRACTED_COPIES > 1) {
FFT_F_FORWARD FFT_F_FORWARD
} }
...@@ -113,19 +113,19 @@ extern "C" __global__ void contractForces(long long* force, long long* contracte ...@@ -113,19 +113,19 @@ extern "C" __global__ void contractForces(long long* force, long long* contracte
int end = NUM_COPIES-NUM_CONTRACTED_COPIES+start; int end = NUM_COPIES-NUM_CONTRACTED_COPIES+start;
tempreal[indexInBlock] = freal[indexInBlock]; tempreal[indexInBlock] = freal[indexInBlock];
tempimag[indexInBlock] = fimag[indexInBlock]; tempimag[indexInBlock] = fimag[indexInBlock];
__syncthreads(); SYNC_THREADS;
if (indexInBlock >= start) { if (indexInBlock >= start) {
freal[indexInBlock] = (indexInBlock < end ? make_mixed3(0) : tempreal[indexInBlock-(NUM_COPIES-NUM_CONTRACTED_COPIES)]); freal[indexInBlock] = (indexInBlock < end ? make_mixed3(0) : tempreal[indexInBlock-(NUM_COPIES-NUM_CONTRACTED_COPIES)]);
fimag[indexInBlock] = (indexInBlock < end ? make_mixed3(0) : tempimag[indexInBlock-(NUM_COPIES-NUM_CONTRACTED_COPIES)]); fimag[indexInBlock] = (indexInBlock < end ? make_mixed3(0) : tempimag[indexInBlock-(NUM_COPIES-NUM_CONTRACTED_COPIES)]);
} }
__syncthreads(); SYNC_THREADS;
w = w1; w = w1;
FFT_F_BACKWARD FFT_F_BACKWARD
// Store results. // Store results.
force[forceIndex] += (long long) (FORCE_SCALE*freal[indexInBlock].x); force[forceIndex] += (mm_long) (FORCE_SCALE*freal[indexInBlock].x);
force[forceIndex+PADDED_NUM_ATOMS] += (long long) (FORCE_SCALE*freal[indexInBlock].y); force[forceIndex+PADDED_NUM_ATOMS] += (mm_long) (FORCE_SCALE*freal[indexInBlock].y);
force[forceIndex+PADDED_NUM_ATOMS*2] += (long long) (FORCE_SCALE*freal[indexInBlock].z); force[forceIndex+PADDED_NUM_ATOMS*2] += (mm_long) (FORCE_SCALE*freal[indexInBlock].z);
} }
} }
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
# The source is organized into subdirectories, but we handle them all from # The source is organized into subdirectories, but we handle them all from
# this CMakeLists file rather than letting CMake visit them as SUBDIRS. # this CMakeLists file rather than letting CMake visit them as SUBDIRS.
SET(OPENMM_SOURCE_SUBDIRS .) SET(OPENMM_SOURCE_SUBDIRS . ../common)
# Collect up information about the version of the OpenMM library we're building # Collect up information about the version of the OpenMM library we're building
...@@ -59,33 +59,25 @@ FOREACH(subdir ${OPENMM_SOURCE_SUBDIRS}) ...@@ -59,33 +59,25 @@ FOREACH(subdir ${OPENMM_SOURCE_SUBDIRS})
INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include) INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include)
ENDFOREACH(subdir) ENDFOREACH(subdir)
SET(COMMON_KERNELS_CPP ${CMAKE_CURRENT_BINARY_DIR}/../common/src/CommonRpmdKernelSources.cpp)
SET(SOURCE_FILES ${SOURCE_FILES} ${COMMON_KERNELS_CPP})
INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/src) INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/src)
INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/../common/src)
INCLUDE_DIRECTORIES(BEFORE ${CMAKE_SOURCE_DIR}/platforms/cuda/include) INCLUDE_DIRECTORIES(BEFORE ${CMAKE_SOURCE_DIR}/platforms/cuda/include)
INCLUDE_DIRECTORIES(BEFORE ${CMAKE_SOURCE_DIR}/platforms/cuda/src) INCLUDE_DIRECTORIES(BEFORE ${CMAKE_SOURCE_DIR}/platforms/cuda/src)
INCLUDE_DIRECTORIES(BEFORE ${CMAKE_BINARY_DIR}/platforms/cuda/src) INCLUDE_DIRECTORIES(BEFORE ${CMAKE_BINARY_DIR}/platforms/cuda/src)
INCLUDE_DIRECTORIES(BEFORE ${CMAKE_SOURCE_DIR}/platforms/common/include) INCLUDE_DIRECTORIES(BEFORE ${CMAKE_SOURCE_DIR}/platforms/common/include)
INCLUDE_DIRECTORIES(BEFORE ${CMAKE_BINARY_DIR}/platforms/common/src)
# Set variables needed for encoding kernel sources into a C++ class INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_BINARY_DIR}/../common/src)
SET(KERNEL_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/src)
SET(KERNEL_SOURCE_CLASS CudaRpmdKernelSources)
SET(KERNELS_CPP ${CMAKE_CURRENT_BINARY_DIR}/src/${KERNEL_SOURCE_CLASS}.cpp)
SET(KERNELS_H ${CMAKE_CURRENT_BINARY_DIR}/src/${KERNEL_SOURCE_CLASS}.h)
SET(SOURCE_FILES ${SOURCE_FILES} ${KERNELS_CPP} ${KERNELS_H})
INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_BINARY_DIR}/src)
# Create the library # Create the library
INCLUDE_DIRECTORIES(${CUDA_TOOLKIT_INCLUDE}) INCLUDE_DIRECTORIES(${CUDA_TOOLKIT_INCLUDE})
FILE(GLOB CUDA_KERNELS ${KERNEL_SOURCE_DIR}/kernels/*.cu) SET_SOURCE_FILES_PROPERTIES(${COMMON_KERNELS_CPP} PROPERTIES GENERATED TRUE)
ADD_CUSTOM_COMMAND(OUTPUT ${KERNELS_CPP} ${KERNELS_H}
COMMAND ${CMAKE_COMMAND}
ARGS -D KERNEL_SOURCE_DIR=${KERNEL_SOURCE_DIR} -D KERNELS_CPP=${KERNELS_CPP} -D KERNELS_H=${KERNELS_H} -D KERNEL_SOURCE_CLASS=${KERNEL_SOURCE_CLASS} -D KERNEL_FILE_EXTENSION=cu -P ${CMAKE_SOURCE_DIR}/cmake_modules/EncodeKernelFiles.cmake
DEPENDS ${CUDA_KERNELS}
)
SET_SOURCE_FILES_PROPERTIES(${KERNELS_CPP} ${KERNELS_H} PROPERTIES GENERATED TRUE)
ADD_LIBRARY(${SHARED_TARGET} SHARED ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES}) ADD_LIBRARY(${SHARED_TARGET} SHARED ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})
ADD_DEPENDENCIES(${SHARED_TARGET} RpmdCommonKernels)
TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${OPENMM_LIBRARY_NAME} ${PTHREADS_LIB}) TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${OPENMM_LIBRARY_NAME} ${PTHREADS_LIB})
TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${OPENMM_LIBRARY_NAME}CUDA) TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${OPENMM_LIBRARY_NAME}CUDA)
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2011-2012 Stanford University and the Authors. * * Portions copyright (c) 2011-2021 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -27,7 +27,8 @@ ...@@ -27,7 +27,8 @@
#include <exception> #include <exception>
#include "CudaRpmdKernelFactory.h" #include "CudaRpmdKernelFactory.h"
#include "CudaRpmdKernels.h" #include "CommonRpmdKernels.h"
#include "CudaContext.h"
#include "openmm/internal/windowsExport.h" #include "openmm/internal/windowsExport.h"
#include "openmm/internal/ContextImpl.h" #include "openmm/internal/ContextImpl.h"
#include "openmm/OpenMMException.h" #include "openmm/OpenMMException.h"
...@@ -61,6 +62,6 @@ extern "C" OPENMM_EXPORT void registerRPMDCudaKernelFactories() { ...@@ -61,6 +62,6 @@ extern "C" OPENMM_EXPORT void registerRPMDCudaKernelFactories() {
KernelImpl* CudaRpmdKernelFactory::createKernelImpl(std::string name, const Platform& platform, ContextImpl& context) const { KernelImpl* CudaRpmdKernelFactory::createKernelImpl(std::string name, const Platform& platform, ContextImpl& context) const {
CudaContext& cl = *static_cast<CudaPlatform::PlatformData*>(context.getPlatformData())->contexts[0]; CudaContext& cl = *static_cast<CudaPlatform::PlatformData*>(context.getPlatformData())->contexts[0];
if (name == IntegrateRPMDStepKernel::Name()) if (name == IntegrateRPMDStepKernel::Name())
return new CudaIntegrateRPMDStepKernel(name, platform, cl); return new CommonIntegrateRPMDStepKernel(name, platform, cl);
throw OpenMMException((std::string("Tried to create kernel with illegal kernel name '")+name+"'").c_str()); throw OpenMMException((std::string("Tried to create kernel with illegal kernel name '")+name+"'").c_str());
} }
#ifndef CUDA_RPMD_KERNELS_H_
#define CUDA_RPMD_KERNELS_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2011-2018 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include "openmm/RpmdKernels.h"
#include "CudaContext.h"
#include "CudaArray.h"
#include <map>
namespace OpenMM {
/**
* This kernel is invoked by RPMDIntegrator to take one time step, and to get and
* set the state of system copies.
*/
class CudaIntegrateRPMDStepKernel : public IntegrateRPMDStepKernel {
public:
CudaIntegrateRPMDStepKernel(const std::string& name, const Platform& platform, CudaContext& cu) :
IntegrateRPMDStepKernel(name, platform), cu(cu) {
}
/**
* Initialize the kernel.
*
* @param system the System this kernel will be applied to
* @param integrator the RPMDIntegrator this kernel will be used for
*/
void initialize(const System& system, const RPMDIntegrator& integrator);
/**
* Execute the kernel.
*
* @param context the context in which to execute this kernel
* @param integrator the RPMDIntegrator this kernel is being used for
* @param forcesAreValid if the context has been modified since the last time step, this will be
* false to show that cached forces are invalid and must be recalculated
*/
void execute(ContextImpl& context, const RPMDIntegrator& integrator, bool forcesAreValid);
/**
* Compute the kinetic energy.
*
* @param context the context in which to execute this kernel
* @param integrator the RPMDIntegrator this kernel is being used for
*/
double computeKineticEnergy(ContextImpl& context, const RPMDIntegrator& integrator);
/**
* Get the positions of all particles in one copy of the system.
*/
void setPositions(int copy, const std::vector<Vec3>& positions);
/**
* Get the velocities of all particles in one copy of the system.
*/
void setVelocities(int copy, const std::vector<Vec3>& velocities);
/**
* Copy positions and velocities for one copy into the context.
*/
void copyToContext(int copy, ContextImpl& context);
private:
void computeForces(ContextImpl& context);
std::string createFFT(int size, const std::string& variable, bool forward);
CudaContext& cu;
int numCopies, numParticles, workgroupSize;
std::map<int, int> groupsByCopies;
int groupsNotContracted;
CudaArray forces;
CudaArray positions;
CudaArray velocities;
CudaArray contractedForces;
CudaArray contractedPositions;
CUfunction pileKernel, stepKernel, velocitiesKernel, copyToContextKernel, copyFromContextKernel, translateKernel;
std::map<int, CUfunction> positionContractionKernels;
std::map<int, CUfunction> forceContractionKernels;
};
} // namespace OpenMM
#endif /*CUDA_RPMD_KERNELS_H_*/
...@@ -5,6 +5,8 @@ ...@@ -5,6 +5,8 @@
ENABLE_TESTING() ENABLE_TESTING()
INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIR}) INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIR})
INCLUDE_DIRECTORIES(${OPENMM_DIR}/plugins/rpmd/tests)
INCLUDE_DIRECTORIES(${OPENMM_DIR}/platforms/cuda/tests)
# Automatically create tests using files named "Test*.cpp" # Automatically create tests using files named "Test*.cpp"
FILE(GLOB TEST_PROGS "*Test*.cpp") FILE(GLOB TEST_PROGS "*Test*.cpp")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment