Converted AMOEBA to common platform (#3120)

* Began converting AMOEBA to common platform * Beginning of OpenCL platform for AMOEBA * Converted AmoebaVdwForce to common platform * Cleaned up reference AMOEBA tests * Began converting AmoebaMultipoleForce to common platform * Continue converting AmoebaMultipoleForce to common platform * Bug fixes * Bug fix * Continue converting AmoebaMultipoleForce to common platform * Converting AmoebaMultipoleForce and AmoebaGeneralizedKirkwoodForce to common platform * Converting AmoebaMultipoleForce and AmoebaGeneralizedKirkwoodForce to common platform * Creating OpenCL version of AmoebaMultipoleForce and AmoebaGeneralizedKirkwoodForce * Creating OpenCL version of AmoebaMultipoleForce and AmoebaGeneralizedKirkwoodForce * Creating OpenCL version of AmoebaMultipoleForce and AmoebaGeneralizedKirkwoodForce * Converted arrays from real3 to real * Bug fix to OpenCL AmoebaGeneralizedKirkwoodForce * Fixes for AMD GPUs * Began converting HippoNonbondedForce to common platform * Continuing to convert HippoNonbondedForce to common platform * Continuing to convert HippoNonbondedForce to common platform * Working on unifying PME kernels * Fixed error on devices without 64 bit atomics * Unified PME kernels * Converted HippoNonbondedForce to common platform * Creating OpenCL implementation of HippoNonbondedForce * Continuing OpenCL implementation of HippoNonbondedForce * Mostly finished OpenCL implementation of HippoNonbondedForce * Eliminated three component vector types in host code * Fix errors on CPU OpenCL * Skip double precision tests for AMOEBA on OpenCL * Bug fixes * Bug fixes * Fixed compilation error

Converted AMOEBA to common platform (#3120)
* Began converting AMOEBA to common platform * Beginning of OpenCL platform for AMOEBA * Converted AmoebaVdwForce to common platform * Cleaned up reference AMOEBA tests * Began converting AmoebaMultipoleForce to common platform * Continue converting AmoebaMultipoleForce to common platform * Bug fixes * Bug fix * Continue converting AmoebaMultipoleForce to common platform * Converting AmoebaMultipoleForce and AmoebaGeneralizedKirkwoodForce to common platform * Converting AmoebaMultipoleForce and AmoebaGeneralizedKirkwoodForce to common platform * Creating OpenCL version of AmoebaMultipoleForce and AmoebaGeneralizedKirkwoodForce * Creating OpenCL version of AmoebaMultipoleForce and AmoebaGeneralizedKirkwoodForce * Creating OpenCL version of AmoebaMultipoleForce and AmoebaGeneralizedKirkwoodForce * Converted arrays from real3 to real * Bug fix to OpenCL AmoebaGeneralizedKirkwoodForce * Fixes for AMD GPUs * Began converting HippoNonbondedForce to common platform * Continuing to convert HippoNonbondedForce to common platform * Continuing to convert HippoNonbondedForce to common platform * Working on unifying PME kernels * Fixed error on devices without 64 bit atomics * Unified PME kernels * Converted HippoNonbondedForce to common platform * Creating OpenCL implementation of HippoNonbondedForce * Continuing OpenCL implementation of HippoNonbondedForce * Mostly finished OpenCL implementation of HippoNonbondedForce * Eliminated three component vector types in host code * Fix errors on CPU OpenCL * Skip double precision tests for AMOEBA on OpenCL * Bug fixes * Bug fixes * Fixed compilation error
8e8923a7 · Peter Eastman · GitHub · 393a4dbd · 8e8923a7 · 8e8923a7
Unverified Commit 8e8923a7 authored May 22, 2021 by Peter Eastman Committed by GitHub May 22, 2021
20 changed files
--- a/platforms/opencl/src/OpenCLContext.cpp
+++ b/platforms/opencl/src/OpenCLContext.cpp
@@ -681,6 +681,19 @@ void OpenCLContext::executeKernel(cl::Kernel& kernel, int workUnits, int blockSi
    }
 }

+int OpenCLContext::computeThreadBlockSize(double memory) const {
+    int maxShared = device.getInfo<CL_DEVICE_LOCAL_MEM_SIZE>();
+    // On some implementations, more local memory gets used than we calculate by
+    // adding up the sizes of the fields.  To be safe, include a factor of 0.5.
+    int max = (int) (0.5*maxShared/memory);
+    if (max < 64)
+        return 32;
+    int threads = 64;
+    while (threads+64 < max)
+        threads += 64;
+    return threads;
+}
+
 void OpenCLContext::clearBuffer(ArrayInterface& array) {
    clearBuffer(unwrap(array).getDeviceBuffer(), array.getSize()*array.getElementSize());
 }

--- a/platforms/opencl/src/OpenCLKernel.cpp
+++ b/platforms/opencl/src/OpenCLKernel.cpp
@@ -26,6 +26,7 @@

 #include "OpenCLKernel.h"
 #include "openmm/common/ComputeArray.h"
+#include "openmm/internal/AssertionUtilities.h"

 using namespace OpenMM;
 using namespace std;
@@ -37,6 +38,10 @@ string OpenCLKernel::getName() const {
    return kernel.getInfo<CL_KERNEL_FUNCTION_NAME>();
 }

+int OpenCLKernel::getMaxBlockSize() const {
+    return kernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice());
+}
+
 void OpenCLKernel::execute(int threads, int blockSize) {
    // Set args that are specified by OpenCLArrays.  We can't do this earlier, because it's
    // possible resize() will get called on an array, causing its internal storage to be
@@ -65,10 +70,12 @@ void OpenCLKernel::addEmptyArg() {
 }

 void OpenCLKernel::setArrayArg(int index, ArrayInterface& value) {
+    ASSERT_VALID_INDEX(index, arrayArgs);
    arrayArgs[index] = &context.unwrap(value);
 }

 void OpenCLKernel::setPrimitiveArg(int index, const void* value, int size) {
+    ASSERT_VALID_INDEX(index, arrayArgs);
    // The const_cast is needed because of a bug in the OpenCL C++ wrappers.  clSetKernelArg()
    // declares the value to be const, but the C++ wrapper doesn't.
    kernel.setArg(index, size, const_cast<void*>(value));

--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
--- a/platforms/opencl/src/OpenCLNonbondedUtilities.cpp
+++ b/platforms/opencl/src/OpenCLNonbondedUtilities.cpp
@@ -91,6 +91,7 @@ OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : con
    }
    pinnedCountBuffer = new cl::Buffer(context.getContext(), CL_MEM_ALLOC_HOST_PTR, sizeof(int));
    pinnedCountMemory = (int*) context.getQueue().enqueueMapBuffer(*pinnedCountBuffer, CL_TRUE, CL_MAP_READ, 0, sizeof(int));
+    setKernelSource(deviceIsCpu ? OpenCLKernelSources::nonbonded_cpu : OpenCLKernelSources::nonbonded);
 }

 OpenCLNonbondedUtilities::~OpenCLNonbondedUtilities() {
@@ -127,7 +128,7 @@ void OpenCLNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic

 void OpenCLNonbondedUtilities::addParameter(ComputeParameterInfo parameter) {
    parameters.push_back(ParameterInfo(parameter.getName(), parameter.getComponentType(), parameter.getNumComponents(),
-            parameter.getSize(), context.unwrap(parameter.getArray()).getDeviceBuffer()));
+            parameter.getSize(), context.unwrap(parameter.getArray()).getDeviceBuffer(), parameter.isConstant()));
 }

 void OpenCLNonbondedUtilities::addParameter(const ParameterInfo& parameter) {
@@ -136,7 +137,7 @@ void OpenCLNonbondedUtilities::addParameter(const ParameterInfo& parameter) {

 void OpenCLNonbondedUtilities::addArgument(ComputeParameterInfo parameter) {
    arguments.push_back(ParameterInfo(parameter.getName(), parameter.getComponentType(), parameter.getNumComponents(),
-            parameter.getSize(), context.unwrap(parameter.getArray()).getDeviceBuffer()));
+            parameter.getSize(), context.unwrap(parameter.getArray()).getDeviceBuffer(), parameter.isConstant()));
 }

 void OpenCLNonbondedUtilities::addArgument(const ParameterInfo& parameter) {
@@ -556,97 +557,108 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
    const string suffixes[] = {"x", "y", "z", "w"};
    stringstream localData;
    int localDataSize = 0;
-    for (int i = 0; i < (int) params.size(); i++) {
-        if (params[i].getNumComponents() == 1)
-            localData<<params[i].getType()<<" "<<params[i].getName()<<";\n";
+    for (const ParameterInfo& param : params) {
+        if (param.getNumComponents() == 1)
+            localData<<param.getType()<<" "<<param.getName()<<";\n";
        else {
-            for (int j = 0; j < params[i].getNumComponents(); ++j)
-                localData<<params[i].getComponentType()<<" "<<params[i].getName()<<"_"<<suffixes[j]<<";\n";
+            for (int j = 0; j < param.getNumComponents(); ++j)
+                localData<<param.getComponentType()<<" "<<param.getName()<<"_"<<suffixes[j]<<";\n";
        }
-        localDataSize += params[i].getSize();
+        localDataSize += param.getSize();
    }
    replacements["ATOM_PARAMETER_DATA"] = localData.str();
    stringstream args;
-    for (int i = 0; i < (int) params.size(); i++) {
-        args << ", __global const ";
-        args << params[i].getType();
+    for (const ParameterInfo& param : params) {
+        args << ", __global ";
+        if (param.isConstant())
+            args << "const ";
+        if (param.getNumComponents() == 3)
+            args << param.getComponentType();
+        else
+            args << param.getType();
        args << "* restrict global_";
-        args << params[i].getName();
+        args << param.getName();
    }
-    for (int i = 0; i < (int) arguments.size(); i++) {
-        if (arguments[i].getMemory().getInfo<CL_MEM_TYPE>() == CL_MEM_OBJECT_IMAGE2D) {
+    for (const ParameterInfo& arg : arguments) {
+        if (arg.getMemory().getInfo<CL_MEM_TYPE>() == CL_MEM_OBJECT_IMAGE2D) {
            args << ", __read_only image2d_t ";
-            args << arguments[i].getName();
+            args << arg.getName();
        }
        else {
-            if ((arguments[i].getMemory().getInfo<CL_MEM_FLAGS>() & CL_MEM_READ_ONLY) == 0)
-                args << ", __global const ";
+            if ((arg.getMemory().getInfo<CL_MEM_FLAGS>() & CL_MEM_READ_ONLY) == 0) {
+                args << ", __global ";
+                if (arg.isConstant())
+                    args << "const ";
+            }
            else
                args << ", __constant ";
-            args << arguments[i].getType();
+            args << arg.getType();
            args << "* restrict ";
-            args << arguments[i].getName();
+            args << arg.getName();
        }
    }
    if (energyParameterDerivatives.size() > 0)
        args << ", __global mixed* restrict energyParamDerivs";
    replacements["PARAMETER_ARGUMENTS"] = args.str();
    stringstream loadLocal1;
-    for (int i = 0; i < (int) params.size(); i++) {
-        if (params[i].getNumComponents() == 1) {
-            loadLocal1<<"localData[localAtomIndex]."<<params[i].getName()<<" = "<<params[i].getName()<<"1;\n";
+    for (const ParameterInfo& param : params) {
+        if (param.getNumComponents() == 1) {
+            loadLocal1<<"localData[localAtomIndex]."<<param.getName()<<" = "<<param.getName()<<"1;\n";
        }
        else {
-            for (int j = 0; j < params[i].getNumComponents(); ++j)
-                loadLocal1<<"localData[localAtomIndex]."<<params[i].getName()<<"_"<<suffixes[j]<<" = "<<params[i].getName()<<"1."<<suffixes[j]<<";\n";
+            for (int j = 0; j < param.getNumComponents(); ++j)
+                loadLocal1<<"localData[localAtomIndex]."<<param.getName()<<"_"<<suffixes[j]<<" = "<<param.getName()<<"1."<<suffixes[j]<<";\n";
        }
    }
    replacements["LOAD_LOCAL_PARAMETERS_FROM_1"] = loadLocal1.str();
+    replacements["DECLARE_LOCAL_PARAMETERS"] = "";
    stringstream loadLocal2;
-    for (int i = 0; i < (int) params.size(); i++) {
-        if (params[i].getNumComponents() == 1) {
-            loadLocal2<<"localData[localAtomIndex]."<<params[i].getName()<<" = global_"<<params[i].getName()<<"[j];\n";
+    for (const ParameterInfo& param : params) {
+        if (param.getNumComponents() == 1) {
+            loadLocal2<<"localData[localAtomIndex]."<<param.getName()<<" = global_"<<param.getName()<<"[j];\n";
        }
        else {
-            loadLocal2<<params[i].getType()<<" temp_"<<params[i].getName()<<" = global_"<<params[i].getName()<<"[j];\n";
-            for (int j = 0; j < params[i].getNumComponents(); ++j)
-                loadLocal2<<"localData[localAtomIndex]."<<params[i].getName()<<"_"<<suffixes[j]<<" = temp_"<<params[i].getName()<<"."<<suffixes[j]<<";\n";
+            if (param.getNumComponents() == 3)
+                loadLocal2<<param.getType()<<" temp_"<<param.getName()<<" = make_"<<param.getType()<<"(global_"<<param.getName()<<"[3*j], global_"<<param.getName()<<"[3*j+1], global_"<<param.getName()<<"[3*j+2]);\n";
+            else
+                loadLocal2<<param.getType()<<" temp_"<<param.getName()<<" = global_"<<param.getName()<<"[j];\n";
+            for (int j = 0; j < param.getNumComponents(); ++j)
+                loadLocal2<<"localData[localAtomIndex]."<<param.getName()<<"_"<<suffixes[j]<<" = temp_"<<param.getName()<<"."<<suffixes[j]<<";\n";
        }
    }
    replacements["LOAD_LOCAL_PARAMETERS_FROM_GLOBAL"] = loadLocal2.str();
    stringstream load1;
-    for (int i = 0; i < (int) params.size(); i++) {
-        load1 << params[i].getType();
-        load1 << " ";
-        load1 << params[i].getName();
-        load1 << "1 = global_";
-        load1 << params[i].getName();
-        load1 << "[atom1];\n";
+    for (const ParameterInfo& param : params) {
+        load1<<param.getType()<<" "<<param.getName()<<"1 = ";
+        if (param.getNumComponents() == 3)
+            load1<<"make_"<<param.getType()<<"(global_"<<param.getName()<<"[3*atom1], global_"<<param.getName()<<"[3*atom1+1], global_"<<param.getName()<<"[3*atom1+2]);\n";
+        else
+            load1<<"global_"<<param.getName()<<"[atom1];\n";
    }
    replacements["LOAD_ATOM1_PARAMETERS"] = load1.str();
    stringstream load2j;
-    for (int i = 0; i < (int) params.size(); i++) {
-        if (params[i].getNumComponents() == 1) {
-            load2j<<params[i].getType()<<" "<<params[i].getName()<<"2 = localData[atom2]."<<params[i].getName()<<";\n";
+    for (const ParameterInfo& param : params) {
+        if (param.getNumComponents() == 1) {
+            load2j<<param.getType()<<" "<<param.getName()<<"2 = localData[atom2]."<<param.getName()<<";\n";
        }
        else {
-            load2j<<params[i].getType()<<" "<<params[i].getName()<<"2 = ("<<params[i].getType()<<") (";
-            for (int j = 0; j < params[i].getNumComponents(); ++j) {
+            load2j<<param.getType()<<" "<<param.getName()<<"2 = ("<<param.getType()<<") (";
+            for (int j = 0; j < param.getNumComponents(); ++j) {
                if (j > 0)
                    load2j<<", ";
-                load2j<<"localData[atom2]."<<params[i].getName()<<"_"<<suffixes[j];
+                load2j<<"localData[atom2]."<<param.getName()<<"_"<<suffixes[j];
            }
            load2j<<");\n";
        }
    }
    replacements["LOAD_ATOM2_PARAMETERS"] = load2j.str();
    stringstream clearLocal;
-    for (int i = 0; i < (int) params.size(); i++) {
-        if (params[i].getNumComponents() == 1)
-            clearLocal<<"localData[localAtomIndex]."<<params[i].getName()<<" = 0;\n";
+    for (const ParameterInfo& param : params) {
+        if (param.getNumComponents() == 1)
+            clearLocal<<"localData[localAtomIndex]."<<param.getName()<<" = 0;\n";
        else
-            for (int j = 0; j < params[i].getNumComponents(); ++j)
-                clearLocal<<"localData[localAtomIndex]."<<params[i].getName()<<"_"<<suffixes[j]<<" = 0;\n";
+            for (int j = 0; j < param.getNumComponents(); ++j)
+                clearLocal<<"localData[localAtomIndex]."<<param.getName()<<"_"<<suffixes[j]<<" = 0;\n";
    }
    replacements["CLEAR_LOCAL_PARAMETERS"] = clearLocal.str();
    stringstream initDerivs;
@@ -659,7 +671,7 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
    for (int i = 0; i < energyParameterDerivatives.size(); i++)
        for (int index = 0; index < numDerivs; index++)
            if (allParamDerivNames[index] == energyParameterDerivatives[i])
-                saveDerivs<<"energyParamDerivs[get_global_id(0)*"<<numDerivs<<"+"<<index<<"] += energyParamDeriv"<<i<<";\n";
+                saveDerivs<<"energyParamDerivs[GLOBAL_ID*"<<numDerivs<<"+"<<index<<"] += energyParamDeriv"<<i<<";\n";
    replacements["SAVE_DERIVATIVES"] = saveDerivs.str();
    map<string, string> defines;
    if (useCutoff)
@@ -676,6 +688,7 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
        defines["INCLUDE_FORCES"] = "1";
    if (includeEnergy)
        defines["INCLUDE_ENERGY"] = "1";
+    defines["THREAD_BLOCK_SIZE"] = context.intToString(forceThreadBlockSize);
    defines["FORCE_WORK_GROUP_SIZE"] = context.intToString(forceThreadBlockSize);
    double maxCutoff = 0.0;
    for (int i = 0; i < 32; i++) {
@@ -700,12 +713,7 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
    defines["LAST_EXCLUSION_TILE"] = context.intToString(endExclusionIndex);
    if ((localDataSize/4)%2 == 0)
        defines["PARAMETER_SIZE_IS_EVEN"] = "1";
-    string file;
-    if (deviceIsCpu)
-        file = OpenCLKernelSources::nonbonded_cpu;
-    else
-        file = OpenCLKernelSources::nonbonded;
-    cl::Program program = context.createProgram(context.replaceStrings(file, replacements), defines);
+    cl::Program program = context.createProgram(context.replaceStrings(kernelSource, replacements), defines);
    cl::Kernel kernel(program, "computeNonbonded");

    // Set arguments to the Kernel.
@@ -730,13 +738,15 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
        kernel.setArg<cl::Buffer>(index++, blockBoundingBox.getDeviceBuffer());
        kernel.setArg<cl::Buffer>(index++, interactingAtoms.getDeviceBuffer());
    }
-    for (int i = 0; i < (int) params.size(); i++) {
-        kernel.setArg<cl::Memory>(index++, params[i].getMemory());
-    }
-    for (int i = 0; i < (int) arguments.size(); i++) {
-        kernel.setArg<cl::Memory>(index++, arguments[i].getMemory());
-    }
+    for (const ParameterInfo& param : params)
+        kernel.setArg<cl::Memory>(index++, param.getMemory());
+    for (const ParameterInfo& arg : arguments)
+        kernel.setArg<cl::Memory>(index++, arg.getMemory());
    if (energyParameterDerivatives.size() > 0)
        kernel.setArg<cl::Memory>(index++, context.getEnergyParamDerivBuffer().getDeviceBuffer());
    return kernel;
 }
+
+void OpenCLNonbondedUtilities::setKernelSource(const string& source) {
+    kernelSource = source;
+}
--- a/platforms/opencl/src/kernels/nonbonded.cl
+++ b/platforms/opencl/src/kernels/nonbonded.cl
@@ -34,6 +34,7 @@ __kernel void computeNonbonded(
    const unsigned int warp = get_global_id(0)/TILE_SIZE;
    const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
    const unsigned int tbx = get_local_id(0) - tgx;
+    const unsigned int localAtomIndex = get_local_id(0);
    mixed energy = 0;
    INIT_DERIVATIVES
    __local AtomData localData[FORCE_WORK_GROUP_SIZE];
@@ -57,7 +58,6 @@ __kernel void computeNonbonded(
        if (x == y) {
            // This tile is on the diagonal.

-            const unsigned int localAtomIndex = get_local_id(0);
            localData[localAtomIndex].x = posq1.x;
            localData[localAtomIndex].y = posq1.y;
            localData[localAtomIndex].z = posq1.z;
@@ -105,7 +105,6 @@ __kernel void computeNonbonded(
        else {
            // This is an off-diagonal tile.

-            const unsigned int localAtomIndex = get_local_id(0);
            unsigned int j = y*TILE_SIZE + tgx;
            real4 tempPosq = posq[j];
            localData[localAtomIndex].x = tempPosq.x;
@@ -266,7 +265,6 @@ __kernel void computeNonbonded(

            real4 posq1 = posq[atom1];
            LOAD_ATOM1_PARAMETERS
-            const unsigned int localAtomIndex = get_local_id(0);
 #ifdef USE_CUTOFF
            unsigned int j = interactingAtoms[pos*TILE_SIZE+tgx];
 #else

--- a/plugins/amoeba/CMakeLists.txt
+++ b/plugins/amoeba/CMakeLists.txt
@@ -124,6 +124,7 @@ ENDIF(OPENMM_BUILD_STATIC_LIB)
 # Which hardware platforms to build

 ADD_SUBDIRECTORY(platforms/reference)
+ADD_SUBDIRECTORY(platforms/common)

 IF(OPENMM_BUILD_CUDA_LIB)
    SET(OPENMM_BUILD_AMOEBA_CUDA_LIB ON CACHE BOOL "Build OpenMMAmoebaCuda library for Nvidia GPUs")
@@ -131,7 +132,12 @@ ELSE(OPENMM_BUILD_CUDA_LIB)
    SET(OPENMM_BUILD_AMOEBA_CUDA_LIB OFF CACHE BOOL "Build OpenMMAmoebaCuda library for Nvidia GPUs")
 ENDIF(OPENMM_BUILD_CUDA_LIB)

-SET(OPENMM_BUILD_AMOEBA_PATH)
+IF(OPENMM_BUILD_OPENCL_LIB)
+    SET(OPENMM_BUILD_AMOEBA_OPENCL_LIB ON CACHE BOOL "Build OpenMMAmoebaOpenCL library")
+ELSE(OPENMM_BUILD_OPENCL_LIB)
+    SET(OPENMM_BUILD_AMOEBA_OPENCL_LIB OFF CACHE BOOL "Build OpenMMAmoebaOpenCL library")
+ENDIF(OPENMM_BUILD_OPENCL_LIB)
+
 SET(OPENMM_BUILD_AMOEBA_CUDA_PATH)
 IF(OPENMM_BUILD_AMOEBA_CUDA_LIB)
    ADD_SUBDIRECTORY(platforms/cuda)
@@ -139,6 +145,13 @@ IF(OPENMM_BUILD_AMOEBA_CUDA_LIB)
    SET(OPENMM_AMOEBA_CUDA_SOURCE_SUBDIRS . openmmapi olla platforms/cuda)
 ENDIF(OPENMM_BUILD_AMOEBA_CUDA_LIB)

+SET(OPENMM_BUILD_AMOEBA_OPENCL_PATH)
+IF(OPENMM_BUILD_AMOEBA_OPENCL_LIB)
+    ADD_SUBDIRECTORY(platforms/opencl)
+    SET(OPENMM_BUILD_AMOEBA_OPENCL_PATH ${CMAKE_CURRENT_SOURCE_DIR}/platforms/opencl)
+    SET(OPENMM_AMOEBA_OPENCL_SOURCE_SUBDIRS . openmmapi olla platforms/opencl)
+ENDIF(OPENMM_BUILD_AMOEBA_OPENCL_LIB)
+
 INSTALL_TARGETS(/lib RUNTIME_DIRECTORY /lib ${SHARED_AMOEBA_TARGET})
 IF(OPENMM_BUILD_STATIC_LIB)
  INSTALL_TARGETS(/lib RUNTIME_DIRECTORY /lib ${STATIC_AMOEBA_TARGET})

--- a/plugins/amoeba/platforms/common/CMakeLists.txt
+++ b/plugins/amoeba/platforms/common/CMakeLists.txt
+# Encode the kernel sources into a C++ class.
+
+SET(KERNEL_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/src")
+SET(KERNEL_SOURCE_CLASS CommonAmoebaKernelSources)
+SET(KERNELS_CPP ${CMAKE_CURRENT_BINARY_DIR}/src/${KERNEL_SOURCE_CLASS}.cpp)
+SET(KERNELS_H ${CMAKE_CURRENT_BINARY_DIR}/src/${KERNEL_SOURCE_CLASS}.h)
+INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_BINARY_DIR}/src)
+FILE(GLOB COMMON_KERNELS ${KERNEL_SOURCE_DIR}/kernels/*.cc)
+ADD_CUSTOM_COMMAND(OUTPUT ${KERNELS_CPP} ${KERNELS_H}
+    COMMAND ${CMAKE_COMMAND}
+    ARGS -D KERNEL_SOURCE_DIR=${KERNEL_SOURCE_DIR} -D KERNELS_CPP=${KERNELS_CPP} -D KERNELS_H=${KERNELS_H} -D KERNEL_SOURCE_CLASS=${KERNEL_SOURCE_CLASS} -D KERNEL_FILE_EXTENSION=cc -P ${CMAKE_SOURCE_DIR}/cmake_modules/EncodeKernelFiles.cmake
+    DEPENDS ${COMMON_KERNELS}
+)
+SET_SOURCE_FILES_PROPERTIES(${KERNELS_CPP} ${KERNELS_H} PROPERTIES GENERATED TRUE)
+ADD_CUSTOM_TARGET(AmoebaCommonKernels DEPENDS ${KERNELS_CPP} ${KERNELS_H})
--- a/plugins/amoeba/platforms/common/src/AmoebaCommonKernels.cpp
+++ b/plugins/amoeba/platforms/common/src/AmoebaCommonKernels.cpp
--- a/plugins/amoeba/platforms/common/src/AmoebaCommonKernels.h
+++ b/plugins/amoeba/platforms/common/src/AmoebaCommonKernels.h
--- a/plugins/amoeba/platforms/common/src/CommonAmoebaKernelSources.cpp.in
+++ b/plugins/amoeba/platforms/common/src/CommonAmoebaKernelSources.cpp.in
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2010 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+#include "CommonAmoebaKernelSources.h"
+
+using namespace OpenMM;
+using namespace std;
+
--- a/plugins/amoeba/platforms/common/src/CommonAmoebaKernelSources.h.in
+++ b/plugins/amoeba/platforms/common/src/CommonAmoebaKernelSources.h.in
+#ifndef OPENMM_COMMONAMOEBAKERNELSOURCES_H_
+#define OPENMM_COMMONAMOEBAKERNELSOURCES_H_
+
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2010 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+#include <string>
+
+namespace OpenMM {
+
+/**
+ * This class is a central holding place for the source code of device kernels.
+ * The CMake build script inserts declarations into it based on the .cc files in the
+ * kernels subfolder.
+ */
+
+class CommonAmoebaKernelSources {
+public:
+@KERNEL_FILE_DECLARATIONS@
+};
+
+} // namespace OpenMM
+
+#endif /*OPENMM_COMMONAMOEBAKERNELSOURCES_H_*/
--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaGk.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaGk.cu
--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaTorsionTorsionForce.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaTorsionTorsionForce.cu
--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaVdwForce1.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaVdwForce1.cu
 /**
 * Clear the forces, and compute the position to use for each atom based on the bond reduction factors.
 */
-extern "C" __global__ void prepareToComputeForce(unsigned long long* __restrict__ forceBuffers, real4* __restrict__ posq, const real4* __restrict__ tempPosq,
-        const int* __restrict__ bondReductionAtoms, const float* __restrict__ bondReductionFactors) {
-    for (unsigned int atom = blockIdx.x*blockDim.x+threadIdx.x; atom < PADDED_NUM_ATOMS; atom += blockDim.x*gridDim.x) {
+KERNEL void prepareToComputeForce(GLOBAL mm_ulong* RESTRICT forceBuffers, GLOBAL real4* RESTRICT posq, GLOBAL const real4* RESTRICT tempPosq,
+        GLOBAL const int* RESTRICT bondReductionAtoms, GLOBAL const float* RESTRICT bondReductionFactors) {
+    for (unsigned int atom = GLOBAL_ID; atom < PADDED_NUM_ATOMS; atom += GLOBAL_SIZE) {
        forceBuffers[atom] = 0;
        forceBuffers[atom+PADDED_NUM_ATOMS] = 0;
        forceBuffers[atom+PADDED_NUM_ATOMS*2] = 0;
@@ -19,27 +19,27 @@ extern "C" __global__ void prepareToComputeForce(unsigned long long* __restrict_
 /**
 * Spread the forces between atoms based on the bond reduction factors.
 */
-extern "C" __global__ void spreadForces(const unsigned long long* __restrict__ forceBuffers, unsigned long long* __restrict__ tempForceBuffers,
-        const int* __restrict__ bondReductionAtoms, const float* __restrict__ bondReductionFactors) {
-    for (unsigned int atom1 = blockIdx.x*blockDim.x+threadIdx.x; atom1 < PADDED_NUM_ATOMS; atom1 += blockDim.x*gridDim.x) {
+KERNEL void spreadForces(GLOBAL const mm_ulong* RESTRICT forceBuffers, GLOBAL mm_ulong* RESTRICT tempForceBuffers,
+        GLOBAL const int* RESTRICT bondReductionAtoms, GLOBAL const float* RESTRICT bondReductionFactors) {
+    for (unsigned int atom1 = GLOBAL_ID; atom1 < PADDED_NUM_ATOMS; atom1 += GLOBAL_SIZE) {
        int atom2 = bondReductionAtoms[atom1];
-        long long fx1 = forceBuffers[atom1];
-        long long fy1 = forceBuffers[atom1+PADDED_NUM_ATOMS];
-        long long fz1 = forceBuffers[atom1+PADDED_NUM_ATOMS*2];
+        mm_long fx1 = forceBuffers[atom1];
+        mm_long fy1 = forceBuffers[atom1+PADDED_NUM_ATOMS];
+        mm_long fz1 = forceBuffers[atom1+PADDED_NUM_ATOMS*2];
        if (atom1 != atom2) {
            double factor = (double) bondReductionFactors[atom1];
-            long long fx2 = (long long) ((1-factor)*fx1);
-            long long fy2 = (long long) ((1-factor)*fy1);
-            long long fz2 = (long long) ((1-factor)*fz1);
-            atomicAdd(&tempForceBuffers[atom2], static_cast<unsigned long long>(fx2));
-            atomicAdd(&tempForceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>(fy2));
-            atomicAdd(&tempForceBuffers[atom2+PADDED_NUM_ATOMS*2], static_cast<unsigned long long>(fz2));
-            fx1 = (long long) (factor*fx1);
-            fy1 = (long long) (factor*fy1);
-            fz1 = (long long) (factor*fz1);
+            mm_long fx2 = (mm_long) ((1-factor)*fx1);
+            mm_long fy2 = (mm_long) ((1-factor)*fy1);
+            mm_long fz2 = (mm_long) ((1-factor)*fz1);
+            ATOMIC_ADD(&tempForceBuffers[atom2], (mm_ulong) fx2);
+            ATOMIC_ADD(&tempForceBuffers[atom2+PADDED_NUM_ATOMS], (mm_ulong) fy2);
+            ATOMIC_ADD(&tempForceBuffers[atom2+PADDED_NUM_ATOMS*2], (mm_ulong) fz2);
+            fx1 = (mm_long) (factor*fx1);
+            fy1 = (mm_long) (factor*fy1);
+            fz1 = (mm_long) (factor*fz1);
        }
-        atomicAdd(&tempForceBuffers[atom1], static_cast<unsigned long long>(fx1));
-        atomicAdd(&tempForceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>(fy1));
-        atomicAdd(&tempForceBuffers[atom1+PADDED_NUM_ATOMS*2], static_cast<unsigned long long>(fz1));
+        ATOMIC_ADD(&tempForceBuffers[atom1], (mm_ulong) fx1);
+        ATOMIC_ADD(&tempForceBuffers[atom1+PADDED_NUM_ATOMS], (mm_ulong) fy1);
+        ATOMIC_ADD(&tempForceBuffers[atom1+PADDED_NUM_ATOMS*2], (mm_ulong) fz1);
    }
 }
--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaVdwForce2.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaVdwForce2.cu
--- a/plugins/amoeba/platforms/cuda/src/kernels/amoebaWcaForce.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/amoebaWcaForce.cu
@@ -6,35 +6,37 @@ typedef struct {
    float radius, epsilon, padding;
 } AtomData;

-inline __device__ void loadAtomData(AtomData& data, int atom, const real4* __restrict__ posq, const float2* __restrict__ radiusEpsilon) {
+inline DEVICE AtomData loadAtomData(int atom, GLOBAL const real4* RESTRICT posq, GLOBAL const float2* RESTRICT radiusEpsilon) {
+    AtomData data;
    real4 atomPosq = posq[atom];
    data.pos = make_real3(atomPosq.x, atomPosq.y, atomPosq.z);
    float2 temp = radiusEpsilon[atom];
    data.radius = temp.x;
    data.epsilon = temp.y;
+    return data;
 }

-__device__ void initParticleParameters(float radius, float epsilon, real& rmixo, real& rmixh, real& emixo, real& emixh) {
+DEVICE void initParticleParameters(float radius, float epsilon, real* rmixo, real* rmixh, real* emixo, real* emixh) {
    real sqrtEps = SQRT(epsilon);
    real denominator = SQRT(EPSO) + sqrtEps;
-    emixo = 4*EPSO*epsilon / (denominator*denominator);
+    *emixo = 4*EPSO*epsilon / (denominator*denominator);
    denominator = SQRT(EPSH) + sqrtEps;
-    emixh = 4*EPSH*epsilon / (denominator*denominator);
+    *emixh = 4*EPSH*epsilon / (denominator*denominator);
    real radius2 = radius*radius;
    real rmino2 = RMINO*RMINO; 
-    rmixo = 2*(rmino2*RMINO + radius2*radius) / (rmino2 + radius2);
+    *rmixo = 2*(rmino2*RMINO + radius2*radius) / (rmino2 + radius2);
    real rminh2 = RMINH*RMINH;
-    rmixh = 2*(rminh2*RMINH + radius2*radius) / (rminh2+radius2);
+    *rmixh = 2*(rminh2*RMINH + radius2*radius) / (rminh2+radius2);
 }

-__device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, real rmixo, real rmixh, real emixo, real emixh, real3& force, real& energy) {
+DEVICE void computeOneInteraction(AtomData atom1, AtomData atom2, real rmixo, real rmixh, real emixo, real emixh, real3* force, real* energy) {
    // get deltaR and r between 2 atoms
    
-    force = atom2.pos - atom1.pos;
-    real r2 = dot(force, force);
+    *force = atom2.pos - atom1.pos;
+    real r2 = dot(*force, *force);
    if (r2 <= 0) {
-        force = make_real3(0);
-        energy = 0;
+        *force = make_real3(0);
+        *energy = 0;
        return;
    }
    real rI = RSQRT(r2);
@@ -43,8 +45,8 @@ __device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, real rmi
    real sk = atom2.radius*SHCTD;
    real sk2 = sk*sk;
    if (atom1.radius >= (r+sk)) {
-        force = make_real3(0);
-        energy = 0;
+        *force = make_real3(0);
+        *energy = 0;
        return;
    }

@@ -183,30 +185,30 @@ __device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, real rmi
    de += mask2*(ah*rmixh7*M_PI*(dl+du)/(30*r2));
    sum += mask2*(irep+idisp);

-    energy = sum;
+    *energy = sum;
    de *= -AWATER*rI;
-    force *= de;
+    *force *= de;
 }

 /**
 * Compute WCA interaction.
 */
-extern "C" __global__ void computeWCAForce(unsigned long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer,
-        const real4* __restrict__ posq, unsigned int startTileIndex, unsigned int numTileIndices, const float2* __restrict__ radiusEpsilon) {
-    unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
-    unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
+KERNEL void computeWCAForce(GLOBAL mm_ulong* RESTRICT forceBuffers, GLOBAL mixed* RESTRICT energyBuffer,
+        GLOBAL const real4* RESTRICT posq, unsigned int startTileIndex, unsigned int numTileIndices, GLOBAL const float2* RESTRICT radiusEpsilon) {
+    unsigned int totalWarps = GLOBAL_SIZE/TILE_SIZE;
+    unsigned int warp = GLOBAL_ID/TILE_SIZE;
    const unsigned int numTiles = numTileIndices;
-    unsigned int pos = (unsigned int) (startTileIndex+warp*(long long)numTiles/totalWarps);
-    unsigned int end = (unsigned int) (startTileIndex+(warp+1)*(long long)numTiles/totalWarps);
+    unsigned int pos = (unsigned int) (startTileIndex+warp*(mm_long)numTiles/totalWarps);
+    unsigned int end = (unsigned int) (startTileIndex+(warp+1)*(mm_long)numTiles/totalWarps);
    mixed energy = 0;
-    __shared__ AtomData localData[THREAD_BLOCK_SIZE];
+    LOCAL AtomData localData[THREAD_BLOCK_SIZE];
    
    do {
        // Extract the coordinates of this tile
        
-        const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
-        const unsigned int tbx = threadIdx.x - tgx;
-        const unsigned int localGroupIndex = threadIdx.x/TILE_SIZE;
+        const unsigned int tgx = LOCAL_ID & (TILE_SIZE-1);
+        const unsigned int tbx = LOCAL_ID - tgx;
+        const unsigned int localGroupIndex = LOCAL_ID/TILE_SIZE;
        int x, y;
        AtomData data;
        if (pos < end) {
@@ -217,12 +219,13 @@ extern "C" __global__ void computeWCAForce(unsigned long long* __restrict__ forc
                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
            }
            unsigned int atom1 = x*TILE_SIZE + tgx;
-            loadAtomData(data, atom1, posq, radiusEpsilon);
-            loadAtomData(localData[threadIdx.x], y*TILE_SIZE+tgx, posq, radiusEpsilon);
+            data = loadAtomData(atom1, posq, radiusEpsilon);
+            localData[LOCAL_ID] = loadAtomData(y*TILE_SIZE+tgx, posq, radiusEpsilon);
            real emixo, emixh, rmixo, rmixh;
-            initParticleParameters(data.radius, data.epsilon, rmixo, rmixh, emixo, emixh);
+            initParticleParameters(data.radius, data.epsilon, &rmixo, &rmixh, &emixo, &emixh);
            data.force = make_real3(0);
-            localData[threadIdx.x].force = make_real3(0);
+            localData[LOCAL_ID].force = make_real3(0);
+            SYNC_WARPS;

            // Compute forces.

@@ -232,31 +235,32 @@ extern "C" __global__ void computeWCAForce(unsigned long long* __restrict__ forc
                if (atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
                    real3 tempForce;
                    real tempEnergy;
-                    computeOneInteraction(data, localData[tbx+tj], rmixo, rmixh, emixo, emixh, tempForce, tempEnergy);
+                    computeOneInteraction(data, localData[tbx+tj], rmixo, rmixh, emixo, emixh, &tempForce, &tempEnergy);
                    data.force += tempForce;
                    localData[tbx+tj].force -= tempForce;
                    energy += (x == y ? 0.5f*tempEnergy : tempEnergy);
                    real emjxo, emjxh, rmjxo, rmjxh;
-                    initParticleParameters(localData[tbx+tj].radius, localData[tbx+tj].epsilon, rmjxo, rmjxh, emjxo, emjxh);
-                    computeOneInteraction(localData[tbx+tj], data, rmjxo, rmjxh, emjxo, emjxh, tempForce, tempEnergy);
+                    initParticleParameters(localData[tbx+tj].radius, localData[tbx+tj].epsilon, &rmjxo, &rmjxh, &emjxo, &emjxh);
+                    computeOneInteraction(localData[tbx+tj], data, rmjxo, rmjxh, emjxo, emjxh, &tempForce, &tempEnergy);
                    data.force -= tempForce;
                    localData[tbx+tj].force += tempForce;
                    energy += (x == y ? 0.5f*tempEnergy : tempEnergy);
                }
                tj = (tj+1) & (TILE_SIZE-1);
+                SYNC_WARPS;
            }
            unsigned int offset = x*TILE_SIZE + tgx;
-            atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
-            atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
-            atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
+            ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) ((mm_long) (data.force.x*0x100000000)));
+            ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (data.force.y*0x100000000)));
+            ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (data.force.z*0x100000000)));
            if (x != y) {
                offset = y*TILE_SIZE + tgx;
-                atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
-                atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
-                atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
+                ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) ((mm_long) (localData[LOCAL_ID].force.x*0x100000000)));
+                ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (localData[LOCAL_ID].force.y*0x100000000)));
+                ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (localData[LOCAL_ID].force.z*0x100000000)));
            }
        }
        pos++;
    } while (pos < end);
-    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] -= AWATER*energy;
+    energyBuffer[GLOBAL_ID] -= AWATER*energy;
 }
\ No newline at end of file
--- a/plugins/amoeba/platforms/cuda/src/kernels/bicubic.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/bicubic.cu
-__device__ void bicubic(real4 y, real4 y1i, real4 y2i, real4 y12i, real x1, real x1l, real x1u,
+DEVICE void bicubic(real4 y, real4 y1i, real4 y2i, real4 y12i, real x1, real x1l, real x1u,
                        real x2, real x2l, real x2u, real* energyOut, real* dang1Out, real* dang2Out) {
    real c[4][4];
    real d1 = x1u - x1l;

--- a/plugins/amoeba/platforms/common/src/kernels/gkEDiffPairForce.cc
+++ b/plugins/amoeba/platforms/common/src/kernels/gkEDiffPairForce.cc
--- a/plugins/amoeba/platforms/cuda/src/kernels/gkPairForce1.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/gkPairForce1.cu
@@ -4,15 +4,15 @@
 */

 #if defined F1
-__device__ void computeOneInteractionF1(AtomData2& atom1, volatile AtomData2& atom2, real& outputEnergy, real3& force) {
+DEVICE void computeOneInteractionF1(AtomData2 atom1, volatile AtomData2 atom2, real* outputEnergy, real3* force) {
 #elif defined F2
-__device__ void computeOneInteractionF2(AtomData2& atom1, volatile AtomData2& atom2, real& outputEnergy, real3& force) {
+DEVICE void computeOneInteractionF2(AtomData2 atom1, volatile AtomData2 atom2, real* outputEnergy, real3* force) {
 #elif defined T1
-__device__ void computeOneInteractionT1(AtomData2& atom1, volatile AtomData2& atom2, real3& torque) {
+DEVICE void computeOneInteractionT1(AtomData2 atom1, volatile AtomData2 atom2, real3* torque) {
 #elif defined T2
-__device__ void computeOneInteractionT2(AtomData2& atom1, volatile AtomData2& atom2, real3& torque) {
+DEVICE void computeOneInteractionT2(AtomData2 atom1, volatile AtomData2 atom2, real3* torque) {
 #elif defined B1 && defined B2
-__device__ void computeOneInteractionB1B2(AtomData2& atom1, volatile AtomData2& atom2) {
+DEVICE void computeOneInteractionB1B2(AtomData2 atom1, volatile AtomData2 atom2, real* bornForce1, real* bornForce2) {
 #endif

    const real fc = EPSILON_FACTOR*GK_FC;

--- a/plugins/amoeba/platforms/cuda/src/kernels/gkPairForce2.cu
+++ b/plugins/amoeba/platforms/cuda/src/kernels/gkPairForce2.cu
@@ -217,8 +217,8 @@
        atom1.quadrupoleYZ*(atom2.quadrupoleXX*gqxx29 + atom2.quadrupoleYY*gqyy29 + atom2.quadrupoleZZ*gqzz29 + 2*(atom2.quadrupoleXY*gqxy29 + atom2.quadrupoleXZ*gqxz29 + atom2.quadrupoleYZ*gqyz29)));

    dsumdrB1 *= 0.5f;
-    atom1.bornForce += atom2.bornRadius*dsumdrB1;
-    atom2.bornForce += atom1.bornRadius*dsumdrB1;
+    *bornForce1 += atom2.bornRadius*dsumdrB1;
+    *bornForce2 += atom1.bornRadius*dsumdrB1;
 #endif

    // unweighted 3rd reaction potential gradient tensor;
@@ -530,21 +530,21 @@
        trq2 -= (atom1.quadrupoleXZ*fidg11 + atom1.quadrupoleYZ*fidg12 + atom1.quadrupoleZZ*fidg13 -atom1.quadrupoleXX*fidg13-atom1.quadrupoleXY*fidg23-atom1.quadrupoleXZ*fidg33);
        trq3 -= (atom1.quadrupoleXX*fidg12 + atom1.quadrupoleXY*fidg22 + atom1.quadrupoleXZ*fidg23 -atom1.quadrupoleXY*fidg11-atom1.quadrupoleYY*fidg12-atom1.quadrupoleYZ*fidg13);

-        torque.x = trq1;
-        torque.y = trq2;
-        torque.z = trq3;
+        torque->x = trq1;
+        torque->y = trq2;
+        torque->z = trq3;

    } else {
-        torque.x = 0;
-        torque.y = 0;
-        torque.z = 0;
+        torque->x = 0;
+        torque->y = 0;
+        torque->z = 0;
    }
 #endif

 #if defined B2 
    dsumdrB2 *= 0.5f;
-    atom1.bornForce += 0.5f*atom2.bornRadius*dsumdrB2;
-    atom2.bornForce += 0.5f*atom1.bornRadius*dsumdrB2;
+    *bornForce1 += 0.5f*atom2.bornRadius*dsumdrB2;
+    *bornForce2 += 0.5f*atom1.bornRadius*dsumdrB2;
 #endif

 #if defined T2
@@ -566,36 +566,36 @@
    trqi3 -= atom1.quadrupoleXX*fidg12 + atom1.quadrupoleXY*fidg22 + atom1.quadrupoleXZ*fidg23
                                -atom1.quadrupoleXY*fidg11 - atom1.quadrupoleYY*fidg12 - atom1.quadrupoleYZ*fidg13;

-    torque.x += 0.5f*trqi1;
-    torque.y += 0.5f*trqi2;
-    torque.z += 0.5f*trqi3;
+    torque->x += 0.5f*trqi1;
+    torque->y += 0.5f*trqi2;
+    torque->z += 0.5f*trqi3;
 #endif

 #if defined F1

-    outputEnergy = energy;
+    *outputEnergy = energy;

    if ((xr != 0 || yr != 0 || zr != 0)) {
-        force.x = dedx;
-        force.y = dedy;
-        force.z = dedz;
+        force->x = dedx;
+        force->y = dedy;
+        force->z = dedz;
    } else {
-        force.x = force.y = force.z = 0;
+        force->x = force->y = force->z = 0;
    }

 #endif

 #if defined F2
-    outputEnergy += 0.5f*energy;
+    *outputEnergy += 0.5f*energy;

    dpdx *= 0.5f;
    dpdy *= 0.5f;
    dpdz *= 0.5f;

    if ((xr != 0 || yr != 0 || zr != 0)) {
-        force.x += dpdx;
-        force.y += dpdy;
-        force.z += dpdz;
+        force->x += dpdx;
+        force->y += dpdy;
+        force->z += dpdz;
    }
 #endif
 }