Merge branch 'master' of github.com:SimTk/openmm

7bfb75c7 · Lee-Ping Wang · a85d163b · 49cb91c5 · 7bfb75c7 · 7bfb75c7
Commit 7bfb75c7 authored Jul 20, 2013 by Lee-Ping Wang
20 changed files
--- a/cmake_modules/FindFFTW.cmake
+++ b/cmake_modules/FindFFTW.cmake
@@ -20,6 +20,5 @@ find_library (FFTW_THREADS_LIBRARY NAMES fftw3f_threads)
 # all listed variables are TRUE
 include (FindPackageHandleStandardArgs)
 find_package_handle_standard_args (FFTW DEFAULT_MSG FFTW_LIBRARY FFTW_INCLUDES)
-find_package_handle_standard_args (FFTW_THREADS DEFAULT_MSG FFTW_THREADS_LIBRARY FFTW_INCLUDES)

 mark_as_advanced (FFTW_LIBRARY FFTW_THREADS_LIBRARY FFTW_INCLUDES)
--- a/examples/testInstallation.py
+++ b/examples/testInstallation.py
@@ -30,6 +30,7 @@ for i in range(numPlatforms):
        simulation = Simulation(pdb.topology, system, integrator, platform)
        simulation.context.setPositions(pdb.positions)
        forces[i] = simulation.context.getState(getForces=True).getForces()
+        del simulation
        print "- Successfully computed forces"
    except:
        print "- Error computing forces with", platform.getName(), "platform"

--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -5222,6 +5222,10 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
    cu.setTime(cu.getTime()+integrator.getStepSize());
    cu.setStepCount(cu.getStepCount()+1);
    cu.reorderAtoms();
+    if (cu.getAtomsWereReordered()) {
+        forcesAreValid = false;
+        validSavedForces.clear();
+    }
 }

 double CudaIntegrateCustomStepKernel::computeKineticEnergy(ContextImpl& context, CustomIntegrator& integrator, bool& forcesAreValid) {

--- a/platforms/cuda/src/kernels/langevin.cu
+++ b/platforms/cuda/src/kernels/langevin.cu
@@ -32,7 +32,12 @@ extern "C" __global__ void integrateLangevinPart1(mixed4* __restrict__ velm, con
 */

 extern "C" __global__ void integrateLangevinPart2(real4* __restrict__ posq, real4* __restrict__ posqCorrection, const mixed4* __restrict__ posDelta, mixed4* __restrict__ velm, const mixed2* __restrict__ dt) {
+#if __CUDA_ARCH__ >= 130
    double invStepSize = 1.0/dt[0].y;
+#else
+    float invStepSize = 1.0f/dt[0].y;
+    float correction = (1.0f-invStepSize*dt[0].y)/dt[0].y;
+#endif
    int index = blockIdx.x*blockDim.x+threadIdx.x;
    while (index < NUM_ATOMS) {
        mixed4 vel = velm[index];
@@ -48,9 +53,15 @@ extern "C" __global__ void integrateLangevinPart2(real4* __restrict__ posq, real
            pos.x += delta.x;
            pos.y += delta.y;
            pos.z += delta.z;
+#if __CUDA_ARCH__ >= 130
            vel.x = (mixed) (invStepSize*delta.x);
            vel.y = (mixed) (invStepSize*delta.y);
            vel.z = (mixed) (invStepSize*delta.z);
+#else
+            vel.x = invStepSize*delta.x + correction*delta.x;
+            vel.y = invStepSize*delta.y + correction*delta.x;
+            vel.z = invStepSize*delta.z + correction*delta.x;
+#endif
 #ifdef USE_MIXED_PRECISION
            posq[index] = make_real4((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
            posqCorrection[index] = make_real4(pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);

--- a/platforms/cuda/src/kernels/verlet.cu
+++ b/platforms/cuda/src/kernels/verlet.cu
@@ -37,7 +37,12 @@ extern "C" __global__ void integrateVerletPart1(const mixed2* __restrict__ dt, c
 extern "C" __global__ void integrateVerletPart2(mixed2* __restrict__ dt, real4* __restrict__ posq,
        real4* __restrict__ posqCorrection, mixed4* __restrict__ velm, const mixed4* __restrict__ posDelta) {
    mixed2 stepSize = dt[0];
+#if __CUDA_ARCH__ >= 130
    double oneOverDt = 1.0/stepSize.y;
+#else
+    float oneOverDt = 1.0f/stepSize.y;
+    float correction = (1.0f-oneOverDt*stepSize.y)/stepSize.y;
+#endif
    int index = blockIdx.x*blockDim.x+threadIdx.x;
    if (index == 0)
        dt[0].x = stepSize.y;
@@ -55,7 +60,11 @@ extern "C" __global__ void integrateVerletPart2(mixed2* __restrict__ dt, real4*
            pos.x += delta.x;
            pos.y += delta.y;
            pos.z += delta.z;
+#if __CUDA_ARCH__ >= 130
            velocity = make_mixed4((mixed) (delta.x*oneOverDt), (mixed) (delta.y*oneOverDt), (mixed) (delta.z*oneOverDt), velocity.w);
+#else
+            velocity = make_mixed4((mixed) (delta.x*oneOverDt+delta.x*correction), (mixed) (delta.y*oneOverDt+delta.y*correction), (mixed) (delta.z*oneOverDt+delta.z*correction), velocity.w);
+#endif
 #ifdef USE_MIXED_PRECISION
            posq[index] = make_real4((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
            posqCorrection[index] = make_real4(pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);

--- a/platforms/opencl/src/OpenCLContext.cpp
+++ b/platforms/opencl/src/OpenCLContext.cpp
@@ -97,7 +97,6 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
            // Try to figure out which device is the fastest.

            int bestSpeed = -1;
-            bool bestSupportsDouble = false;
            for (int i = 0; i < (int) devices.size(); i++) {
                if (platformVendor == "Apple" && devices[i].getInfo<CL_DEVICE_VENDOR>() == "AMD")
                    continue; // Don't use AMD GPUs on OS X due to serious bugs.
@@ -136,11 +135,9 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
                    }
                }
                int speed = devices[i].getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>()*processingElementsPerComputeUnit*devices[i].getInfo<CL_DEVICE_MAX_CLOCK_FREQUENCY>();
-                bool supportsDouble = (devices[i].getInfo<CL_DEVICE_EXTENSIONS>().find("cl_khr_fp64") != string::npos);
-                if (maxSize >= minThreadBlockSize && speed > bestSpeed && (supportsDouble || !bestSupportsDouble)) {
+                if (maxSize >= minThreadBlockSize && speed > bestSpeed) {
                    deviceIndex = i;
                    bestSpeed = speed;
-                    bestSupportsDouble = supportsDouble;
                }
            }
        }
@@ -277,6 +274,7 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
    clearFiveBuffersKernel = cl::Kernel(utilities, "clearFiveBuffers");
    clearSixBuffersKernel = cl::Kernel(utilities, "clearSixBuffers");
    reduceReal4Kernel = cl::Kernel(utilities, "reduceReal4Buffer");
+    if (supports64BitGlobalAtomics)
        reduceForcesKernel = cl::Kernel(utilities, "reduceForces");

    // Decide whether native_sqrt(), native_rsqrt(), and native_recip() are sufficiently accurate to use.

--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
@@ -5444,6 +5444,10 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
    cl.setTime(cl.getTime()+integrator.getStepSize());
    cl.setStepCount(cl.getStepCount()+1);
    cl.reorderAtoms();
+    if (cl.getAtomsWereReordered()) {
+        forcesAreValid = false;
+        validSavedForces.clear();
+    }
    
    // Reduce UI lag.
    

--- a/platforms/opencl/src/OpenCLNonbondedUtilities.cpp
+++ b/platforms/opencl/src/OpenCLNonbondedUtilities.cpp
@@ -81,7 +81,7 @@ OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : con
    }
    else {
        numForceThreadBlocks = context.getNumThreadBlocks();
-        forceThreadBlockSize = OpenCLContext::ThreadBlockSize;
+        forceThreadBlockSize = (context.getSIMDWidth() >= 32 ? OpenCLContext::ThreadBlockSize : 32);
        if (context.getSupports64BitGlobalAtomics()) {
            // Even though using longForceBuffer, still need a single forceBuffer for the reduceForces kernel to convert the long results into float4 which will be used by later kernels.
            numForceBuffers = 1;

--- a/platforms/opencl/src/kernels/findInteractingBlocks_cpu.cl
+++ b/platforms/opencl/src/kernels/findInteractingBlocks_cpu.cl
@@ -70,7 +70,7 @@ __kernel void sortBoxData(__global const real2* restrict sortedBlock, __global c
 * to global memory.
 */
 void storeInteractionData(unsigned short x, unsigned short* buffer, int* atoms, int* numAtoms, int numValid, __global unsigned int* interactionCount,
-            __global ushort2* interactingTiles, __global unsigned int* interactingAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize,
+            __global int* interactingTiles, __global unsigned int* interactingAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize,
            __global real4* posq, real4 blockCenterX, real4 blockSizeX, unsigned int maxTiles, bool finish) {
    real4 posBuffer[TILE_SIZE];
    const bool singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= PADDED_CUTOFF &&
@@ -128,7 +128,7 @@ void storeInteractionData(unsigned short x, unsigned short* buffer, int* atoms,
                int baseIndex = atom_add(interactionCount, tilesToStore);
                if (baseIndex+tilesToStore <= maxTiles) {
                    for (int i = 0; i < tilesToStore; i++) {
-                        interactingTiles[baseIndex+i] = (ushort2) (x, singlePeriodicCopy);
+                        interactingTiles[baseIndex+i] = x;
                        for (int j = 0; j < TILE_SIZE; j++)
                            interactingAtoms[(baseIndex+i)*TILE_SIZE+j] = atoms[i*TILE_SIZE+j];
                    }
@@ -145,7 +145,7 @@ void storeInteractionData(unsigned short x, unsigned short* buffer, int* atoms,
        int baseIndex = atom_add(interactionCount, tilesToStore);
        if (baseIndex+tilesToStore <= maxTiles) {
            for (int i = 0; i < tilesToStore; i++) {
-                interactingTiles[baseIndex+i] = (ushort2) (x, singlePeriodicCopy);
+                interactingTiles[baseIndex+i] = x;
                for (int j = 0; j < TILE_SIZE; j++) {
                    int index = i*TILE_SIZE+j;
                    interactingAtoms[(baseIndex+i)*TILE_SIZE+j] = (index < *numAtoms ? atoms[index] : NUM_ATOMS);
@@ -159,9 +159,8 @@ void storeInteractionData(unsigned short x, unsigned short* buffer, int* atoms,
 * Compare the bounding boxes for each pair of blocks.  If they are sufficiently far apart,
 * mark them as non-interacting.
 */
-__kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, __global const real4* restrict blockCenter,
-        __global const real4* restrict blockBoundingBox, __global unsigned int* restrict interactionCount, __global ushort2* restrict interactingTiles,
-        __global unsigned int* restrict interactingAtoms, __global const real4* restrict posq, unsigned int maxTiles, unsigned int startBlockIndex,
+__kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, __global unsigned int* restrict interactionCount,
+        __global int* restrict interactingTiles, __global unsigned int* restrict interactingAtoms, __global const real4* restrict posq, unsigned int maxTiles, unsigned int startBlockIndex,
        unsigned int numBlocks, __global real2* restrict sortedBlocks, __global const real4* restrict sortedBlockCenter, __global const real4* restrict sortedBlockBoundingBox,
        __global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __global real4* restrict oldPositions,
        __global const int* restrict rebuildNeighborList) {
@@ -180,8 +179,8 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi
        numAtoms = 0;
        real2 sortedKey = sortedBlocks[i];
        unsigned short x = (unsigned short) sortedKey.y;
-        real4 blockCenterX = blockCenter[x];
-        real4 blockSizeX = blockBoundingBox[x];
+        real4 blockCenterX = sortedBlockCenter[i];
+        real4 blockSizeX = sortedBlockBoundingBox[i];

        // Load exclusion data for block x.
        

--- a/platforms/opencl/src/kernels/gbsaObc.cl
+++ b/platforms/opencl/src/kernels/gbsaObc.cl
@@ -424,6 +424,7 @@ __kernel void computeGBSAForce1(
            localData[localAtomIndex].z = posq1.z;
            localData[localAtomIndex].q = posq1.w;
            localData[get_local_id(0)].bornRadius = bornRadius1;
+            SYNC_WARPS;
            for (unsigned int j = 0; j < TILE_SIZE; j++) {
                if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
                    real4 posq2 = (real4) (localData[tbx+j].x, localData[tbx+j].y, localData[tbx+j].z, localData[tbx+j].q);
@@ -454,8 +455,8 @@ __kernel void computeGBSAForce1(
 #ifdef USE_CUTOFF
                    }
 #endif
-                SYNC_WARPS;
                }
+                SYNC_WARPS;
            }
        }
        else {
@@ -472,6 +473,7 @@ __kernel void computeGBSAForce1(
            localData[get_local_id(0)].fy = 0.0f;
            localData[get_local_id(0)].fz = 0.0f;
            localData[get_local_id(0)].fw = 0.0f;
+            SYNC_WARPS;
            unsigned int tj = tgx;
            for (j = 0; j < TILE_SIZE; j++) {
                if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS) {
@@ -627,6 +629,7 @@ __kernel void computeGBSAForce1(
                localData[get_local_id(0)].fz = 0.0f;
                localData[get_local_id(0)].fw = 0.0f;
            }
+            SYNC_WARPS;
 #ifdef USE_PERIODIC
            if (singlePeriodicCopy) {
                // The box is small enough that we can just translate all the atoms into a single periodic

--- a/platforms/opencl/src/kernels/langevin.cl
+++ b/platforms/opencl/src/kernels/langevin.cl
@@ -36,6 +36,7 @@ __kernel void integrateLangevinPart2(__global real4* restrict posq, __global rea
    double invStepSize = 1.0/dt[0].y;
 #else
    float invStepSize = 1.0f/dt[0].y;
+    float correction = (1.0f-invStepSize*dt[0].y)/dt[0].y;
 #endif
    int index = get_global_id(0);
    while (index < NUM_ATOMS) {
@@ -53,7 +54,7 @@ __kernel void integrateLangevinPart2(__global real4* restrict posq, __global rea
 #ifdef SUPPORTS_DOUBLE_PRECISION
            vel.xyz = convert_mixed4(invStepSize*convert_double4(delta)).xyz;
 #else
-            vel.xyz = invStepSize*delta.xyz;
+            vel.xyz = invStepSize*delta.xyz + correction*delta.xyz;
 #endif
 #ifdef USE_MIXED_PRECISION
            posq[index] = convert_real4(pos);

--- a/platforms/opencl/src/kernels/utilities.cl
+++ b/platforms/opencl/src/kernels/utilities.cl
@@ -81,6 +81,7 @@ __kernel void reduceReal4Buffer(__global real4* restrict buffer, int bufferSize,
    }
 }

+#ifdef SUPPORTS_64_BIT_ATOMICS
 /**
 * Sum the various buffers containing forces.
 */
@@ -94,6 +95,7 @@ __kernel void reduceForces(__global const long* restrict longBuffer, __global re
        buffer[index] = sum;
    }
 }
+#endif

 /**
 * This is called to determine the accuracy of various native functions.

--- a/platforms/opencl/src/kernels/verlet.cl
+++ b/platforms/opencl/src/kernels/verlet.cl
@@ -38,6 +38,7 @@ __kernel void integrateVerletPart2(int numAtoms, __global mixed2* restrict dt, _
    double oneOverDt = 1.0/stepSize.y;
 #else
    float oneOverDt = 1.0f/stepSize.y;
+    float correction = (1.0f-oneOverDt*stepSize.y)/stepSize.y;
 #endif
    if (get_global_id(0) == 0)
        dt[0].x = stepSize.y;
@@ -58,7 +59,7 @@ __kernel void integrateVerletPart2(int numAtoms, __global mixed2* restrict dt, _
 #ifdef SUPPORTS_DOUBLE_PRECISION
            velocity.xyz = convert_mixed4(convert_double4(delta)*oneOverDt).xyz;
 #else
-            velocity.xyz = delta.xyz*oneOverDt;
+            velocity.xyz = delta.xyz*oneOverDt + delta.xyz*correction;
 #endif
 #ifdef USE_MIXED_PRECISION
            posq[index] = convert_real4(pos);

--- a/platforms/opencl/tests/TestOpenCLCustomNonbondedForce.cpp
+++ b/platforms/opencl/tests/TestOpenCLCustomNonbondedForce.cpp
@@ -455,7 +455,7 @@ void testSwitchingFunction() {
        positions[1] = Vec3(r+delta, 0, 0);
        context.setPositions(positions);
        double e2 = context.getState(State::Energy).getPotentialEnergy();
-        ASSERT_EQUAL_TOL((e2-e1)/(2*delta), state.getForces()[0][0], 1e-3);
+        ASSERT_EQUAL_TOL((e2-e1)/(2*delta), state.getForces()[0][0], 2e-3);
    }
 }


--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernelFactory.cpp
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernelFactory.cpp
@@ -33,7 +33,7 @@

 using namespace OpenMM;

-extern "C" void registerPlatforms() {
+extern "C" OPENMM_EXPORT void registerPlatforms() {
 }

 extern "C" OPENMM_EXPORT void registerKernelFactories() {

--- a/plugins/cpupme/CMakeLists.txt
+++ b/plugins/cpupme/CMakeLists.txt
@@ -85,7 +85,10 @@ IF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
 ELSE (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
    SET(MAIN_OPENMM_LIB ${OPENMM_LIBRARY_NAME})
 ENDIF (UNIX AND CMAKE_BUILD_TYPE MATCHES Debug)
-TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${MAIN_OPENMM_LIB} ${PTHREADS_LIB} ${FFTW_LIBRARY} ${FFTW_THREADS_LIBRARY})
+TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${MAIN_OPENMM_LIB} ${PTHREADS_LIB} ${FFTW_LIBRARY})
+IF (FFTW_THREADS_LIBRARY)
+    TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${FFTW_THREADS_LIBRARY})
+ENDIF (FFTW_THREADS_LIBRARY)
 SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES COMPILE_FLAGS "-DOPENMM_PME_BUILDING_SHARED_LIBRARY")

 INSTALL_TARGETS(/lib/plugins RUNTIME_DIRECTORY /lib/plugins ${SHARED_TARGET})

--- a/plugins/cpupme/src/CpuPmeKernelFactory.cpp
+++ b/plugins/cpupme/src/CpuPmeKernelFactory.cpp
@@ -32,10 +32,10 @@

 using namespace OpenMM;

-extern "C" void registerPlatforms() {
+extern "C" OPENMM_EXPORT_PME void registerPlatforms() {
 }

-extern "C" void registerKernelFactories() {
+extern "C" OPENMM_EXPORT_PME void registerKernelFactories() {
    if (CpuCalcPmeReciprocalForceKernel::isProcessorSupported()) {
        CpuPmeKernelFactory* factory = new CpuPmeKernelFactory();
        for (int i = 0; i < Platform::getNumPlatforms(); i++)

--- a/plugins/cpupme/src/CpuPmeKernels.cpp
+++ b/plugins/cpupme/src/CpuPmeKernels.cpp
@@ -119,8 +119,9 @@ static void spreadCharge(int start, int end, float* posq, float* grid, int gridx
    for (int i = start; i < end; i++) {
        // Find the position relative to the nearest grid point.
        
-        __m128 pos = _mm_load_ps(&posq[4*i]);
-        __m128 posInBox = _mm_sub_ps(pos, _mm_mul_ps(boxSize, _mm_floor_ps(_mm_mul_ps(pos, invBoxSize))));
+        __m128 pos = _mm_loadu_ps(&posq[4*i]);
+        __m128 posFloor = _mm_floor_ps(_mm_mul_ps(pos, invBoxSize));
+        __m128 posInBox = _mm_sub_ps(pos, _mm_mul_ps(boxSize, posFloor));
        __m128 t = _mm_mul_ps(_mm_mul_ps(posInBox, invBoxSize), gridSize);
        __m128i ti = _mm_cvttps_epi32(t);
        __m128 dr = _mm_sub_ps(t, _mm_cvtepi32_ps(ti));
@@ -309,8 +310,9 @@ static void interpolateForces(int start, int end, float* posq, float* force, flo
    for (int i = start; i < end; i++) {
        // Find the position relative to the nearest grid point.
        
-        __m128 pos = _mm_load_ps(&posq[4*i]);
-        __m128 posInBox = _mm_sub_ps(pos, _mm_mul_ps(boxSize, _mm_floor_ps(_mm_mul_ps(pos, invBoxSize))));
+        __m128 pos = _mm_loadu_ps(&posq[4*i]);
+        __m128 posFloor = _mm_floor_ps(_mm_mul_ps(pos, invBoxSize));
+        __m128 posInBox = _mm_sub_ps(pos, _mm_mul_ps(boxSize, posFloor));
        __m128 t = _mm_mul_ps(_mm_mul_ps(posInBox, invBoxSize), gridSize);
        __m128i ti = _mm_cvttps_epi32(t);
        __m128 dr = _mm_sub_ps(t, _mm_cvtepi32_ps(ti));
@@ -638,11 +640,11 @@ int CpuCalcPmeReciprocalForceKernel::findFFTDimension(int minimum) {
        // Attempt to factor the current value.

        int unfactored = minimum;
-        for (int factor = 2; factor < 12; factor++) {
+        for (int factor = 2; factor < 8; factor++) {
            while (unfactored > 1 && unfactored%factor == 0)
                unfactored /= factor;
        }
-        if (unfactored == 1)
+        if (unfactored == 1 || unfactored == 11 || unfactored == 13)
            return minimum;
        minimum++;
    }

--- a/plugins/cpupme/tests/TestCpuPme.cpp
+++ b/plugins/cpupme/tests/TestCpuPme.cpp
@@ -124,6 +124,10 @@ void testPME() {

 int main(int argc, char* argv[]) {
    try {
+        if (!CpuCalcPmeReciprocalForceKernel::isProcessorSupported()) {
+            cout << "CPU is not supported.  Exiting." << endl;
+            return 0;
+        }
        testPME();
    }
    catch(const exception& e) {

--- a/plugins/drude/platforms/cuda/src/CudaDrudeKernelFactory.cpp
+++ b/plugins/drude/platforms/cuda/src/CudaDrudeKernelFactory.cpp
@@ -34,10 +34,10 @@

 using namespace OpenMM;

-extern "C" void registerPlatforms() {
+extern "C" OPENMM_EXPORT void registerPlatforms() {
 }

-extern "C" void registerKernelFactories() {
+extern "C" OPENMM_EXPORT void registerKernelFactories() {
    try {
        Platform& platform = Platform::getPlatformByName("CUDA");
        CudaDrudeKernelFactory* factory = new CudaDrudeKernelFactory();