Merge branch 'master' of https://github.com/SimTk/openmm

59836b77 · peastman · 99b62b77 · f769300f · 59836b77 · 59836b77
Commit 59836b77 authored Feb 07, 2018 by peastman
7 changed files
--- a/docs-source/usersguide/theory.rst
+++ b/docs-source/usersguide/theory.rst
@@ -762,6 +762,26 @@ where :math:`m_i` and :math:`\mathbf{v}_i` are the mass and velocity of particle
 \ *i*\ .  It then subtracts :math:`\mathbf{v}_\text{CM}` from the velocity of every
 particle.

+RMSDForce
+*********
+
+RMSDForce computes the root-mean-squared deviation (RMSD) between the current
+particle positions :math:`\mathbf{x}_i` and a set of reference positions
+:math:`\mathbf{x}_i^\text{ref}`:
+
+.. math::
+   \text{RMSD} = \sqrt{\frac{\sum_{i} \| \mathbf{x}_i - \mathbf{x}_i^\text{ref} \|^2}{N}}
+
+Before computing this, the reference positions are first translated and rotated
+so as to minimize the RMSD.  The computed value is therefore :math:`argmin(\text{RMSD})`,
+where the :math:`argmin` is taken over all possible translations and rotations.
+
+This force is normally used with a CustomCVForce (see Section :ref:`customcvforce`).
+One rarely wants a force whose energy exactly equals the RMSD, but there are many
+situations where it is useful to have a restraining or biasing force that depends
+on the RMSD in some way.
+
+
 Custom Forces
 #############

@@ -1158,6 +1178,8 @@ specified in three ways:
 * Per-donor parameters are defined by specifying a value for each donor group.
 * Per-acceptor parameters are defined by specifying a value for each acceptor group.

+.. _customcvforce:
+
 CustomCVForce
 *************


--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -6887,7 +6887,7 @@ double CudaCalcRMSDForceKernel::executeImpl(ContextImpl& context) {
    // Execute the first kernel.

    int numParticles = particles->getSize();
-    int blockSize = 128;
+    int blockSize = 256;
    void* args1[] = {&numParticles, &cu.getPosq().getDevicePointer(), &referencePos->getDevicePointer(),
            &particles->getDevicePointer(), &buffer->getDevicePointer()};
    cu.executeKernel(kernel1, args1, blockSize, blockSize, blockSize*sizeof(REAL));

--- a/platforms/cuda/src/kernels/pme.cu
+++ b/platforms/cuda/src/kernels/pme.cu
@@ -35,6 +35,14 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real
    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
        int atom = pmeAtomGridIndex[i].x;
        real4 pos = posq[atom];
+#ifdef USE_LJPME
+        const float2 sigEps = sigmaEpsilon[atom];
+        const real charge = 8*sigEps.x*sigEps.x*sigEps.x*sigEps.y;
+#else
+        const real charge = pos.w;
+#endif
+        if (charge == 0)
+            continue;
        APPLY_PERIODIC_TO_POS(pos)
        real3 t = make_real3(pos.x*recipBoxVecX.x+pos.y*recipBoxVecY.x+pos.z*recipBoxVecZ.x,
                             pos.y*recipBoxVecY.y+pos.z*recipBoxVecZ.y,
@@ -67,12 +75,6 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real
        
        // Spread the charge from this atom onto each grid point.
        
-#ifdef USE_LJPME
-        const float2 sigEps = sigmaEpsilon[atom];
-        const real charge = 8*sigEps.x*sigEps.x*sigEps.x*sigEps.y;
-#else
-        const real charge = pos.w;
-#endif
        for (int ix = 0; ix < PME_ORDER; ix++) {
            int xbase = gridIndex.x+ix;
            xbase -= (xbase >= GRID_SIZE_X ? GRID_SIZE_X : 0);

--- a/platforms/cuda/src/kernels/rmsd.cu
+++ b/platforms/cuda/src/kernels/rmsd.cu
@@ -4,11 +4,16 @@
 /**
 * Sum a value over all threads.
 */
-__device__ real reduceValue(real value, real* temp) {
+__device__ real reduceValue(real value, volatile real* temp) {
    const int thread = threadIdx.x;
    temp[thread] = value;
    __syncthreads();
-    for (uint step = 1; step < blockDim.x; step *= 2) {
+    for (uint step = 1; step < 32; step *= 2) {
+        if (thread+step < blockDim.x && thread%(2*step) == 0)
+            temp[thread] = temp[thread] + temp[thread+step];
+        SYNC_WARPS
+    }
+    for (uint step = 32; step < blockDim.x; step *= 2) {
        if (thread+step < blockDim.x && thread%(2*step) == 0)
            temp[thread] = temp[thread] + temp[thread+step];
        __syncthreads();
@@ -21,7 +26,7 @@ __device__ real reduceValue(real value, real* temp) {
 */
 extern "C" __global__ void computeRMSDPart1(int numParticles, const real4* __restrict__ posq, const real4* __restrict__ referencePos,
         const int* __restrict__ particles, real* buffer) {
-    extern __shared__ real temp[];
+    extern __shared__ volatile real temp[];

    // Compute the center of the particle positions.
    

--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
@@ -7167,7 +7167,7 @@ double OpenCLCalcRMSDForceKernel::executeImpl(ContextImpl& context) {
    // Execute the first kernel.

    int numParticles = particles->getSize();
-    int blockSize = 128;
+    int blockSize = min(256, (int) kernel1.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(cl.getDevice()));
    kernel1.setArg<cl_int>(0, numParticles);
    kernel1.setArg<cl::Buffer>(1, cl.getPosq().getDeviceBuffer());
    kernel1.setArg<cl::Buffer>(2, referencePos->getDeviceBuffer());

--- a/platforms/opencl/src/kernels/pme.cl
+++ b/platforms/opencl/src/kernels/pme.cl
@@ -110,6 +110,14 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
    for (int i = get_global_id(0); i < NUM_ATOMS; i += get_global_size(0)) {
        int atom = pmeAtomGridIndex[i].x;
        real4 pos = posq[atom];
+#ifdef USE_LJPME
+        const float2 sigEps = sigmaEpsilon[atom];
+        const real charge = 8*sigEps.x*sigEps.x*sigEps.x*sigEps.y;
+#else
+        const real charge = pos.w;
+#endif
+        if (charge == 0)
+            continue;
        APPLY_PERIODIC_TO_POS(pos)
        real3 t = (real3) (pos.x*recipBoxVecX.x+pos.y*recipBoxVecY.x+pos.z*recipBoxVecZ.x,
                           pos.y*recipBoxVecY.y+pos.z*recipBoxVecZ.y,
@@ -142,12 +150,6 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con

        // Spread the charge from this atom onto each grid point.

-#ifdef USE_LJPME
-        const float2 sigEps = sigmaEpsilon[atom];
-        const real charge = 8*sigEps.x*sigEps.x*sigEps.x*sigEps.y;
-#else
-        const real charge = pos.w;
-#endif
        for (int ix = 0; ix < PME_ORDER; ix++) {
            int xindex = gridIndex.x+ix;
            xindex -= (xindex >= GRID_SIZE_X ? GRID_SIZE_X : 0);
@@ -224,6 +226,8 @@ __kernel void gridSpreadCharge(__global const real4* restrict posq, __global con
 #else
        const real charge = pos.w;
 #endif
+        if (charge == 0)
+            continue;
        bool hasComputedThetas = false;
        for (int ix = 0; ix < PME_ORDER; ix++) {
            int xindex = gridIndex.x+ix;

--- a/platforms/opencl/src/kernels/rmsd.cl
+++ b/platforms/opencl/src/kernels/rmsd.cl
@@ -4,11 +4,16 @@
 /**
 * Sum a value over all threads.
 */
-real reduceValue(real value, __local real* temp) {
+real reduceValue(real value, __local volatile real* temp) {
    const int thread = get_local_id(0);
    temp[thread] = value;
    barrier(CLK_LOCAL_MEM_FENCE);
-    for (uint step = 1; step < get_local_size(0); step *= 2) {
+    for (uint step = 1; step < 32; step *= 2) {
+        if (thread+step < get_local_size(0) && thread%(2*step) == 0)
+            temp[thread] = temp[thread] + temp[thread+step];
+        SYNC_WARPS
+    }
+    for (uint step = 32; step < get_local_size(0); step *= 2) {
        if (thread+step < get_local_size(0) && thread%(2*step) == 0)
            temp[thread] = temp[thread] + temp[thread+step];
        barrier(CLK_LOCAL_MEM_FENCE);
@@ -20,7 +25,7 @@ real reduceValue(real value, __local real* temp) {
 * Perform the first step of computing the RMSD.  This is executed as a single work group.
 */
 __kernel void computeRMSDPart1(int numParticles, __global const real4* restrict posq, __global const real4* restrict referencePos,
-        __global const int* restrict particles, __global real* buffer, __local real* restrict temp) {
+        __global const int* restrict particles, __global real* buffer, __local volatile real* restrict temp) {
    // Compute the center of the particle positions.
    
    real3 center = (real3) 0;