Merge branch 'master' of github.com:leeping/openmm

8167c79b · leeping · 855ece90 · 99ef4344 · 8167c79b · 8167c79b
Commit 8167c79b authored Oct 30, 2013 by leeping
20 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -252,6 +252,7 @@ FOREACH(subdir ${OPENMM_SOURCE_SUBDIRS})
    ## OpenMM was previously installed there.
    INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include)
 ENDFOREACH(subdir)
+SET_SOURCE_FILES_PROPERTIES(${CMAKE_SOURCE_DIR}/libraries/sfmt/src/SFMT.cpp PROPERTIES COMPILE_FLAGS "-msse2 -DHAVE_SSE2=1")
 # If API wrappers are being generated, and add them to the build.
 FIND_PROGRAM(GCCXML_PATH gccxml PATH

--- a/libraries/sfmt/include/sfmt/SFMT-sse2.h
+++ b/libraries/sfmt/include/sfmt/SFMT-sse2.h
@@ -49,22 +49,23 @@ PRE_ALWAYS static __m128i mm_recursion(__m128i *a, __m128i *b,
 * This function fills the internal state array with pseudorandom
 * integers.
 */
-inline static void gen_rand_all(void) {
+inline static void gen_rand_all(SFMT& sfmt) {
    int i;
    __m128i r, r1, r2, mask;
    mask = _mm_set_epi32(MSK4, MSK3, MSK2, MSK1);
-    r1 = _mm_load_si128(&sfmt[N - 2].si);
+    SFMTData& data = *sfmt.data;
-    r2 = _mm_load_si128(&sfmt[N - 1].si);
+    r1 = _mm_load_si128(&data.sfmt[N - 2].si);
+    r2 = _mm_load_si128(&data.sfmt[N - 1].si);
    for (i = 0; i < N - POS1; i++) {
-	r = mm_recursion(&sfmt[i].si, &sfmt[i + POS1].si, r1, r2, mask);
+	r = mm_recursion(&data.sfmt[i].si, &data.sfmt[i + POS1].si, r1, r2, mask);
-	_mm_store_si128(&sfmt[i].si, r);
+	_mm_store_si128(&data.sfmt[i].si, r);
 	r1 = r2;
 	r2 = r;
    }
    for (; i < N; i++) {
-	r = mm_recursion(&sfmt[i].si, &sfmt[i + POS1 - N].si, r1, r2, mask);
+	r = mm_recursion(&data.sfmt[i].si, &data.sfmt[i + POS1 - N].si, r1, r2, mask);
-	_mm_store_si128(&sfmt[i].si, r);
+	_mm_store_si128(&data.sfmt[i].si, r);
 	r1 = r2;
 	r2 = r;
    }
@@ -77,21 +78,22 @@ inline static void gen_rand_all(void) {
 * @param array an 128-bit array to be filled by pseudorandom numbers.  
 * @param size number of 128-bit pesudorandom numbers to be generated.
 */
-inline static void gen_rand_array(w128_t *array, int size) {
+inline static void gen_rand_array(w128_t *array, int size, SFMT& sfmt) {
    int i, j;
    __m128i r, r1, r2, mask;
    mask = _mm_set_epi32(MSK4, MSK3, MSK2, MSK1);
-    r1 = _mm_load_si128(&sfmt[N - 2].si);
+    SFMTData& data = *sfmt.data;
-    r2 = _mm_load_si128(&sfmt[N - 1].si);
+    r1 = _mm_load_si128(&data.sfmt[N - 2].si);
+    r2 = _mm_load_si128(&data.sfmt[N - 1].si);
    for (i = 0; i < N - POS1; i++) {
-	r = mm_recursion(&sfmt[i].si, &sfmt[i + POS1].si, r1, r2, mask);
+	r = mm_recursion(&data.sfmt[i].si, &data.sfmt[i + POS1].si, r1, r2, mask);
 	_mm_store_si128(&array[i].si, r);
 	r1 = r2;
 	r2 = r;
    }
    for (; i < N; i++) {
-	r = mm_recursion(&sfmt[i].si, &array[i + POS1 - N].si, r1, r2, mask);
+	r = mm_recursion(&data.sfmt[i].si, &array[i + POS1 - N].si, r1, r2, mask);
 	_mm_store_si128(&array[i].si, r);
 	r1 = r2;
 	r2 = r;
@@ -106,13 +108,13 @@ inline static void gen_rand_array(w128_t *array, int size) {
    }
    for (j = 0; j < 2 * N - size; j++) {
 	r = _mm_load_si128(&array[j + size - N].si);
-	_mm_store_si128(&sfmt[j].si, r);
+	_mm_store_si128(&data.sfmt[j].si, r);
    }
    for (; i < size; i++) {
 	r = mm_recursion(&array[i - N].si, &array[i + POS1 - N].si, r1, r2,
 			 mask);
 	_mm_store_si128(&array[i].si, r);
-	_mm_store_si128(&sfmt[j++].si, r);
+	_mm_store_si128(&data.sfmt[j++].si, r);
 	r1 = r2;
 	r2 = r;
    }

--- a/libraries/sfmt/src/SFMT.cpp
+++ b/libraries/sfmt/src/SFMT.cpp
@@ -144,9 +144,9 @@ inline static void swap(w128_t *array, int size);
 #endif
 #if defined(HAVE_ALTIVEC)
-  #include "SFMT-alti.h"
+  #include "sfmt/SFMT-alti.h"
 #elif defined(HAVE_SSE2)
-  #include "SFMT-sse2.h"
+  #include "sfmt/SFMT-sse2.h"
 #endif
 /**

--- a/openmmapi/include/openmm/internal/vectorize.h
+++ b/openmmapi/include/openmm/internal/vectorize.h
+#ifndef OPENMM_VECTORIZE_H_
+#define OPENMM_VECTORIZE_H_
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2013 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+#include <smmintrin.h>
+// This file defines classes and functions to simplify vectorizing code with SSE.
+class ivec4;
+/**
+ * A four element vector of floats.
+ */
+class fvec4 {
+public:
+    __m128 val;
+    fvec4() {}
+    fvec4(float v) : val(_mm_set1_ps(v)) {}
+    fvec4(float v1, float v2, float v3, float v4) : val(_mm_set_ps(v4, v3, v2, v1)) {}
+    fvec4(__m128 v) : val(v) {}
+    fvec4(const float* v) : val(_mm_loadu_ps(v)) {}
+    operator __m128() const {
+        return val;
+    }
+    float operator[](int i) const {
+        int resultBits = _mm_extract_ps(val, i);
+        return *((float*) &resultBits);
+    }
+    void store(float* v) const {
+        _mm_storeu_ps(v, val);
+    }
+    fvec4 operator+(fvec4 other) const {
+        return _mm_add_ps(val, other);
+    }
+    fvec4 operator-(fvec4 other) const {
+        return _mm_sub_ps(val, other);
+    }
+    fvec4 operator*(fvec4 other) const {
+        return _mm_mul_ps(val, other);
+    }
+    fvec4 operator/(fvec4 other) const {
+        return _mm_div_ps(val, other);
+    }
+    void operator+=(fvec4 other) {
+        val = _mm_add_ps(val, other);
+    }
+    void operator-=(fvec4 other) {
+        val = _mm_sub_ps(val, other);
+    }
+    void operator*=(fvec4 other) {
+        val = _mm_mul_ps(val, other);
+    }
+    void operator/=(fvec4 other) {
+        val = _mm_div_ps(val, other);
+    }
+    fvec4 operator-() const {
+        return _mm_sub_ps(_mm_set1_ps(0.0f), val);
+    }
+    fvec4 operator&(fvec4 other) const {
+        return _mm_and_ps(val, other);
+    }
+    fvec4 operator==(fvec4 other) const {
+        return _mm_cmpeq_ps(val, other);
+    }
+    fvec4 operator!=(fvec4 other) const {
+        return _mm_cmpneq_ps(val, other);
+    }
+    fvec4 operator>(fvec4 other) const {
+        return _mm_cmpgt_ps(val, other);
+    }
+    fvec4 operator<(fvec4 other) const {
+        return _mm_cmplt_ps(val, other);
+    }
+    fvec4 operator>=(fvec4 other) const {
+        return _mm_cmpge_ps(val, other);
+    }
+    fvec4 operator<=(fvec4 other) const {
+        return _mm_cmple_ps(val, other);
+    }
+    operator ivec4() const;
+};
+/**
+ * A four element vector of ints.
+ */
+class ivec4 {
+public:
+    __m128i val;
+    ivec4() {}
+    ivec4(int v) : val(_mm_set1_epi32(v)) {}
+    ivec4(int v1, int v2, int v3, int v4) : val(_mm_set_epi32(v4, v3, v2, v1)) {}
+    ivec4(__m128i v) : val(v) {}
+    ivec4(const int* v) : val(_mm_loadu_si128((const __m128i*) v)) {}
+    operator __m128i() const {
+        return val;
+    }
+    int operator[](int i) const {
+        return _mm_extract_epi32(val, i);
+    }
+    void store(int* v) const {
+        _mm_storeu_si128((__m128i*) v, val);
+    }
+    ivec4 operator+(ivec4 other) const {
+        return _mm_add_epi32(val, other);
+    }
+    ivec4 operator-(ivec4 other) const {
+        return _mm_sub_epi32(val, other);
+    }
+    ivec4 operator*(ivec4 other) const {
+        return _mm_mul_epi32(val, other);
+    }
+    void operator+=(ivec4 other) {
+        val = _mm_add_epi32(val, other);
+    }
+    void operator-=(ivec4 other) {
+        val = _mm_sub_epi32(val, other);
+    }
+    void operator*=(ivec4 other) {
+        val = _mm_mul_epi32(val, other);
+    }
+    ivec4 operator-() const {
+        return _mm_sub_epi32(_mm_set1_epi32(0), val);
+    }
+    ivec4 operator&(ivec4 other) const {
+        return _mm_and_si128(val, other);
+    }
+    ivec4 operator==(ivec4 other) const {
+        return _mm_cmpeq_epi32(val, other);
+    }
+    operator fvec4() const;
+};
+// Conversion operators.
+inline fvec4::operator ivec4() const {
+    return _mm_cvttps_epi32(val);
+}
+inline ivec4::operator fvec4() const {
+    return _mm_cvtepi32_ps(val);
+}
+// Functions that operate on fvec4s.
+static inline fvec4 floor(fvec4 v) {
+    return fvec4(_mm_floor_ps(v.val));
+}
+static inline fvec4 ceil(fvec4 v) {
+    return fvec4(_mm_ceil_ps(v.val));
+}
+static inline fvec4 round(fvec4 v) {
+    return fvec4(_mm_round_ps(v.val, _MM_FROUND_TO_NEAREST_INT));
+}
+static inline fvec4 min(fvec4 v1, fvec4 v2) {
+    return fvec4(_mm_min_ps(v1.val, v2.val));
+}
+static inline fvec4 max(fvec4 v1, fvec4 v2) {
+    return fvec4(_mm_max_ps(v1.val, v2.val));
+}
+static inline fvec4 abs(fvec4 v) {
+    static const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
+    return fvec4(_mm_and_ps(v.val, mask));
+}
+static inline fvec4 sqrt(fvec4 v) {
+    return fvec4(_mm_sqrt_ps(v.val));
+}
+static inline float dot3(fvec4 v1, fvec4 v2) {
+    return _mm_cvtss_f32(_mm_dp_ps(v1, v2, 0x71));
+}
+static inline float dot4(fvec4 v1, fvec4 v2) {
+    return _mm_cvtss_f32(_mm_dp_ps(v1, v2, 0xF1));
+}
+// Functions that operate on ivec4s.
+static inline ivec4 min(ivec4 v1, ivec4 v2) {
+    return ivec4(_mm_min_epi32(v1.val, v2.val));
+}
+static inline ivec4 max(ivec4 v1, ivec4 v2) {
+    return ivec4(_mm_max_epi32(v1.val, v2.val));
+}
+static inline ivec4 abs(ivec4 v) {
+    return ivec4(_mm_abs_epi32(v.val));
+}
+// Mathematical operators involving a scalar and a vector.
+static inline fvec4 operator+(float v1, fvec4 v2) {
+    return fvec4(v1)+v2;
+}
+static inline fvec4 operator-(float v1, fvec4 v2) {
+    return fvec4(v1)-v2;
+}
+static inline fvec4 operator*(float v1, fvec4 v2) {
+    return fvec4(v1)*v2;
+}
+static inline fvec4 operator/(float v1, fvec4 v2) {
+    return fvec4(v1)/v2;
+}
+#endif /*OPENMM_VECTORIZE_H_*/
--- a/platforms/cpu/include/CpuNeighborList.h
+++ b/platforms/cpu/include/CpuNeighborList.h
@@ -12,26 +12,42 @@ namespace OpenMM {
 class OPENMM_EXPORT_CPU CpuNeighborList {
 public:
    class ThreadData;
-    class VoxelHash;
+    class Voxels;
+    static const int BlockSize;
    CpuNeighborList();
    ~CpuNeighborList();
    void computeNeighborList(int numAtoms, const std::vector<float>& atomLocations, const std::vector<std::set<int> >& exclusions,
            const float* periodicBoxSize, bool usePeriodic, float maxDistance);
-    const std::vector<std::pair<int, int> >& getNeighbors();
+    int getNumBlocks() const;
+    const std::vector<int>& getSortedAtoms() const;
+    const std::vector<int>& getBlockNeighbors(int blockIndex) const;
+    const std::vector<char>& getBlockExclusions(int blockIndex) const;
    /**
     * This routine contains the code executed by each thread.
     */
-    void runThread(int index, std::vector<std::pair<int, int> >& threadNeighbors);
+    void runThread(int index);
 private:
+    /**
+     * This is called by the worker threads to wait until the master thread instructs them to advance.
+     */
+    void threadWait();
+    /**
+     * This is called by the master thread to instruct all the worker threads to advance.
+     */
+    void advanceThreads();
    bool isDeleted;
    int numThreads, waitCount;
-    std::vector<std::pair<int, int> > neighbors;
+    std::vector<int> sortedAtoms;
+    std::vector<std::vector<int> > blockNeighbors;
+    std::vector<std::vector<char> > blockExclusions;
    std::vector<pthread_t> thread;
    std::vector<ThreadData*> threadData;
    pthread_cond_t startCondition, endCondition;
    pthread_mutex_t lock;
    // The following variables are used to make information accessible to the individual threads.
-    VoxelHash* voxelHash;
+    float minx, maxx, miny, maxy, minz, maxz;
+    std::vector<std::pair<int, int> > atomBins;
+    Voxels* voxels;
    const std::vector<std::set<int> >* exclusions;
    const float* atomLocations;
    const float* periodicBoxSize;

--- a/platforms/cpu/include/CpuNonbondedForce.h
+++ b/platforms/cpu/include/CpuNonbondedForce.h
@@ -25,14 +25,17 @@
 #ifndef OPENMM_CPU_NONBONDED_FORCE_H__
 #define OPENMM_CPU_NONBONDED_FORCE_H__
+#include "CpuNeighborList.h"
 #include "ReferencePairIxn.h"
+#include "openmm/internal/vectorize.h"
 #include <pthread.h>
 #include <set>
 #include <utility>
 #include <vector>
-#include <smmintrin.h>
 // ---------------------------------------------------------------------------------------
+namespace OpenMM {
 class CpuNonbondedForce {
    public:
        class ThreadData;
@@ -63,7 +66,7 @@ class CpuNonbondedForce {
         --------------------------------------------------------------------------------------- */
-      void setUseCutoff(float distance, const std::vector<std::pair<int, int> >& neighbors, float solventDielectric);
+      void setUseCutoff(float distance, const CpuNeighborList& neighbors, float solventDielectric);
      /**---------------------------------------------------------------------------------------
@@ -127,9 +130,9 @@ class CpuNonbondedForce {
         --------------------------------------------------------------------------------------- */
-      void calculateReciprocalIxn(int numberOfAtoms, float* posq, std::vector<OpenMM::RealVec>& atomCoordinates,
+      void calculateReciprocalIxn(int numberOfAtoms, float* posq, std::vector<RealVec>& atomCoordinates,
                            const std::vector<std::pair<float, float> >& atomParameters, const std::vector<std::set<int> >& exclusions,
-                            std::vector<OpenMM::RealVec>& forces, float* totalEnergy) const;
+                            std::vector<RealVec>& forces, float* totalEnergy) const;
      /**---------------------------------------------------------------------------------------
@@ -159,14 +162,14 @@ private:
        bool periodic;
        bool ewald;
        bool pme;
-        const std::vector<std::pair<int, int> >* neighborList;
+        const CpuNeighborList* neighborList;
        float periodicBoxSize[3];
        float cutoffDistance, switchingDistance;
        float krf, crf;
        float alphaEwald;
        int numRx, numRy, numRz;
        int meshDim[3];
-        std::vector<float> ewaldScaleX, ewaldScaleY, ewaldScaleDeriv;
+        std::vector<float> ewaldScaleTable;
        float ewaldDX, ewaldDXInv;
        bool isDeleted;
        int numThreads, waitCount;
@@ -195,31 +198,42 @@ private:
         --------------------------------------------------------------------------------------- */
-      void calculateOneIxn(int atom1, int atom2, float* forces, double* totalEnergy, const __m128& boxSize, const __m128& invBoxSize);
+      void calculateOneIxn(int atom1, int atom2, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
      /**---------------------------------------------------------------------------------------
-         Calculate LJ Coulomb pair ixn between two atoms
+         Calculate all the interactions for one atom block.
-         @param atom1            the index of the first atom
+         @param blockIndex       the index of the atom block
-         @param atom2            the index of the second atom
+         @param forces           force array (forces added)
+         @param totalEnergy      total energy
+         --------------------------------------------------------------------------------------- */
+      void calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
+      /**---------------------------------------------------------------------------------------
+         Calculate all the interactions for one atom block.
+         @param blockIndex       the index of the atom block
         @param forces           force array (forces added)
         @param totalEnergy      total energy
         --------------------------------------------------------------------------------------- */
-      void calculateOneEwaldIxn(int atom1, int atom2, float* forces, double* totalEnergy, const __m128& boxSize, const __m128& invBoxSize);
+      void calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
      /**
       * Compute the displacement and squared distance between two points, optionally using
       * periodic boundary conditions.
       */
-      void getDeltaR(const __m128& posI, const __m128& posJ, __m128& deltaR, float& r2, bool periodic, const __m128& boxSize, const __m128& invBoxSize) const;
+      void getDeltaR(const fvec4& posI, const fvec4& posJ, fvec4& deltaR, float& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const;
      /**
       * Compute a fast approximation to erfc(x).
       */
-      static float erfcApprox(float x);
+      static fvec4 erfcApprox(fvec4 x);
      /**
       * Create a lookup table for the scale factor used with Ewald and PME.
@@ -229,9 +243,11 @@ private:
      /**
       * Evaluate the scale factor used with Ewald and PME: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
       */
-      float ewaldScaleFunction(float x);
+      fvec4 ewaldScaleFunction(fvec4 x);
 };
+} // namespace OpenMM
 // ---------------------------------------------------------------------------------------
 #endif // OPENMM_CPU_NONBONDED_FORCE_H__
--- a/platforms/cpu/src/CpuKernels.cpp
+++ b/platforms/cpu/src/CpuKernels.cpp
@@ -221,20 +221,48 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
    if (nonbondedMethod != NoCutoff) {
        // Determine whether we need to recompute the neighbor list.
-        double padding = 0.1*nonbondedCutoff;
+        double padding = 0.15*nonbondedCutoff;
        bool needRecompute = false;
+        double closeCutoff2 = 0.25*padding*padding;
+        double farCutoff2 = 0.5*padding*padding;
+        int maxNumMoved = numParticles/10;
+        vector<int> moved;
        for (int i = 0; i < numParticles; i++) {
            RealVec delta = posData[i]-lastPositions[i];
-            if (delta.dot(delta) > 0.25*padding*padding) {
+            double dist2 = delta.dot(delta);
+            if (dist2 > closeCutoff2) {
+                moved.push_back(i);
+                if (dist2 > farCutoff2 || moved.size() > maxNumMoved) {
                    needRecompute = true;
                    break;
                }
            }
+        }
+        if (!needRecompute && moved.size() > 0) {
+            // Some particles have moved further than half the padding distance.  Look for pairs
+            // that are missing from the neighbor list.
+            int numMoved = moved.size();
+            double cutoff2 = nonbondedCutoff*nonbondedCutoff;
+            for (int i = 1; i < numMoved && !needRecompute; i++)
+                for (int j = 0; j < i; j++) {
+                    RealVec delta = posData[moved[i]]-posData[moved[j]];
+                    if (delta.dot(delta) < cutoff2) {
+                        // These particles should interact.  See if they are in the neighbor list.
+                        RealVec oldDelta = lastPositions[moved[i]]-lastPositions[moved[j]];
+                        if (oldDelta.dot(oldDelta) > cutoff2) {
+                            needRecompute = true;
+                            break;
+                        }
+                    }
+                }
+        }
        if (needRecompute) {
            neighborList.computeNeighborList(numParticles, posq, exclusions, floatBoxSize, periodic || ewald || pme, nonbondedCutoff+padding);
            lastPositions = posData;
        }
-        nonbonded.setUseCutoff(nonbondedCutoff, neighborList.getNeighbors(), rfDielectric);
+        nonbonded.setUseCutoff(nonbondedCutoff, neighborList, rfDielectric);
    }
    if (periodic || ewald || pme) {
        double minAllowedSize = 1.999999*nonbondedCutoff;

--- a/platforms/cpu/src/CpuNeighborList.cpp
+++ b/platforms/cpu/src/CpuNeighborList.cpp
--- a/platforms/cpu/src/CpuNonbondedForce.cpp
+++ b/platforms/cpu/src/CpuNonbondedForce.cpp
--- a/platforms/cpu/tests/TestCpuNeighborList.cpp
+++ b/platforms/cpu/tests/TestCpuNeighborList.cpp
@@ -39,6 +39,7 @@
 #include "sfmt/SFMT.h"
 #include <iostream>
 #include <set>
+#include <utility>
 #include <vector>
 using namespace OpenMM;
@@ -68,11 +69,20 @@ void testNeighborList(bool periodic) {
    // Convert the neighbor list to a set for faster lookup.
    set<pair<int, int> > neighbors;
-    for (int i = 0; i < (int) neighborList.getNeighbors().size(); i++) {
+    for (int i = 0; i < (int) neighborList.getSortedAtoms().size(); i++) {
-        pair<int, int> entry = neighborList.getNeighbors()[i];
+        int blockIndex = i/CpuNeighborList::BlockSize;
+        int indexInBlock = i-blockIndex*CpuNeighborList::BlockSize;
+        char mask = 1<<indexInBlock;
+        for (int j = 0; j < (int) neighborList.getBlockExclusions(blockIndex).size(); j++) {
+            if ((neighborList.getBlockExclusions(blockIndex)[j] & mask) == 0) {
+                int atom1 = neighborList.getSortedAtoms()[i];
+                int atom2 = neighborList.getBlockNeighbors(blockIndex)[j];
+                pair<int, int> entry = make_pair(min(atom1, atom2), max(atom1, atom2));
                ASSERT(neighbors.find(entry) == neighbors.end() && neighbors.find(make_pair(entry.second, entry.first)) == neighbors.end()); // No duplicates
                neighbors.insert(entry);
            }
+        }
+    }
    // Check each particle pair and figure out whether they should be in the neighbor list.
@@ -90,7 +100,8 @@ void testNeighborList(bool periodic) {
            if (dx*dx + dy*dy + dz*dz > cutoff*cutoff)
                shouldInclude = false;
            bool isIncluded = (neighbors.find(make_pair(i, j)) != neighbors.end() || neighbors.find(make_pair(j, i)) != neighbors.end());
-            ASSERT_EQUAL(shouldInclude, isIncluded);
+            if (shouldInclude)
+                ASSERT(isIncluded);
        }
 }

--- a/plugins/amoeba/openmmapi/include/openmm/AmoebaMultipoleForce.h
+++ b/plugins/amoeba/openmmapi/include/openmm/AmoebaMultipoleForce.h
@@ -301,6 +301,14 @@ public:
     */
    void setEwaldErrorTolerance(double tol);
+    /**
+     * Get the induced dipole moments of all particles.
+     * 
+     * @param context    the Context for which to get the induced dipoles
+     * @param dipoles    the induced dipole moment of particle i is stored into the i'th element
+     */
+    void getInducedDipoles(Context& context, std::vector<Vec3>& dipoles);
    /**
     * Get the electrostatic potential.
     *

--- a/plugins/amoeba/openmmapi/include/openmm/amoebaKernels.h
+++ b/plugins/amoeba/openmmapi/include/openmm/amoebaKernels.h
@@ -348,6 +348,8 @@ public:
     */
    virtual double execute(ContextImpl& context, bool includeForces, bool includeEnergy) = 0;
+    virtual void getInducedDipoles(ContextImpl& context, std::vector<Vec3>& dipoles) = 0;
    virtual void getElectrostaticPotential( ContextImpl& context, const std::vector< Vec3 >& inputGrid,
                                            std::vector< double >& outputElectrostaticPotential ) = 0;

--- a/plugins/amoeba/openmmapi/include/openmm/internal/AmoebaMultipoleForceImpl.h
+++ b/plugins/amoeba/openmmapi/include/openmm/internal/AmoebaMultipoleForceImpl.h
@@ -82,6 +82,8 @@ public:
     */
    static void getCovalentDegree( const AmoebaMultipoleForce& force, std::vector<int>& covalentDegree );
+    void getInducedDipoles(ContextImpl& context, std::vector<Vec3>& dipoles);
    void getElectrostaticPotential( ContextImpl& context, const std::vector< Vec3 >& inputGrid,
                                    std::vector< double >& outputElectrostaticPotential );

--- a/plugins/amoeba/openmmapi/src/AmoebaMultipoleForce.cpp
+++ b/plugins/amoeba/openmmapi/src/AmoebaMultipoleForce.cpp
@@ -226,6 +226,10 @@ void AmoebaMultipoleForce::getCovalentMaps(int index, std::vector< std::vector<i
    }
 }
+void AmoebaMultipoleForce::getInducedDipoles(Context& context, vector<Vec3>& dipoles) {
+    dynamic_cast<AmoebaMultipoleForceImpl&>(getImplInContext(context)).getInducedDipoles(getContextImpl(context), dipoles);
+}
 void AmoebaMultipoleForce::getElectrostaticPotential( const std::vector< Vec3 >& inputGrid, Context& context, std::vector< double >& outputElectrostaticPotential ){
    dynamic_cast<AmoebaMultipoleForceImpl&>(getImplInContext(context)).getElectrostaticPotential(getContextImpl(context), inputGrid, outputElectrostaticPotential);
 }

--- a/plugins/amoeba/openmmapi/src/AmoebaMultipoleForceImpl.cpp
+++ b/plugins/amoeba/openmmapi/src/AmoebaMultipoleForceImpl.cpp
@@ -183,6 +183,10 @@ void AmoebaMultipoleForceImpl::getCovalentDegree( const AmoebaMultipoleForce& fo
    return;
 }
+void AmoebaMultipoleForceImpl::getInducedDipoles(ContextImpl& context, vector<Vec3>& dipoles) {
+    kernel.getAs<CalcAmoebaMultipoleForceKernel>().getInducedDipoles(context, dipoles);
+}
 void AmoebaMultipoleForceImpl::getElectrostaticPotential( ContextImpl& context, const std::vector< Vec3 >& inputGrid,
                                                          std::vector< double >& outputElectrostaticPotential ){
    kernel.getAs<CalcAmoebaMultipoleForceKernel>().getElectrostaticPotential(context, inputGrid, outputElectrostaticPotential);

--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.cpp
@@ -1639,6 +1639,24 @@ void CudaCalcAmoebaMultipoleForceKernel::ensureMultipolesValid(ContextImpl& cont
        context.calcForcesAndEnergy(false, false, -1);
 }
+void CudaCalcAmoebaMultipoleForceKernel::getInducedDipoles(ContextImpl& context, vector<Vec3>& dipoles) {
+    ensureMultipolesValid(context);
+    int numParticles = cu.getNumAtoms();
+    dipoles.resize(numParticles);
+    if (cu.getUseDoublePrecision()) {
+        vector<double> d;
+        inducedDipole->download(d);
+        for (int i = 0; i < numParticles; i++)
+            dipoles[i] = Vec3(d[3*i], d[3*i+1], d[3*i+2]);
+    }
+    else {
+        vector<float> d;
+        inducedDipole->download(d);
+        for (int i = 0; i < numParticles; i++)
+            dipoles[i] = Vec3(d[3*i], d[3*i+1], d[3*i+2]);
+    }
+}
 void CudaCalcAmoebaMultipoleForceKernel::getElectrostaticPotential(ContextImpl& context, const vector<Vec3>& inputGrid, vector<double>& outputElectrostaticPotential) {
    ensureMultipolesValid(context);
    int numPoints = inputGrid.size();

--- a/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.h
+++ b/plugins/amoeba/platforms/cuda/src/AmoebaCudaKernels.h
@@ -327,6 +327,13 @@ public:
     * @return the potential energy due to the force
     */
    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
+    /**
+     * Get the induced dipole moments of all particles.
+     * 
+     * @param context    the Context for which to get the induced dipoles
+     * @param dipoles    the induced dipole moment of particle i is stored into the i'th element
+     */
+    void getInducedDipoles(ContextImpl& context, std::vector<Vec3>& dipoles);
    /**
     * Execute the kernel to calculate the electrostatic potential
     *

--- a/plugins/amoeba/platforms/cuda/tests/TestCudaAmoebaMultipoleForce.cpp
+++ b/plugins/amoeba/platforms/cuda/tests/TestCudaAmoebaMultipoleForce.cpp
@@ -2700,6 +2700,40 @@ static void testPMEMutualPolarizationLargeWater( FILE* log ) {
 }
+// test querying particle induced dipoles
+static void testParticleInducedDipoles() {
+    int numberOfParticles     = 8;
+    int inputPmeGridDimension = 0;
+    double cutoff             = 9000000.0;
+    std::vector<Vec3> forces;
+    double energy;
+    System system;
+    AmoebaMultipoleForce* amoebaMultipoleForce = new AmoebaMultipoleForce();;
+    setupMultipoleAmmonia(system, amoebaMultipoleForce, AmoebaMultipoleForce::NoCutoff, AmoebaMultipoleForce::Mutual, 
+                                             cutoff, inputPmeGridDimension);
+    LangevinIntegrator integrator(0.0, 0.1, 0.01);
+    Context context(system, integrator, Platform::getPlatformByName("CUDA"));
+    getForcesEnergyMultipoleAmmonia(context, forces, energy);
+    std::vector<Vec3> dipole;
+    amoebaMultipoleForce->getInducedDipoles(context, dipole);
+    // Compare to values calculated by TINKER.
+    std::vector<Vec3> expectedDipole(numberOfParticles);
+    expectedDipole[0] = Vec3(0.0031710288, 9.3687453e-7, -0.0006919963);
+    expectedDipole[1] = Vec3(8.0279737504e-5, -0.000279376, 4.778060103e-5);
+    expectedDipole[2] = Vec3(0.000079322, 0.0002789804, 4.8696656126e-5);
+    expectedDipole[3] = Vec3(-0.0001407394, 1.540638116e-6, -0.0007077775);
+    expectedDipole[4] = Vec3(0.0019564439, -1.0409717e-7, 0.0007332188);
+    expectedDipole[5] = Vec3(0.0008213891, -0.0007749618, -0.0003883865);
+    expectedDipole[6] = Vec3(0.0046133992, -7.2868019e-7, 0.0002500622);
+    expectedDipole[7] = Vec3(0.0008204731, 0.0007772727, -0.0003856176);
+    for (int i = 0; i < numberOfParticles; i++)
+        ASSERT_EQUAL_VEC(expectedDipole[i], dipole[i], 1e-4);
+}
 // test computation of system multipole moments
 static void testSystemMultipoleMoments( FILE* log ) {
@@ -2963,6 +2997,10 @@ int main(int argc, char* argv[]) {
        testMultipoleIonsAndWaterPMEMutualPolarization( log );
        testMultipoleIonsAndWaterPMEDirectPolarization( log );
+        // test querying induced dipoles
+        testParticleInducedDipoles();
        // test computation of system multipole moments
        testSystemMultipoleMoments( log );

--- a/plugins/amoeba/platforms/reference/src/AmoebaReferenceKernels.cpp
+++ b/plugins/amoeba/platforms/reference/src/AmoebaReferenceKernels.cpp
@@ -683,6 +683,25 @@ double ReferenceCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bo
    return static_cast<double>(energy);
 }
+void ReferenceCalcAmoebaMultipoleForceKernel::getInducedDipoles(ContextImpl& context, vector<Vec3>& outputDipoles) {
+    int numParticles = context.getSystem().getNumParticles();
+    outputDipoles.resize(numParticles);
+    // Create an AmoebaReferenceMultipoleForce to do the calculation.
+    AmoebaReferenceMultipoleForce* amoebaReferenceMultipoleForce = setupAmoebaReferenceMultipoleForce( context );
+    vector<RealVec>& posData = extractPositions(context);
+    // Retrieve the induced dipoles.
+    vector<RealVec> inducedDipoles;
+    amoebaReferenceMultipoleForce->calculateInducedDipoles(posData, charges, dipoles, quadrupoles, tholes,
+            dampingFactors, polarity, axisTypes, multipoleAtomZs, multipoleAtomXs, multipoleAtomYs, multipoleAtomCovalentInfo, inducedDipoles);
+    for (int i = 0; i < numParticles; i++)
+        outputDipoles[i] = inducedDipoles[i];
+    delete amoebaReferenceMultipoleForce;
+}
 void ReferenceCalcAmoebaMultipoleForceKernel::getElectrostaticPotential(ContextImpl& context, const std::vector< Vec3 >& inputGrid,
                                                                        std::vector< double >& outputElectrostaticPotential ){
@@ -704,8 +723,6 @@ void ReferenceCalcAmoebaMultipoleForceKernel::getElectrostaticPotential(ContextI
    }
    delete amoebaReferenceMultipoleForce;
-    return;
 }
 void ReferenceCalcAmoebaMultipoleForceKernel::getSystemMultipoleMoments(ContextImpl& context, std::vector< double >& outputMultipoleMoments){
@@ -726,8 +743,6 @@ void ReferenceCalcAmoebaMultipoleForceKernel::getSystemMultipoleMoments(ContextI
                                                                          multipoleAtomCovalentInfo, outputMultipoleMoments );
    delete amoebaReferenceMultipoleForce;
-    return;
 }
 void ReferenceCalcAmoebaMultipoleForceKernel::copyParametersToContext(ContextImpl& context, const AmoebaMultipoleForce& force) {

--- a/plugins/amoeba/platforms/reference/src/AmoebaReferenceKernels.h
+++ b/plugins/amoeba/platforms/reference/src/AmoebaReferenceKernels.h
@@ -366,6 +366,13 @@ public:
     * @return the potential energy due to the force
     */
    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
+    /**
+     * Get the induced dipole moments of all particles.
+     * 
+     * @param context    the Context for which to get the induced dipoles
+     * @param dipoles    the induced dipole moment of particle i is stored into the i'th element
+     */
+    void getInducedDipoles(ContextImpl& context, std::vector<Vec3>& dipoles);
    /** 
     * Calculate the electrostatic potential given vector of grid coordinates.
     *