Merge remote-tracking branch 'upstream/master' into fix-mts

669e314e · John Chodera (MSKCC) · 283b1acd · 388bf972 · 669e314e · 669e314e
Commit 669e314e authored Oct 15, 2016 by John Chodera (MSKCC)
14 changed files
--- a/.travis.yml
+++ b/.travis.yml
@@ -109,7 +109,7 @@ before_install:
      sudo easy_install pytest;
    fi
  - if [[ "$OPENCL" == "true" ]]; then
-      wget https://jenkins.choderalab.org/userContent/AMD-APP-SDKInstaller-v3.0.130.135-GA-linux64.tar.bz2;
+      wget http://s3.amazonaws.com/omnia-ci/AMD-APP-SDKInstaller-v3.0.130.135-GA-linux64.tar.bz2;
      tar -xjf AMD-APP-SDK*.tar.bz2;
      AMDAPPSDK=${HOME}/AMDAPPSDK;
      export OPENCL_VENDOR_PATH=${AMDAPPSDK}/etc/OpenCL/vendors;

--- a/README.md
+++ b/README.md
+[![Build Status](https://travis-ci.org/pandegroup/openmm.svg?branch=master)](https://travis-ci.org/pandegroup/openmm)
+[![Anaconda Cloud Badge](https://anaconda.org/omnia/openmm/badges/downloads.svg)](https://anaconda.org/omnia/openmm)
+
 ## OpenMM: A High Performance Molecular Dynamics Library

 Introduction
@@ -10,15 +13,3 @@ Getting Help
 ------------

 Need Help? Check out the [documentation](http://docs.openmm.org/) and [discussion forums](https://simtk.org/forums/viewforum.php?f=161).
-
-[C++ API Reference](http://docs.openmm.org/6.3.0/api-c++/namespaceOpenMM.html)
-
-[Python API Reference](http://docs.openmm.org/6.3.0/api-python/annotated.html)
-
-Badges
------
-* Travis CI `linux` and `osx` integration tests:
-  * GitHub master [![Build Status](https://travis-ci.org/pandegroup/openmm.svg?branch=master)](https://travis-ci.org/pandegroup/openmm)
-  * `openmm-dev` recipe [![Build Status](https://travis-ci.org/omnia-md/conda-dev-recipes.svg?branch=master)](https://travis-ci.org/omnia-md/conda-dev-recipes)
-* Anaconda Cloud `openmm` conda release: [![Binstar `openmm` conda release](https://anaconda.org/omnia/openmm/badges/version.svg)](https://anaconda.org/omnia/openmm)
-* Anaconda Cloud `openmm-dev` conda package: [![Binstar `openmm-dev` conda package](https://anaconda.org/omnia/openmm-dev/badges/version.svg)](https://anaconda.org/omnia/openmm-dev)
--- a/platforms/cuda/include/CudaNonbondedUtilities.h
+++ b/platforms/cuda/include/CudaNonbondedUtilities.h
@@ -71,15 +71,16 @@ public:
    /**
     * Add a nonbonded interaction to be evaluated by the default interaction kernel.
     *
-     * @param usesCutoff     specifies whether a cutoff should be applied to this interaction
-     * @param usesPeriodic   specifies whether periodic boundary conditions should be applied to this interaction
-     * @param usesExclusions specifies whether this interaction uses exclusions.  If this is true, it must have identical exclusions to every other interaction.
-     * @param cutoffDistance the cutoff distance for this interaction (ignored if usesCutoff is false)
-     * @param exclusionList  for each atom, specifies the list of other atoms whose interactions should be excluded
-     * @param kernel         the code to evaluate the interaction
-     * @param forceGroup     the force group in which the interaction should be calculated
+     * @param usesCutoff       specifies whether a cutoff should be applied to this interaction
+     * @param usesPeriodic     specifies whether periodic boundary conditions should be applied to this interaction
+     * @param usesExclusions   specifies whether this interaction uses exclusions.  If this is true, it must have identical exclusions to every other interaction.
+     * @param cutoffDistance   the cutoff distance for this interaction (ignored if usesCutoff is false)
+     * @param exclusionList    for each atom, specifies the list of other atoms whose interactions should be excluded
+     * @param kernel           the code to evaluate the interaction
+     * @param forceGroup       the force group in which the interaction should be calculated
+     * @param supportsPairList specifies whether this interaction can work with a neighbor list that uses a separate pair list
     */
-    void addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const std::vector<std::vector<int> >& exclusionList, const std::string& kernel, int forceGroup);
+    void addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const std::vector<std::vector<int> >& exclusionList, const std::string& kernel, int forceGroup, bool supportsPairList=false);
    /**
     * Add a per-atom parameter that the default interaction kernel may depend on.
     */
@@ -189,6 +190,12 @@ public:
    CudaArray& getInteractingAtoms() {
        return *interactingAtoms;
    }
+    /**
+     * Get the array containing single pairs in the neighbor list.
+     */
+    CudaArray& getSinglePairs() {
+        return *singlePairs;
+    }
    /**
     * Get the array containing exclusion flags.
     */
@@ -270,6 +277,8 @@ private:
    CudaArray* interactingTiles;
    CudaArray* interactingAtoms;
    CudaArray* interactionCount;
+    CudaArray* singlePairs;
+    CudaArray* singlePairCount;
    CudaArray* blockCenter;
    CudaArray* blockBoundingBox;
    CudaArray* sortedBlocks;
@@ -288,8 +297,8 @@ private:
    std::map<int, double> groupCutoff;
    std::map<int, std::string> groupKernelSource;
    double lastCutoff;
-    bool useCutoff, usePeriodic, anyExclusions, usePadding, forceRebuildNeighborList;
-    int startTileIndex, numTiles, startBlockIndex, numBlocks, maxTiles, maxExclusions, numForceThreadBlocks, forceThreadBlockSize, numAtoms, groupFlags;
+    bool useCutoff, usePeriodic, anyExclusions, usePadding, forceRebuildNeighborList, canUsePairList;
+    int startTileIndex, numTiles, startBlockIndex, numBlocks, maxTiles, maxSinglePairs, maxExclusions, numForceThreadBlocks, forceThreadBlockSize, numAtoms, groupFlags;
 };

 /**

--- a/platforms/cuda/include/CudaParallelKernels.h
+++ b/platforms/cuda/include/CudaParallelKernels.h
@@ -83,7 +83,7 @@ private:
    std::vector<Kernel> kernels;
    std::vector<long long> completionTimes;
    std::vector<double> contextNonbondedFractions;
-    int* tileCounts;
+    int2* interactionCounts;
    CudaArray* contextForces;
    void* pinnedPositionBuffer;
    long long* pinnedForceBuffer;

--- a/platforms/cuda/src/CudaContext.cpp
+++ b/platforms/cuda/src/CudaContext.cpp
@@ -130,8 +130,10 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
 #endif
    struct stat info;
    isNvccAvailable = (res == 0 && stat(tempDir.c_str(), &info) == 0);
+    int cudaDriverVersion;
+    cuDriverGetVersion(&cudaDriverVersion);
    static bool hasShownNvccWarning = false;
-    if (hasCompilerKernel && !isNvccAvailable && !hasShownNvccWarning) {
+    if (hasCompilerKernel && !isNvccAvailable && !hasShownNvccWarning && cudaDriverVersion < 8000) {
        hasShownNvccWarning = true;
        printf("Could not find nvcc.  Using runtime compiler, which may produce slower performance.  ");
 #ifdef WIN32
@@ -208,14 +210,14 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
    int major, minor;
    CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, device));
    int numThreadBlocksPerComputeUnit = (major >= 6 ? 4 : 6);
-#if __CUDA_API_VERSION < 7000
+    if (cudaDriverVersion < 7000) {
        // This is a workaround to support GTX 980 with CUDA 6.5.  It reports
        // its compute capability as 5.2, but the compiler doesn't support
        // anything beyond 5.0.
        if (major == 5)
            minor = 0;
-#endif
-#if __CUDA_API_VERSION < 8000
+    }
+    if (cudaDriverVersion < 8000) {
        // This is a workaround to support Pascal with CUDA 7.5.  It reports
        // its compute capability as 6.x, but the compiler doesn't support
        // anything beyond 5.3.
@@ -223,7 +225,7 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
            major = 5;
            minor = 3;
        }
-#endif
+    }
    gpuArchitecture = intToString(major)+intToString(minor);
    computeCapability = major+0.1*minor;


--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -1920,7 +1920,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
    // Add the interaction to the default nonbonded kernel.
   
    string source = cu.replaceStrings(CudaKernelSources::coulombLennardJones, defines);
-    cu.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, true, force.getCutoffDistance(), exclusionList, source, force.getForceGroup());
+    cu.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, true, force.getCutoffDistance(), exclusionList, source, force.getForceGroup(), true);
    if (hasLJ)
        cu.getNonbondedUtilities().addParameter(CudaNonbondedUtilities::ParameterInfo("sigmaEpsilon", "float", 2, sizeof(float2), sigmaEpsilon->getDevicePointer()));

@@ -2314,7 +2314,7 @@ void CudaCalcCustomNonbondedForceKernel::initialize(const System& system, const
    if (force.getNumInteractionGroups() > 0)
        initInteractionGroups(force, source, tableTypes);
    else {
-        cu.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, true, force.getCutoffDistance(), exclusionList, source, force.getForceGroup());
+        cu.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, true, force.getCutoffDistance(), exclusionList, source, force.getForceGroup(), true);
        for (int i = 0; i < (int) params->getBuffers().size(); i++) {
            CudaNonbondedUtilities::ParameterInfo& buffer = params->getBuffers()[i];
            cu.getNonbondedUtilities().addParameter(CudaNonbondedUtilities::ParameterInfo(prefix+"params"+cu.intToString(i+1), buffer.getComponentType(), buffer.getNumComponents(), buffer.getSize(), buffer.getMemory()));

--- a/platforms/cuda/src/CudaNonbondedUtilities.cpp
+++ b/platforms/cuda/src/CudaNonbondedUtilities.cpp
@@ -64,15 +64,16 @@ private:

 CudaNonbondedUtilities::CudaNonbondedUtilities(CudaContext& context) : context(context), useCutoff(false), usePeriodic(false), anyExclusions(false), usePadding(true),
        exclusionIndices(NULL), exclusionRowIndices(NULL), exclusionTiles(NULL), exclusions(NULL), interactingTiles(NULL), interactingAtoms(NULL),
-        interactionCount(NULL), blockCenter(NULL), blockBoundingBox(NULL), sortedBlocks(NULL), sortedBlockCenter(NULL), sortedBlockBoundingBox(NULL),
-        oldPositions(NULL), rebuildNeighborList(NULL), blockSorter(NULL), pinnedCountBuffer(NULL), forceRebuildNeighborList(true), lastCutoff(0.0), groupFlags(0) {
+        interactionCount(NULL), singlePairs(NULL), blockCenter(NULL), blockBoundingBox(NULL), sortedBlocks(NULL), sortedBlockCenter(NULL), sortedBlockBoundingBox(NULL),
+        oldPositions(NULL), rebuildNeighborList(NULL), blockSorter(NULL), pinnedCountBuffer(NULL), forceRebuildNeighborList(true), lastCutoff(0.0), groupFlags(0),
+        canUsePairList(true) {
    // Decide how many thread blocks to use.

    string errorMessage = "Error initializing nonbonded utilities";
    int multiprocessors;
    CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, context.getDevice()));
    CHECK_RESULT(cuEventCreate(&downloadCountEvent, 0));
-    CHECK_RESULT(cuMemHostAlloc((void**) &pinnedCountBuffer, sizeof(int), CU_MEMHOSTALLOC_PORTABLE));
+    CHECK_RESULT(cuMemHostAlloc((void**) &pinnedCountBuffer, 2*sizeof(int), CU_MEMHOSTALLOC_PORTABLE));
    numForceThreadBlocks = 4*multiprocessors;
    forceThreadBlockSize = (context.getComputeCapability() < 2.0 ? 128 : 256);
 }
@@ -92,6 +93,8 @@ CudaNonbondedUtilities::~CudaNonbondedUtilities() {
        delete interactingAtoms;
    if (interactionCount != NULL)
        delete interactionCount;
+    if (singlePairs != NULL)
+        delete singlePairs;
    if (blockCenter != NULL)
        delete blockCenter;
    if (blockBoundingBox != NULL)
@@ -113,7 +116,7 @@ CudaNonbondedUtilities::~CudaNonbondedUtilities() {
    cuEventDestroy(downloadCountEvent);
 }

-void CudaNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector<vector<int> >& exclusionList, const string& kernel, int forceGroup) {
+void CudaNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector<vector<int> >& exclusionList, const string& kernel, int forceGroup, bool supportsPairList) {
    if (groupCutoff.size() > 0) {
        if (usesCutoff != useCutoff)
            throw OpenMMException("All Forces must agree on whether to use a cutoff");
@@ -128,6 +131,7 @@ void CudaNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic,
    usePeriodic = usesPeriodic;
    groupCutoff[forceGroup] = cutoffDistance;
    groupFlags |= 1<<forceGroup;
+    canUsePairList &= supportsPairList;
    if (kernel.size() > 0) {
        if (groupKernelSource.find(forceGroup) == groupKernelSource.end())
            groupKernelSource[forceGroup] = "";
@@ -279,9 +283,11 @@ void CudaNonbondedUtilities::initialize(const System& system) {
            maxTiles = numTiles;
        if (maxTiles < 1)
            maxTiles = 1;
+        maxSinglePairs = 5*numAtoms;
        interactingTiles = CudaArray::create<int>(context, maxTiles, "interactingTiles");
        interactingAtoms = CudaArray::create<int>(context, CudaContext::TileSize*maxTiles, "interactingAtoms");
-        interactionCount = CudaArray::create<unsigned int>(context, 1, "interactionCount");
+        interactionCount = CudaArray::create<unsigned int>(context, 2, "interactionCount");
+        singlePairs = CudaArray::create<int2>(context, maxSinglePairs, "singlePairs");
        int elementSize = (context.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
        blockCenter = new CudaArray(context, numAtomBlocks, 4*elementSize, "blockCenter");
        blockBoundingBox = new CudaArray(context, numAtomBlocks, 4*elementSize, "blockBoundingBox");
@@ -291,7 +297,7 @@ void CudaNonbondedUtilities::initialize(const System& system) {
        oldPositions = new CudaArray(context, numAtoms, 4*elementSize, "oldPositions");
        rebuildNeighborList = CudaArray::create<int>(context, 1, "rebuildNeighborList");
        blockSorter = new CudaSort(context, new BlockSortTrait(context.getUseDoublePrecision()), numAtomBlocks);
-        vector<unsigned int> count(1, 0);
+        vector<unsigned int> count(2, 0);
        interactionCount->upload(count);
    }

@@ -316,6 +322,8 @@ void CudaNonbondedUtilities::initialize(const System& system) {
        forceArgs.push_back(&blockCenter->getDevicePointer());
        forceArgs.push_back(&blockBoundingBox->getDevicePointer());
        forceArgs.push_back(&interactingAtoms->getDevicePointer());
+        forceArgs.push_back(&maxSinglePairs);
+        forceArgs.push_back(&singlePairs->getDevicePointer());
    }
    for (int i = 0; i < (int) parameters.size(); i++)
        forceArgs.push_back(&parameters[i].getMemory());
@@ -353,8 +361,10 @@ void CudaNonbondedUtilities::initialize(const System& system) {
        findInteractingBlocksArgs.push_back(&interactionCount->getDevicePointer());
        findInteractingBlocksArgs.push_back(&interactingTiles->getDevicePointer());
        findInteractingBlocksArgs.push_back(&interactingAtoms->getDevicePointer());
+        findInteractingBlocksArgs.push_back(&singlePairs->getDevicePointer());
        findInteractingBlocksArgs.push_back(&context.getPosq().getDevicePointer());
        findInteractingBlocksArgs.push_back(&maxTiles);
+        findInteractingBlocksArgs.push_back(&maxSinglePairs);
        findInteractingBlocksArgs.push_back(&startBlockIndex);
        findInteractingBlocksArgs.push_back(&numBlocks);
        findInteractingBlocksArgs.push_back(&sortedBlocks->getDevicePointer());
@@ -424,28 +434,39 @@ void CudaNonbondedUtilities::computeInteractions(int forceGroups, bool includeFo
 bool CudaNonbondedUtilities::updateNeighborListSize() {
    if (!useCutoff)
        return false;
-    if (pinnedCountBuffer[0] <= (unsigned int) maxTiles)
+    if (pinnedCountBuffer[0] <= maxTiles && pinnedCountBuffer[1] <= maxSinglePairs)
        return false;

    // The most recent timestep had too many interactions to fit in the arrays.  Make the arrays bigger to prevent
    // this from happening in the future.

-    maxTiles = (int) (1.2*pinnedCountBuffer[0]);
-    int totalTiles = context.getNumAtomBlocks()*(context.getNumAtomBlocks()+1)/2;
-    if (maxTiles > totalTiles)
-        maxTiles = totalTiles;
-    delete interactingTiles;
-    delete interactingAtoms;
-    interactingTiles = NULL; // Avoid an error in the destructor if the following allocation fails
-    interactingAtoms = NULL;
-    interactingTiles = CudaArray::create<int>(context, maxTiles, "interactingTiles");
-    interactingAtoms = CudaArray::create<int>(context, CudaContext::TileSize*maxTiles, "interactingAtoms");
-    if (forceArgs.size() > 0)
-        forceArgs[7] = &interactingTiles->getDevicePointer();
-    findInteractingBlocksArgs[6] = &interactingTiles->getDevicePointer();
-    if (forceArgs.size() > 0)
-        forceArgs[17] = &interactingAtoms->getDevicePointer();
-    findInteractingBlocksArgs[7] = &interactingAtoms->getDevicePointer();
+    if (pinnedCountBuffer[0] > maxTiles) {
+        maxTiles = (int) (1.2*pinnedCountBuffer[0]);
+        int totalTiles = context.getNumAtomBlocks()*(context.getNumAtomBlocks()+1)/2;
+        if (maxTiles > totalTiles)
+            maxTiles = totalTiles;
+        delete interactingTiles;
+        delete interactingAtoms;
+        interactingTiles = NULL; // Avoid an error in the destructor if the following allocation fails
+        interactingAtoms = NULL;
+        interactingTiles = CudaArray::create<int>(context, maxTiles, "interactingTiles");
+        interactingAtoms = CudaArray::create<int>(context, CudaContext::TileSize*maxTiles, "interactingAtoms");
+        if (forceArgs.size() > 0)
+            forceArgs[7] = &interactingTiles->getDevicePointer();
+        findInteractingBlocksArgs[6] = &interactingTiles->getDevicePointer();
+        if (forceArgs.size() > 0)
+            forceArgs[17] = &interactingAtoms->getDevicePointer();
+        findInteractingBlocksArgs[7] = &interactingAtoms->getDevicePointer();
+    }
+    if (pinnedCountBuffer[1] > maxSinglePairs) {
+        maxSinglePairs = (int) (1.2*pinnedCountBuffer[1]);
+        delete singlePairs;
+        singlePairs = NULL; // Avoid an error in the destructor if the following allocation fails
+        singlePairs = CudaArray::create<int2>(context, maxSinglePairs, "singlePairs");
+        if (forceArgs.size() > 0)
+            forceArgs[19] = &singlePairs->getDevicePointer();
+        findInteractingBlocksArgs[8] = &singlePairs->getDevicePointer();
+    }
    forceRebuildNeighborList = true;
    context.setForcesValid(false);
    return true;
@@ -493,6 +514,7 @@ void CudaNonbondedUtilities::createKernelsForGroups(int groups) {
        if (usePeriodic)
            defines["USE_PERIODIC"] = "1";
        defines["MAX_EXCLUSIONS"] = context.intToString(maxExclusions);
+        defines["MAX_BITS_FOR_PAIRS"] = (canUsePairList ? "2" : "0");
        CUmodule interactingBlocksProgram = context.createModule(CudaKernelSources::vectorOps+CudaKernelSources::findInteractingBlocks, defines);
        kernels.findBlockBoundsKernel = context.getKernel(interactingBlocksProgram, "findBlockBounds");
        kernels.sortBoxDataKernel = context.getKernel(interactingBlocksProgram, "sortBoxData");

--- a/platforms/cuda/src/CudaParallelKernels.cpp
+++ b/platforms/cuda/src/CudaParallelKernels.cpp
@@ -63,8 +63,8 @@ if (result != CUDA_SUCCESS) { \
 class CudaParallelCalcForcesAndEnergyKernel::BeginComputationTask : public CudaContext::WorkTask {
 public:
    BeginComputationTask(ContextImpl& context, CudaContext& cu, CudaCalcForcesAndEnergyKernel& kernel,
-            bool includeForce, bool includeEnergy, int groups, void* pinnedMemory, CUevent event, int& numTiles) : context(context), cu(cu), kernel(kernel),
-            includeForce(includeForce), includeEnergy(includeEnergy), groups(groups), pinnedMemory(pinnedMemory), event(event), numTiles(numTiles) {
+            bool includeForce, bool includeEnergy, int groups, void* pinnedMemory, CUevent event, int2& interactionCount) : context(context), cu(cu), kernel(kernel),
+            includeForce(includeForce), includeEnergy(includeEnergy), groups(groups), pinnedMemory(pinnedMemory), event(event), interactionCount(interactionCount) {
    }
    void execute() {
        // Copy coordinates over to this device and execute the kernel.
@@ -77,7 +77,7 @@ public:
        }
        kernel.beginComputation(context, includeForce, includeEnergy, groups);
        if (cu.getNonbondedUtilities().getUsePeriodic())
-            cu.getNonbondedUtilities().getInteractionCount().download(&numTiles, false);
+            cu.getNonbondedUtilities().getInteractionCount().download(&interactionCount, false);
    }
 private:
    ContextImpl& context;
@@ -87,15 +87,15 @@ private:
    int groups;
    void* pinnedMemory;
    CUevent event;
-    int& numTiles;
+    int2& interactionCount;
 };

 class CudaParallelCalcForcesAndEnergyKernel::FinishComputationTask : public CudaContext::WorkTask {
 public:
    FinishComputationTask(ContextImpl& context, CudaContext& cu, CudaCalcForcesAndEnergyKernel& kernel,
-            bool includeForce, bool includeEnergy, int groups, double& energy, long long& completionTime, long long* pinnedMemory, CudaArray& contextForces, bool& valid, int& numTiles) :
+            bool includeForce, bool includeEnergy, int groups, double& energy, long long& completionTime, long long* pinnedMemory, CudaArray& contextForces, bool& valid, int2& interactionCount) :
            context(context), cu(cu), kernel(kernel), includeForce(includeForce), includeEnergy(includeEnergy), groups(groups), energy(energy),
-            completionTime(completionTime), pinnedMemory(pinnedMemory), contextForces(contextForces), valid(valid), numTiles(numTiles) {
+            completionTime(completionTime), pinnedMemory(pinnedMemory), contextForces(contextForces), valid(valid), interactionCount(interactionCount) {
    }
    void execute() {
        // Execute the kernel, then download forces.
@@ -120,7 +120,8 @@ public:
                    cu.getForce().download(&pinnedMemory[(cu.getContextIndex()-1)*numAtoms*3]);
            }
        }
-        if (cu.getNonbondedUtilities().getUsePeriodic() && numTiles > cu.getNonbondedUtilities().getInteractingTiles().getSize()) {
+        if (cu.getNonbondedUtilities().getUsePeriodic() && (interactionCount.x > cu.getNonbondedUtilities().getInteractingTiles().getSize() ||
+                interactionCount.y > cu.getNonbondedUtilities().getSinglePairs().getSize())) {
            valid = false;
            cu.getNonbondedUtilities().updateNeighborListSize();
        }
@@ -136,12 +137,12 @@ private:
    long long* pinnedMemory;
    CudaArray& contextForces;
    bool& valid;
-    int& numTiles;
+    int2& interactionCount;
 };

 CudaParallelCalcForcesAndEnergyKernel::CudaParallelCalcForcesAndEnergyKernel(string name, const Platform& platform, CudaPlatform::PlatformData& data) :
        CalcForcesAndEnergyKernel(name, platform), data(data), completionTimes(data.contexts.size()), contextNonbondedFractions(data.contexts.size()),
-        tileCounts(NULL), contextForces(NULL), pinnedPositionBuffer(NULL), pinnedForceBuffer(NULL) {
+        interactionCounts(NULL), contextForces(NULL), pinnedPositionBuffer(NULL), pinnedForceBuffer(NULL) {
    for (int i = 0; i < (int) data.contexts.size(); i++)
        kernels.push_back(Kernel(new CudaCalcForcesAndEnergyKernel(name, platform, *data.contexts[i])));
 }
@@ -156,8 +157,8 @@ CudaParallelCalcForcesAndEnergyKernel::~CudaParallelCalcForcesAndEnergyKernel()
        cuMemFreeHost(pinnedForceBuffer);
    cuEventDestroy(event);
    cuStreamDestroy(peerCopyStream);
-    if (tileCounts != NULL)
-        cuMemFreeHost(tileCounts);
+    if (interactionCounts != NULL)
+        cuMemFreeHost(interactionCounts);
 }

 void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
@@ -172,7 +173,7 @@ void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
        contextNonbondedFractions[i] = 1/(double) numContexts;
    CHECK_RESULT(cuEventCreate(&event, 0), "Error creating event");
    CHECK_RESULT(cuStreamCreate(&peerCopyStream, CU_STREAM_NON_BLOCKING), "Error creating stream");
-    CHECK_RESULT(cuMemHostAlloc((void**) &tileCounts, numContexts*sizeof(int), 0), "Error creating tile count buffer");
+    CHECK_RESULT(cuMemHostAlloc((void**) &interactionCounts, numContexts*sizeof(int2), 0), "Error creating interaction counts buffer");
 }

 void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) {
@@ -202,7 +203,7 @@ void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& contex
        data.contextEnergy[i] = 0.0;
        CudaContext& cu = *data.contexts[i];
        CudaContext::WorkThread& thread = cu.getWorkThread();
-        thread.addTask(new BeginComputationTask(context, cu, getKernel(i), includeForce, includeEnergy, groups, pinnedPositionBuffer, event, tileCounts[i]));
+        thread.addTask(new BeginComputationTask(context, cu, getKernel(i), includeForce, includeEnergy, groups, pinnedPositionBuffer, event, interactionCounts[i]));
    }
 }

@@ -210,7 +211,7 @@ double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& con
    for (int i = 0; i < (int) data.contexts.size(); i++) {
        CudaContext& cu = *data.contexts[i];
        CudaContext::WorkThread& thread = cu.getWorkThread();
-        thread.addTask(new FinishComputationTask(context, cu, getKernel(i), includeForce, includeEnergy, groups, data.contextEnergy[i], completionTimes[i], pinnedForceBuffer, *contextForces, valid, tileCounts[i]));
+        thread.addTask(new FinishComputationTask(context, cu, getKernel(i), includeForce, includeEnergy, groups, data.contextEnergy[i], completionTimes[i], pinnedForceBuffer, *contextForces, valid, interactionCounts[i]));
    }
    data.syncContexts();
    double energy = 0.0;

--- a/platforms/cuda/src/kernels/findInteractingBlocks.cu
+++ b/platforms/cuda/src/kernels/findInteractingBlocks.cu
@@ -71,9 +71,60 @@ extern "C" __global__ void sortBoxData(const real2* __restrict__ sortedBlock, co
    if (rebuild) {
        rebuildNeighborList[0] = 1;
        interactionCount[0] = 0;
+        interactionCount[1] = 0;
    }
 }

+__device__ int saveSinglePairs(int x, int* atoms, int* flags, int length, unsigned int maxSinglePairs, unsigned int* singlePairCount, int2* singlePairs, int* sumBuffer, volatile int& pairStartIndex) {
+    // Record interactions that should be computed as single pairs rather than in blocks.
+    
+    const int indexInWarp = threadIdx.x%32;
+    int sum = 0;
+    for (int i = indexInWarp; i < length; i += 32) {
+        int count = __popc(flags[i]);
+        sum += (count <= MAX_BITS_FOR_PAIRS ? count : 0);
+    }
+    sumBuffer[indexInWarp] = sum;
+    for (int step = 1; step < 32; step *= 2) {
+        int add = (indexInWarp >= step ? sumBuffer[indexInWarp-step] : 0);
+        sumBuffer[indexInWarp] += add;
+    }
+    int pairsToStore = sumBuffer[31];
+    if (indexInWarp == 0)
+        pairStartIndex = atomicAdd(singlePairCount, pairsToStore);
+    int pairIndex = pairStartIndex + (indexInWarp > 0 ? sumBuffer[indexInWarp-1] : 0);
+    for (int i = indexInWarp; i < length; i += 32) {
+        int count = __popc(flags[i]);
+        if (count <= MAX_BITS_FOR_PAIRS && pairIndex+count < maxSinglePairs) {
+            int f = flags[i];
+            while (f != 0) {
+                singlePairs[pairIndex] = make_int2(atoms[i], x*TILE_SIZE+__ffs(f)-1);
+                f &= f-1;
+                pairIndex++;
+            }
+        }
+    }
+    
+    // Compact the remaining interactions.
+    
+    const int warpMask = (1<<indexInWarp)-1;
+    int numCompacted = 0;
+    for (int start = 0; start < length; start += 32) {
+        int i = start+indexInWarp;
+        int atom = atoms[i];
+        int flag = flags[i];
+        bool include = (i < length && __popc(flags[i]) > MAX_BITS_FOR_PAIRS);
+        int includeFlags = __ballot(include);
+        if (include) {
+            int index = numCompacted+__popc(includeFlags&warpMask);
+            atoms[index] = atom;
+            flags[index] = flag;
+        }
+        numCompacted += __popc(includeFlags);
+    }
+    return numCompacted;
+}
+
 /**
 * Compare the bounding boxes for each pair of atom blocks (comprised of 32 atoms each), forming a tile. If the two
 * atom blocks are sufficiently far apart, mark them as non-interacting. There are two stages in the algorithm.
@@ -124,8 +175,9 @@ extern "C" __global__ void sortBoxData(const real2* __restrict__ sortedBlock, co
 *
 */
 extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
-        unsigned int* __restrict__ interactionCount, int* __restrict__ interactingTiles, unsigned int* __restrict__ interactingAtoms, const real4* __restrict__ posq,
-        unsigned int maxTiles, unsigned int startBlockIndex, unsigned int numBlocks, real2* __restrict__ sortedBlocks, const real4* __restrict__ sortedBlockCenter,
+        unsigned int* __restrict__ interactionCount, int* __restrict__ interactingTiles, unsigned int* __restrict__ interactingAtoms,
+        int2* __restrict__ singlePairs, const real4* __restrict__ posq, unsigned int maxTiles, unsigned int maxSinglePairs,
+        unsigned int startBlockIndex, unsigned int numBlocks, real2* __restrict__ sortedBlocks, const real4* __restrict__ sortedBlockCenter,
        const real4* __restrict__ sortedBlockBoundingBox, const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices,
        real4* __restrict__ oldPositions, const int* __restrict__ rebuildNeighborList) {

@@ -138,12 +190,17 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
    const int warpIndex = (blockIdx.x*blockDim.x+threadIdx.x)/32;
    const int warpMask = (1<<indexInWarp)-1;
    __shared__ int workgroupBuffer[BUFFER_SIZE*(GROUP_SIZE/32)];
+    __shared__ int workgroupFlagsBuffer[BUFFER_SIZE*(GROUP_SIZE/32)];
    __shared__ int warpExclusions[MAX_EXCLUSIONS*(GROUP_SIZE/32)];
    __shared__ real3 posBuffer[GROUP_SIZE];
    __shared__ volatile int workgroupTileIndex[GROUP_SIZE/32];
+    __shared__ int sumBuffer[GROUP_SIZE];
+    __shared__ int worksgroupPairStartIndex[GROUP_SIZE/32];
    int* buffer = workgroupBuffer+BUFFER_SIZE*(warpStart/32);
+    int* flagsBuffer = workgroupFlagsBuffer+BUFFER_SIZE*(warpStart/32);
    int* exclusionsForX = warpExclusions+MAX_EXCLUSIONS*(warpStart/32);
    volatile int& tileStartIndex = workgroupTileIndex[warpStart/32];
+    volatile int& pairStartIndex = worksgroupPairStartIndex[warpStart/32];

    // Loop over blocks.
    
@@ -227,7 +284,7 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
                APPLY_PERIODIC_TO_DELTA(atomDelta)
 #endif
                int atomFlags = ballot(atomDelta.x*atomDelta.x+atomDelta.y*atomDelta.y+atomDelta.z*atomDelta.z < (PADDED_CUTOFF+blockCenterY.w)*(PADDED_CUTOFF+blockCenterY.w));
-                bool interacts = false;
+                int interacts = 0;
                if (atom2 < NUM_ATOMS && atomFlags != 0) {
                    int first = __ffs(atomFlags)-1;
                    int last = 32-__clz(atomFlags);
@@ -236,14 +293,14 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
                        for (int j = first; j < last; j++) {
                            real3 delta = pos2-posBuffer[warpStart+j];
                            APPLY_PERIODIC_TO_DELTA(delta)
-                            interacts |= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED);
+                            interacts |= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED ? 1<<j : 0);
                        }
                    }
                    else {
 #endif
                        for (int j = first; j < last; j++) {
                            real3 delta = pos2-posBuffer[warpStart+j];
-                            interacts |= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED);
+                            interacts |= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED ? 1<<j : 0);
                        }
 #ifdef USE_PERIODIC
                    }
@@ -253,34 +310,46 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
                // Add any interacting atoms to the buffer.
                
                int includeAtomFlags = __ballot(interacts);
-                if (interacts)
-                    buffer[neighborsInBuffer+__popc(includeAtomFlags&warpMask)] = atom2;
+                if (interacts) {
+                    int index = neighborsInBuffer+__popc(includeAtomFlags&warpMask);
+                    buffer[index] = atom2;
+                    flagsBuffer[index] = interacts;
+                }
                neighborsInBuffer += __popc(includeAtomFlags);
                if (neighborsInBuffer > BUFFER_SIZE-TILE_SIZE) {
                    // Store the new tiles to memory.
                    
+#if MAX_BITS_FOR_PAIRS > 0
+                    neighborsInBuffer = saveSinglePairs(x, buffer, flagsBuffer, neighborsInBuffer, maxSinglePairs, &interactionCount[1], singlePairs, sumBuffer+warpStart, pairStartIndex);
+#endif
                    int tilesToStore = neighborsInBuffer/TILE_SIZE;
-                    if (indexInWarp == 0)
-                        tileStartIndex = atomicAdd(interactionCount, tilesToStore);
-                    int newTileStartIndex = tileStartIndex;
-                    if (newTileStartIndex+tilesToStore <= maxTiles) {
-                        if (indexInWarp < tilesToStore)
-                            interactingTiles[newTileStartIndex+indexInWarp] = x;
-                        for (int j = 0; j < tilesToStore; j++)
-                            interactingAtoms[(newTileStartIndex+j)*TILE_SIZE+indexInWarp] = buffer[indexInWarp+j*TILE_SIZE];
+                    if (tilesToStore > 0) {
+                        if (indexInWarp == 0)
+                            tileStartIndex = atomicAdd(&interactionCount[0], tilesToStore);
+                        int newTileStartIndex = tileStartIndex;
+                        if (newTileStartIndex+tilesToStore <= maxTiles) {
+                            if (indexInWarp < tilesToStore)
+                                interactingTiles[newTileStartIndex+indexInWarp] = x;
+                            for (int j = 0; j < tilesToStore; j++)
+                                interactingAtoms[(newTileStartIndex+j)*TILE_SIZE+indexInWarp] = buffer[indexInWarp+j*TILE_SIZE];
+                        }
+                        buffer[indexInWarp] = buffer[indexInWarp+TILE_SIZE*tilesToStore];
+                        neighborsInBuffer -= TILE_SIZE*tilesToStore;
                    }
-                    buffer[indexInWarp] = buffer[indexInWarp+TILE_SIZE*tilesToStore];
-                    neighborsInBuffer -= TILE_SIZE*tilesToStore;
                }
            }
        }
        
        // If we have a partially filled buffer,  store it to memory.
        
+#if MAX_BITS_FOR_PAIRS > 0
+        if (neighborsInBuffer > 32)
+            neighborsInBuffer = saveSinglePairs(x, buffer, flagsBuffer, neighborsInBuffer, maxSinglePairs, &interactionCount[1], singlePairs, sumBuffer+warpStart, pairStartIndex);
+#endif
        if (neighborsInBuffer > 0) {
            int tilesToStore = (neighborsInBuffer+TILE_SIZE-1)/TILE_SIZE;
            if (indexInWarp == 0)
-                tileStartIndex = atomicAdd(interactionCount, tilesToStore);
+                tileStartIndex = atomicAdd(&interactionCount[0], tilesToStore);
            int newTileStartIndex = tileStartIndex;
            if (newTileStartIndex+tilesToStore <= maxTiles) {
                if (indexInWarp < tilesToStore)
@@ -295,4 +364,4 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
    
    for (int i = threadIdx.x+blockIdx.x*blockDim.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x)
        oldPositions[i] = posq[i];
-}
\ No newline at end of file
+}
--- a/platforms/cuda/src/kernels/nonbonded.cu
+++ b/platforms/cuda/src/kernels/nonbonded.cu
@@ -103,9 +103,10 @@ extern "C" __global__ void computeNonbonded(
        unsigned long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq, const tileflags* __restrict__ exclusions,
        const ushort2* __restrict__ exclusionTiles, unsigned int startTileIndex, unsigned int numTileIndices
 #ifdef USE_CUTOFF
-        , const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount,real4 periodicBoxSize, real4 invPeriodicBoxSize, 
+        , const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, 
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, const real4* __restrict__ blockCenter,
-        const real4* __restrict__ blockSize, const unsigned int* __restrict__ interactingAtoms
+        const real4* __restrict__ blockSize, const unsigned int* __restrict__ interactingAtoms, unsigned int maxSinglePairs,
+        const int2* __restrict__ singlePairs
 #endif
        PARAMETER_ARGUMENTS) {
    const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
@@ -278,10 +279,10 @@ extern "C" __global__ void computeNonbonded(
                localData[tbx+tj].fz += dEdR2.z;
 #endif 
 #endif // end USE_SYMMETRIC
+#endif
 #ifdef ENABLE_SHUFFLE
                SHUFFLE_WARP_DATA
 #endif
-#endif
 #ifdef USE_EXCLUSIONS
                excl >>= 1;
 #endif
@@ -484,9 +485,9 @@ extern "C" __global__ void computeNonbonded(
                    localData[tbx+tj].fz += dEdR2.z;
 #endif 
 #endif // end USE_SYMMETRIC
+#endif
 #ifdef ENABLE_SHUFFLE
                    SHUFFLE_WARP_DATA
-#endif
 #endif
                    tj = (tj + 1) & (TILE_SIZE - 1);
                }
@@ -555,9 +556,9 @@ extern "C" __global__ void computeNonbonded(
                    localData[tbx+tj].fz += dEdR2.z;
 #endif 
 #endif // end USE_SYMMETRIC
+#endif
 #ifdef ENABLE_SHUFFLE
                    SHUFFLE_WARP_DATA
-#endif
 #endif
                    tj = (tj + 1) & (TILE_SIZE - 1);
                }
@@ -588,6 +589,59 @@ extern "C" __global__ void computeNonbonded(
        }
        pos++;
    }
+    
+    // Third loop: single pairs that aren't part of a tile.
+    
+#if USE_CUTOFF
+    const unsigned int numPairs = interactionCount[1];
+    if (numPairs > maxSinglePairs)
+        return; // There wasn't enough memory for the neighbor list.
+    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < numPairs; i += blockDim.x*gridDim.x) {
+        int2 pair = singlePairs[i];
+        int atom1 = pair.x;
+        int atom2 = pair.y;
+        real4 posq1 = posq[atom1];
+        real4 posq2 = posq[atom2];
+        LOAD_ATOM1_PARAMETERS
+        int j = atom2;
+atom2 = threadIdx.x;
+        DECLARE_LOCAL_PARAMETERS
+        LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
+        LOAD_ATOM2_PARAMETERS
+atom2 = pair.y;
+        real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
+#ifdef USE_PERIODIC
+        APPLY_PERIODIC_TO_DELTA(delta)
+#endif
+        real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+        real invR = RSQRT(r2);
+        real r = r2*invR;
+#ifdef USE_SYMMETRIC
+        real dEdR = 0.0f;
+#else
+        real3 dEdR1 = make_real3(0);
+        real3 dEdR2 = make_real3(0);
+#endif
+        bool hasExclusions = false;
+        bool isExcluded = false;
+        real tempEnergy = 0.0f;
+        const real interactionScale = 1.0f;
+        COMPUTE_INTERACTION
+        energy += tempEnergy;
+#ifdef INCLUDE_FORCES
+#ifdef USE_SYMMETRIC
+        real3 dEdR1 = delta*dEdR;
+        real3 dEdR2 = -dEdR1;
+#endif
+        atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (-dEdR1.x*0x100000000)));
+        atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (-dEdR1.y*0x100000000)));
+        atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (-dEdR1.z*0x100000000)));
+        atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>((long long) (-dEdR2.x*0x100000000)));
+        atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (-dEdR2.y*0x100000000)));
+        atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (-dEdR2.z*0x100000000)));
+#endif
+    }
+#endif
 #ifdef INCLUDE_ENERGY
    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
 #endif

--- a/platforms/reference/src/SimTKReference/ReferenceVariableStochasticDynamics.cpp
+++ b/platforms/reference/src/SimTKReference/ReferenceVariableStochasticDynamics.cpp
@@ -266,11 +266,11 @@ void ReferenceVariableStochasticDynamics::update(const OpenMM::System& system, v

   // copy xPrime -> atomCoordinates

+   RealOpenMM invStepSize = 1.0/getDeltaT();
   for (int ii = 0; ii < numberOfAtoms; ii++) {
       if (masses[ii] != 0.0) {
-           atomCoordinates[ii][0] = xPrime[ii][0];
-           atomCoordinates[ii][1] = xPrime[ii][1];
-           atomCoordinates[ii][2] = xPrime[ii][2];
+           velocities[ii] = (xPrime[ii]-atomCoordinates[ii])*invStepSize;
+           atomCoordinates[ii] = xPrime[ii];
       }
   }


--- a/wrappers/python/src/swig_doxygen/swig_lib/python/extend.i
+++ b/wrappers/python/src/swig_doxygen/swig_lib/python/extend.i
@@ -28,6 +28,7 @@
                            int getForces,
                            int getEnergy,
                            int getParameters,
+                            int getParameterDerivatives,
                            int enforcePeriodic,
                            bitmask32t groups) {
    State state;
@@ -38,6 +39,7 @@
    if (getForces) types |= State::Forces;
    if (getEnergy) types |= State::Energy;
    if (getParameters) types |= State::Parameters;
+    if (getParameterDerivatives) types |= State::ParameterDerivatives;
    try {
        state = self->getState(types, enforcePeriodic, groups);
    }
@@ -53,7 +55,7 @@
  %pythoncode %{
    def getState(self, getPositions=False, getVelocities=False,
                 getForces=False, getEnergy=False, getParameters=False,
-                 enforcePeriodicBox=False, groups=-1):
+                 getParameterDerivatives=False, enforcePeriodicBox=False, groups=-1):
        """Get a State object recording the current state information stored in this context.

        Parameters
@@ -66,8 +68,10 @@
            whether to store the forces acting on particles in the State
        getEnergy : bool=False
            whether to store potential and kinetic energy in the State
-        getParameter : bool=False
+        getParameters : bool=False
            whether to store context parameters in the State
+        getParameterDerivatives : bool=False
+            whether to store parameter derivatives in the State
        enforcePeriodicBox : bool=False
            if false, the position of each particle will be whatever position
            is stored in the Context, regardless of periodic boundary conditions.
@@ -79,9 +83,9 @@
            can also be passed as an unsigned integer interpreted as a bitmask,
            in which case group i will be included if (groups&(1<<i)) != 0.
        """
-        getP, getV, getF, getE, getPa, enforcePeriodic = map(bool,
+        getP, getV, getF, getE, getPa, getPd, enforcePeriodic = map(bool,
            (getPositions, getVelocities, getForces, getEnergy, getParameters,
-             enforcePeriodicBox))
+             getParameterDerivatives, enforcePeriodicBox))

        try:
            # is the input integer-like?
@@ -95,8 +99,8 @@
                raise TypeError('%s is neither an int nor set' % groups)

        (simTime, periodicBoxVectorsList, energy, coordList, velList,
-         forceList, paramMap) = \
-            self._getStateAsLists(getP, getV, getF, getE, getPa, enforcePeriodic, groups_mask)
+         forceList, paramMap, paramDerivMap) = \
+            self._getStateAsLists(getP, getV, getF, getE, getPa, getPd, enforcePeriodic, groups_mask)

        state = State(simTime=simTime,
                      energy=energy,
@@ -104,7 +108,8 @@
                      velList=velList,
                      forceList=forceList,
                      periodicBoxVectorsList=periodicBoxVectorsList,
-                      paramMap=paramMap)
+                      paramMap=paramMap,
+                      paramDerivMap=paramDerivMap)
        return state

    def setState(self, state):
@@ -176,6 +181,7 @@ Parameters:
                            int getForces,
                            int getEnergy,
                            int getParameters,
+                            int getParameterDerivatives, 
                            int enforcePeriodic,
                            int groups) {
    State state;
@@ -186,6 +192,7 @@ Parameters:
    if (getForces) types |= State::Forces;
    if (getEnergy) types |= State::Energy;
    if (getParameters) types |= State::Parameters;
+    if (getParameterDerivatives) types |= State::ParameterDerivatives;
    try {
        state = self->getState(copy, types, enforcePeriodic, groups);
    }
@@ -206,6 +213,7 @@ Parameters:
                 getForces=False,
                 getEnergy=False,
                 getParameters=False,
+                 getParameterDerivatives=False,
                 enforcePeriodicBox=False,
                 groups=-1):
        """Get a State object recording the current state information about one copy of the system.
@@ -222,8 +230,10 @@ Parameters:
            whether to store the forces acting on particles in the State
        getEnergy : bool=False
            whether to store potential and kinetic energy in the State
-        getParameter : bool=False
+        getParameters : bool=False
            whether to store context parameters in the State
+        getParameterDerivatives : bool=False
+            whether to store parameter derivatives in the State
        enforcePeriodicBox : bool=False
            if false, the position of each particle will be whatever position
            is stored in the Context, regardless of periodic boundary conditions.
@@ -235,9 +245,9 @@ Parameters:
            can also be passed as an unsigned integer interpreted as a bitmask,
            in which case group i will be included if (groups&(1<<i)) != 0.
        """
-        getP, getV, getF, getE, getPa, enforcePeriodic = map(bool,
+        getP, getV, getF, getE, getPa, getPd, enforcePeriodic = map(bool,
            (getPositions, getVelocities, getForces, getEnergy, getParameters,
-             enforcePeriodicBox))
+             getParameterDerivatives, enforcePeriodicBox))

        try:
            # is the input integer-like?
@@ -250,8 +260,8 @@ Parameters:
                raise TypeError('%s is neither an int nor set' % groups)

        (simTime, periodicBoxVectorsList, energy, coordList, velList,
-          forceList, paramMap) = \
-             self._getStateAsLists(copy, getP, getV, getF, getE, getPa, enforcePeriodic, groups_mask)
+          forceList, paramMap, paramDerivMap) = \
+            self._getStateAsLists(getP, getV, getF, getE, getPa, getPd, enforcePeriodic, groups_mask)

        state = State(simTime=simTime,
                      energy=energy,
@@ -259,7 +269,8 @@ Parameters:
                      velList=velList,
                      forceList=forceList,
                      periodicBoxVectorsList=periodicBoxVectorsList,
-                      paramMap=paramMap)
+                      paramMap=paramMap,
+                      paramDerivMap=paramDerivMap)
        return state
  %}
 }
@@ -361,8 +372,9 @@ Parameters:
                                double time,
                                const std::vector<Vec3>& boxVectors,
                                const std::map<string, double>& params,
+                                const std::map<string, double>& paramDerivs,
                                int types) {
-    OpenMM::State myState =  _convertListsToState(pos,vel,forces,kineticEnergy,potentialEnergy,time,boxVectors,params,types);
+    OpenMM::State myState =  _convertListsToState(pos,vel,forces,kineticEnergy,potentialEnergy,time,boxVectors,params,paramDerivs,types);
    std::stringstream buffer;
    OpenMM::XmlSerializer::serialize<OpenMM::State>(&myState, "State", buffer);
    return buffer.str();
@@ -386,6 +398,7 @@ Parameters:
      kineticEnergy = 0.0
      potentialEnergy = 0.0
      params = {}
+      paramDerivs = {}
      types = 0
      try:
        positions = pythonState.getPositions().value_in_unit(unit.nanometers)
@@ -413,16 +426,21 @@ Parameters:
        types |= 16
      except:
        pass
+      try:
+        params = pythonState.getEnergyParameterDerivatives()
+        types |= 32
+      except:
+        pass
      time = pythonState.getTime().value_in_unit(unit.picoseconds)
      boxVectors = pythonState.getPeriodicBoxVectors().value_in_unit(unit.nanometers)
-      string = XmlSerializer._serializeStateAsLists(positions, velocities, forces, kineticEnergy, potentialEnergy, time, boxVectors, params, types)
+      string = XmlSerializer._serializeStateAsLists(positions, velocities, forces, kineticEnergy, potentialEnergy, time, boxVectors, params, paramDerivs, types)
      return string

    @staticmethod
    def _deserializeState(pythonString):

      (simTime, periodicBoxVectorsList, energy, coordList, velList,
-       forceList, paramMap) = XmlSerializer._deserializeStringIntoLists(pythonString)
+       forceList, paramMap, paramDerivMap) = XmlSerializer._deserializeStringIntoLists(pythonString)

      state = State(simTime=simTime,
                    energy=energy,
@@ -430,7 +448,8 @@ Parameters:
                    velList=velList,
                    forceList=forceList,
                    periodicBoxVectorsList=periodicBoxVectorsList,
-                    paramMap=paramMap)
+                    paramMap=paramMap,
+                    paramDerivMap=paramDerivMap)
      return state

    @staticmethod

--- a/wrappers/python/src/swig_doxygen/swig_lib/python/header.i
+++ b/wrappers/python/src/swig_doxygen/swig_lib/python/header.i
@@ -28,6 +28,7 @@ State _convertListsToState( const std::vector<Vec3> &pos,
                            double time,
                            const std::vector<Vec3> &boxVectors,
                            const std::map<std::string, double> &params,
+                            const std::map<std::string, double> &paramDerivs,
                            int types ) {  
    State::StateBuilder sb(time); 
    if(types & State::Positions)
@@ -40,6 +41,8 @@ State _convertListsToState( const std::vector<Vec3> &pos,
      sb.setEnergy(kineticEnergy, potentialEnergy);
    if(types & State::Parameters)
      sb.setParameters(params);
+    if(types & State::ParameterDerivatives)
+      sb.setEnergyParameterDerivatives(paramDerivs);
    sb.setPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
    return sb.getState();
 }
@@ -53,6 +56,7 @@ PyObject *_convertStateToLists(const State& state) {
    PyObject *pForces;
    PyObject *pyTuple;
    PyObject *pParameters;
+    PyObject *pParameterDerivs;
    simTime=state.getTime();

    OpenMM::Vec3 myVecA;
@@ -112,11 +116,21 @@ PyObject *_convertStateToLists(const State& state) {
      pParameters = Py_None;
      Py_INCREF(Py_None);
    }
+    try {
+      pParameterDerivs = PyDict_New();
+      const std::map<std::string, double>& params = state.getEnergyParameterDerivatives();
+      for (std::map<std::string, double>::const_iterator iter = params.begin(); iter != params.end(); ++iter)
+          PyDict_SetItemString(pParameterDerivs, iter->first.c_str(), Py_BuildValue("d", iter->second));
+    }
+    catch (std::exception& ex) {
+      pParameterDerivs = Py_None;
+      Py_INCREF(Py_None);
+    }
  
-    pyTuple=Py_BuildValue("(d,N,N,N,N,N,N)",
+    pyTuple=Py_BuildValue("(d,N,N,N,N,N,N,N)",
                          simTime, pPeriodicBoxVectorsList, pEnergy,
                          pPositions, pVelocities,
-                          pForces, pParameters);
+                          pForces, pParameters, pParameterDerivs);
  
    return pyTuple;
 }

--- a/wrappers/python/src/swig_doxygen/swig_lib/python/pythoncode.i
+++ b/wrappers/python/src/swig_doxygen/swig_lib/python/pythoncode.i
@@ -63,7 +63,8 @@ class State(_object):
                 velList=None,
                 forceList=None,
                 periodicBoxVectorsList=None,
-                 paramMap=None):
+                 paramMap=None,
+                 paramDerivMap=None):
        self._simTime=simTime
        self._periodicBoxVectorsList=periodicBoxVectorsList
        self._periodicBoxVectorsListNumpy=None
@@ -80,6 +81,7 @@ class State(_object):
        self._forceList=forceList
        self._forceListNumpy=None
        self._paramMap=paramMap
+        self._paramDerivMap=paramDerivMap

    def __getstate__(self):
        serializationString = XmlSerializer.serialize(self)
@@ -221,6 +223,18 @@ class State(_object):
            raise TypeError('Parameters were not requested in getState() call, so are not available.')
        return self._paramMap

+    def getEnergyParameterDerivatives(self):
+        """Get a map containing derivatives of the potential energy with respect to context parameters.
+
+        In most cases derivatives are only calculated if the corresponding Force objects have been
+        specifically told to compute them.  Otherwise, the values in the map will be zero.  Likewise,
+        if multiple Forces depend on the same parameter but only some have been told to compute
+        derivatives with respect to it, the returned value will include only the contributions from
+        the Forces that were told to compute it."""
+        if self._paramDerivMap is None:
+            raise TypeError('Parameter derivatives were not requested in getState() call, so are not available.')
+        return self._paramDerivMap
+
 %}

 %pythonappend OpenMM::Context::Context %{