Continuing to implement new CUDA platform

3e16cab9 · Peter Eastman · abb8cb4b · 3e16cab9 · 3e16cab9 · 3e16cab9
Commit 3e16cab9 authored Jun 05, 2012 by Peter Eastman
9 changed files
--- a/platforms/cuda2/src/CudaArray.h
+++ b/platforms/cuda2/src/CudaArray.h
@@ -83,7 +83,7 @@ public:
    /**
     * Get a pointer to the device memory.
     */
-    CUdeviceptr getDevicePointer() {
+    CUdeviceptr& getDevicePointer() {
        return pointer;
    }
    /**

--- a/platforms/cuda2/src/CudaContext.cpp
+++ b/platforms/cuda2/src/CudaContext.cpp
--- a/platforms/cuda2/src/CudaContext.h
+++ b/platforms/cuda2/src/CudaContext.h
@@ -72,11 +72,11 @@ public:
    CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const std::string& precision,
            const std::string& compiler, const std::string& tempDir, CudaPlatform::PlatformData& platformData);
    ~CudaContext();
-//    /**
-//     * This is called to initialize internal data structures after all Forces in the system
-//     * have been initialized.
-//     */
-//    void initialize();
+    /**
+     * This is called to initialize internal data structures after all Forces in the system
+     * have been initialized.
+     */
+    void initialize();
    /**
     * Add a CudaForce to this context.
     */
@@ -123,12 +123,12 @@ public:
    CudaArray& getVelm() {
        return *velm;
    }
-//    /**
-//     * Get the array which contains the force on each atom.
-//     */
-//    CudaArray<mm_float4>& getForce() {
-//        return *force;
-//    }
+    /**
+     * Get the array which contains the force on each atom (respresented as a long3 in 64 bit fixed point).
+     */
+    CudaArray& getForce() {
+        return *force;
+    }
 //    /**
 //     * Get the array which contains the buffers in which forces are computed.
 //     */
@@ -184,36 +184,41 @@ public:
     *                           omitted, a default set of options will be used
     */
    CUmodule createModule(const std::string source, const std::map<std::string, std::string>& defines, const char* optimizationFlags = NULL);
-//    /**
-//     * Execute a kernel.
-//     *
-//     * @param kernel       the kernel to execute
-//     * @param workUnits    the maximum number of work units that should be used
-//     * @param blockSize    the size of each thread block to use
-//     */
-//    void executeKernel(cl::Kernel& kernel, int workUnits, int blockSize = -1);
-//    /**
-//     * Set all elements of an array to 0.
-//     */
-//    void clearBuffer(CudaArray<float>& array);
-//    /**
-//     * Set all elements of an array to 0.
-//     */
-//    void clearBuffer(CudaArray<mm_float4>& array);
-//    /**
-//     * Set all elements of an array to 0.
-//     *
-//     * @param memory     the Memory to clear
-//     * @param size       the number of float elements in the buffer
-//     */
-//    void clearBuffer(cl::Memory& memory, int size);
-//    /**
-//     * Register a buffer that should be automatically cleared (all elements set to 0) at the start of each force or energy computation.
-//     *
-//     * @param memory     the Memory to clear
-//     * @param size       the number of float elements in the buffer
-//     */
-//    void addAutoclearBuffer(cl::Memory& memory, int size);
+    /**
+     * Get a kernel from a CUDA module.
+     *
+     * @param module    the module to get the kernel from
+     * @param name      the name of the kernel to get
+     */
+    CUfunction getKernel(CUmodule& module, const std::string& name);
+    /**
+     * Execute a kernel.
+     *
+     * @param kernel       the kernel to execute
+     * @param arguments    an array of pointers to the kernel arguments
+     * @param threads      the maximum number of threads that should be used
+     * @param blockSize    the size of each thread block to use
+     * @param sharedSize   the amount of dynamic shared memory to allocated for the kernel, in bytes
+     */
+    void executeKernel(CUfunction kernel, void** arguments, int workUnits, int blockSize = -1, unsigned int sharedSize = 0);
+    /**
+     * Set all elements of an array to 0.
+     */
+    void clearBuffer(CudaArray& array);
+    /**
+     * Set all elements of an array to 0.
+     *
+     * @param memory     the memory to clear
+     * @param size       the number of 4-byte elements in the buffer
+     */
+    void clearBuffer(CUdeviceptr memory, int size);
+    /**
+     * Register a buffer that should be automatically cleared (all elements set to 0) at the start of each force or energy computation.
+     *
+     * @param memory     the memory to clear
+     * @param size       the number of float/double elements in the buffer
+     */
+    void addAutoclearBuffer(CUdeviceptr memory, int size);
 //    /**
 //     * Clear all buffers that have been registered with addAutoclearBuffer().
 //     */
@@ -230,108 +235,110 @@ public:
 //     * Sum the buffesr containing forces.
 //     */
 //    void reduceForces();
-//    /**
-//     * Get the current simulation time.
-//     */
-//    double getTime() {
-//        return time;
-//    }
-//    /**
-//     * Set the current simulation time.
-//     */
-//    void setTime(double t) {
-//        time = t;
-//    }
-//    /**
-//     * Get the number of integration steps that have been taken.
-//     */
-//    int getStepCount() {
-//        return stepCount;
-//    }
-//    /**
-//     * Set the number of integration steps that have been taken.
-//     */
-//    void setStepCount(int steps) {
-//        stepCount = steps;
-//    }
-//    /**
-//     * Get the number of times forces or energy has been computed.
-//     */
-//    int getComputeForceCount() {
-//        return computeForceCount;
-//    }
-//    /**
-//     * Set the number of times forces or energy has been computed.
-//     */
-//    void setComputeForceCount(int count) {
-//        computeForceCount = count;
-//    }
-//    /**
-//     * Get the number of atoms.
-//     */
-//    int getNumAtoms() const {
-//        return numAtoms;
-//    }
-//    /**
-//     * Get the number of atoms, rounded up to a multiple of TileSize.  This is the actual size of
-//     * most arrays with one element per atom.
-//     */
-//    int getPaddedNumAtoms() const {
-//        return paddedNumAtoms;
-//    }
-//    /**
-//     * Get the number of blocks of TileSize atoms.
-//     */
-//    int getNumAtomBlocks() const {
-//        return numAtomBlocks;
-//    }
-//    /**
-//     * Get the standard number of thread blocks to use when executing kernels.
-//     */
-//    int getNumThreadBlocks() const {
-//        return numThreadBlocks;
-//    }
-//    /**
-//     * Get the number of force buffers.
-//     */
-//    int getNumForceBuffers() const {
-//        return numForceBuffers;
-//    }
-//    /**
-//     * Get the SIMD width of the device being used.
-//     */
-//    int getSIMDWidth() const {
-//        return simdWidth;
-//    }
-//    /**
-//     * Get whether the device being used supports 64 bit atomic operations on global memory.
-//     */
-//    bool getSupports64BitGlobalAtomics() {
-//        return supports64BitGlobalAtomics;
-//    }
-//    /**
-//     * Get whether the device being used supports double precision math.
-//     */
-//    bool getSupportsDoublePrecision() {
-//        return supportsDoublePrecision;
-//    }
+    /**
+     * Get the current simulation time.
+     */
+    double getTime() {
+        return time;
+    }
+    /**
+     * Set the current simulation time.
+     */
+    void setTime(double t) {
+        time = t;
+    }
+    /**
+     * Get the number of integration steps that have been taken.
+     */
+    int getStepCount() {
+        return stepCount;
+    }
+    /**
+     * Set the number of integration steps that have been taken.
+     */
+    void setStepCount(int steps) {
+        stepCount = steps;
+    }
+    /**
+     * Get the number of times forces or energy has been computed.
+     */
+    int getComputeForceCount() {
+        return computeForceCount;
+    }
+    /**
+     * Set the number of times forces or energy has been computed.
+     */
+    void setComputeForceCount(int count) {
+        computeForceCount = count;
+    }
+    /**
+     * Get the number of atoms.
+     */
+    int getNumAtoms() const {
+        return numAtoms;
+    }
+    /**
+     * Get the number of atoms, rounded up to a multiple of TileSize.  This is the actual size of
+     * most arrays with one element per atom.
+     */
+    int getPaddedNumAtoms() const {
+        return paddedNumAtoms;
+    }
+    /**
+     * Get the number of blocks of TileSize atoms.
+     */
+    int getNumAtomBlocks() const {
+        return numAtomBlocks;
+    }
+    /**
+     * Get the standard number of thread blocks to use when executing kernels.
+     */
+    int getNumThreadBlocks() const {
+        return numThreadBlocks;
+    }
+    /**
+     * Get whether double precision is being used.
+     */
+    bool getUseDoublePrecision() {
+        return useDoublePrecision;
+    }
+    /**
+     * Get whether accumulation is being done in double precision.
+     */
+    bool getAccumulateInDouble() {
+        return accumulateInDouble;
+    }
+    /**
+     * Convert a number to a string in a format suitable for including in a kernel.
+     * This takes into account whether the context uses single or double precision.
+     */
+    std::string doubleToString(double value);
+    /**
+     * Convert a number to a string in a format suitable for including in a kernel.
+     */
+    std::string intToString(int value);
+    /**
+     * Convert a CUDA result code to the corresponding string description.
+     */
+    std::string getErrorString(CUresult result);
+    
 //    /**
 //     * Get the size of the periodic box.
 //     */
-//    mm_float4 getPeriodicBoxSize() const {
+//    float4 getPeriodicBoxSize() const {
 //        return periodicBoxSize;
 //    }
 //    /**
 //     * Set the size of the periodic box.
 //     */
 //    void setPeriodicBoxSize(double xsize, double ysize, double zsize) {
-//        periodicBoxSize = mm_float4((float) xsize, (float) ysize, (float) zsize, 0);
-//        invPeriodicBoxSize = mm_float4((float) (1.0/xsize), (float) (1.0/ysize), (float) (1.0/zsize), 0);
+//        periodicBoxSize = make_float4((float) xsize, (float) ysize, (float) zsize, 0);
+//        invPeriodicBoxSize = make_float4((float) (1.0/xsize), (float) (1.0/ysize), (float) (1.0/zsize), 0);
 //    }
 //    /**
 //     * Get the inverse of the size of the periodic box.
 //     */
-//    mm_float4 getInvPeriodicBoxSize() const {
+//    float4 getInvPeriodicBoxSize() const {
 //        return invPeriodicBoxSize;
 //    }
 //    /**
@@ -352,66 +359,66 @@ public:
 //    CudaNonbondedUtilities& getNonbondedUtilities() {
 //        return *nonbonded;
 //    }
-//    /**
-//     * Get the thread used by this context for executing parallel computations.
-//     */
-//    WorkThread& getWorkThread() {
-//        return *thread;
-//    }
-//    /**
-//     * Get whether atoms were reordered during the most recent force/energy computation.
-//     */
-//    bool getAtomsWereReordered() const {
-//        return atomsWereReordered;
-//    }
-//    /**
-//     * Set whether atoms were reordered during the most recent force/energy computation.
-//     */
-//    void setAtomsWereReordered(bool wereReordered) {
-//        atomsWereReordered = wereReordered;
-//    }
-//    /**
-//     * Reorder the internal arrays of atoms to try to keep spatially contiguous atoms close
-//     * together in the arrays.
-//     * 
-//     * @param enforcePeriodic    if true, the atom positions may be altered to enforce periodic boundary conditions
-//     */
-//    void reorderAtoms(bool enforcePeriodic);
-//    /**
-//     * Add a listener that should be called whenever atoms get reordered.  The CudaContext
-//     * assumes ownership of the object, and deletes it when the context itself is deleted.
-//     */
-//    void addReorderListener(ReorderListener* listener);
-//    /**
-//     * Get the list of ReorderListeners.
-//     */
-//    std::vector<ReorderListener*>& getReorderListeners() {
-//        return reorderListeners;
-//    }
-//    /**
-//     * Mark that the current molecule definitions (and hence the atom order) may be invalid.
-//     * This should be called whenever force field parameters change.  It will cause the definitions
-//     * and order to be revalidated the next to reorderAtoms() is called.
-//     */
-//    void invalidateMolecules();
-//    /**
-//     * Get whether the current molecule definitions are valid.
-//     */
-//    bool getMoleculesAreInvalid() {
-//        return moleculesInvalid;
-//    }
+    /**
+     * Get the thread used by this context for executing parallel computations.
+     */
+    WorkThread& getWorkThread() {
+        return *thread;
+    }
+    /**
+     * Get whether atoms were reordered during the most recent force/energy computation.
+     */
+    bool getAtomsWereReordered() const {
+        return atomsWereReordered;
+    }
+    /**
+     * Set whether atoms were reordered during the most recent force/energy computation.
+     */
+    void setAtomsWereReordered(bool wereReordered) {
+        atomsWereReordered = wereReordered;
+    }
+    /**
+     * Reorder the internal arrays of atoms to try to keep spatially contiguous atoms close
+     * together in the arrays.
+     * 
+     * @param enforcePeriodic    if true, the atom positions may be altered to enforce periodic boundary conditions
+     */
+    void reorderAtoms(bool enforcePeriodic);
+    /**
+     * Add a listener that should be called whenever atoms get reordered.  The CudaContext
+     * assumes ownership of the object, and deletes it when the context itself is deleted.
+     */
+    void addReorderListener(ReorderListener* listener);
+    /**
+     * Get the list of ReorderListeners.
+     */
+    std::vector<ReorderListener*>& getReorderListeners() {
+        return reorderListeners;
+    }
+    /**
+     * Mark that the current molecule definitions (and hence the atom order) may be invalid.
+     * This should be called whenever force field parameters change.  It will cause the definitions
+     * and order to be revalidated the next to reorderAtoms() is called.
+     */
+    void invalidateMolecules();
+    /**
+     * Get whether the current molecule definitions are valid.
+     */
+    bool getMoleculesAreInvalid() {
+        return moleculesInvalid;
+    }
 private:
    struct Molecule;
    struct MoleculeGroup;
    class VirtualSiteInfo;
-//    void findMoleculeGroups();
-//    static void tagAtomsInMolecule(int atom, int molecule, std::vector<int>& atomMolecule, std::vector<std::vector<int> >& atomBonds);
-//    /**
-//     * Ensure that all molecules marked as "identical" really are identical.  This should be
-//     * called whenever force field parameters change.  If necessary, it will rebuild the list
-//     * of molecules and resort the atoms.
-//     */
-//    void validateMolecules();
+    void findMoleculeGroups();
+    static void tagAtomsInMolecule(int atom, int molecule, std::vector<int>& atomMolecule, std::vector<std::vector<int> >& atomBonds);
+    /**
+     * Ensure that all molecules marked as "identical" really are identical.  This should be
+     * called whenever force field parameters change.  If necessary, it will rebuild the list
+     * of molecules and resort the atoms.
+     */
+    void validateMolecules();
    static bool hasInitializedCuda;
    const System& system;
    double time;
@@ -424,8 +431,6 @@ private:
    int paddedNumAtoms;
    int numAtomBlocks;
    int numThreadBlocks;
-//    int numForceBuffers;
-//    int simdWidth;
    bool useBlockingSync, useDoublePrecision, accumulateInDouble, contextIsValid, atomsWereReordered, moleculesInvalid;
    std::string compiler, tempDir, gpuArchitecture;
    float4 periodicBoxSize;
@@ -446,15 +451,15 @@ private:
    std::vector<Molecule> molecules;
    std::vector<MoleculeGroup> moleculeGroups;
    std::vector<int4> posCellOffsets;
+    void* pinnedBuffer;
    CudaArray* posq;
    CudaArray* velm;
-//    CudaArray<mm_float4>* force;
-//    CudaArray<mm_float4>* forceBuffers;
-//    CudaArray<cl_long>* longForceBuffer;
-//    CudaArray<cl_float>* energyBuffer;
-//    CudaArray<cl_int>* atomIndex;
-//    std::vector<cl::Memory*> autoclearBuffers;
-//    std::vector<int> autoclearBufferSizes;
+    CudaArray* force;
+    CudaArray* energyBuffer;
+    CudaArray* atomIndexDevice;
+    std::vector<int> atomIndex;
+    std::vector<CUdeviceptr> autoclearBuffers;
+    std::vector<int> autoclearBufferSizes;
    std::vector<ReorderListener*> reorderListeners;
 //    CudaIntegrationUtilities* integration;
 //    CudaBondedUtilities* bonded;

--- a/platforms/cuda2/src/CudaPlatform.cpp
+++ b/platforms/cuda2/src/CudaPlatform.cpp
@@ -154,6 +154,7 @@ CudaPlatform::PlatformData::PlatformData(const System& system, const string& dev
        device << contexts[i]->getDeviceIndex();
    }
    propertyValues[CudaPlatform::CudaDeviceIndex()] = device.str();
+    propertyValues[CudaPlatform::CudaUseBlockingSync()] = blocking ? "true" : "false";
    propertyValues[CudaPlatform::CudaPrecision()] = precisionProperty;
    propertyValues[CudaPlatform::CudaCompiler()] = compilerProperty;
    propertyValues[CudaPlatform::CudaTempDirectory()] = tempProperty;
@@ -166,11 +167,11 @@ CudaPlatform::PlatformData::~PlatformData() {
 }

 void CudaPlatform::PlatformData::initializeContexts(const System& system) {
-//    for (int i = 0; i < (int) contexts.size(); i++)
-//        contexts[i]->initialize();
+    for (int i = 0; i < (int) contexts.size(); i++)
+        contexts[i]->initialize();
 }

 void CudaPlatform::PlatformData::syncContexts() {
-//    for (int i = 0; i < (int) contexts.size(); i++)
-//        contexts[i]->getWorkThread().flush();
+    for (int i = 0; i < (int) contexts.size(); i++)
+        contexts[i]->getWorkThread().flush();
 }
--- a/platforms/cuda2/src/CudaSort.cpp
+++ b/platforms/cuda2/src/CudaSort.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2010-2012 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+#include "CudaSort.h"
+#include "CudaKernelSources.h"
+#include <map>
+
+using namespace OpenMM;
+using namespace std;
+
+CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length) : context(context), trait(trait),
+        dataRange(NULL), bucketOfElement(NULL), offsetInBucket(NULL), bucketOffset(NULL), buckets(NULL) {
+    // Create kernels.
+
+    map<string, string> replacements;
+    replacements["DATA_TYPE"] = trait->getDataType();
+    replacements["KEY_TYPE"] =  trait->getKeyType();
+    replacements["SORT_KEY"] = trait->getSortKey();
+    replacements["MIN_KEY"] = trait->getMinKey();
+    replacements["MAX_KEY"] = trait->getMaxKey();
+    replacements["MAX_VALUE"] = trait->getMaxValue();
+    CUmodule module = context.createModule(context.replaceStrings(CudaKernelSources::sort, replacements));
+    computeRangeKernel = context.getKernel(module, "computeRange");
+    assignElementsKernel = context.getKernel(module, "assignElementsToBuckets");
+    computeBucketPositionsKernel = context.getKernel(module, "computeBucketPositions");
+    copyToBucketsKernel = context.getKernel(module, "copyDataToBuckets");
+    sortBucketsKernel = context.getKernel(module, "sortBuckets");
+
+    // Work out the work group sizes for various kernels.
+
+    int maxBlockSize;
+    cuDeviceGetAttribute(&maxBlockSize, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, context.getDevice());
+    for (rangeKernelSize = 1; rangeKernelSize*2 <= maxBlockSize; rangeKernelSize *= 2)
+        ;
+    positionsKernelSize = rangeKernelSize;
+    sortKernelSize = rangeKernelSize/2;
+    if (rangeKernelSize > length)
+        rangeKernelSize = length;
+    int maxSharedMem;
+    cuDeviceGetAttribute(&maxSharedMem, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, context.getDevice());
+    unsigned int maxLocalBuffer = (unsigned int) ((maxSharedMem/trait->getDataSize())/2);
+    if (sortKernelSize > maxLocalBuffer)
+        sortKernelSize = maxLocalBuffer;
+    unsigned int targetBucketSize = sortKernelSize/2;
+    unsigned int numBuckets = length/targetBucketSize;
+    if (numBuckets < 1)
+        numBuckets = 1;
+    if (positionsKernelSize > numBuckets)
+        positionsKernelSize = numBuckets;
+
+    // Create workspace arrays.
+
+    dataRange = new CudaArray(2, trait->getKeySize(), "sortDataRange");
+    bucketOffset = CudaArray::create<uint1>(numBuckets, "bucketOffset");
+    bucketOfElement = CudaArray::create<uint1>(length, "bucketOfElement");
+    offsetInBucket = CudaArray::create<uint1>(length, "offsetInBucket");
+    buckets = new CudaArray(length, trait->getDataSize(), "buckets");
+}
+
+CudaSort::~CudaSort() {
+    delete trait;
+    if (dataRange != NULL)
+        delete dataRange;
+    if (bucketOfElement != NULL)
+        delete bucketOfElement;
+    if (offsetInBucket != NULL)
+        delete offsetInBucket;
+    if (bucketOffset != NULL)
+        delete bucketOffset;
+    if (buckets != NULL)
+        delete buckets;
+}
+
+void CudaSort::sort(CudaArray& data) {
+    if (data.getSize() != bucketOfElement->getSize() || data.getElementSize() != trait->getDataSize())
+        throw OpenMMException("CudaSort called with different data size");
+    if (data.getSize() == 0)
+        return;
+
+    // Compute the range of data values.
+
+    unsigned int dataSize = data.getSize();
+    void* rangeArgs[] = {&data.getDevicePointer(), &dataSize, &dataRange->getDevicePointer()};
+    context.executeKernel(computeRangeKernel, rangeArgs, rangeKernelSize, rangeKernelSize, rangeKernelSize*trait->getKeySize());
+
+    // Assign array elements to buckets.
+
+    unsigned int numBuckets = bucketOffset->getSize();
+    context.clearBuffer(*bucketOffset);
+    void* elementsArgs[] = {&data.getDevicePointer(), &dataSize, &numBuckets, &dataRange->getDevicePointer(),
+            &bucketOffset->getDevicePointer(), &bucketOfElement->getDevicePointer(), &offsetInBucket->getDevicePointer()};
+    context.executeKernel(assignElementsKernel, elementsArgs, data.getSize());
+
+    // Compute the position of each bucket.
+
+    void* computeArgs[] = {&numBuckets, &bucketOffset->getDevicePointer()};
+    context.executeKernel(computeBucketPositionsKernel, computeArgs, positionsKernelSize, positionsKernelSize, positionsKernelSize*sizeof(int));
+
+    // Copy the data into the buckets.
+
+    void* copyArgs[] = {&data.getDevicePointer(), &buckets->getDevicePointer(), &dataSize, &bucketOffset->getDevicePointer(),
+            &bucketOfElement->getDevicePointer(), &offsetInBucket->getDevicePointer()};
+    context.executeKernel(copyToBucketsKernel, copyArgs, data.getSize());
+
+    // Sort each bucket.
+
+    void* sortArgs[] = {&data.getDevicePointer(), &buckets->getDevicePointer(), &numBuckets, &bucketOffset->getDevicePointer()};
+    context.executeKernel(sortBucketsKernel, sortArgs, ((data.getSize()+sortKernelSize-1)/sortKernelSize)*sortKernelSize, sortKernelSize, sortKernelSize*trait->getDataSize());
+}
--- a/platforms/cuda2/src/CudaSort.h
+++ b/platforms/cuda2/src/CudaSort.h
+#ifndef __OPENMM_CUDASORT_H__
+#define __OPENMM_CUDASORT_H__
+
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2010-2012 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+#include "CudaArray.h"
+#include "openmm/internal/windowsExport.h"
+#include "CudaContext.h"
+
+namespace OpenMM {
+
+/**
+ * This class sorts arrays of values.  It supports any type of values, not just scalars,
+ * so long as an appropriate sorting key can be defined by which to sort them.
+ * 
+ * The sorting behavior is specified by a "trait" class that defines the type of data to
+ * sort and the key for sorting it.  Here is an example of a trait class for
+ * sorting floats:
+ * 
+ * class SortTrait : public CudaSort::SortTrait {
+ *     int getDataSize() const {return 4;}
+ *     int getKeySize() const {return 4;}
+ *     const char* getDataType() const {return "float";}
+ *     const char* getKeyType() const {return "float";}
+ *     const char* getMinKey() const {return "-MAXFLOAT";}
+ *     const char* getMaxKey() const {return "MAXFLOAT";}
+ *     const char* getMaxValue() const {return "MAXFLOAT";}
+ *     const char* getSortKey() const {return "value";}
+ * };
+ *
+ * The algorithm used is a bucket sort, followed by a bitonic sort within each bucket
+ * (in local memory when possible, in global memory otherwise).  This is similar to
+ * the algorithm described in
+ *
+ * Shifu Chen, Jing Qin, Yongming Xie, Junping Zhao, and Pheng-Ann Heng.  "An Efficient
+ * Sorting Algorithm with CUDA"  Journal of the Chinese Institute of Engineers, 32(7),
+ * pp. 915-921 (2009)
+ *
+ * but with many modifications and simplifications.  In particular, this algorithm
+ * involves much less communication between host and device, which is critical to get
+ * good performance with the array sizes we typically work with (10,000 to 100,000
+ * elements).
+ */
+    
+class OPENMM_EXPORT CudaSort {
+public:
+    class SortTrait;
+    /**
+     * Create a CudaSort object for sorting data of a particular type.
+     *
+     * @param context    the context in which to perform calculations
+     * @param trait      a SortTrait defining the type of data to sort.  It should have been allocated
+     *                   on the heap with the "new" operator.  This object takes over ownership of it,
+     *                   and deletes it when the CudaSort is deleted.
+     * @param length     the length of the arrays this object will be used to sort
+     */
+    CudaSort(CudaContext& context, SortTrait* trait, unsigned int length);
+    ~CudaSort();
+    /**
+     * Sort an array.
+     */
+    void sort(CudaArray& data);
+private:
+    CudaContext& context;
+    SortTrait* trait;
+    CudaArray* dataRange;
+    CudaArray* bucketOfElement;
+    CudaArray* offsetInBucket;
+    CudaArray* bucketOffset;
+    CudaArray* buckets;
+    CUfunction computeRangeKernel, assignElementsKernel, computeBucketPositionsKernel, copyToBucketsKernel, sortBucketsKernel;
+    unsigned int rangeKernelSize, positionsKernelSize, sortKernelSize;
+};
+
+/**
+ * A subclass of SortTrait defines the type of value to sort, and the key for sorting them.
+ */
+class CudaSort::SortTrait {
+public:
+    /**
+     * Get the size of each data value in bytes.
+     */
+    virtual int getDataSize() const = 0;
+    /**
+     * Get the size of each key value in bytes.
+     */
+    virtual int getKeySize() const = 0;
+    /**
+     * Get the data type of the values to sort.
+     */
+    virtual const char* getDataType() const = 0;
+    /**
+     * Get the data type of the sorting key.
+     */
+    virtual const char* getKeyType() const = 0;
+    /**
+     * Get the minimum value a key can take.
+     */
+    virtual const char* getMinKey() const = 0;
+    /**
+     * Get the maximum value a key can take.
+     */
+    virtual const char* getMaxKey() const = 0;
+    /**
+     * Get a value whose key is guaranteed to equal getMaxKey().
+     */
+    virtual const char* getMaxValue() const = 0;
+    /**
+     * Get the CUDA code to select the key from the data value.
+     */
+    virtual const char* getSortKey() const = 0;
+};
+
+
+} // namespace OpenMM
+
+#endif // __OPENMM_CUDASORT_H__
--- a/platforms/cuda2/src/kernels/sort.cu
+++ b/platforms/cuda2/src/kernels/sort.cu
+__device__ KEY_TYPE getValue(DATA_TYPE value) {
+    return SORT_KEY;
+}
+
+extern "C" {
+    
+/**
+ * Calculate the minimum and maximum value in the array to be sorted.  This kernel
+ * is executed as a single work group.
+ */
+__global__ void computeRange(const DATA_TYPE* __restrict__ data, unsigned int length, KEY_TYPE* __restrict__ range) {
+    extern __shared__ KEY_TYPE rangeBuffer[];
+    KEY_TYPE minimum = MAX_KEY;
+    KEY_TYPE maximum = MIN_KEY;
+
+    // Each thread calculates the range of a subset of values.
+
+    for (unsigned int index = threadIdx.x; index < length; index += blockDim.x) {
+        KEY_TYPE value = getValue(data[index]);
+        minimum = min(minimum, value);
+        maximum = max(maximum, value);
+    }
+
+    // Now reduce them.
+
+    rangeBuffer[threadIdx.x] = minimum;
+    __syncthreads();
+    for (unsigned int step = 1; step < blockDim.x; step *= 2) {
+        if (threadIdx.x+step < blockDim.x && threadIdx.x%(2*step) == 0)
+            rangeBuffer[threadIdx.x] = min(rangeBuffer[threadIdx.x], rangeBuffer[threadIdx.x+step]);
+        __syncthreads();
+    }
+    minimum = rangeBuffer[0];
+    rangeBuffer[threadIdx.x] = maximum;
+    __syncthreads();
+    for (unsigned int step = 1; step < blockDim.x; step *= 2) {
+        if (threadIdx.x+step < blockDim.x && threadIdx.x%(2*step) == 0)
+            rangeBuffer[threadIdx.x] = max(rangeBuffer[threadIdx.x], rangeBuffer[threadIdx.x+step]);
+        __syncthreads();
+    }
+    maximum = rangeBuffer[0];
+    if (threadIdx.x == 0) {
+        range[0] = minimum;
+        range[1] = maximum;
+    }
+}
+
+/**
+ * Assign elements to buckets.
+ */
+__global__ void assignElementsToBuckets(const DATA_TYPE* __restrict__ data, unsigned int length, unsigned int numBuckets, const KEY_TYPE* __restrict__ range,
+        unsigned int* bucketOffset, unsigned int* __restrict__ bucketOfElement, unsigned int* __restrict__ offsetInBucket) {
+    float minValue = (float) (range[0]);
+    float maxValue = (float) (range[1]);
+    float bucketWidth = (maxValue-minValue)/numBuckets;
+    for (unsigned int index = blockDim.x*blockIdx.x+threadIdx.x; index < length; index += blockDim.x*gridDim.x) {
+        float key = (float) getValue(data[index]);
+        unsigned int bucketIndex = min((unsigned int) ((key-minValue)/bucketWidth), numBuckets-1);
+        offsetInBucket[index] = atomicAdd(&bucketOffset[bucketIndex], 1);
+        bucketOfElement[index] = bucketIndex;
+    }
+}
+
+/**
+ * Sum the bucket sizes to compute the start position of each bucket.  This kernel
+ * is executed as a single work group.
+ */
+__global__ void computeBucketPositions(unsigned int numBuckets, unsigned int* __restrict__ bucketOffset) {
+    extern __shared__ unsigned int posBuffer[];
+    unsigned int globalOffset = 0;
+    for (unsigned int startBucket = 0; startBucket < numBuckets; startBucket += blockDim.x) {
+        // Load the bucket sizes into local memory.
+
+        unsigned int globalIndex = startBucket+threadIdx.x;
+        posBuffer[threadIdx.x] = (globalIndex < numBuckets ? bucketOffset[globalIndex] : 0);
+        __syncthreads();
+
+        // Perform a parallel prefix sum.
+
+        for (unsigned int step = 1; step < blockDim.x; step *= 2) {
+            unsigned int add = (threadIdx.x >= step ? posBuffer[threadIdx.x-step] : 0);
+            __syncthreads();
+            posBuffer[threadIdx.x] += add;
+            __syncthreads();
+        }
+
+        // Write the results back to global memory.
+
+        if (globalIndex < numBuckets)
+            bucketOffset[globalIndex] = posBuffer[threadIdx.x]+globalOffset;
+        globalOffset += posBuffer[blockDim.x-1];
+    }
+}
+
+/**
+ * Copy the input data into the buckets for sorting.
+ */
+__global__ void copyDataToBuckets(const DATA_TYPE* __restrict__ data, DATA_TYPE* __restrict__ buckets, unsigned int length, const unsigned int* __restrict__ bucketOffset, const unsigned int* __restrict__ bucketOfElement, const unsigned int* __restrict__ offsetInBucket) {
+    for (unsigned int index = blockDim.x*blockIdx.x+threadIdx.x; index < length; index += blockDim.x*gridDim.x) {
+        DATA_TYPE element = data[index];
+        unsigned int bucketIndex = bucketOfElement[index];
+        unsigned int offset = (bucketIndex == 0 ? 0 : bucketOffset[bucketIndex-1]);
+        buckets[offset+offsetInBucket[index]] = element;
+    }
+}
+
+/**
+ * Sort the data in each bucket.
+ */
+__global__ void sortBuckets(DATA_TYPE* __restrict__ data, const DATA_TYPE* __restrict__ buckets, unsigned int numBuckets, const unsigned int* __restrict__ bucketOffset) {
+    extern __shared__ DATA_TYPE dataBuffer[];
+    for (unsigned int index = blockIdx.x; index < numBuckets; index += gridDim.x) {
+        unsigned int startIndex = (index == 0 ? 0 : bucketOffset[index-1]);
+        unsigned int endIndex = bucketOffset[index];
+        unsigned int length = endIndex-startIndex;
+        if (length <= blockDim.x) {
+            // Load the data into local memory.
+
+            if (threadIdx.x < length)
+                dataBuffer[threadIdx.x] = buckets[startIndex+threadIdx.x];
+            else
+                dataBuffer[threadIdx.x] = MAX_VALUE;
+            __syncthreads();
+
+            // Perform a bitonic sort in local memory.
+
+            for (unsigned int k = 2; k <= blockDim.x; k *= 2) {
+                for (unsigned int j = k/2; j > 0; j /= 2) {
+                    int ixj = threadIdx.x^j;
+                    if (ixj > threadIdx.x) {
+                        DATA_TYPE value1 = dataBuffer[threadIdx.x];
+                        DATA_TYPE value2 = dataBuffer[ixj];
+                        bool ascending = (threadIdx.x&k) == 0;
+                        KEY_TYPE lowKey = (ascending ? getValue(value1) : getValue(value2));
+                        KEY_TYPE highKey = (ascending ? getValue(value2) : getValue(value1));
+                        if (lowKey > highKey) {
+                            dataBuffer[threadIdx.x] = value2;
+                            dataBuffer[ixj] = value1;
+                        }
+                    }
+                    __syncthreads();
+                }
+            }
+
+            // Write the data to the sorted array.
+
+            if (threadIdx.x < length)
+                data[startIndex+threadIdx.x] = dataBuffer[threadIdx.x];
+        }
+        else {
+            // Copy the bucket data over to the output array.
+
+            for (unsigned int i = threadIdx.x; i < length; i += blockDim.x)
+                data[startIndex+i] = buckets[startIndex+i];
+            __threadfence_block();
+            __syncthreads();
+
+            // Perform a bitonic sort in global memory.
+
+            for (unsigned int k = 2; k < 2*length; k *= 2) {
+                for (unsigned int j = k/2; j > 0; j /= 2) {
+                    for (unsigned int i = threadIdx.x; i < length; i += blockDim.x) {
+                        int ixj = i^j;
+                        if (ixj > i && ixj < length) {
+                            DATA_TYPE value1 = data[startIndex+i];
+                            DATA_TYPE value2 = data[startIndex+ixj];
+                            bool ascending = ((i&k) == 0);
+                            for (unsigned int mask = k*2; mask < 2*length; mask *= 2)
+                                ascending = ((i&mask) == 0 ? !ascending : ascending);
+                            KEY_TYPE lowKey  = (ascending ? getValue(value1) : getValue(value2));
+                            KEY_TYPE highKey = (ascending ? getValue(value2) : getValue(value1));
+                            if (lowKey > highKey) {
+                                data[startIndex+i] = value2;
+                                data[startIndex+ixj] = value1;
+                            }
+                        }
+                    }
+                    __threadfence_block();
+                    __syncthreads();
+                }
+            }
+        }
+    }
+}
+
+}
\ No newline at end of file
--- a/platforms/cuda2/src/kernels/utilities.cu
+++ b/platforms/cuda2/src/kernels/utilities.cu
+extern "C" {
+
 /**
 * This is called by the various functions below to clear a buffer.
 */
@@ -100,3 +102,5 @@ __global__ void reduceForces(const long* __restrict__ longBuffer, float4* __rest
        buffer[index] = sum;
    }
 }
+
+}
\ No newline at end of file
--- a/platforms/cuda2/tests/TestCudaSort.cpp
+++ b/platforms/cuda2/tests/TestCudaSort.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008-2012 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+/**
+ * This tests the CUDA implementation of sorting.
+ */
+
+#include "openmm/internal/AssertionUtilities.h"
+#include "../src/CudaArray.h"
+#include "../src/CudaContext.h"
+#include "../src/CudaSort.h"
+#include "sfmt/SFMT.h"
+#include "openmm/System.h"
+#include <iostream>
+#include <cmath>
+#include <set>
+
+using namespace OpenMM;
+using namespace std;
+
+class SortTrait : public CudaSort::SortTrait {
+    int getDataSize() const {return 4;}
+    int getKeySize() const {return 4;}
+    const char* getDataType() const {return "float";}
+    const char* getKeyType() const {return "float";}
+    const char* getMinKey() const {return "-MAXFLOAT";}
+    const char* getMaxKey() const {return "MAXFLOAT";}
+    const char* getMaxValue() const {return "MAXFLOAT";}
+    const char* getSortKey() const {return "value";}
+};
+
+void verifySorting(vector<float> array) {
+    // Sort the array.
+
+    System system;
+    system.addParticle(0.0);
+    CudaPlatform platform;
+    CudaPlatform::PlatformData platformData(system, "", "true", "single",
+            platform.getPropertyDefaultValue(CudaPlatform::CudaCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaTempDirectory()));
+    CudaContext& context = *platformData.contexts[0];
+    context.initialize();
+    CudaArray data(array.size(), 4, "sortData");
+    data.upload(array);
+    CudaSort sort(context, new SortTrait(), array.size());
+    sort.sort(data);
+    vector<float> sorted;
+    data.download(sorted);
+
+    // Verify that it is in sorted order.
+
+    for (int i = 1; i < (int) sorted.size(); i++)
+        ASSERT(sorted[i-1] <= sorted[i]);
+
+    // Make sure the sorted array contains the same values as the original one.
+
+    multiset<float> elements1(array.begin(), array.end());
+    multiset<float> elements2(sorted.begin(), sorted.end());
+    ASSERT(elements1 == elements2);
+}
+
+void testUniformValues()
+{
+    OpenMM_SFMT::SFMT sfmt;
+    init_gen_rand(0, sfmt);
+
+    vector<float> array(10000);
+    for (int i = 0; i < (int) array.size(); i++)
+        array[i] = (float) genrand_real2(sfmt);
+    verifySorting(array);
+}
+
+void testLogValues()
+{
+    OpenMM_SFMT::SFMT sfmt;
+    init_gen_rand(0, sfmt);
+
+    vector<float> array(10000);
+    for (int i = 0; i < (int) array.size(); i++)
+        array[i] = (float) log(genrand_real2(sfmt));
+    verifySorting(array);
+}
+
+int main() {
+    try {
+        testUniformValues();
+        testLogValues();
+    }
+    catch(const exception& e) {
+        cout << "exception: " << e.what() << endl;
+        return 1;
+    }
+    cout << "Done" << endl;
+    return 0;
+}