Continuing to implement new CUDA platform

3e16cab9 · Peter Eastman · abb8cb4b · 3e16cab9 · 3e16cab9 · 3e16cab9
Commit 3e16cab9 authored Jun 05, 2012 by Peter Eastman
9 changed files
--- a/platforms/cuda2/src/CudaArray.h
+++ b/platforms/cuda2/src/CudaArray.h
@@ -83,7 +83,7 @@ public:
    /**
     * Get a pointer to the device memory.
     */
-    CUdeviceptr getDevicePointer() {
+    CUdeviceptr& getDevicePointer() {
        return pointer;
    }
    /**

--- a/platforms/cuda2/src/CudaContext.cpp
+++ b/platforms/cuda2/src/CudaContext.cpp
@@ -31,7 +31,6 @@
 #include "CudaContext.h"
 #include "CudaArray.h"
 //#include "CudaBondedUtilities.h"
-#include "CudaExpressionUtilities.h"
 #include "CudaForceInfo.h"
 //#include "CudaIntegrationUtilities.h"
 #include "CudaKernelSources.h"
@@ -53,7 +52,7 @@
 #define CHECK_RESULT2(result, prefix) \
    if (result != CUDA_SUCCESS) { \
        std::stringstream m; \
-        m<<prefix<<": "<<result<<" ("<<__FILE__<<": "<<__LINE__<<")"; \
+        m<<prefix<<": "<<getErrorString(result)<<" ("<<result<<")"<<" at "<<__FILE__<<":"<<__LINE__; \
        throw OpenMMException(m.str());\
    }

@@ -66,7 +65,7 @@ bool CudaContext::hasInitializedCuda = false;

 CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const string& precision, const string& compiler,
        const string& tempDir, CudaPlatform::PlatformData& platformData) : system(system), compiler(compiler),
-        time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), contextIsValid(false), atomsWereReordered(false), posq(NULL),
+        time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), contextIsValid(false), atomsWereReordered(false), pinnedBuffer(NULL), posq(NULL),
        velm(NULL), /*forceBuffers(NULL), longForceBuffer(NULL), energyBuffer(NULL), atomIndex(NULL), integration(NULL),
        bonded(NULL), nonbonded(NULL),*/ thread(NULL) {
    if (!hasInitializedCuda) {
@@ -88,7 +87,7 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
    else
        throw OpenMMException("Illegal value for CudaPrecision: "+precision);
 #ifdef WIN32
-    this->tempDir = tempDir+"\";
+    this->tempDir = tempDir+"\\";
 #else
    this->tempDir = tempDir+"/";
 #endif
@@ -114,6 +113,7 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
                deviceIndex = i;
                bestSpeed = speed;
                bestCompute = major;
+                gpuArchitecture = intToString(major)+intToString(minor);
            }
        }
    }
@@ -121,37 +121,47 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
        throw OpenMMException("No compatible CUDA device is available");
    CHECK_RESULT(cuDeviceGet(&device, deviceIndex));
    this->deviceIndex = deviceIndex;
-    int major, minor;
-    CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, device));
-    gpuArchitecture = CudaExpressionUtilities::intToString(major)+CudaExpressionUtilities::intToString(minor);
-    compilationDefines["WORK_GROUP_SIZE"] = CudaExpressionUtilities::intToString(ThreadBlockSize);
+    compilationDefines["WORK_GROUP_SIZE"] = intToString(ThreadBlockSize);
    defaultOptimizationOptions = "--use_fast_math";
-    int numThreadBlocksPerComputeUnit = 6;
-    CHECK_RESULT(cuCtxCreate(&context, 0, device));
+    unsigned int flags = CU_CTX_MAP_HOST;
+    if (useBlockingSync)
+        flags += CU_CTX_SCHED_BLOCKING_SYNC;
+    else
+        flags += CU_CTX_SCHED_SPIN;
+    CHECK_RESULT(cuCtxCreate(&context, flags, device));
    contextIsValid = true;
    numAtoms = system.getNumParticles();
    paddedNumAtoms = TileSize*((numAtoms+TileSize-1)/TileSize);
    numAtomBlocks = (paddedNumAtoms+(TileSize-1))/TileSize;
    int multiprocessors;
    CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device));
+    int numThreadBlocksPerComputeUnit = 6;
    numThreadBlocks = numThreadBlocksPerComputeUnit*multiprocessors;
 //    bonded = new CudaBondedUtilities(*this);
 //    nonbonded = new CudaNonbondedUtilities(*this);
-    posq = CudaArray::create<float4>(paddedNumAtoms, "posq");
-    velm = CudaArray::create<float4>(paddedNumAtoms, "velm");
+    if (useDoublePrecision) {
+        CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, paddedNumAtoms*sizeof(double4), 0));
+        posq = CudaArray::create<double4>(paddedNumAtoms, "posq");
+        velm = CudaArray::create<double4>(paddedNumAtoms, "velm");
+    }
+    else {
+        CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, paddedNumAtoms*sizeof(float4), 0));
+        posq = CudaArray::create<float4>(paddedNumAtoms, "posq");
+        velm = CudaArray::create<float4>(paddedNumAtoms, "velm");
+    }
    posCellOffsets.resize(paddedNumAtoms, make_int4(0, 0, 0, 0));

    // Create utility kernels that are used in multiple places.

    CUmodule utilities = createModule(CudaKernelSources::vectorOps+CudaKernelSources::utilities);
-    cuModuleGetFunction(&clearBufferKernel, utilities, "clearBuffer");
-    cuModuleGetFunction(&clearTwoBuffersKernel, utilities, "clearTwoBuffers");
-    cuModuleGetFunction(&clearThreeBuffersKernel, utilities, "clearThreeBuffers");
-    cuModuleGetFunction(&clearFourBuffersKernel, utilities, "clearFourBuffers");
-    cuModuleGetFunction(&clearFiveBuffersKernel, utilities, "clearFiveBuffers");
-    cuModuleGetFunction(&clearSixBuffersKernel, utilities, "clearSixBuffers");
-    cuModuleGetFunction(&reduceFloat4Kernel, utilities, "reduceFloat4Buffer");
-    cuModuleGetFunction(&reduceForcesKernel, utilities, "reduceForces");
+    clearBufferKernel = getKernel(utilities, "clearBuffer");
+    clearTwoBuffersKernel = getKernel(utilities, "clearTwoBuffers");
+    clearThreeBuffersKernel = getKernel(utilities, "clearThreeBuffers");
+    clearFourBuffersKernel = getKernel(utilities, "clearFourBuffers");
+    clearFiveBuffersKernel = getKernel(utilities, "clearFiveBuffers");
+    clearSixBuffersKernel = getKernel(utilities, "clearSixBuffers");
+    reduceFloat4Kernel = getKernel(utilities, "reduceFloat4Buffer");
+    reduceForcesKernel = getKernel(utilities, "reduceForces");

    // Set defines based on the requested precision.

@@ -175,6 +185,8 @@ CudaContext::~CudaContext() {
        delete forces[i];
    for (int i = 0; i < (int) reorderListeners.size(); i++)
        delete reorderListeners[i];
+    if (pinnedBuffer != NULL)
+        cuMemFreeHost(pinnedBuffer);
    if (posq != NULL)
        delete posq;
    if (velm != NULL)
@@ -202,38 +214,29 @@ CudaContext::~CudaContext() {
        CHECK_RESULT(cuCtxDestroy(context));
 }

-//void CudaContext::initialize() {
-//    for (int i = 0; i < numAtoms; i++) {
-//        double mass = system.getParticleMass(i);
-//        (*velm)[i].w = (float) (mass == 0.0 ? 0.0 : 1.0/mass);
-//    }
-//    velm->upload();
+void CudaContext::initialize() {
+    for (int i = 0; i < numAtoms; i++) {
+        double mass = system.getParticleMass(i);
+        if (useDoublePrecision)
+            ((double4*) pinnedBuffer)[i] = make_double4(0.0, 0.0, 0.0, mass == 0.0 ? 0.0 : 1.0/mass);
+        else
+            ((float4*) pinnedBuffer)[i] = make_float4(0.0f, 0.0f, 0.0f, mass == 0.0 ? 0.0f : (float) (1.0/mass));
+    }
+    velm->upload(pinnedBuffer);
 //    bonded->initialize(system);
-//    numForceBuffers = platformData.contexts.size();
-//    numForceBuffers = std::max(numForceBuffers, bonded->getNumForceBuffers());
-//    for (int i = 0; i < (int) forces.size(); i++)
-//        numForceBuffers = std::max(numForceBuffers, forces[i]->getRequiredForceBuffers());
-//    forceBuffers = new CudaArray<mm_float4>(*this, paddedNumAtoms*numForceBuffers, "forceBuffers", false);
-//    if (supports64BitGlobalAtomics) {
-//        longForceBuffer = new CudaArray<cl_long>(*this, 3*paddedNumAtoms, "longForceBuffer", false);
-//        reduceForcesKernel.setArg<cl::Buffer>(0, longForceBuffer->getDeviceBuffer());
-//        reduceForcesKernel.setArg<cl::Buffer>(1, forceBuffers->getDeviceBuffer());
-//        reduceForcesKernel.setArg<cl_int>(2, paddedNumAtoms);
-//        reduceForcesKernel.setArg<cl_int>(3, numForceBuffers);
-//        addAutoclearBuffer(longForceBuffer->getDeviceBuffer(), longForceBuffer->getSize()*2);
-//    }
-//    addAutoclearBuffer(forceBuffers->getDeviceBuffer(), forceBuffers->getSize()*4);
-//    force = new CudaArray<mm_float4>(*this, &forceBuffers->getDeviceBuffer(), paddedNumAtoms, "force", true);
-//    energyBuffer = new CudaArray<cl_float>(*this, max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers()), "energyBuffer", true);
-//    addAutoclearBuffer(energyBuffer->getDeviceBuffer(), energyBuffer->getSize());
-//    atomIndex = new CudaArray<cl_int>(*this, paddedNumAtoms, "atomIndex", true);
-//    for (int i = 0; i < paddedNumAtoms; ++i)
-//        (*atomIndex)[i] = i;
-//    atomIndex->upload();
-//    findMoleculeGroups();
-//    moleculesInvalid = false;
+    force = CudaArray::create<long3>(paddedNumAtoms, "force");
+    addAutoclearBuffer(force->getDevicePointer(), force->getSize()*6);
+    energyBuffer = CudaArray::create<float>(numThreadBlocks*ThreadBlockSize, "energyBuffer");
+    addAutoclearBuffer(energyBuffer->getDevicePointer(), energyBuffer->getSize());
+    atomIndexDevice = CudaArray::create<int>(paddedNumAtoms, "atomIndex");
+    atomIndex.resize(paddedNumAtoms);
+    for (int i = 0; i < paddedNumAtoms; ++i)
+        atomIndex[i] = i;
+    atomIndexDevice->upload(atomIndex);
+    findMoleculeGroups();
+    moleculesInvalid = false;
 //    nonbonded->initialize(system);
-//}
+}

 void CudaContext::addForce(CudaForceInfo* force) {
    forces.push_back(force);
@@ -315,7 +318,7 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
        CUresult result = cuModuleLoad(&module, outputFile.c_str());
        if (result != CUDA_SUCCESS) {
            std::stringstream m;
-            m<<"Error loading CUDA module: "<<result;
+            m<<"Error loading CUDA module: "<<getErrorString(result)<<" ("<<result<<")";
            throw OpenMMException(m.str());
        }
        remove(inputFile.c_str());
@@ -329,52 +332,109 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
        remove(logFile.c_str());
        throw;
    }
-//    
-//    // Get length before using c_str() to avoid length() call invalidating the c_str() value.
-//    string src_string = src.str();
-//    ::size_t src_length = src_string.length();
-//    cl::Program::Sources sources(1, make_pair(src_string.c_str(), src_length));
-//    cl::Program program(context, sources);
-//    try {
-//        program.build(vector<cl::Device>(1, device), options.c_str());
-//    } catch (cl::Error err) {
-//        throw OpenMMException("Error compiling kernel: "+program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device));
-//    }
 }
-//
-//void CudaContext::executeKernel(cl::Kernel& kernel, int workUnits, int blockSize) {
-//    if (blockSize == -1)
-//        blockSize = ThreadBlockSize;
-//    int size = std::min((workUnits+blockSize-1)/blockSize, numThreadBlocks)*blockSize;
-//    try {
-//        queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(size), cl::NDRange(blockSize));
-//    }
-//    catch (cl::Error err) {
-//        stringstream str;
-//        str<<"Error invoking kernel "<<kernel.getInfo<CL_KERNEL_FUNCTION_NAME>()<<": "<<err.what()<<" ("<<err.err()<<")";
-//        throw OpenMMException(str.str());
-//    }
-//}
-//
-//void CudaContext::clearBuffer(CudaArray<float>& array) {
-//    clearBuffer(array.getDeviceBuffer(), array.getSize());
-//}
-//
-//void CudaContext::clearBuffer(CudaArray<mm_float4>& array) {
-//    clearBuffer(array.getDeviceBuffer(), array.getSize()*4);
-//}
-//
-//void CudaContext::clearBuffer(cl::Memory& memory, int size) {
-//    clearBufferKernel.setArg<cl::Memory>(0, memory);
-//    clearBufferKernel.setArg<cl_int>(1, size);
-//    executeKernel(clearBufferKernel, size, 128);
-//}
-//
-//void CudaContext::addAutoclearBuffer(cl::Memory& memory, int size) {
-//    autoclearBuffers.push_back(&memory);
-//    autoclearBufferSizes.push_back(size);
-//}
-//
+
+CUfunction CudaContext::getKernel(CUmodule& module, const string& name) {
+    CUfunction function;
+    CUresult result = cuModuleGetFunction(&function, module, name.c_str());
+    if (result != CUDA_SUCCESS) {
+        std::stringstream m;
+        m<<"Error creating kernel "<<name<<": "<<getErrorString(result)<<" ("<<result<<")";
+        throw OpenMMException(m.str());
+    }
+    return function;
+}
+
+string CudaContext::doubleToString(double value) {
+    stringstream s;
+    s.precision(useDoublePrecision ? 16 : 8);
+    s << scientific << value;
+    if (!useDoublePrecision)
+        s << "f";
+    return s.str();
+}
+
+string CudaContext::intToString(int value) {
+    stringstream s;
+    s << value;
+    return s.str();
+}
+
+std::string CudaContext::getErrorString(CUresult result) {
+    switch (result) {
+        case CUDA_SUCCESS: return "CUDA_SUCCESS";
+        case CUDA_ERROR_INVALID_VALUE: return "CUDA_ERROR_INVALID_VALUE";
+        case CUDA_ERROR_OUT_OF_MEMORY: return "CUDA_ERROR_OUT_OF_MEMORY";
+        case CUDA_ERROR_NOT_INITIALIZED: return "CUDA_ERROR_NOT_INITIALIZED";
+        case CUDA_ERROR_DEINITIALIZED: return "CUDA_ERROR_DEINITIALIZED";
+        case CUDA_ERROR_PROFILER_DISABLED: return "CUDA_ERROR_PROFILER_DISABLED";
+        case CUDA_ERROR_PROFILER_NOT_INITIALIZED: return "CUDA_ERROR_PROFILER_NOT_INITIALIZED";
+        case CUDA_ERROR_PROFILER_ALREADY_STARTED: return "CUDA_ERROR_PROFILER_ALREADY_STARTED";
+        case CUDA_ERROR_PROFILER_ALREADY_STOPPED: return "CUDA_ERROR_PROFILER_ALREADY_STOPPED";
+        case CUDA_ERROR_NO_DEVICE: return "CUDA_ERROR_NO_DEVICE";
+        case CUDA_ERROR_INVALID_DEVICE: return "CUDA_ERROR_INVALID_DEVICE";
+        case CUDA_ERROR_INVALID_IMAGE: return "CUDA_ERROR_INVALID_IMAGE";
+        case CUDA_ERROR_INVALID_CONTEXT: return "CUDA_ERROR_INVALID_CONTEXT";
+        case CUDA_ERROR_CONTEXT_ALREADY_CURRENT: return "CUDA_ERROR_CONTEXT_ALREADY_CURRENT";
+        case CUDA_ERROR_MAP_FAILED: return "CUDA_ERROR_MAP_FAILED";
+        case CUDA_ERROR_UNMAP_FAILED: return "CUDA_ERROR_UNMAP_FAILED";
+        case CUDA_ERROR_ARRAY_IS_MAPPED: return "CUDA_ERROR_ARRAY_IS_MAPPED";
+        case CUDA_ERROR_ALREADY_MAPPED: return "CUDA_ERROR_ALREADY_MAPPED";
+        case CUDA_ERROR_NO_BINARY_FOR_GPU: return "CUDA_ERROR_NO_BINARY_FOR_GPU";
+        case CUDA_ERROR_ALREADY_ACQUIRED: return "CUDA_ERROR_ALREADY_ACQUIRED";
+        case CUDA_ERROR_NOT_MAPPED: return "CUDA_ERROR_NOT_MAPPED";
+        case CUDA_ERROR_NOT_MAPPED_AS_ARRAY: return "CUDA_ERROR_NOT_MAPPED_AS_ARRAY";
+        case CUDA_ERROR_NOT_MAPPED_AS_POINTER: return "CUDA_ERROR_NOT_MAPPED_AS_POINTER";
+        case CUDA_ERROR_ECC_UNCORRECTABLE: return "CUDA_ERROR_ECC_UNCORRECTABLE";
+        case CUDA_ERROR_UNSUPPORTED_LIMIT: return "CUDA_ERROR_UNSUPPORTED_LIMIT";
+        case CUDA_ERROR_CONTEXT_ALREADY_IN_USE: return "CUDA_ERROR_CONTEXT_ALREADY_IN_USE";
+        case CUDA_ERROR_INVALID_SOURCE: return "CUDA_ERROR_INVALID_SOURCE";
+        case CUDA_ERROR_FILE_NOT_FOUND: return "CUDA_ERROR_FILE_NOT_FOUND";
+        case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: return "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND";
+        case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED: return "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED";
+        case CUDA_ERROR_OPERATING_SYSTEM: return "CUDA_ERROR_OPERATING_SYSTEM";
+        case CUDA_ERROR_INVALID_HANDLE: return "CUDA_ERROR_INVALID_HANDLE";
+        case CUDA_ERROR_NOT_FOUND: return "CUDA_ERROR_NOT_FOUND";
+        case CUDA_ERROR_NOT_READY: return "CUDA_ERROR_NOT_READY";
+        case CUDA_ERROR_LAUNCH_FAILED: return "CUDA_ERROR_LAUNCH_FAILED";
+        case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: return "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES";
+        case CUDA_ERROR_LAUNCH_TIMEOUT: return "CUDA_ERROR_LAUNCH_TIMEOUT";
+        case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: return "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING";
+        case CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED: return "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED";
+        case CUDA_ERROR_PEER_ACCESS_NOT_ENABLED: return "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED";
+        case CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE: return "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE";
+        case CUDA_ERROR_CONTEXT_IS_DESTROYED: return "CUDA_ERROR_CONTEXT_IS_DESTROYED";
+        case CUDA_ERROR_UNKNOWN: return "CUDA_ERROR_UNKNOWN";
+    }
+    return "Invalid error code";
+}
+
+void CudaContext::executeKernel(CUfunction kernel, void** arguments, int threads, int blockSize, unsigned int sharedSize) {
+    if (blockSize == -1)
+        blockSize = ThreadBlockSize;
+    int gridSize = std::min((threads+blockSize-1)/blockSize, numThreadBlocks);
+    CUresult result = cuLaunchKernel(kernel, gridSize, 1, 1, blockSize, 1, 1, sharedSize, 0, arguments, NULL);
+    if (result != CUDA_SUCCESS) {
+        stringstream str;
+        str<<"Error invoking kernel: "<<getErrorString(result)<<" ("<<result<<")";
+        throw OpenMMException(str.str());
+    }
+}
+
+void CudaContext::clearBuffer(CudaArray& array) {
+    clearBuffer(array.getDevicePointer(), array.getSize()*array.getElementSize()/4);
+}
+
+void CudaContext::clearBuffer(CUdeviceptr memory, int size) {
+    void* args[] = {&memory, &size};
+    executeKernel(clearBufferKernel, args, size, 128);
+}
+
+void CudaContext::addAutoclearBuffer(CUdeviceptr memory, int size) {
+    autoclearBuffers.push_back(memory);
+    autoclearBufferSizes.push_back(size);
+}
+
 //void CudaContext::clearAutoclearBuffers() {
 //    int base = 0;
 //    int total = autoclearBufferSizes.size();
@@ -454,219 +514,217 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
 //    executeKernel(reduceFloat4Kernel, bufferSize, 128);
 //}
 //
-//void CudaContext::tagAtomsInMolecule(int atom, int molecule, vector<int>& atomMolecule, vector<vector<int> >& atomBonds) {
-//    // Recursively tag atoms as belonging to a particular molecule.
-//
-//    atomMolecule[atom] = molecule;
-//    for (int i = 0; i < (int) atomBonds[atom].size(); i++)
-//        if (atomMolecule[atomBonds[atom][i]] == -1)
-//            tagAtomsInMolecule(atomBonds[atom][i], molecule, atomMolecule, atomBonds);
-//}
-//
-///**
-// * This class ensures that atom reordering doesn't break virtual sites.
-// */
-//class CudaContext::VirtualSiteInfo : public CudaForceInfo {
-//public:
-//    VirtualSiteInfo(const System& system) : CudaForceInfo(0) {
-//        for (int i = 0; i < system.getNumParticles(); i++) {
-//            if (system.isVirtualSite(i)) {
-//                siteTypes.push_back(&typeid(system.getVirtualSite(i)));
-//                vector<int> particles;
-//                particles.push_back(i);
-//                for (int j = 0; j < system.getVirtualSite(i).getNumParticles(); j++)
-//                    particles.push_back(system.getVirtualSite(i).getParticle(j));
-//                siteParticles.push_back(particles);
-//                vector<double> weights;
-//                if (dynamic_cast<const TwoParticleAverageSite*>(&system.getVirtualSite(i)) != NULL) {
-//                    // A two particle average.
-//
-//                    const TwoParticleAverageSite& site = dynamic_cast<const TwoParticleAverageSite&>(system.getVirtualSite(i));
-//                    weights.push_back(site.getWeight(0));
-//                    weights.push_back(site.getWeight(1));
-//                }
-//                else if (dynamic_cast<const ThreeParticleAverageSite*>(&system.getVirtualSite(i)) != NULL) {
-//                    // A three particle average.
-//
-//                    const ThreeParticleAverageSite& site = dynamic_cast<const ThreeParticleAverageSite&>(system.getVirtualSite(i));
-//                    weights.push_back(site.getWeight(0));
-//                    weights.push_back(site.getWeight(1));
-//                    weights.push_back(site.getWeight(2));
-//                }
-//                else if (dynamic_cast<const OutOfPlaneSite*>(&system.getVirtualSite(i)) != NULL) {
-//                    // An out of plane site.
-//
-//                    const OutOfPlaneSite& site = dynamic_cast<const OutOfPlaneSite&>(system.getVirtualSite(i));
-//                    weights.push_back(site.getWeight12());
-//                    weights.push_back(site.getWeight13());
-//                    weights.push_back(site.getWeightCross());
-//                }
-//                siteWeights.push_back(weights);
-//            }
-//        }
-//    }
-//    int getNumParticleGroups() {
-//        return siteTypes.size();
-//    }
-//    void getParticlesInGroup(int index, std::vector<int>& particles) {
-//        particles = siteParticles[index];
-//    }
-//    bool areGroupsIdentical(int group1, int group2) {
-//        if (siteTypes[group1] != siteTypes[group2])
-//            return false;
-//        int numParticles = siteWeights[group1].size();
-//        if (siteWeights[group2].size() != numParticles)
-//            return false;
-//        for (int i = 0; i < numParticles; i++)
-//            if (siteWeights[group1][i] != siteWeights[group2][i])
-//                return false;
-//        return true;
-//    }
-//private:
-//    vector<const type_info*> siteTypes;
-//    vector<vector<int> > siteParticles;
-//    vector<vector<double> > siteWeights;
-//};
-//
-//
-//void CudaContext::findMoleculeGroups() {
-//    // The first time this is called, we need to identify all the molecules in the system.
-//    
-//    if (moleculeGroups.size() == 0) {
-//        // Add a ForceInfo that makes sure reordering doesn't break virtual sites.
-//
-//        addForce(new VirtualSiteInfo(system));
-//
-//        // First make a list of every other atom to which each atom is connect by a constraint or force group.
-//
-//        vector<vector<int> > atomBonds(system.getNumParticles());
-//        for (int i = 0; i < system.getNumConstraints(); i++) {
-//            int particle1, particle2;
-//            double distance;
-//            system.getConstraintParameters(i, particle1, particle2, distance);
-//            atomBonds[particle1].push_back(particle2);
-//            atomBonds[particle2].push_back(particle1);
-//        }
-//        for (int i = 0; i < (int) forces.size(); i++) {
-//            for (int j = 0; j < forces[i]->getNumParticleGroups(); j++) {
-//                vector<int> particles;
-//                forces[i]->getParticlesInGroup(j, particles);
-//                for (int k = 0; k < (int) particles.size(); k++)
-//                    for (int m = 0; m < (int) particles.size(); m++)
-//                        if (k != m)
-//                            atomBonds[particles[k]].push_back(particles[m]);
-//            }
-//        }
-//
-//        // Now tag atoms by which molecule they belong to.
-//
-//        vector<int> atomMolecule(numAtoms, -1);
-//        int numMolecules = 0;
-//        for (int i = 0; i < numAtoms; i++)
-//            if (atomMolecule[i] == -1)
-//                tagAtomsInMolecule(i, numMolecules++, atomMolecule, atomBonds);
-//        vector<vector<int> > atomIndices(numMolecules);
-//        for (int i = 0; i < numAtoms; i++)
-//            atomIndices[atomMolecule[i]].push_back(i);
-//
-//        // Construct a description of each molecule.
-//
-//        molecules.resize(numMolecules);
-//        for (int i = 0; i < numMolecules; i++) {
-//            molecules[i].atoms = atomIndices[i];
-//            molecules[i].groups.resize(forces.size());
-//        }
-//        for (int i = 0; i < system.getNumConstraints(); i++) {
-//            int particle1, particle2;
-//            double distance;
-//            system.getConstraintParameters(i, particle1, particle2, distance);
-//            molecules[atomMolecule[particle1]].constraints.push_back(i);
-//        }
-//        for (int i = 0; i < (int) forces.size(); i++)
-//            for (int j = 0; j < forces[i]->getNumParticleGroups(); j++) {
-//                vector<int> particles;
-//                forces[i]->getParticlesInGroup(j, particles);
-//                molecules[atomMolecule[particles[0]]].groups[i].push_back(j);
-//            }
-//    }
-//
-//    // Sort them into groups of identical molecules.
-//
-//    vector<Molecule> uniqueMolecules;
-//    vector<vector<int> > moleculeInstances;
-//    vector<vector<int> > moleculeOffsets;
-//    for (int molIndex = 0; molIndex < (int) molecules.size(); molIndex++) {
-//        Molecule& mol = molecules[molIndex];
-//
-//        // See if it is identical to another molecule.
-//
-//        bool isNew = true;
-//        for (int j = 0; j < (int) uniqueMolecules.size() && isNew; j++) {
-//            Molecule& mol2 = uniqueMolecules[j];
-//            bool identical = (mol.atoms.size() == mol2.atoms.size() && mol.constraints.size() == mol2.constraints.size());
-//
-//            // See if the atoms are identical.
-//
-//            int atomOffset = mol2.atoms[0]-mol.atoms[0];
-//            for (int i = 0; i < (int) mol.atoms.size() && identical; i++) {
-//                if (mol.atoms[i] != mol2.atoms[i]-atomOffset || system.getParticleMass(mol.atoms[i]) != system.getParticleMass(mol2.atoms[i]))
-//                    identical = false;
-//                for (int k = 0; k < (int) forces.size(); k++)
-//                    if (!forces[k]->areParticlesIdentical(mol.atoms[i], mol2.atoms[i]))
-//                        identical = false;
-//            }
-//            
-//            // See if the constraints are identical.
-//
-//            for (int i = 0; i < (int) mol.constraints.size() && identical; i++) {
-//                int c1particle1, c1particle2, c2particle1, c2particle2;
-//                double distance1, distance2;
-//                system.getConstraintParameters(mol.constraints[i], c1particle1, c1particle2, distance1);
-//                system.getConstraintParameters(mol2.constraints[i], c2particle1, c2particle2, distance2);
-//                if (c1particle1 != c2particle1-atomOffset || c1particle2 != c2particle2-atomOffset || distance1 != distance2)
-//                    identical = false;
-//            }
-//
-//            // See if the force groups are identical.
-//
-//            for (int i = 0; i < (int) forces.size() && identical; i++) {
-//                if (mol.groups[i].size() != mol2.groups[i].size())
-//                    identical = false;
-//                for (int k = 0; k < (int) mol.groups[i].size() && identical; k++)
-//                    if (!forces[i]->areGroupsIdentical(mol.groups[i][k], mol2.groups[i][k]))
-//                        identical = false;
-//            }
-//            if (identical) {
-//                moleculeInstances[j].push_back(molIndex);
-//                moleculeOffsets[j].push_back(mol.atoms[0]);
-//                isNew = false;
-//            }
-//        }
-//        if (isNew) {
-//            uniqueMolecules.push_back(mol);
-//            moleculeInstances.push_back(vector<int>());
-//            moleculeInstances[moleculeInstances.size()-1].push_back(molIndex);
-//            moleculeOffsets.push_back(vector<int>());
-//            moleculeOffsets[moleculeOffsets.size()-1].push_back(mol.atoms[0]);
-//        }
-//    }
-//    moleculeGroups.resize(moleculeInstances.size());
-//    for (int i = 0; i < (int) moleculeInstances.size(); i++)
-//    {
-//        moleculeGroups[i].instances = moleculeInstances[i];
-//        moleculeGroups[i].offsets = moleculeOffsets[i];
-//        vector<int>& atoms = uniqueMolecules[i].atoms;
-//        moleculeGroups[i].atoms.resize(atoms.size());
-//        for (int j = 0; j < (int) atoms.size(); j++)
-//            moleculeGroups[i].atoms[j] = atoms[j]-atoms[0];
-//    }
-//}
-//
-//void CudaContext::invalidateMolecules() {
-//    moleculesInvalid = true;
-//}
-//
-//
+void CudaContext::tagAtomsInMolecule(int atom, int molecule, vector<int>& atomMolecule, vector<vector<int> >& atomBonds) {
+    // Recursively tag atoms as belonging to a particular molecule.
+
+    atomMolecule[atom] = molecule;
+    for (int i = 0; i < (int) atomBonds[atom].size(); i++)
+        if (atomMolecule[atomBonds[atom][i]] == -1)
+            tagAtomsInMolecule(atomBonds[atom][i], molecule, atomMolecule, atomBonds);
+}
+
+/**
+ * This class ensures that atom reordering doesn't break virtual sites.
+ */
+class CudaContext::VirtualSiteInfo : public CudaForceInfo {
+public:
+    VirtualSiteInfo(const System& system) : CudaForceInfo(0) {
+        for (int i = 0; i < system.getNumParticles(); i++) {
+            if (system.isVirtualSite(i)) {
+                siteTypes.push_back(&typeid(system.getVirtualSite(i)));
+                vector<int> particles;
+                particles.push_back(i);
+                for (int j = 0; j < system.getVirtualSite(i).getNumParticles(); j++)
+                    particles.push_back(system.getVirtualSite(i).getParticle(j));
+                siteParticles.push_back(particles);
+                vector<double> weights;
+                if (dynamic_cast<const TwoParticleAverageSite*>(&system.getVirtualSite(i)) != NULL) {
+                    // A two particle average.
+
+                    const TwoParticleAverageSite& site = dynamic_cast<const TwoParticleAverageSite&>(system.getVirtualSite(i));
+                    weights.push_back(site.getWeight(0));
+                    weights.push_back(site.getWeight(1));
+                }
+                else if (dynamic_cast<const ThreeParticleAverageSite*>(&system.getVirtualSite(i)) != NULL) {
+                    // A three particle average.
+
+                    const ThreeParticleAverageSite& site = dynamic_cast<const ThreeParticleAverageSite&>(system.getVirtualSite(i));
+                    weights.push_back(site.getWeight(0));
+                    weights.push_back(site.getWeight(1));
+                    weights.push_back(site.getWeight(2));
+                }
+                else if (dynamic_cast<const OutOfPlaneSite*>(&system.getVirtualSite(i)) != NULL) {
+                    // An out of plane site.
+
+                    const OutOfPlaneSite& site = dynamic_cast<const OutOfPlaneSite&>(system.getVirtualSite(i));
+                    weights.push_back(site.getWeight12());
+                    weights.push_back(site.getWeight13());
+                    weights.push_back(site.getWeightCross());
+                }
+                siteWeights.push_back(weights);
+            }
+        }
+    }
+    int getNumParticleGroups() {
+        return siteTypes.size();
+    }
+    void getParticlesInGroup(int index, std::vector<int>& particles) {
+        particles = siteParticles[index];
+    }
+    bool areGroupsIdentical(int group1, int group2) {
+        if (siteTypes[group1] != siteTypes[group2])
+            return false;
+        int numParticles = siteWeights[group1].size();
+        if (siteWeights[group2].size() != numParticles)
+            return false;
+        for (int i = 0; i < numParticles; i++)
+            if (siteWeights[group1][i] != siteWeights[group2][i])
+                return false;
+        return true;
+    }
+private:
+    vector<const type_info*> siteTypes;
+    vector<vector<int> > siteParticles;
+    vector<vector<double> > siteWeights;
+};
+
+void CudaContext::findMoleculeGroups() {
+    // The first time this is called, we need to identify all the molecules in the system.
+    
+    if (moleculeGroups.size() == 0) {
+        // Add a ForceInfo that makes sure reordering doesn't break virtual sites.
+
+        addForce(new VirtualSiteInfo(system));
+
+        // First make a list of every other atom to which each atom is connect by a constraint or force group.
+
+        vector<vector<int> > atomBonds(system.getNumParticles());
+        for (int i = 0; i < system.getNumConstraints(); i++) {
+            int particle1, particle2;
+            double distance;
+            system.getConstraintParameters(i, particle1, particle2, distance);
+            atomBonds[particle1].push_back(particle2);
+            atomBonds[particle2].push_back(particle1);
+        }
+        for (int i = 0; i < (int) forces.size(); i++) {
+            for (int j = 0; j < forces[i]->getNumParticleGroups(); j++) {
+                vector<int> particles;
+                forces[i]->getParticlesInGroup(j, particles);
+                for (int k = 0; k < (int) particles.size(); k++)
+                    for (int m = 0; m < (int) particles.size(); m++)
+                        if (k != m)
+                            atomBonds[particles[k]].push_back(particles[m]);
+            }
+        }
+
+        // Now tag atoms by which molecule they belong to.
+
+        vector<int> atomMolecule(numAtoms, -1);
+        int numMolecules = 0;
+        for (int i = 0; i < numAtoms; i++)
+            if (atomMolecule[i] == -1)
+                tagAtomsInMolecule(i, numMolecules++, atomMolecule, atomBonds);
+        vector<vector<int> > atomIndices(numMolecules);
+        for (int i = 0; i < numAtoms; i++)
+            atomIndices[atomMolecule[i]].push_back(i);
+
+        // Construct a description of each molecule.
+
+        molecules.resize(numMolecules);
+        for (int i = 0; i < numMolecules; i++) {
+            molecules[i].atoms = atomIndices[i];
+            molecules[i].groups.resize(forces.size());
+        }
+        for (int i = 0; i < system.getNumConstraints(); i++) {
+            int particle1, particle2;
+            double distance;
+            system.getConstraintParameters(i, particle1, particle2, distance);
+            molecules[atomMolecule[particle1]].constraints.push_back(i);
+        }
+        for (int i = 0; i < (int) forces.size(); i++)
+            for (int j = 0; j < forces[i]->getNumParticleGroups(); j++) {
+                vector<int> particles;
+                forces[i]->getParticlesInGroup(j, particles);
+                molecules[atomMolecule[particles[0]]].groups[i].push_back(j);
+            }
+    }
+
+    // Sort them into groups of identical molecules.
+
+    vector<Molecule> uniqueMolecules;
+    vector<vector<int> > moleculeInstances;
+    vector<vector<int> > moleculeOffsets;
+    for (int molIndex = 0; molIndex < (int) molecules.size(); molIndex++) {
+        Molecule& mol = molecules[molIndex];
+
+        // See if it is identical to another molecule.
+
+        bool isNew = true;
+        for (int j = 0; j < (int) uniqueMolecules.size() && isNew; j++) {
+            Molecule& mol2 = uniqueMolecules[j];
+            bool identical = (mol.atoms.size() == mol2.atoms.size() && mol.constraints.size() == mol2.constraints.size());
+
+            // See if the atoms are identical.
+
+            int atomOffset = mol2.atoms[0]-mol.atoms[0];
+            for (int i = 0; i < (int) mol.atoms.size() && identical; i++) {
+                if (mol.atoms[i] != mol2.atoms[i]-atomOffset || system.getParticleMass(mol.atoms[i]) != system.getParticleMass(mol2.atoms[i]))
+                    identical = false;
+                for (int k = 0; k < (int) forces.size(); k++)
+                    if (!forces[k]->areParticlesIdentical(mol.atoms[i], mol2.atoms[i]))
+                        identical = false;
+            }
+            
+            // See if the constraints are identical.
+
+            for (int i = 0; i < (int) mol.constraints.size() && identical; i++) {
+                int c1particle1, c1particle2, c2particle1, c2particle2;
+                double distance1, distance2;
+                system.getConstraintParameters(mol.constraints[i], c1particle1, c1particle2, distance1);
+                system.getConstraintParameters(mol2.constraints[i], c2particle1, c2particle2, distance2);
+                if (c1particle1 != c2particle1-atomOffset || c1particle2 != c2particle2-atomOffset || distance1 != distance2)
+                    identical = false;
+            }
+
+            // See if the force groups are identical.
+
+            for (int i = 0; i < (int) forces.size() && identical; i++) {
+                if (mol.groups[i].size() != mol2.groups[i].size())
+                    identical = false;
+                for (int k = 0; k < (int) mol.groups[i].size() && identical; k++)
+                    if (!forces[i]->areGroupsIdentical(mol.groups[i][k], mol2.groups[i][k]))
+                        identical = false;
+            }
+            if (identical) {
+                moleculeInstances[j].push_back(molIndex);
+                moleculeOffsets[j].push_back(mol.atoms[0]);
+                isNew = false;
+            }
+        }
+        if (isNew) {
+            uniqueMolecules.push_back(mol);
+            moleculeInstances.push_back(vector<int>());
+            moleculeInstances[moleculeInstances.size()-1].push_back(molIndex);
+            moleculeOffsets.push_back(vector<int>());
+            moleculeOffsets[moleculeOffsets.size()-1].push_back(mol.atoms[0]);
+        }
+    }
+    moleculeGroups.resize(moleculeInstances.size());
+    for (int i = 0; i < (int) moleculeInstances.size(); i++)
+    {
+        moleculeGroups[i].instances = moleculeInstances[i];
+        moleculeGroups[i].offsets = moleculeOffsets[i];
+        vector<int>& atoms = uniqueMolecules[i].atoms;
+        moleculeGroups[i].atoms.resize(atoms.size());
+        for (int j = 0; j < (int) atoms.size(); j++)
+            moleculeGroups[i].atoms[j] = atoms[j]-atoms[0];
+    }
+}
+
+void CudaContext::invalidateMolecules() {
+    moleculesInvalid = true;
+}
+
 //void OpenCLContext::validateMolecules() {
 //    moleculesInvalid = false;
 //    if (numAtoms == 0 || nonbonded == NULL || !nonbonded->getUseCutoff())

--- a/platforms/cuda2/src/CudaContext.h
+++ b/platforms/cuda2/src/CudaContext.h
@@ -72,11 +72,11 @@ public:
    CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const std::string& precision,
            const std::string& compiler, const std::string& tempDir, CudaPlatform::PlatformData& platformData);
    ~CudaContext();
-//    /**
-//     * This is called to initialize internal data structures after all Forces in the system
-//     * have been initialized.
-//     */
-//    void initialize();
+    /**
+     * This is called to initialize internal data structures after all Forces in the system
+     * have been initialized.
+     */
+    void initialize();
    /**
     * Add a CudaForce to this context.
     */
@@ -123,12 +123,12 @@ public:
    CudaArray& getVelm() {
        return *velm;
    }
-//    /**
-//     * Get the array which contains the force on each atom.
-//     */
-//    CudaArray<mm_float4>& getForce() {
-//        return *force;
-//    }
+    /**
+     * Get the array which contains the force on each atom (respresented as a long3 in 64 bit fixed point).
+     */
+    CudaArray& getForce() {
+        return *force;
+    }
 //    /**
 //     * Get the array which contains the buffers in which forces are computed.
 //     */
@@ -184,36 +184,41 @@ public:
     *                           omitted, a default set of options will be used
     */
    CUmodule createModule(const std::string source, const std::map<std::string, std::string>& defines, const char* optimizationFlags = NULL);
-//    /**
-//     * Execute a kernel.
-//     *
-//     * @param kernel       the kernel to execute
-//     * @param workUnits    the maximum number of work units that should be used
-//     * @param blockSize    the size of each thread block to use
-//     */
-//    void executeKernel(cl::Kernel& kernel, int workUnits, int blockSize = -1);
-//    /**
-//     * Set all elements of an array to 0.
-//     */
-//    void clearBuffer(CudaArray<float>& array);
-//    /**
-//     * Set all elements of an array to 0.
-//     */
-//    void clearBuffer(CudaArray<mm_float4>& array);
-//    /**
-//     * Set all elements of an array to 0.
-//     *
-//     * @param memory     the Memory to clear
-//     * @param size       the number of float elements in the buffer
-//     */
-//    void clearBuffer(cl::Memory& memory, int size);
-//    /**
-//     * Register a buffer that should be automatically cleared (all elements set to 0) at the start of each force or energy computation.
-//     *
-//     * @param memory     the Memory to clear
-//     * @param size       the number of float elements in the buffer
-//     */
-//    void addAutoclearBuffer(cl::Memory& memory, int size);
+    /**
+     * Get a kernel from a CUDA module.
+     *
+     * @param module    the module to get the kernel from
+     * @param name      the name of the kernel to get
+     */
+    CUfunction getKernel(CUmodule& module, const std::string& name);
+    /**
+     * Execute a kernel.
+     *
+     * @param kernel       the kernel to execute
+     * @param arguments    an array of pointers to the kernel arguments
+     * @param threads      the maximum number of threads that should be used
+     * @param blockSize    the size of each thread block to use
+     * @param sharedSize   the amount of dynamic shared memory to allocated for the kernel, in bytes
+     */
+    void executeKernel(CUfunction kernel, void** arguments, int workUnits, int blockSize = -1, unsigned int sharedSize = 0);
+    /**
+     * Set all elements of an array to 0.
+     */
+    void clearBuffer(CudaArray& array);
+    /**
+     * Set all elements of an array to 0.
+     *
+     * @param memory     the memory to clear
+     * @param size       the number of 4-byte elements in the buffer
+     */
+    void clearBuffer(CUdeviceptr memory, int size);
+    /**
+     * Register a buffer that should be automatically cleared (all elements set to 0) at the start of each force or energy computation.
+     *
+     * @param memory     the memory to clear
+     * @param size       the number of float/double elements in the buffer
+     */
+    void addAutoclearBuffer(CUdeviceptr memory, int size);
 //    /**
 //     * Clear all buffers that have been registered with addAutoclearBuffer().
 //     */
@@ -230,108 +235,110 @@ public:
 //     * Sum the buffesr containing forces.
 //     */
 //    void reduceForces();
-//    /**
-//     * Get the current simulation time.
-//     */
-//    double getTime() {
-//        return time;
-//    }
-//    /**
-//     * Set the current simulation time.
-//     */
-//    void setTime(double t) {
-//        time = t;
-//    }
-//    /**
-//     * Get the number of integration steps that have been taken.
-//     */
-//    int getStepCount() {
-//        return stepCount;
-//    }
-//    /**
-//     * Set the number of integration steps that have been taken.
-//     */
-//    void setStepCount(int steps) {
-//        stepCount = steps;
-//    }
-//    /**
-//     * Get the number of times forces or energy has been computed.
-//     */
-//    int getComputeForceCount() {
-//        return computeForceCount;
-//    }
-//    /**
-//     * Set the number of times forces or energy has been computed.
-//     */
-//    void setComputeForceCount(int count) {
-//        computeForceCount = count;
-//    }
-//    /**
-//     * Get the number of atoms.
-//     */
-//    int getNumAtoms() const {
-//        return numAtoms;
-//    }
-//    /**
-//     * Get the number of atoms, rounded up to a multiple of TileSize.  This is the actual size of
-//     * most arrays with one element per atom.
-//     */
-//    int getPaddedNumAtoms() const {
-//        return paddedNumAtoms;
-//    }
-//    /**
-//     * Get the number of blocks of TileSize atoms.
-//     */
-//    int getNumAtomBlocks() const {
-//        return numAtomBlocks;
-//    }
-//    /**
-//     * Get the standard number of thread blocks to use when executing kernels.
-//     */
-//    int getNumThreadBlocks() const {
-//        return numThreadBlocks;
-//    }
-//    /**
-//     * Get the number of force buffers.
-//     */
-//    int getNumForceBuffers() const {
-//        return numForceBuffers;
-//    }
-//    /**
-//     * Get the SIMD width of the device being used.
-//     */
-//    int getSIMDWidth() const {
-//        return simdWidth;
-//    }
-//    /**
-//     * Get whether the device being used supports 64 bit atomic operations on global memory.
-//     */
-//    bool getSupports64BitGlobalAtomics() {
-//        return supports64BitGlobalAtomics;
-//    }
-//    /**
-//     * Get whether the device being used supports double precision math.
-//     */
-//    bool getSupportsDoublePrecision() {
-//        return supportsDoublePrecision;
-//    }
+    /**
+     * Get the current simulation time.
+     */
+    double getTime() {
+        return time;
+    }
+    /**
+     * Set the current simulation time.
+     */
+    void setTime(double t) {
+        time = t;
+    }
+    /**
+     * Get the number of integration steps that have been taken.
+     */
+    int getStepCount() {
+        return stepCount;
+    }
+    /**
+     * Set the number of integration steps that have been taken.
+     */
+    void setStepCount(int steps) {
+        stepCount = steps;
+    }
+    /**
+     * Get the number of times forces or energy has been computed.
+     */
+    int getComputeForceCount() {
+        return computeForceCount;
+    }
+    /**
+     * Set the number of times forces or energy has been computed.
+     */
+    void setComputeForceCount(int count) {
+        computeForceCount = count;
+    }
+    /**
+     * Get the number of atoms.
+     */
+    int getNumAtoms() const {
+        return numAtoms;
+    }
+    /**
+     * Get the number of atoms, rounded up to a multiple of TileSize.  This is the actual size of
+     * most arrays with one element per atom.
+     */
+    int getPaddedNumAtoms() const {
+        return paddedNumAtoms;
+    }
+    /**
+     * Get the number of blocks of TileSize atoms.
+     */
+    int getNumAtomBlocks() const {
+        return numAtomBlocks;
+    }
+    /**
+     * Get the standard number of thread blocks to use when executing kernels.
+     */
+    int getNumThreadBlocks() const {
+        return numThreadBlocks;
+    }
+    /**
+     * Get whether double precision is being used.
+     */
+    bool getUseDoublePrecision() {
+        return useDoublePrecision;
+    }
+    /**
+     * Get whether accumulation is being done in double precision.
+     */
+    bool getAccumulateInDouble() {
+        return accumulateInDouble;
+    }
+    /**
+     * Convert a number to a string in a format suitable for including in a kernel.
+     * This takes into account whether the context uses single or double precision.
+     */
+    std::string doubleToString(double value);
+    /**
+     * Convert a number to a string in a format suitable for including in a kernel.
+     */
+    std::string intToString(int value);
+    /**
+     * Convert a CUDA result code to the corresponding string description.
+     */
+    std::string getErrorString(CUresult result);
+    
 //    /**
 //     * Get the size of the periodic box.
 //     */
-//    mm_float4 getPeriodicBoxSize() const {
+//    float4 getPeriodicBoxSize() const {
 //        return periodicBoxSize;
 //    }
 //    /**
 //     * Set the size of the periodic box.
 //     */
 //    void setPeriodicBoxSize(double xsize, double ysize, double zsize) {
-//        periodicBoxSize = mm_float4((float) xsize, (float) ysize, (float) zsize, 0);
-//        invPeriodicBoxSize = mm_float4((float) (1.0/xsize), (float) (1.0/ysize), (float) (1.0/zsize), 0);
+//        periodicBoxSize = make_float4((float) xsize, (float) ysize, (float) zsize, 0);
+//        invPeriodicBoxSize = make_float4((float) (1.0/xsize), (float) (1.0/ysize), (float) (1.0/zsize), 0);
 //    }
 //    /**
 //     * Get the inverse of the size of the periodic box.
 //     */
-//    mm_float4 getInvPeriodicBoxSize() const {
+//    float4 getInvPeriodicBoxSize() const {
 //        return invPeriodicBoxSize;
 //    }
 //    /**
@@ -352,66 +359,66 @@ public:
 //    CudaNonbondedUtilities& getNonbondedUtilities() {
 //        return *nonbonded;
 //    }
-//    /**
-//     * Get the thread used by this context for executing parallel computations.
-//     */
-//    WorkThread& getWorkThread() {
-//        return *thread;
-//    }
-//    /**
-//     * Get whether atoms were reordered during the most recent force/energy computation.
-//     */
-//    bool getAtomsWereReordered() const {
-//        return atomsWereReordered;
-//    }
-//    /**
-//     * Set whether atoms were reordered during the most recent force/energy computation.
-//     */
-//    void setAtomsWereReordered(bool wereReordered) {
-//        atomsWereReordered = wereReordered;
-//    }
-//    /**
-//     * Reorder the internal arrays of atoms to try to keep spatially contiguous atoms close
-//     * together in the arrays.
-//     * 
-//     * @param enforcePeriodic    if true, the atom positions may be altered to enforce periodic boundary conditions
-//     */
-//    void reorderAtoms(bool enforcePeriodic);
-//    /**
-//     * Add a listener that should be called whenever atoms get reordered.  The CudaContext
-//     * assumes ownership of the object, and deletes it when the context itself is deleted.
-//     */
-//    void addReorderListener(ReorderListener* listener);
-//    /**
-//     * Get the list of ReorderListeners.
-//     */
-//    std::vector<ReorderListener*>& getReorderListeners() {
-//        return reorderListeners;
-//    }
-//    /**
-//     * Mark that the current molecule definitions (and hence the atom order) may be invalid.
-//     * This should be called whenever force field parameters change.  It will cause the definitions
-//     * and order to be revalidated the next to reorderAtoms() is called.
-//     */
-//    void invalidateMolecules();
-//    /**
-//     * Get whether the current molecule definitions are valid.
-//     */
-//    bool getMoleculesAreInvalid() {
-//        return moleculesInvalid;
-//    }
+    /**
+     * Get the thread used by this context for executing parallel computations.
+     */
+    WorkThread& getWorkThread() {
+        return *thread;
+    }
+    /**
+     * Get whether atoms were reordered during the most recent force/energy computation.
+     */
+    bool getAtomsWereReordered() const {
+        return atomsWereReordered;
+    }
+    /**
+     * Set whether atoms were reordered during the most recent force/energy computation.
+     */
+    void setAtomsWereReordered(bool wereReordered) {
+        atomsWereReordered = wereReordered;
+    }
+    /**
+     * Reorder the internal arrays of atoms to try to keep spatially contiguous atoms close
+     * together in the arrays.
+     * 
+     * @param enforcePeriodic    if true, the atom positions may be altered to enforce periodic boundary conditions
+     */
+    void reorderAtoms(bool enforcePeriodic);
+    /**
+     * Add a listener that should be called whenever atoms get reordered.  The CudaContext
+     * assumes ownership of the object, and deletes it when the context itself is deleted.
+     */
+    void addReorderListener(ReorderListener* listener);
+    /**
+     * Get the list of ReorderListeners.
+     */
+    std::vector<ReorderListener*>& getReorderListeners() {
+        return reorderListeners;
+    }
+    /**
+     * Mark that the current molecule definitions (and hence the atom order) may be invalid.
+     * This should be called whenever force field parameters change.  It will cause the definitions
+     * and order to be revalidated the next to reorderAtoms() is called.
+     */
+    void invalidateMolecules();
+    /**
+     * Get whether the current molecule definitions are valid.
+     */
+    bool getMoleculesAreInvalid() {
+        return moleculesInvalid;
+    }
 private:
    struct Molecule;
    struct MoleculeGroup;
    class VirtualSiteInfo;
-//    void findMoleculeGroups();
-//    static void tagAtomsInMolecule(int atom, int molecule, std::vector<int>& atomMolecule, std::vector<std::vector<int> >& atomBonds);
-//    /**
-//     * Ensure that all molecules marked as "identical" really are identical.  This should be
-//     * called whenever force field parameters change.  If necessary, it will rebuild the list
-//     * of molecules and resort the atoms.
-//     */
-//    void validateMolecules();
+    void findMoleculeGroups();
+    static void tagAtomsInMolecule(int atom, int molecule, std::vector<int>& atomMolecule, std::vector<std::vector<int> >& atomBonds);
+    /**
+     * Ensure that all molecules marked as "identical" really are identical.  This should be
+     * called whenever force field parameters change.  If necessary, it will rebuild the list
+     * of molecules and resort the atoms.
+     */
+    void validateMolecules();
    static bool hasInitializedCuda;
    const System& system;
    double time;
@@ -424,8 +431,6 @@ private:
    int paddedNumAtoms;
    int numAtomBlocks;
    int numThreadBlocks;
-//    int numForceBuffers;
-//    int simdWidth;
    bool useBlockingSync, useDoublePrecision, accumulateInDouble, contextIsValid, atomsWereReordered, moleculesInvalid;
    std::string compiler, tempDir, gpuArchitecture;
    float4 periodicBoxSize;
@@ -446,15 +451,15 @@ private:
    std::vector<Molecule> molecules;
    std::vector<MoleculeGroup> moleculeGroups;
    std::vector<int4> posCellOffsets;
+    void* pinnedBuffer;
    CudaArray* posq;
    CudaArray* velm;
-//    CudaArray<mm_float4>* force;
-//    CudaArray<mm_float4>* forceBuffers;
-//    CudaArray<cl_long>* longForceBuffer;
-//    CudaArray<cl_float>* energyBuffer;
-//    CudaArray<cl_int>* atomIndex;
-//    std::vector<cl::Memory*> autoclearBuffers;
-//    std::vector<int> autoclearBufferSizes;
+    CudaArray* force;
+    CudaArray* energyBuffer;
+    CudaArray* atomIndexDevice;
+    std::vector<int> atomIndex;
+    std::vector<CUdeviceptr> autoclearBuffers;
+    std::vector<int> autoclearBufferSizes;
    std::vector<ReorderListener*> reorderListeners;
 //    CudaIntegrationUtilities* integration;
 //    CudaBondedUtilities* bonded;

--- a/platforms/cuda2/src/CudaPlatform.cpp
+++ b/platforms/cuda2/src/CudaPlatform.cpp
@@ -154,6 +154,7 @@ CudaPlatform::PlatformData::PlatformData(const System& system, const string& dev
        device << contexts[i]->getDeviceIndex();
    }
    propertyValues[CudaPlatform::CudaDeviceIndex()] = device.str();
+    propertyValues[CudaPlatform::CudaUseBlockingSync()] = blocking ? "true" : "false";
    propertyValues[CudaPlatform::CudaPrecision()] = precisionProperty;
    propertyValues[CudaPlatform::CudaCompiler()] = compilerProperty;
    propertyValues[CudaPlatform::CudaTempDirectory()] = tempProperty;
@@ -166,11 +167,11 @@ CudaPlatform::PlatformData::~PlatformData() {
 }

 void CudaPlatform::PlatformData::initializeContexts(const System& system) {
-//    for (int i = 0; i < (int) contexts.size(); i++)
-//        contexts[i]->initialize();
+    for (int i = 0; i < (int) contexts.size(); i++)
+        contexts[i]->initialize();
 }

 void CudaPlatform::PlatformData::syncContexts() {
-//    for (int i = 0; i < (int) contexts.size(); i++)
-//        contexts[i]->getWorkThread().flush();
+    for (int i = 0; i < (int) contexts.size(); i++)
+        contexts[i]->getWorkThread().flush();
 }
--- a/platforms/cuda2/src/CudaSort.cpp
+++ b/platforms/cuda2/src/CudaSort.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2010-2012 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+#include "CudaSort.h"
+#include "CudaKernelSources.h"
+#include <map>
+
+using namespace OpenMM;
+using namespace std;
+
+CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length) : context(context), trait(trait),
+        dataRange(NULL), bucketOfElement(NULL), offsetInBucket(NULL), bucketOffset(NULL), buckets(NULL) {
+    // Create kernels.
+
+    map<string, string> replacements;
+    replacements["DATA_TYPE"] = trait->getDataType();
+    replacements["KEY_TYPE"] =  trait->getKeyType();
+    replacements["SORT_KEY"] = trait->getSortKey();
+    replacements["MIN_KEY"] = trait->getMinKey();
+    replacements["MAX_KEY"] = trait->getMaxKey();
+    replacements["MAX_VALUE"] = trait->getMaxValue();
+    CUmodule module = context.createModule(context.replaceStrings(CudaKernelSources::sort, replacements));
+    computeRangeKernel = context.getKernel(module, "computeRange");
+    assignElementsKernel = context.getKernel(module, "assignElementsToBuckets");
+    computeBucketPositionsKernel = context.getKernel(module, "computeBucketPositions");
+    copyToBucketsKernel = context.getKernel(module, "copyDataToBuckets");
+    sortBucketsKernel = context.getKernel(module, "sortBuckets");
+
+    // Work out the work group sizes for various kernels.
+
+    int maxBlockSize;
+    cuDeviceGetAttribute(&maxBlockSize, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, context.getDevice());
+    for (rangeKernelSize = 1; rangeKernelSize*2 <= maxBlockSize; rangeKernelSize *= 2)
+        ;
+    positionsKernelSize = rangeKernelSize;
+    sortKernelSize = rangeKernelSize/2;
+    if (rangeKernelSize > length)
+        rangeKernelSize = length;
+    int maxSharedMem;
+    cuDeviceGetAttribute(&maxSharedMem, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, context.getDevice());
+    unsigned int maxLocalBuffer = (unsigned int) ((maxSharedMem/trait->getDataSize())/2);
+    if (sortKernelSize > maxLocalBuffer)
+        sortKernelSize = maxLocalBuffer;
+    unsigned int targetBucketSize = sortKernelSize/2;
+    unsigned int numBuckets = length/targetBucketSize;
+    if (numBuckets < 1)
+        numBuckets = 1;
+    if (positionsKernelSize > numBuckets)
+        positionsKernelSize = numBuckets;
+
+    // Create workspace arrays.
+
+    dataRange = new CudaArray(2, trait->getKeySize(), "sortDataRange");
+    bucketOffset = CudaArray::create<uint1>(numBuckets, "bucketOffset");
+    bucketOfElement = CudaArray::create<uint1>(length, "bucketOfElement");
+    offsetInBucket = CudaArray::create<uint1>(length, "offsetInBucket");
+    buckets = new CudaArray(length, trait->getDataSize(), "buckets");
+}
+
+CudaSort::~CudaSort() {
+    delete trait;
+    if (dataRange != NULL)
+        delete dataRange;
+    if (bucketOfElement != NULL)
+        delete bucketOfElement;
+    if (offsetInBucket != NULL)
+        delete offsetInBucket;
+    if (bucketOffset != NULL)
+        delete bucketOffset;
+    if (buckets != NULL)
+        delete buckets;
+}
+
+void CudaSort::sort(CudaArray& data) {
+    if (data.getSize() != bucketOfElement->getSize() || data.getElementSize() != trait->getDataSize())
+        throw OpenMMException("CudaSort called with different data size");
+    if (data.getSize() == 0)
+        return;
+
+    // Compute the range of data values.
+
+    unsigned int dataSize = data.getSize();
+    void* rangeArgs[] = {&data.getDevicePointer(), &dataSize, &dataRange->getDevicePointer()};
+    context.executeKernel(computeRangeKernel, rangeArgs, rangeKernelSize, rangeKernelSize, rangeKernelSize*trait->getKeySize());
+
+    // Assign array elements to buckets.
+
+    unsigned int numBuckets = bucketOffset->getSize();
+    context.clearBuffer(*bucketOffset);
+    void* elementsArgs[] = {&data.getDevicePointer(), &dataSize, &numBuckets, &dataRange->getDevicePointer(),
+            &bucketOffset->getDevicePointer(), &bucketOfElement->getDevicePointer(), &offsetInBucket->getDevicePointer()};
+    context.executeKernel(assignElementsKernel, elementsArgs, data.getSize());
+
+    // Compute the position of each bucket.
+
+    void* computeArgs[] = {&numBuckets, &bucketOffset->getDevicePointer()};
+    context.executeKernel(computeBucketPositionsKernel, computeArgs, positionsKernelSize, positionsKernelSize, positionsKernelSize*sizeof(int));
+
+    // Copy the data into the buckets.
+
+    void* copyArgs[] = {&data.getDevicePointer(), &buckets->getDevicePointer(), &dataSize, &bucketOffset->getDevicePointer(),
+            &bucketOfElement->getDevicePointer(), &offsetInBucket->getDevicePointer()};
+    context.executeKernel(copyToBucketsKernel, copyArgs, data.getSize());
+
+    // Sort each bucket.
+
+    void* sortArgs[] = {&data.getDevicePointer(), &buckets->getDevicePointer(), &numBuckets, &bucketOffset->getDevicePointer()};
+    context.executeKernel(sortBucketsKernel, sortArgs, ((data.getSize()+sortKernelSize-1)/sortKernelSize)*sortKernelSize, sortKernelSize, sortKernelSize*trait->getDataSize());
+}
--- a/platforms/cuda2/src/CudaSort.h
+++ b/platforms/cuda2/src/CudaSort.h
+#ifndef __OPENMM_CUDASORT_H__
+#define __OPENMM_CUDASORT_H__
+
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2010-2012 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * This program is free software: you can redistribute it and/or modify       *
+ * it under the terms of the GNU Lesser General Public License as published   *
+ * by the Free Software Foundation, either version 3 of the License, or       *
+ * (at your option) any later version.                                        *
+ *                                                                            *
+ * This program is distributed in the hope that it will be useful,            *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
+ * GNU Lesser General Public License for more details.                        *
+ *                                                                            *
+ * You should have received a copy of the GNU Lesser General Public License   *
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
+ * -------------------------------------------------------------------------- */
+
+#include "CudaArray.h"
+#include "openmm/internal/windowsExport.h"
+#include "CudaContext.h"
+
+namespace OpenMM {
+
+/**
+ * This class sorts arrays of values.  It supports any type of values, not just scalars,
+ * so long as an appropriate sorting key can be defined by which to sort them.
+ * 
+ * The sorting behavior is specified by a "trait" class that defines the type of data to
+ * sort and the key for sorting it.  Here is an example of a trait class for
+ * sorting floats:
+ * 
+ * class SortTrait : public CudaSort::SortTrait {
+ *     int getDataSize() const {return 4;}
+ *     int getKeySize() const {return 4;}
+ *     const char* getDataType() const {return "float";}
+ *     const char* getKeyType() const {return "float";}
+ *     const char* getMinKey() const {return "-MAXFLOAT";}
+ *     const char* getMaxKey() const {return "MAXFLOAT";}
+ *     const char* getMaxValue() const {return "MAXFLOAT";}
+ *     const char* getSortKey() const {return "value";}
+ * };
+ *
+ * The algorithm used is a bucket sort, followed by a bitonic sort within each bucket
+ * (in local memory when possible, in global memory otherwise).  This is similar to
+ * the algorithm described in
+ *
+ * Shifu Chen, Jing Qin, Yongming Xie, Junping Zhao, and Pheng-Ann Heng.  "An Efficient
+ * Sorting Algorithm with CUDA"  Journal of the Chinese Institute of Engineers, 32(7),
+ * pp. 915-921 (2009)
+ *
+ * but with many modifications and simplifications.  In particular, this algorithm
+ * involves much less communication between host and device, which is critical to get
+ * good performance with the array sizes we typically work with (10,000 to 100,000
+ * elements).
+ */
+    
+class OPENMM_EXPORT CudaSort {
+public:
+    class SortTrait;
+    /**
+     * Create a CudaSort object for sorting data of a particular type.
+     *
+     * @param context    the context in which to perform calculations
+     * @param trait      a SortTrait defining the type of data to sort.  It should have been allocated
+     *                   on the heap with the "new" operator.  This object takes over ownership of it,
+     *                   and deletes it when the CudaSort is deleted.
+     * @param length     the length of the arrays this object will be used to sort
+     */
+    CudaSort(CudaContext& context, SortTrait* trait, unsigned int length);
+    ~CudaSort();
+    /**
+     * Sort an array.
+     */
+    void sort(CudaArray& data);
+private:
+    CudaContext& context;
+    SortTrait* trait;
+    CudaArray* dataRange;
+    CudaArray* bucketOfElement;
+    CudaArray* offsetInBucket;
+    CudaArray* bucketOffset;
+    CudaArray* buckets;
+    CUfunction computeRangeKernel, assignElementsKernel, computeBucketPositionsKernel, copyToBucketsKernel, sortBucketsKernel;
+    unsigned int rangeKernelSize, positionsKernelSize, sortKernelSize;
+};
+
+/**
+ * A subclass of SortTrait defines the type of value to sort, and the key for sorting them.
+ */
+class CudaSort::SortTrait {
+public:
+    /**
+     * Get the size of each data value in bytes.
+     */
+    virtual int getDataSize() const = 0;
+    /**
+     * Get the size of each key value in bytes.
+     */
+    virtual int getKeySize() const = 0;
+    /**
+     * Get the data type of the values to sort.
+     */
+    virtual const char* getDataType() const = 0;
+    /**
+     * Get the data type of the sorting key.
+     */
+    virtual const char* getKeyType() const = 0;
+    /**
+     * Get the minimum value a key can take.
+     */
+    virtual const char* getMinKey() const = 0;
+    /**
+     * Get the maximum value a key can take.
+     */
+    virtual const char* getMaxKey() const = 0;
+    /**
+     * Get a value whose key is guaranteed to equal getMaxKey().
+     */
+    virtual const char* getMaxValue() const = 0;
+    /**
+     * Get the CUDA code to select the key from the data value.
+     */
+    virtual const char* getSortKey() const = 0;
+};
+
+
+} // namespace OpenMM
+
+#endif // __OPENMM_CUDASORT_H__
--- a/platforms/cuda2/src/kernels/sort.cu
+++ b/platforms/cuda2/src/kernels/sort.cu
+__device__ KEY_TYPE getValue(DATA_TYPE value) {
+    return SORT_KEY;
+}
+
+extern "C" {
+    
+/**
+ * Calculate the minimum and maximum value in the array to be sorted.  This kernel
+ * is executed as a single work group.
+ */
+__global__ void computeRange(const DATA_TYPE* __restrict__ data, unsigned int length, KEY_TYPE* __restrict__ range) {
+    extern __shared__ KEY_TYPE rangeBuffer[];
+    KEY_TYPE minimum = MAX_KEY;
+    KEY_TYPE maximum = MIN_KEY;
+
+    // Each thread calculates the range of a subset of values.
+
+    for (unsigned int index = threadIdx.x; index < length; index += blockDim.x) {
+        KEY_TYPE value = getValue(data[index]);
+        minimum = min(minimum, value);
+        maximum = max(maximum, value);
+    }
+
+    // Now reduce them.
+
+    rangeBuffer[threadIdx.x] = minimum;
+    __syncthreads();
+    for (unsigned int step = 1; step < blockDim.x; step *= 2) {
+        if (threadIdx.x+step < blockDim.x && threadIdx.x%(2*step) == 0)
+            rangeBuffer[threadIdx.x] = min(rangeBuffer[threadIdx.x], rangeBuffer[threadIdx.x+step]);
+        __syncthreads();
+    }
+    minimum = rangeBuffer[0];
+    rangeBuffer[threadIdx.x] = maximum;
+    __syncthreads();
+    for (unsigned int step = 1; step < blockDim.x; step *= 2) {
+        if (threadIdx.x+step < blockDim.x && threadIdx.x%(2*step) == 0)
+            rangeBuffer[threadIdx.x] = max(rangeBuffer[threadIdx.x], rangeBuffer[threadIdx.x+step]);
+        __syncthreads();
+    }
+    maximum = rangeBuffer[0];
+    if (threadIdx.x == 0) {
+        range[0] = minimum;
+        range[1] = maximum;
+    }
+}
+
+/**
+ * Assign elements to buckets.
+ */
+__global__ void assignElementsToBuckets(const DATA_TYPE* __restrict__ data, unsigned int length, unsigned int numBuckets, const KEY_TYPE* __restrict__ range,
+        unsigned int* bucketOffset, unsigned int* __restrict__ bucketOfElement, unsigned int* __restrict__ offsetInBucket) {
+    float minValue = (float) (range[0]);
+    float maxValue = (float) (range[1]);
+    float bucketWidth = (maxValue-minValue)/numBuckets;
+    for (unsigned int index = blockDim.x*blockIdx.x+threadIdx.x; index < length; index += blockDim.x*gridDim.x) {
+        float key = (float) getValue(data[index]);
+        unsigned int bucketIndex = min((unsigned int) ((key-minValue)/bucketWidth), numBuckets-1);
+        offsetInBucket[index] = atomicAdd(&bucketOffset[bucketIndex], 1);
+        bucketOfElement[index] = bucketIndex;
+    }
+}
+
+/**
+ * Sum the bucket sizes to compute the start position of each bucket.  This kernel
+ * is executed as a single work group.
+ */
+__global__ void computeBucketPositions(unsigned int numBuckets, unsigned int* __restrict__ bucketOffset) {
+    extern __shared__ unsigned int posBuffer[];
+    unsigned int globalOffset = 0;
+    for (unsigned int startBucket = 0; startBucket < numBuckets; startBucket += blockDim.x) {
+        // Load the bucket sizes into local memory.
+
+        unsigned int globalIndex = startBucket+threadIdx.x;
+        posBuffer[threadIdx.x] = (globalIndex < numBuckets ? bucketOffset[globalIndex] : 0);
+        __syncthreads();
+
+        // Perform a parallel prefix sum.
+
+        for (unsigned int step = 1; step < blockDim.x; step *= 2) {
+            unsigned int add = (threadIdx.x >= step ? posBuffer[threadIdx.x-step] : 0);
+            __syncthreads();
+            posBuffer[threadIdx.x] += add;
+            __syncthreads();
+        }
+
+        // Write the results back to global memory.
+
+        if (globalIndex < numBuckets)
+            bucketOffset[globalIndex] = posBuffer[threadIdx.x]+globalOffset;
+        globalOffset += posBuffer[blockDim.x-1];
+    }
+}
+
+/**
+ * Copy the input data into the buckets for sorting.
+ */
+__global__ void copyDataToBuckets(const DATA_TYPE* __restrict__ data, DATA_TYPE* __restrict__ buckets, unsigned int length, const unsigned int* __restrict__ bucketOffset, const unsigned int* __restrict__ bucketOfElement, const unsigned int* __restrict__ offsetInBucket) {
+    for (unsigned int index = blockDim.x*blockIdx.x+threadIdx.x; index < length; index += blockDim.x*gridDim.x) {
+        DATA_TYPE element = data[index];
+        unsigned int bucketIndex = bucketOfElement[index];
+        unsigned int offset = (bucketIndex == 0 ? 0 : bucketOffset[bucketIndex-1]);
+        buckets[offset+offsetInBucket[index]] = element;
+    }
+}
+
+/**
+ * Sort the data in each bucket.
+ */
+__global__ void sortBuckets(DATA_TYPE* __restrict__ data, const DATA_TYPE* __restrict__ buckets, unsigned int numBuckets, const unsigned int* __restrict__ bucketOffset) {
+    extern __shared__ DATA_TYPE dataBuffer[];
+    for (unsigned int index = blockIdx.x; index < numBuckets; index += gridDim.x) {
+        unsigned int startIndex = (index == 0 ? 0 : bucketOffset[index-1]);
+        unsigned int endIndex = bucketOffset[index];
+        unsigned int length = endIndex-startIndex;
+        if (length <= blockDim.x) {
+            // Load the data into local memory.
+
+            if (threadIdx.x < length)
+                dataBuffer[threadIdx.x] = buckets[startIndex+threadIdx.x];
+            else
+                dataBuffer[threadIdx.x] = MAX_VALUE;
+            __syncthreads();
+
+            // Perform a bitonic sort in local memory.
+
+            for (unsigned int k = 2; k <= blockDim.x; k *= 2) {
+                for (unsigned int j = k/2; j > 0; j /= 2) {
+                    int ixj = threadIdx.x^j;
+                    if (ixj > threadIdx.x) {
+                        DATA_TYPE value1 = dataBuffer[threadIdx.x];
+                        DATA_TYPE value2 = dataBuffer[ixj];
+                        bool ascending = (threadIdx.x&k) == 0;
+                        KEY_TYPE lowKey = (ascending ? getValue(value1) : getValue(value2));
+                        KEY_TYPE highKey = (ascending ? getValue(value2) : getValue(value1));
+                        if (lowKey > highKey) {
+                            dataBuffer[threadIdx.x] = value2;
+                            dataBuffer[ixj] = value1;
+                        }
+                    }
+                    __syncthreads();
+                }
+            }
+
+            // Write the data to the sorted array.
+
+            if (threadIdx.x < length)
+                data[startIndex+threadIdx.x] = dataBuffer[threadIdx.x];
+        }
+        else {
+            // Copy the bucket data over to the output array.
+
+            for (unsigned int i = threadIdx.x; i < length; i += blockDim.x)
+                data[startIndex+i] = buckets[startIndex+i];
+            __threadfence_block();
+            __syncthreads();
+
+            // Perform a bitonic sort in global memory.
+
+            for (unsigned int k = 2; k < 2*length; k *= 2) {
+                for (unsigned int j = k/2; j > 0; j /= 2) {
+                    for (unsigned int i = threadIdx.x; i < length; i += blockDim.x) {
+                        int ixj = i^j;
+                        if (ixj > i && ixj < length) {
+                            DATA_TYPE value1 = data[startIndex+i];
+                            DATA_TYPE value2 = data[startIndex+ixj];
+                            bool ascending = ((i&k) == 0);
+                            for (unsigned int mask = k*2; mask < 2*length; mask *= 2)
+                                ascending = ((i&mask) == 0 ? !ascending : ascending);
+                            KEY_TYPE lowKey  = (ascending ? getValue(value1) : getValue(value2));
+                            KEY_TYPE highKey = (ascending ? getValue(value2) : getValue(value1));
+                            if (lowKey > highKey) {
+                                data[startIndex+i] = value2;
+                                data[startIndex+ixj] = value1;
+                            }
+                        }
+                    }
+                    __threadfence_block();
+                    __syncthreads();
+                }
+            }
+        }
+    }
+}
+
+}
\ No newline at end of file
--- a/platforms/cuda2/src/kernels/utilities.cu
+++ b/platforms/cuda2/src/kernels/utilities.cu
+extern "C" {
+
 /**
 * This is called by the various functions below to clear a buffer.
 */
@@ -100,3 +102,5 @@ __global__ void reduceForces(const long* __restrict__ longBuffer, float4* __rest
        buffer[index] = sum;
    }
 }
+
+}
\ No newline at end of file
--- a/platforms/cuda2/tests/TestCudaSort.cpp
+++ b/platforms/cuda2/tests/TestCudaSort.cpp
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2008-2012 Stanford University and the Authors.      *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+/**
+ * This tests the CUDA implementation of sorting.
+ */
+
+#include "openmm/internal/AssertionUtilities.h"
+#include "../src/CudaArray.h"
+#include "../src/CudaContext.h"
+#include "../src/CudaSort.h"
+#include "sfmt/SFMT.h"
+#include "openmm/System.h"
+#include <iostream>
+#include <cmath>
+#include <set>
+
+using namespace OpenMM;
+using namespace std;
+
+class SortTrait : public CudaSort::SortTrait {
+    int getDataSize() const {return 4;}
+    int getKeySize() const {return 4;}
+    const char* getDataType() const {return "float";}
+    const char* getKeyType() const {return "float";}
+    const char* getMinKey() const {return "-MAXFLOAT";}
+    const char* getMaxKey() const {return "MAXFLOAT";}
+    const char* getMaxValue() const {return "MAXFLOAT";}
+    const char* getSortKey() const {return "value";}
+};
+
+void verifySorting(vector<float> array) {
+    // Sort the array.
+
+    System system;
+    system.addParticle(0.0);
+    CudaPlatform platform;
+    CudaPlatform::PlatformData platformData(system, "", "true", "single",
+            platform.getPropertyDefaultValue(CudaPlatform::CudaCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaTempDirectory()));
+    CudaContext& context = *platformData.contexts[0];
+    context.initialize();
+    CudaArray data(array.size(), 4, "sortData");
+    data.upload(array);
+    CudaSort sort(context, new SortTrait(), array.size());
+    sort.sort(data);
+    vector<float> sorted;
+    data.download(sorted);
+
+    // Verify that it is in sorted order.
+
+    for (int i = 1; i < (int) sorted.size(); i++)
+        ASSERT(sorted[i-1] <= sorted[i]);
+
+    // Make sure the sorted array contains the same values as the original one.
+
+    multiset<float> elements1(array.begin(), array.end());
+    multiset<float> elements2(sorted.begin(), sorted.end());
+    ASSERT(elements1 == elements2);
+}
+
+void testUniformValues()
+{
+    OpenMM_SFMT::SFMT sfmt;
+    init_gen_rand(0, sfmt);
+
+    vector<float> array(10000);
+    for (int i = 0; i < (int) array.size(); i++)
+        array[i] = (float) genrand_real2(sfmt);
+    verifySorting(array);
+}
+
+void testLogValues()
+{
+    OpenMM_SFMT::SFMT sfmt;
+    init_gen_rand(0, sfmt);
+
+    vector<float> array(10000);
+    for (int i = 0; i < (int) array.size(); i++)
+        array[i] = (float) log(genrand_real2(sfmt));
+    verifySorting(array);
+}
+
+int main() {
+    try {
+        testUniformValues();
+        testLogValues();
+    }
+    catch(const exception& e) {
+        cout << "exception: " << e.what() << endl;
+        return 1;
+    }
+    cout << "Done" << endl;
+    return 0;
+}