Continuing to implement new CUDA platform: CustomGBForce

f6346776 · Peter Eastman · 5feaa943 · f6346776 · f6346776 · f6346776
Commit f6346776 authored Jun 27, 2012 by Peter Eastman
13 changed files
--- a/platforms/cuda2/src/CudaContext.cpp
+++ b/platforms/cuda2/src/CudaContext.cpp
@@ -123,6 +123,7 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
    int major, minor;
    CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, device));
    gpuArchitecture = intToString(major)+intToString(minor);
+    computeCapability = major+0.1*minor;
    defaultOptimizationOptions = "--use_fast_math";
    unsigned int flags = CU_CTX_MAP_HOST;
    if (useBlockingSync)

--- a/platforms/cuda2/src/CudaContext.h
+++ b/platforms/cuda2/src/CudaContext.h
@@ -105,10 +105,16 @@ public:
    CUdevice getDevice() {
        return device;
    }
+    /**
+     * Get the compute capability of the device associated with this object.
+     */
+    double getComputeCapability() const {
+        return computeCapability;
+    }
    /**
     * Get the index of the CUdevice associated with this object.
     */
-    int getDeviceIndex() {
+    int getDeviceIndex() const {
        return deviceIndex;
    }
    /**
@@ -444,7 +450,7 @@ private:
    void validateMolecules();
    static bool hasInitializedCuda;
    const System& system;
-    double time;
+    double time, computeCapability;
    CudaPlatform::PlatformData& platformData;
    int deviceIndex;
    int contextIndex;

--- a/platforms/cuda2/src/CudaKernelFactory.cpp
+++ b/platforms/cuda2/src/CudaKernelFactory.cpp
@@ -98,8 +98,8 @@ KernelImpl* CudaKernelFactory::createKernelImpl(std::string name, const Platform
        return new CudaCalcCustomNonbondedForceKernel(name, platform, cu, context.getSystem());
    if (name == CalcGBSAOBCForceKernel::Name())
        return new CudaCalcGBSAOBCForceKernel(name, platform, cu);
-//    if (name == CalcCustomGBForceKernel::Name())
-//        return new CudaCalcCustomGBForceKernel(name, platform, cu, context.getSystem());
+    if (name == CalcCustomGBForceKernel::Name())
+        return new CudaCalcCustomGBForceKernel(name, platform, cu, context.getSystem());
    if (name == CalcCustomExternalForceKernel::Name())
        return new CudaCalcCustomExternalForceKernel(name, platform, cu, context.getSystem());
    if (name == CalcCustomHbondForceKernel::Name())

--- a/platforms/cuda2/src/CudaKernels.cpp
+++ b/platforms/cuda2/src/CudaKernels.cpp
--- a/platforms/cuda2/src/CudaKernels.h
+++ b/platforms/cuda2/src/CudaKernels.h
@@ -715,58 +715,58 @@ private:
    std::vector<void*> computeSumArgs, force1Args;
 };

-///**
-// * This kernel is invoked by CustomGBForce to calculate the forces acting on the system.
-// */
-//class CudaCalcCustomGBForceKernel : public CalcCustomGBForceKernel {
-//public:
-//    CudaCalcCustomGBForceKernel(std::string name, const Platform& platform, CudaContext& cu, System& system) : CalcCustomGBForceKernel(name, platform),
-//            hasInitializedKernels(false), cu(cu), params(NULL), computedValues(NULL), energyDerivs(NULL), longEnergyDerivs(NULL), globals(NULL),
-//            valueBuffers(NULL), longValueBuffers(NULL), tabulatedFunctionParams(NULL), system(system) {
-//    }
-//    ~CudaCalcCustomGBForceKernel();
-//    /**
-//     * Initialize the kernel.
-//     *
-//     * @param system     the System this kernel will be applied to
-//     * @param force      the CustomGBForce this kernel will be used for
-//     */
-//    void initialize(const System& system, const CustomGBForce& force);
-//    /**
-//     * Execute the kernel to calculate the forces and/or energy.
-//     *
-//     * @param context        the context in which to execute this kernel
-//     * @param includeForces  true if forces should be calculated
-//     * @param includeEnergy  true if the energy should be calculated
-//     * @return the potential energy due to the force
-//     */
-//    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
-//    /**
-//     * Copy changed parameters over to a context.
-//     *
-//     * @param context    the context to copy parameters to
-//     * @param force      the CustomGBForce to copy the parameters from
-//     */
-//    void copyParametersToContext(ContextImpl& context, const CustomGBForce& force);
-//private:
-//    bool hasInitializedKernels, needParameterGradient;
-//    int maxTiles, numComputedValues;
-//    CudaContext& cu;
-//    CudaParameterSet* params;
-//    CudaParameterSet* computedValues;
-//    CudaParameterSet* energyDerivs;
-//    CudaArray<cl_long>* longEnergyDerivs;
-//    CudaArray<cl_float>* globals;
-//    CudaArray<cl_float>* valueBuffers;
-//    CudaArray<cl_long>* longValueBuffers;
-//    CudaArray<mm_float4>* tabulatedFunctionParams;
-//    std::vector<std::string> globalParamNames;
-//    std::vector<cl_float> globalParamValues;
-//    std::vector<CudaArray<mm_float4>*> tabulatedFunctions;
-//    std::vector<bool> pairValueUsesParam, pairEnergyUsesParam, pairEnergyUsesValue;
-//    System& system;
-//    CUfunction pairValueKernel, perParticleValueKernel, pairEnergyKernel, perParticleEnergyKernel, gradientChainRuleKernel;
-//};
+/**
+ * This kernel is invoked by CustomGBForce to calculate the forces acting on the system.
+ */
+class CudaCalcCustomGBForceKernel : public CalcCustomGBForceKernel {
+public:
+    CudaCalcCustomGBForceKernel(std::string name, const Platform& platform, CudaContext& cu, System& system) : CalcCustomGBForceKernel(name, platform),
+            hasInitializedKernels(false), cu(cu), params(NULL), computedValues(NULL), energyDerivs(NULL), longEnergyDerivs(NULL), globals(NULL),
+            valueBuffers(NULL), tabulatedFunctionParams(NULL), system(system) {
+    }
+    ~CudaCalcCustomGBForceKernel();
+    /**
+     * Initialize the kernel.
+     *
+     * @param system     the System this kernel will be applied to
+     * @param force      the CustomGBForce this kernel will be used for
+     */
+    void initialize(const System& system, const CustomGBForce& force);
+    /**
+     * Execute the kernel to calculate the forces and/or energy.
+     *
+     * @param context        the context in which to execute this kernel
+     * @param includeForces  true if forces should be calculated
+     * @param includeEnergy  true if the energy should be calculated
+     * @return the potential energy due to the force
+     */
+    double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
+    /**
+     * Copy changed parameters over to a context.
+     *
+     * @param context    the context to copy parameters to
+     * @param force      the CustomGBForce to copy the parameters from
+     */
+    void copyParametersToContext(ContextImpl& context, const CustomGBForce& force);
+private:
+    bool hasInitializedKernels, needParameterGradient;
+    int maxTiles, numComputedValues;
+    CudaContext& cu;
+    CudaParameterSet* params;
+    CudaParameterSet* computedValues;
+    CudaParameterSet* energyDerivs;
+    CudaArray* longEnergyDerivs;
+    CudaArray* globals;
+    CudaArray* valueBuffers;
+    CudaArray* tabulatedFunctionParams;
+    std::vector<std::string> globalParamNames;
+    std::vector<float> globalParamValues;
+    std::vector<CudaArray*> tabulatedFunctions;
+    std::vector<bool> pairValueUsesParam, pairEnergyUsesParam, pairEnergyUsesValue;
+    System& system;
+    CUfunction pairValueKernel, perParticleValueKernel, pairEnergyKernel, perParticleEnergyKernel, gradientChainRuleKernel;
+    std::vector<void*> pairValueArgs, perParticleValueArgs, pairEnergyArgs, perParticleEnergyArgs, gradientChainRuleArgs;
+};

 /**
 * This kernel is invoked by CustomExternalForce to calculate the forces acting on the system and the energy of the system.

--- a/platforms/cuda2/src/CudaNonbondedUtilities.cpp
+++ b/platforms/cuda2/src/CudaNonbondedUtilities.cpp
@@ -52,7 +52,7 @@ CudaNonbondedUtilities::CudaNonbondedUtilities(CudaContext& context) : context(c
    int multiprocessors;
    CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, context.getDevice()));
    numForceThreadBlocks = 2*multiprocessors;
-    forceThreadBlockSize = 256;
+    forceThreadBlockSize = (context.getComputeCapability() < 2.0 ? 128 : 256);
 }

 CudaNonbondedUtilities::~CudaNonbondedUtilities() {
@@ -441,8 +441,7 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
    defines["NUM_BLOCKS"] = context.intToString(context.getNumAtomBlocks());
    if ((localDataSize/4)%2 == 0 && !context.getUseDoublePrecision())
        defines["PARAMETER_SIZE_IS_EVEN"] = "1";
-    string file;
-    CUmodule program = context.createModule(context.replaceStrings(CudaKernelSources::vectorOps+CudaKernelSources::nonbonded, replacements), defines);
+    CUmodule program = context.createModule(CudaKernelSources::vectorOps+context.replaceStrings(CudaKernelSources::nonbonded, replacements), defines);
    CUfunction kernel = context.getKernel(program, "computeNonbonded");

    // Set arguments to the Kernel.

--- a/platforms/cuda2/src/kernels/customGBChainRule.cu
+++ b/platforms/cuda2/src/kernels/customGBChainRule.cu
+#ifdef USE_CUTOFF
+if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2 && r2 < CUTOFF_SQUARED) {
+#else
+if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
+#endif
+#ifdef USE_SYMMETRIC
+    real tempForce = 0;
+#else
+    real3 tempForce1 = make_real3(0);
+    real3 tempForce2 = make_real3(0);
+#endif
+    COMPUTE_FORCE
+#ifdef USE_SYMMETRIC
+    dEdR += tempForce*invR;
+#else
+    dEdR1 += tempForce1;
+    dEdR2 += tempForce2;
+#endif
+}
--- a/platforms/cuda2/src/kernels/customGBEnergyN2.cu
+++ b/platforms/cuda2/src/kernels/customGBEnergyN2.cu
+#define STORE_DERIVATIVE_1(INDEX) atomicAdd(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (deriv##INDEX##_1*0xFFFFFFFF)));
+#define STORE_DERIVATIVE_2(INDEX) atomicAdd(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].deriv##INDEX*0xFFFFFFFF)));
+#define TILE_SIZE 32
+
+
+typedef struct {
+    real4 posq;
+    real3 force;
+    ATOM_PARAMETER_DATA
+#ifdef NEED_PADDING
+    float padding;
+#endif
+} AtomData;
+
+/**
+ * Compute a force based on pair interactions.
+ */
+extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forceBuffers, real* __restrict__ energyBuffer,
+        const real4* __restrict__ posq, const unsigned int* __restrict__ exclusions, const unsigned int* __restrict__ exclusionIndices,
+        const unsigned int* __restrict__ exclusionRowIndices,
+#ifdef USE_CUTOFF
+        const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const unsigned int* __restrict__ interactionFlags
+#else
+        unsigned int numTiles
+#endif
+        PARAMETER_ARGUMENTS) {
+    unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
+    unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
+#ifdef USE_CUTOFF
+    unsigned int numTiles = interactionCount[0];
+    unsigned int pos = warp*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
+    unsigned int end = (warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
+#else
+    unsigned int pos = warp*numTiles/totalWarps;
+    unsigned int end = (warp+1)*numTiles/totalWarps;
+#endif
+    real energy = 0;
+    unsigned int lasty = 0xFFFFFFFF;
+    __shared__ AtomData localData[THREAD_BLOCK_SIZE];
+    __shared__ unsigned int exclusionRange[2*WARPS_PER_GROUP];
+    __shared__ int exclusionIndex[WARPS_PER_GROUP];
+    
+    do {
+        // Extract the coordinates of this tile
+        const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
+        const unsigned int tbx = threadIdx.x - tgx;
+        const unsigned int localGroupIndex = threadIdx.x/TILE_SIZE;
+        unsigned int x, y;
+        real3 force = make_real3(0);
+        DECLARE_ATOM1_DERIVATIVES
+        if (pos < end) {
+#ifdef USE_CUTOFF
+            if (numTiles <= maxTiles) {
+                ushort2 tileIndices = tiles[pos];
+                x = tileIndices.x;
+                y = tileIndices.y;
+            }
+            else
+#endif
+            {
+                y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
+                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+                if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
+                    y += (x < y ? -1 : 1);
+                    x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+                }
+            }
+            unsigned int atom1 = x*TILE_SIZE + tgx;
+            real4 posq1 = posq[atom1];
+            LOAD_ATOM1_PARAMETERS
+
+            // Locate the exclusion data for this tile.
+
+#ifdef USE_EXCLUSIONS
+            if (tgx < 2)
+                exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
+            if (tgx == 0)
+                exclusionIndex[localGroupIndex] = -1;
+            for (unsigned int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
+                if (exclusionIndices[i] == y)
+                    exclusionIndex[localGroupIndex] = i*TILE_SIZE;
+            bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
+#else
+            bool hasExclusions = false;
+#endif
+            if (pos >= end)
+                ; // This warp is done.
+            else if (x == y) {
+                // This tile is on the diagonal.
+
+                const unsigned int localAtomIndex = threadIdx.x;
+                localData[localAtomIndex].posq = posq1;
+                LOAD_LOCAL_PARAMETERS_FROM_1
+#ifdef USE_EXCLUSIONS
+                unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
+#endif
+                for (unsigned int j = 0; j < TILE_SIZE; j++) {
+#ifdef USE_EXCLUSIONS
+                    bool isExcluded = !(excl & 0x1);
+#endif
+                    int atom2 = tbx+j;
+                    real4 posq2 = localData[atom2].posq;
+                    real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
+#ifdef USE_PERIODIC
+                    delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                    delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                    delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+#endif
+                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+#ifdef USE_CUTOFF
+                    if (r2 < CUTOFF_SQUARED) {
+#endif
+                    real invR = RSQRT(r2);
+                    real r = RECIP(invR);
+                    LOAD_ATOM2_PARAMETERS
+                    atom2 = y*TILE_SIZE+j;
+                    real dEdR = 0;
+                    real tempEnergy = 0;
+                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
+                        COMPUTE_INTERACTION
+                        dEdR /= -r;
+                    }
+                    energy += 0.5f*tempEnergy;
+                    delta *= dEdR;
+                    force -= delta;
+#ifdef USE_CUTOFF
+                    }
+#endif
+#ifdef USE_EXCLUSIONS
+                    excl >>= 1;
+#endif
+                }
+            }
+            else {
+                // This is an off-diagonal tile.
+
+                const unsigned int localAtomIndex = threadIdx.x;
+                if (lasty != y) {
+                    unsigned int j = y*TILE_SIZE + tgx;
+                    localData[localAtomIndex].posq = posq[j];
+                    LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
+                }
+                localData[localAtomIndex].force = make_real3(0);
+                CLEAR_LOCAL_DERIVATIVES
+#ifdef USE_CUTOFF
+                unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
+                if (!hasExclusions && flags == 0) {
+                    // No interactions in this tile.
+                }
+                else
+#endif
+                {
+                    // Compute the full set of interactions in this tile.
+
+#ifdef USE_EXCLUSIONS
+                    unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[localGroupIndex]+tgx] : 0xFFFFFFFF);
+                    excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
+#endif
+                    unsigned int tj = tgx;
+                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
+#ifdef USE_EXCLUSIONS
+                        bool isExcluded = !(excl & 0x1);
+#endif
+                        int atom2 = tbx+tj;
+                        real4 posq2 = localData[atom2].posq;
+                        real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
+#ifdef USE_PERIODIC
+                        delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                        delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                        delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+#endif
+                        real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+#ifdef USE_CUTOFF
+                        if (r2 < CUTOFF_SQUARED) {
+#endif
+                        real invR = RSQRT(r2);
+                        real r = RECIP(invR);
+                        LOAD_ATOM2_PARAMETERS
+                        atom2 = y*TILE_SIZE+tj;
+                        real dEdR = 0;
+                        real tempEnergy = 0;
+                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+                            COMPUTE_INTERACTION
+                            dEdR /= -r;
+                        }
+                        energy += tempEnergy;
+                        delta *= dEdR;
+                        force -= delta;
+                        atom2 = tbx+tj;
+                        localData[atom2].force += delta;
+                        RECORD_DERIVATIVE_2
+#ifdef USE_CUTOFF
+                        }
+#endif
+#ifdef USE_EXCLUSIONS
+                        excl >>= 1;
+#endif
+                        tj = (tj + 1) & (TILE_SIZE - 1);
+                    }
+                }
+            }
+        }
+        lasty = y;
+        
+        // Write results.
+        
+        if (pos < end) {
+            const unsigned int offset = x*TILE_SIZE + tgx;
+            atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (force.x*0xFFFFFFFF)));
+            atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0xFFFFFFFF)));
+            atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0xFFFFFFFF)));
+            STORE_DERIVATIVES_1
+        }
+        if (pos < end && x != y) {
+            const unsigned int offset = y*TILE_SIZE + tgx;
+            atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0xFFFFFFFF)));
+            atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0xFFFFFFFF)));
+            atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0xFFFFFFFF)));
+            STORE_DERIVATIVES_2
+        }
+        pos++;
+    } while (pos < end);
+    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
+}
--- a/platforms/cuda2/src/kernels/customGBEnergyPerParticle.cu
+++ b/platforms/cuda2/src/kernels/customGBEnergyPerParticle.cu
+/**
+ * Reduce the derivatives computed in the N^2 energy kernel, and compute all per-particle energy terms.
+ */
+
+extern "C" __global__ void computePerParticleEnergy(long long* __restrict__ forceBuffers, real* __restrict__ energyBuffer, const real4* __restrict__ posq
+        PARAMETER_ARGUMENTS) {
+    real energy = 0;
+    for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
+        // Load the derivatives
+
+        LOAD_DERIVATIVES
+
+        // Now calculate the per-particle energy terms.
+
+        real4 pos = posq[index];
+        real3 force = make_real3(0, 0, 0);
+        COMPUTE_ENERGY
+    }
+    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
+}
--- a/platforms/cuda2/src/kernels/customGBGradientChainRule.cu
+++ b/platforms/cuda2/src/kernels/customGBGradientChainRule.cu
+/**
+ * Compute chain rule terms for computed values that depend explicitly on particle coordinates.
+ */
+
+extern "C" __global__ void computeGradientChainRuleTerms(long long* __restrict__ forceBuffers, const real4* __restrict__ posq
+        PARAMETER_ARGUMENTS) {
+    const real scale = RECIP((real) 0xFFFFFFFF);
+    for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
+        real4 pos = posq[index];
+        real3 force = make_real3(scale*forceBuffers[index], scale*forceBuffers[index+PADDED_NUM_ATOMS], scale*forceBuffers[index+PADDED_NUM_ATOMS*2]);
+        COMPUTE_FORCES
+        forceBuffers[index] = (long long) (force.x*0xFFFFFFFF);
+        forceBuffers[index+PADDED_NUM_ATOMS] = (long long) (force.y*0xFFFFFFFF);
+        forceBuffers[index+PADDED_NUM_ATOMS*2] = (long long) (force.z*0xFFFFFFFF);
+    }
+}
--- a/platforms/cuda2/src/kernels/customGBValueN2.cu
+++ b/platforms/cuda2/src/kernels/customGBValueN2.cu
+#define TILE_SIZE 32
+
+typedef struct {
+    real4 posq;
+    real value, temp;
+    ATOM_PARAMETER_DATA
+#ifdef NEED_PADDING
+    float padding;
+#endif
+} AtomData;
+
+/**
+ * Compute a value based on pair interactions.
+ */
+extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const unsigned int* __restrict__ exclusions,
+        const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices, unsigned long long* __restrict__ global_value,
+#ifdef USE_CUTOFF
+        const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const unsigned int* __restrict__ interactionFlags
+#else
+        unsigned int numTiles
+#endif
+        PARAMETER_ARGUMENTS) {
+    unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
+    unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
+#ifdef USE_CUTOFF
+    unsigned int numTiles = interactionCount[0];
+    unsigned int pos = warp*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
+    unsigned int end = (warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
+#else
+    unsigned int pos = warp*numTiles/totalWarps;
+    unsigned int end = (warp+1)*numTiles/totalWarps;
+#endif
+    real energy = 0;
+    unsigned int lasty = 0xFFFFFFFF;
+    __shared__ AtomData localData[THREAD_BLOCK_SIZE];
+    __shared__ unsigned int exclusionRange[2*WARPS_PER_GROUP];
+    __shared__ int exclusionIndex[WARPS_PER_GROUP];
+    
+    do {
+        // Extract the coordinates of this tile
+        const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
+        const unsigned int tbx = threadIdx.x - tgx;
+        const unsigned int localGroupIndex = threadIdx.x/TILE_SIZE;
+        unsigned int x, y;
+        real value = 0;
+        if (pos < end) {
+#ifdef USE_CUTOFF
+            if (numTiles <= maxTiles) {
+                ushort2 tileIndices = tiles[pos];
+                x = tileIndices.x;
+                y = tileIndices.y;
+            }
+            else
+#endif
+            {
+                y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
+                x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+                if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
+                    y += (x < y ? -1 : 1);
+                    x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
+                }
+            }
+            unsigned int atom1 = x*TILE_SIZE + tgx;
+            real4 posq1 = posq[atom1];
+            LOAD_ATOM1_PARAMETERS
+
+            // Locate the exclusion data for this tile.
+
+#ifdef USE_EXCLUSIONS
+            if (tgx < 2)
+                exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
+            if (tgx == 0)
+                exclusionIndex[localGroupIndex] = -1;
+            for (unsigned int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
+                if (exclusionIndices[i] == y)
+                    exclusionIndex[localGroupIndex] = i*TILE_SIZE;
+            bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
+#else
+            bool hasExclusions = false;
+#endif
+            if (pos >= end)
+                ; // This warp is done.
+            else if (x == y) {
+                // This tile is on the diagonal.
+
+                const unsigned int localAtomIndex = threadIdx.x;
+                localData[localAtomIndex].posq = posq1;
+                LOAD_LOCAL_PARAMETERS_FROM_1
+#ifdef USE_EXCLUSIONS
+                unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
+#endif
+                for (unsigned int j = 0; j < TILE_SIZE; j++) {
+#ifdef USE_EXCLUSIONS
+                    bool isExcluded = !(excl & 0x1);
+#endif
+                    int atom2 = tbx+j;
+                    real4 posq2 = localData[atom2].posq;
+                    real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
+#ifdef USE_PERIODIC
+                    delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                    delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                    delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+#endif
+                    real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+#ifdef USE_CUTOFF
+                    if (r2 < CUTOFF_SQUARED) {
+#endif
+                    real invR = RSQRT(r2);
+                    real r = RECIP(invR);
+                    LOAD_ATOM2_PARAMETERS
+                    atom2 = y*TILE_SIZE+j;
+                    real tempValue1 = 0;
+                    real tempValue2 = 0;
+#ifdef USE_EXCLUSIONS
+                    if (!isExcluded && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
+#else
+                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
+#endif
+                        COMPUTE_VALUE
+                    }
+                    value += tempValue1;
+#ifdef USE_CUTOFF
+                    }
+#endif
+#ifdef USE_EXCLUSIONS
+                    excl >>= 1;
+#endif
+                }
+            }
+            else {
+                // This is an off-diagonal tile.
+
+                if (lasty != y) {
+                    unsigned int j = y*TILE_SIZE + tgx;
+                    localData[threadIdx.x].posq = posq[j];
+                    const unsigned int localAtomIndex = threadIdx.x;
+                    LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
+                }
+                localData[threadIdx.x].value = 0;
+#ifdef USE_CUTOFF
+                unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
+                if (!hasExclusions && flags != 0xFFFFFFFF) {
+                    if (flags == 0) {
+                        // No interactions in this tile.
+                    }
+                    else {
+                        // Compute only a subset of the interactions in this tile.
+
+                        for (unsigned int j = 0; j < TILE_SIZE; j++) {
+                            if ((flags&(1<<j)) != 0) {
+                                int atom2 = tbx+j;
+                                real4 posq2 = localData[atom2].posq;
+                                real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
+#ifdef USE_PERIODIC
+                                delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                                delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                                delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+#endif
+                                real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+                                real tempValue1 = 0;
+                                real tempValue2 = 0;
+                                if (r2 < CUTOFF_SQUARED) {
+                                    real invR = RSQRT(r2);
+                                    real r = RECIP(invR);
+                                    LOAD_ATOM2_PARAMETERS
+                                    atom2 = y*TILE_SIZE+j;
+                                    if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+                                        COMPUTE_VALUE
+                                    }
+                                    value += tempValue1;
+                                }
+                                localData[threadIdx.x].temp = tempValue2;
+
+                                // Sum the forces on atom2.
+
+                                if (tgx % 4 == 0)
+                                    localData[threadIdx.x].temp += localData[threadIdx.x+1].temp+localData[threadIdx.x+2].temp+localData[threadIdx.x+3].temp;
+                                if (tgx == 0)
+                                    localData[tbx+j].value += localData[threadIdx.x].temp+localData[threadIdx.x+4].temp+localData[threadIdx.x+8].temp+localData[threadIdx.x+12].temp+localData[threadIdx.x+16].temp+localData[threadIdx.x+20].temp+localData[threadIdx.x+24].temp+localData[threadIdx.x+28].temp;
+                            }
+                        }
+                    }
+                }
+                else
+#endif
+                {
+                    // Compute the full set of interactions in this tile.
+
+#ifdef USE_EXCLUSIONS
+                    unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[localGroupIndex]+tgx] : 0xFFFFFFFF);
+                    excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
+#endif
+                    unsigned int tj = tgx;
+                    for (unsigned int j = 0; j < TILE_SIZE; j++) {
+#ifdef USE_EXCLUSIONS
+                        bool isExcluded = !(excl & 0x1);
+#endif
+                        int atom2 = tbx+tj;
+                        real4 posq2 = localData[atom2].posq;
+                        real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
+#ifdef USE_PERIODIC
+                        delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
+                        delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
+                        delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
+#endif
+                        real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
+#ifdef USE_CUTOFF
+                        if (r2 < CUTOFF_SQUARED) {
+#endif
+                        real invR = RSQRT(r2);
+                        real r = RECIP(invR);
+                        LOAD_ATOM2_PARAMETERS
+                        atom2 = y*TILE_SIZE+tj;
+                        real tempValue1 = 0;
+                        real tempValue2 = 0;
+#ifdef USE_EXCLUSIONS
+                        if (!isExcluded && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+#else
+                        if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
+#endif
+                            COMPUTE_VALUE
+                        }
+                        value += tempValue1;
+                        localData[tbx+tj].value += tempValue2;
+#ifdef USE_CUTOFF
+                        }
+#endif
+#ifdef USE_EXCLUSIONS
+                        excl >>= 1;
+#endif
+                        tj = (tj + 1) & (TILE_SIZE - 1);
+                    }
+                }
+            }
+        }
+        
+        // Write results.
+        
+        if (pos < end) {
+            const unsigned int offset = x*TILE_SIZE + tgx;
+            atomicAdd(&global_value[offset], static_cast<unsigned long long>((long long) (value*0xFFFFFFFF)));
+        }
+        if (pos < end && x != y) {
+            const unsigned int offset = y*TILE_SIZE + tgx;
+            atomicAdd(&global_value[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].value*0xFFFFFFFF)));
+        }
+        lasty = y;
+        pos++;
+    } while (pos < end);
+}
--- a/platforms/cuda2/src/kernels/customGBValuePerParticle.cu
+++ b/platforms/cuda2/src/kernels/customGBValuePerParticle.cu
+/**
+ * Reduce a pairwise computed value, and compute per-particle values.
+ */
+
+extern "C" __global__ void computePerParticleValues(real4* posq, long long* valueBuffers
+        PARAMETER_ARGUMENTS) {
+    for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
+        // Load the pairwise value
+
+        real sum = valueBuffers[index]/(real) 0xFFFFFFFF;
+        
+        // Now calculate other values
+
+        real4 pos = posq[index];
+        COMPUTE_VALUES
+    }
+}
--- a/platforms/cuda2/tests/TestCudaCustomGBForce.cpp
+++ b/platforms/cuda2/tests/TestCudaCustomGBForce.cpp