Merge pull request #1165 from peastman/mixedenergy

Further improved energy accuracy in mixed precision mode

Merge pull request #1165 from peastman/mixedenergy
Further improved energy accuracy in mixed precision mode
59bd8d19 · peastman · a20944f6 · 1f2b65da · 59bd8d19 · 59bd8d19
Commit 59bd8d19 authored Oct 02, 2015 by peastman
20 changed files
--- a/platforms/cuda/src/CudaBondedUtilities.cpp
+++ b/platforms/cuda/src/CudaBondedUtilities.cpp
@@ -99,7 +99,7 @@ void CudaBondedUtilities::initialize(const System& system) {
    s<<CudaKernelSources::vectorOps;
    for (int i = 0; i < (int) prefixCode.size(); i++)
        s<<prefixCode[i];
-    s<<"extern \"C\" __global__ void computeBondedForces(unsigned long long* __restrict__ forceBuffer, real* __restrict__ energyBuffer, const real4* __restrict__ posq, int groups, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ";
+    s<<"extern \"C\" __global__ void computeBondedForces(unsigned long long* __restrict__ forceBuffer, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq, int groups, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ";
    for (int force = 0; force < numForces; force++) {
        for (int i = 0; i < (int) atomIndices[force].size(); i++) {
            int indexWidth = atomIndices[force][i]->getElementSize()/4;
@@ -110,7 +110,7 @@ void CudaBondedUtilities::initialize(const System& system) {
    for (int i = 0; i < (int) arguments.size(); i++)
        s<<", "<<argTypes[i]<<"* customArg"<<(i+1);
    s<<") {\n";
-    s<<"real energy = 0;\n";
+    s<<"mixed energy = 0;\n";
    for (int force = 0; force < numForces; force++)
        s<<createForceSource(force, forceAtoms[force].size(), forceAtoms[force][0].size(), forceGroup[force], forceSource[force]);
    s<<"energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;\n";

--- a/platforms/cuda/src/CudaContext.cpp
+++ b/platforms/cuda/src/CudaContext.cpp
@@ -368,7 +368,7 @@ void CudaContext::initialize() {
        CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0));
    }
    else if (useMixedPrecision) {
-        energyBuffer = CudaArray::create<float>(*this, numEnergyBuffers, "energyBuffer");
+        energyBuffer = CudaArray::create<double>(*this, numEnergyBuffers, "energyBuffer");
        int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers);
        CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0));
    }

--- a/platforms/cuda/src/CudaKernels.cpp
+++ b/platforms/cuda/src/CudaKernels.cpp
@@ -112,7 +112,7 @@ double CudaCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bo
    cu.getIntegrationUtilities().distributeForcesFromVirtualSites();
    if (includeEnergy) {
        CudaArray& energyArray = cu.getEnergyBuffer();
-        if (cu.getUseDoublePrecision()) {
+        if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
            double* energy = (double*) cu.getPinnedBuffer();
            energyArray.download(energy);
            for (int i = 0; i < energyArray.getSize(); i++)

--- a/platforms/cuda/src/kernels/customCentroidBond.cu
+++ b/platforms/cuda/src/kernels/customCentroidBond.cu
@@ -104,10 +104,10 @@ inline __device__ real4 computeCross(real4 vec1, real4 vec2) {
 /**
 * Compute the forces on groups based on the bonds.
 */
-extern "C" __global__ void computeGroupForces(unsigned long long* __restrict__ groupForce, real* __restrict__ energyBuffer, const real4* __restrict__ centerPositions,
+extern "C" __global__ void computeGroupForces(unsigned long long* __restrict__ groupForce, mixed* __restrict__ energyBuffer, const real4* __restrict__ centerPositions,
        const int* __restrict__ bondGroups
        EXTRA_ARGS) {
-    real energy = 0;
+    mixed energy = 0;
    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_BONDS; index += blockDim.x*gridDim.x) {
        COMPUTE_FORCE
    }

--- a/platforms/cuda/src/kernels/customGBEnergyN2.cu
+++ b/platforms/cuda/src/kernels/customGBEnergyN2.cu
@@ -13,7 +13,7 @@ typedef struct {
 /**
 * Compute a force based on pair interactions.
 */
-extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forceBuffers, real* __restrict__ energyBuffer,
+extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer,
        const real4* __restrict__ posq, const unsigned int* __restrict__ exclusions, const ushort2* __restrict__ exclusionTiles,
 #ifdef USE_CUTOFF
        const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
@@ -27,7 +27,7 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
    const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
    const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
    const unsigned int tbx = threadIdx.x - tgx;
-    real energy = 0;
+    mixed energy = 0;
    __shared__ AtomData localData[THREAD_BLOCK_SIZE];
    // First loop: process tiles that contain exclusions.

--- a/platforms/cuda/src/kernels/customGBEnergyPerParticle.cu
+++ b/platforms/cuda/src/kernels/customGBEnergyPerParticle.cu
@@ -2,9 +2,9 @@
 * Reduce the derivatives computed in the N^2 energy kernel, and compute all per-particle energy terms.
 */
-extern "C" __global__ void computePerParticleEnergy(long long* __restrict__ forceBuffers, real* __restrict__ energyBuffer, const real4* __restrict__ posq
+extern "C" __global__ void computePerParticleEnergy(long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq
        PARAMETER_ARGUMENTS) {
-    real energy = 0;
+    mixed energy = 0;
    for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
        // Load the derivatives

--- a/platforms/cuda/src/kernels/customHbondForce.cu
+++ b/platforms/cuda/src/kernels/customHbondForce.cu
@@ -66,12 +66,12 @@ inline __device__ real4 computeCross(real4 vec1, real4 vec2) {
 /**
 * Compute forces on donors.
 */
-extern "C" __global__ void computeDonorForces(unsigned long long* __restrict__ force, real* __restrict__ energyBuffer, const real4* __restrict__ posq,
+extern "C" __global__ void computeDonorForces(unsigned long long* __restrict__ force, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq,
        const int4* __restrict__ exclusions, const int4* __restrict__ donorAtoms, const int4* __restrict__ acceptorAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize,
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
        PARAMETER_ARGUMENTS) {
    extern __shared__ real4 posBuffer[];
-    real energy = 0;
+    mixed energy = 0;
    real3 f1 = make_real3(0);
    real3 f2 = make_real3(0);
    real3 f3 = make_real3(0);
@@ -155,7 +155,7 @@ extern "C" __global__ void computeDonorForces(unsigned long long* __restrict__ f
 /**
 * Compute forces on acceptors.
 */
-extern "C" __global__ void computeAcceptorForces(unsigned long long* __restrict__ force, real* __restrict__ energyBuffer, const real4* __restrict__ posq,
+extern "C" __global__ void computeAcceptorForces(unsigned long long* __restrict__ force, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq,
        const int4* __restrict__ exclusions, const int4* __restrict__ donorAtoms, const int4* __restrict__ acceptorAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize,
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
        PARAMETER_ARGUMENTS) {

--- a/platforms/cuda/src/kernels/customManyParticle.cu
+++ b/platforms/cuda/src/kernels/customManyParticle.cu
@@ -78,7 +78,7 @@ __constant__ float globals[NUM_GLOBALS];
 * Compute the interaction.
 */
 extern "C" __global__ void computeInteraction(
-        unsigned long long* __restrict__ forceBuffers, real* __restrict__ energyBuffer, const real4* __restrict__ posq,
+        unsigned long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq,
        real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
 #ifdef USE_CUTOFF
        , const int* __restrict__ neighbors, const int* __restrict__ neighborStartIndex
@@ -90,7 +90,7 @@ extern "C" __global__ void computeInteraction(
        , int* __restrict__ exclusions, int* __restrict__ exclusionStartIndex
 #endif
        PARAMETER_ARGUMENTS) {
-    real energy = 0.0f;
+    mixed energy = 0;
    // Loop over particles to be the first one in the set.

--- a/platforms/cuda/src/kernels/customNonbondedGroups.cu
+++ b/platforms/cuda/src/kernels/customNonbondedGroups.cu
@@ -9,14 +9,14 @@ typedef struct {
 } AtomData;
 extern "C" __global__ void computeInteractionGroups(
-        unsigned long long* __restrict__ forceBuffers, real* __restrict__ energyBuffer, const real4* __restrict__ posq, const int4* __restrict__ groupData,
+        unsigned long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq, const int4* __restrict__ groupData,
        real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
        PARAMETER_ARGUMENTS) {
    const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
    const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE; // global warpIndex
    const unsigned int tgx = threadIdx.x & (TILE_SIZE-1); // index within the warp
    const unsigned int tbx = threadIdx.x - tgx;           // block warpIndex
-    real energy = 0.0f;
+    mixed energy = 0;
    __shared__ AtomData localData[LOCAL_MEMORY_SIZE];
    const unsigned int startTile = FIRST_TILE+warp*(LAST_TILE-FIRST_TILE)/totalWarps;

--- a/platforms/cuda/src/kernels/ewald.cu
+++ b/platforms/cuda/src/kernels/ewald.cu
@@ -6,7 +6,7 @@ __device__ real2 multofReal2(real2 a, real2 b) {
 * Precompute the cosine and sine sums which appear in each force term.
 */
-extern "C" __global__ void calculateEwaldCosSinSums(real* __restrict__ energyBuffer, const real4* __restrict__ posq, real2* __restrict__ cosSinSum, real4 periodicBoxSize) {
+extern "C" __global__ void calculateEwaldCosSinSums(mixed* __restrict__ energyBuffer, const real4* __restrict__ posq, real2* __restrict__ cosSinSum, real4 periodicBoxSize) {
    const unsigned int ksizex = 2*KMAX_X-1;
    const unsigned int ksizey = 2*KMAX_Y-1;
    const unsigned int ksizez = 2*KMAX_Z-1;
@@ -14,7 +14,7 @@ extern "C" __global__ void calculateEwaldCosSinSums(real* __restrict__ energyBuf
    real3 reciprocalBoxSize = make_real3(2*M_PI/periodicBoxSize.x, 2*M_PI/periodicBoxSize.y, 2*M_PI/periodicBoxSize.z);
    real reciprocalCoefficient = ONE_4PI_EPS0*4*M_PI/(periodicBoxSize.x*periodicBoxSize.y*periodicBoxSize.z);
    unsigned int index = blockIdx.x*blockDim.x+threadIdx.x;
-    real energy = 0;
+    mixed energy = 0;
    while (index < (KMAX_Y-1)*ksizez+KMAX_Z)
        index += blockDim.x*gridDim.x;
    while (index < totalK) {

--- a/platforms/cuda/src/kernels/gbsaObc1.cu
+++ b/platforms/cuda/src/kernels/gbsaObc1.cu
@@ -33,9 +33,9 @@ extern "C" __global__ void reduceBornSum(float alpha, float beta, float gamma, c
 * Reduce the Born force.
 */
-extern "C" __global__ void reduceBornForce(long long* __restrict__ bornForce, real* __restrict__ energyBuffer,
+extern "C" __global__ void reduceBornForce(long long* __restrict__ bornForce, mixed* __restrict__ energyBuffer,
        const float2* __restrict__ params, const real* __restrict__ bornRadii, const real* __restrict__ obcChain) {
-    real energy = 0;
+    mixed energy = 0;
    for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
        // Get summed Born force
@@ -402,7 +402,7 @@ typedef struct {
 */
 extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ forceBuffers, unsigned long long* __restrict__ global_bornForce,
-        real* __restrict__ energyBuffer, const real4* __restrict__ posq, const real* __restrict__ global_bornRadii,
+        mixed* __restrict__ energyBuffer, const real4* __restrict__ posq, const real* __restrict__ global_bornRadii,
 #ifdef USE_CUTOFF
        const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, const real4* __restrict__ blockCenter,
@@ -415,7 +415,7 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
    const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
    const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
    const unsigned int tbx = threadIdx.x - tgx;
-    real energy = 0;
+    mixed energy = 0;
    __shared__ AtomData2 localData[FORCE_WORK_GROUP_SIZE];
    // First loop: process tiles that contain exclusions.

--- a/platforms/cuda/src/kernels/nonbonded.cu
+++ b/platforms/cuda/src/kernels/nonbonded.cu
@@ -100,7 +100,7 @@ static __inline__ __device__ long long real_shfl(long long var, int srcLane) {
 *
 */
 extern "C" __global__ void computeNonbonded(
-        unsigned long long* __restrict__ forceBuffers, real* __restrict__ energyBuffer, const real4* __restrict__ posq, const tileflags* __restrict__ exclusions,
+        unsigned long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq, const tileflags* __restrict__ exclusions,
        const ushort2* __restrict__ exclusionTiles, unsigned int startTileIndex, unsigned int numTileIndices
 #ifdef USE_CUTOFF
        , const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount,real4 periodicBoxSize, real4 invPeriodicBoxSize, 
@@ -583,6 +583,6 @@ extern "C" __global__ void computeNonbonded(
        pos++;
    }
 #ifdef INCLUDE_ENERGY
-    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += (real) energy;
+    energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
 #endif
 }
\ No newline at end of file
--- a/platforms/cuda/src/kernels/pme.cu
+++ b/platforms/cuda/src/kernels/pme.cu
@@ -115,7 +115,7 @@ extern "C" __global__ void finishSpreadCharge(long long* __restrict__ originalPm
 // convolutes on the halfcomplex_pmeGrid, which is of size NX*NY*(NZ/2+1) as F(Q) is conjugate symmetric
 extern "C" __global__ void 
-reciprocalConvolution(real2* __restrict__ halfcomplex_pmeGrid, real* __restrict__ energyBuffer, 
+reciprocalConvolution(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict__ energyBuffer, 
                      const real* __restrict__ pmeBsplineModuliX, const real* __restrict__ pmeBsplineModuliY, const real* __restrict__ pmeBsplineModuliZ, 
                      real4 periodicBoxSize, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
    // R2C stores into a half complex matrix where the last dimension is cut by half
@@ -150,14 +150,14 @@ reciprocalConvolution(real2* __restrict__ halfcomplex_pmeGrid, real* __restrict_
 extern "C" __global__ void
-gridEvaluateEnergy(real2* __restrict__ halfcomplex_pmeGrid, real* __restrict__ energyBuffer,
+gridEvaluateEnergy(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict__ energyBuffer,
                      const real* __restrict__ pmeBsplineModuliX, const real* __restrict__ pmeBsplineModuliY, const real* __restrict__ pmeBsplineModuliZ,
                      real4 periodicBoxSize, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
    // R2C stores into a half complex matrix where the last dimension is cut by half
    const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
    const real recipScaleFactor = RECIP(M_PI*periodicBoxSize.x*periodicBoxSize.y*periodicBoxSize.z);
-    real energy = 0;
+    mixed energy = 0;
    for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x) {
        // real indices
        int kx = index/(GRID_SIZE_Y*(GRID_SIZE_Z));

--- a/platforms/opencl/src/OpenCLBondedUtilities.cpp
+++ b/platforms/opencl/src/OpenCLBondedUtilities.cpp
@@ -181,7 +181,7 @@ void OpenCLBondedUtilities::initialize(const System& system) {
        for (int i = 0; i < (int) prefixCode.size(); i++)
            s<<prefixCode[i];
        string bufferType = (context.getSupports64BitGlobalAtomics() ? "long" : "real4");
-        s<<"__kernel void computeBondedForces(__global "<<bufferType<<"* restrict forceBuffers, __global real* restrict energyBuffer, __global const real4* restrict posq, int groups, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ";
+        s<<"__kernel void computeBondedForces(__global "<<bufferType<<"* restrict forceBuffers, __global mixed* restrict energyBuffer, __global const real4* restrict posq, int groups, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ";
        for (int i = 0; i < setSize; i++) {
            int force = set[i];
            string indexType = "uint"+(indexWidth[force] == 1 ? "" : context.intToString(indexWidth[force]));
@@ -191,7 +191,7 @@ void OpenCLBondedUtilities::initialize(const System& system) {
        for (int i = 0; i < (int) arguments.size(); i++)
            s<<", __global "<<argTypes[i]<<"* customArg"<<(i+1);
        s<<") {\n";
-        s<<"real energy = 0.0f;\n";
+        s<<"mixed energy = 0;\n";
        for (int i = 0; i < setSize; i++) {
            int force = set[i];
            s<<createForceSource(i, forceAtoms[force].size(), forceAtoms[force][0].size(), forceGroup[force], forceSource[force]);

--- a/platforms/opencl/src/OpenCLContext.cpp
+++ b/platforms/opencl/src/OpenCLContext.cpp
@@ -460,7 +460,7 @@ void OpenCLContext::initialize() {
    else {
        forceBuffers = OpenCLArray::create<mm_float4>(*this, paddedNumAtoms*numForceBuffers, "forceBuffers");
        force = OpenCLArray::create<mm_float4>(*this, &forceBuffers->getDeviceBuffer(), paddedNumAtoms, "force");
-        energyBuffer = OpenCLArray::create<cl_float>(*this, max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers()), "energyBuffer");
+        energyBuffer = OpenCLArray::create<cl_double>(*this, max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers()), "energyBuffer");
    }
    if (supports64BitGlobalAtomics) {
        longForceBuffer = OpenCLArray::create<cl_long>(*this, 3*paddedNumAtoms, "longForceBuffer");

--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
@@ -136,7 +136,7 @@ double OpenCLCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context,
    cl.getIntegrationUtilities().distributeForcesFromVirtualSites();
    if (includeEnergy) {
        OpenCLArray& energyArray = cl.getEnergyBuffer();
-        if (cl.getUseDoublePrecision()) {
+        if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision()) {
            double* energy = (double*) cl.getPinnedBuffer();
            energyArray.download(energy);
            for (int i = 0; i < energyArray.getSize(); i++)

--- a/platforms/opencl/src/kernels/customCentroidBond.cl
+++ b/platforms/opencl/src/kernels/customCentroidBond.cl
@@ -105,10 +105,10 @@ real4 computeCross(real4 vec1, real4 vec2) {
 /**
 * Compute the forces on groups based on the bonds.
 */
-__kernel void computeGroupForces(__global long* restrict groupForce, __global real* restrict energyBuffer, __global const real4* restrict centerPositions,
+__kernel void computeGroupForces(__global long* restrict groupForce, __global mixed* restrict energyBuffer, __global const real4* restrict centerPositions,
        __global const int* restrict bondGroups
        EXTRA_ARGS) {
-    real energy = 0;
+    mixed energy = 0;
    for (int index = get_global_id(0); index < NUM_BONDS; index += get_global_size(0)) {
        COMPUTE_FORCE
    }

--- a/platforms/opencl/src/kernels/customGBEnergyN2.cl
+++ b/platforms/opencl/src/kernels/customGBEnergyN2.cl
@@ -16,7 +16,7 @@ __kernel void computeN2Energy(
 #else
        __global real4* restrict forceBuffers,
 #endif
-        __global real* restrict energyBuffer, __local real4* restrict local_force,
+        __global mixed* restrict energyBuffer, __local real4* restrict local_force,
        __global const real4* restrict posq, __local real4* restrict local_posq, __global const unsigned int* restrict exclusions,
        __global const ushort2* exclusionTiles,
 #ifdef USE_CUTOFF
@@ -31,7 +31,7 @@ __kernel void computeN2Energy(
    const unsigned int warp = get_global_id(0)/TILE_SIZE;
    const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
    const unsigned int tbx = get_local_id(0) - tgx;
-    real energy = 0;
+    mixed energy = 0;
    // First loop: process tiles that contain exclusions.

--- a/platforms/opencl/src/kernels/customGBEnergyN2_cpu.cl
+++ b/platforms/opencl/src/kernels/customGBEnergyN2_cpu.cl
@@ -16,7 +16,7 @@ __kernel void computeN2Energy(
 #else
        __global real4* restrict forceBuffers,
 #endif
-        __global real* restrict energyBuffer, __local real4* restrict local_force,
+        __global mixed* restrict energyBuffer, __local real4* restrict local_force,
        __global const real4* restrict posq, __local real4* restrict local_posq, __global const unsigned int* restrict exclusions,
        __global const ushort2* exclusionTiles,
 #ifdef USE_CUTOFF
@@ -27,7 +27,7 @@ __kernel void computeN2Energy(
        unsigned int numTiles
 #endif
        PARAMETER_ARGUMENTS) {
-    real energy = 0;
+    mixed energy = 0;
    // First loop: process tiles that contain exclusions.

--- a/platforms/opencl/src/kernels/customGBEnergyPerParticle.cl
+++ b/platforms/opencl/src/kernels/customGBEnergyPerParticle.cl
@@ -9,9 +9,9 @@
 * Reduce the derivatives computed in the N^2 energy kernel, and compute all per-particle energy terms.
 */
-__kernel void computePerParticleEnergy(int bufferSize, int numBuffers, __global real4* restrict forceBuffers, __global real* restrict energyBuffer, __global const real4* restrict posq
+__kernel void computePerParticleEnergy(int bufferSize, int numBuffers, __global real4* restrict forceBuffers, __global mixed* restrict energyBuffer, __global const real4* restrict posq
        PARAMETER_ARGUMENTS) {
-    real energy = 0;
+    mixed energy = 0;
    unsigned int index = get_global_id(0);
    while (index < NUM_ATOMS) {
        // Reduce the derivatives