Commit 59bd8d19 authored by peastman's avatar peastman
Browse files

Merge pull request #1165 from peastman/mixedenergy

Further improved energy accuracy in mixed precision mode
parents a20944f6 1f2b65da
......@@ -53,11 +53,11 @@ real4 computeCross(real4 vec1, real4 vec2) {
/**
* Compute forces on donors.
*/
__kernel void computeDonorForces(__global real4* restrict forceBuffers, __global real* restrict energyBuffer, __global const real4* restrict posq, __global const int4* restrict exclusions,
__kernel void computeDonorForces(__global real4* restrict forceBuffers, __global mixed* restrict energyBuffer, __global const real4* restrict posq, __global const int4* restrict exclusions,
__global const int4* restrict donorAtoms, __global const int4* restrict acceptorAtoms, __global const int4* restrict donorBufferIndices, __local real4* posBuffer, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
PARAMETER_ARGUMENTS) {
real energy = 0;
mixed energy = 0;
real4 f1 = (real4) 0;
real4 f2 = (real4) 0;
real4 f3 = (real4) 0;
......@@ -142,7 +142,7 @@ __kernel void computeDonorForces(__global real4* restrict forceBuffers, __global
/**
* Compute forces on acceptors.
*/
__kernel void computeAcceptorForces(__global real4* restrict forceBuffers, __global real* restrict energyBuffer, __global const real4* restrict posq, __global const int4* restrict exclusions,
__kernel void computeAcceptorForces(__global real4* restrict forceBuffers, __global mixed* restrict energyBuffer, __global const real4* restrict posq, __global const int4* restrict exclusions,
__global const int4* restrict donorAtoms, __global const int4* restrict acceptorAtoms, __global const int4* restrict acceptorBufferIndices, __local real4* restrict posBuffer, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
PARAMETER_ARGUMENTS) {
......
......@@ -72,7 +72,7 @@ inline bool isInteractionExcluded(int atom1, int atom2, __global int* restrict e
* Compute the interaction.
*/
__kernel void computeInteraction(
__global long* restrict forceBuffers, __global real* restrict energyBuffer, __global const real4* restrict posq,
__global long* restrict forceBuffers, __global mixed* restrict energyBuffer, __global const real4* restrict posq,
real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
#ifdef USE_CUTOFF
, __global const int* restrict neighbors, __global const int* restrict neighborStartIndex
......@@ -84,7 +84,7 @@ __kernel void computeInteraction(
, __global int* restrict exclusions, __global int* restrict exclusionStartIndex
#endif
PARAMETER_ARGUMENTS) {
real energy = 0.0f;
mixed energy = 0;
// Loop over particles to be the first one in the set.
......
......@@ -42,14 +42,14 @@ __kernel void computeInteractionGroups(
#else
__global real4* restrict forceBuffers,
#endif
__global real* restrict energyBuffer, __global const real4* restrict posq, __global const int4* restrict groupData,
__global mixed* restrict energyBuffer, __global const real4* restrict posq, __global const int4* restrict groupData,
real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
PARAMETER_ARGUMENTS) {
const unsigned int totalWarps = get_global_size(0)/TILE_SIZE;
const unsigned int warp = get_global_id(0)/TILE_SIZE; // global warpIndex
const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1); // index within the warp
const unsigned int tbx = get_local_id(0) - tgx; // block warpIndex
real energy = 0.0f;
mixed energy = 0;
__local AtomData localData[LOCAL_MEMORY_SIZE];
const unsigned int startTile = FIRST_TILE+warp*(LAST_TILE-FIRST_TILE)/totalWarps;
......
......@@ -6,13 +6,13 @@ real2 multofReal2(real2 a, real2 b) {
* Precompute the cosine and sine sums which appear in each force term.
*/
__kernel void calculateEwaldCosSinSums(__global real* restrict energyBuffer, __global const real4* restrict posq, __global real2* restrict cosSinSum, real4 reciprocalPeriodicBoxSize, real reciprocalCoefficient) {
__kernel void calculateEwaldCosSinSums(__global mixed* restrict energyBuffer, __global const real4* restrict posq, __global real2* restrict cosSinSum, real4 reciprocalPeriodicBoxSize, real reciprocalCoefficient) {
const unsigned int ksizex = 2*KMAX_X-1;
const unsigned int ksizey = 2*KMAX_Y-1;
const unsigned int ksizez = 2*KMAX_Z-1;
const unsigned int totalK = ksizex*ksizey*ksizez;
unsigned int index = get_global_id(0);
real energy = 0.0f;
mixed energy = 0;
while (index < (KMAX_Y-1)*ksizez+KMAX_Z)
index += get_global_size(0);
while (index < totalK) {
......
......@@ -387,7 +387,7 @@ __kernel void computeGBSAForce1(
#else
__global real4* restrict forceBuffers, __global real* restrict global_bornForce,
#endif
__global real* restrict energyBuffer, __global const real4* restrict posq, __global const real* restrict global_bornRadii,
__global mixed* restrict energyBuffer, __global const real4* restrict posq, __global const real* restrict global_bornRadii,
#ifdef USE_CUTOFF
__global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, __global const real4* restrict blockCenter,
......@@ -400,7 +400,7 @@ __kernel void computeGBSAForce1(
const unsigned int warp = get_global_id(0)/TILE_SIZE;
const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
const unsigned int tbx = get_local_id(0) - tgx;
real energy = 0.0f;
mixed energy = 0;
__local AtomData2 localData[FORCE_WORK_GROUP_SIZE];
// First loop: process tiles that contain exclusions.
......
......@@ -50,8 +50,8 @@ __kernel void reduceBornForce(int bufferSize, int numBuffers, __global real* bor
#ifdef SUPPORTS_64_BIT_ATOMICS
__global const long* restrict bornForceIn,
#endif
__global real* restrict energyBuffer, __global const float2* restrict params, __global const real* restrict bornRadii, __global const real* restrict obcChain) {
real energy = 0.0f;
__global mixed* restrict energyBuffer, __global const float2* restrict params, __global const real* restrict bornRadii, __global const real* restrict obcChain) {
mixed energy = 0;
unsigned int index = get_global_id(0);
while (index < NUM_ATOMS) {
// Sum the Born force
......
......@@ -409,7 +409,7 @@ __kernel void computeGBSAForce1(
#else
__global real4* restrict forceBuffers, __global real* restrict global_bornForce,
#endif
__global real* restrict energyBuffer, __global const real4* restrict posq, __global const real* restrict global_bornRadii,
__global mixed* restrict energyBuffer, __global const real4* restrict posq, __global const real* restrict global_bornRadii,
#ifdef USE_CUTOFF
__global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, __global const real4* restrict blockCenter,
......@@ -418,7 +418,7 @@ __kernel void computeGBSAForce1(
unsigned int numTiles,
#endif
__global const ushort2* exclusionTiles) {
real energy = 0.0f;
mixed energy = 0;
__local AtomData2 localData[TILE_SIZE];
// First loop: process tiles that contain exclusions.
......
......@@ -22,7 +22,7 @@ __kernel void computeNonbonded(
#else
__global real4* restrict forceBuffers,
#endif
__global real* restrict energyBuffer, __global const real4* restrict posq, __global const unsigned int* restrict exclusions,
__global mixed* restrict energyBuffer, __global const real4* restrict posq, __global const unsigned int* restrict exclusions,
__global const ushort2* restrict exclusionTiles, unsigned int startTileIndex, unsigned int numTileIndices
#ifdef USE_CUTOFF
, __global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
......@@ -429,6 +429,6 @@ __kernel void computeNonbonded(
pos++;
}
#ifdef INCLUDE_ENERGY
energyBuffer[get_global_id(0)] += (real) energy;
energyBuffer[get_global_id(0)] += energy;
#endif
}
......@@ -19,7 +19,7 @@ __kernel void computeNonbonded(
#else
__global real4* restrict forceBuffers,
#endif
__global real* restrict energyBuffer, __global const real4* restrict posq, __global const unsigned int* restrict exclusions,
__global mixed* restrict energyBuffer, __global const real4* restrict posq, __global const unsigned int* restrict exclusions,
__global const ushort2* restrict exclusionTiles, unsigned int startTileIndex, unsigned int numTileIndices
#ifdef USE_CUTOFF
, __global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
......@@ -27,7 +27,7 @@ __kernel void computeNonbonded(
__global const real4* restrict blockSize, __global const int* restrict interactingAtoms
#endif
PARAMETER_ARGUMENTS) {
real energy = 0;
mixed energy = 0;
__local AtomData localData[TILE_SIZE];
// First loop: process tiles that contain exclusions.
......
......@@ -325,14 +325,14 @@ __kernel void reciprocalConvolution(__global real2* restrict pmeGrid, __global c
}
}
__kernel void gridEvaluateEnergy(__global real2* restrict pmeGrid, __global real* restrict energyBuffer,
__kernel void gridEvaluateEnergy(__global real2* restrict pmeGrid, __global mixed* restrict energyBuffer,
__global const real* restrict pmeBsplineModuliX, __global const real* restrict pmeBsplineModuliY, __global const real* restrict pmeBsplineModuliZ,
real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ) {
// R2C stores into a half complex matrix where the last dimension is cut by half
const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
const real recipScaleFactor = (1.0f/M_PI)*recipBoxVecX.x*recipBoxVecY.y*recipBoxVecZ.z;
real energy = 0;
mixed energy = 0;
for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
// real indices
int kx = index/(GRID_SIZE_Y*(GRID_SIZE_Z));
......
......@@ -23,8 +23,8 @@ extern "C" __global__ void reduceBornSum(const long long* __restrict__ bornSum,
/**
* Apply the surface area term to the force and energy.
*/
extern "C" __global__ void computeSurfaceAreaForce(long long* __restrict__ bornForce, real* __restrict__ energyBuffer, const float2* __restrict__ params, const real* __restrict__ bornRadii) {
real energy = 0;
extern "C" __global__ void computeSurfaceAreaForce(long long* __restrict__ bornForce, mixed* __restrict__ energyBuffer, const float2* __restrict__ params, const real* __restrict__ bornRadii) {
mixed energy = 0;
for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
real bornRadius = bornRadii[index];
float radius = params[index].x;
......@@ -216,7 +216,7 @@ inline __device__ void zeroAtomData(AtomData2& data) {
* Compute electrostatic interactions.
*/
extern "C" __global__ void computeGKForces(
unsigned long long* __restrict__ forceBuffers, unsigned long long* __restrict__ torqueBuffers, real* __restrict__ energyBuffer,
unsigned long long* __restrict__ forceBuffers, unsigned long long* __restrict__ torqueBuffers, mixed* __restrict__ energyBuffer,
const real4* __restrict__ posq, unsigned int startTileIndex, unsigned int numTileIndices, const real* __restrict__ labFrameDipole,
const real* __restrict__ labFrameQuadrupole, const real* __restrict__ inducedDipole, const real* __restrict__ inducedDipolePolar,
const real* __restrict__ bornRadii, unsigned long long* __restrict__ bornForce) {
......@@ -225,7 +225,7 @@ extern "C" __global__ void computeGKForces(
const unsigned int numTiles = numTileIndices;
unsigned int pos = (unsigned int) (startTileIndex+warp*(long long)numTiles/totalWarps);
unsigned int end = (unsigned int) (startTileIndex+(warp+1)*(long long)numTiles/totalWarps);
real energy = 0;
mixed energy = 0;
__shared__ AtomData2 localData[GK_FORCE_THREAD_BLOCK_SIZE];
do {
......@@ -605,7 +605,7 @@ __device__ float computePScaleFactor(uint2 covalent, unsigned int polarizationGr
* Compute electrostatic interactions.
*/
extern "C" __global__ void computeEDiffForce(
unsigned long long* __restrict__ forceBuffers, unsigned long long* __restrict__ torqueBuffers, real* __restrict__ energyBuffer,
unsigned long long* __restrict__ forceBuffers, unsigned long long* __restrict__ torqueBuffers, mixed* __restrict__ energyBuffer,
const real4* __restrict__ posq, const uint2* __restrict__ covalentFlags, const unsigned int* __restrict__ polarizationGroupFlags,
const ushort2* __restrict__ exclusionTiles, unsigned int startTileIndex, unsigned int numTileIndices,
const real* __restrict__ labFrameDipole, const real* __restrict__ labFrameQuadrupole, const real* __restrict__ inducedDipole,
......@@ -615,7 +615,7 @@ extern "C" __global__ void computeEDiffForce(
const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
const unsigned int tbx = threadIdx.x - tgx;
real energy = 0;
mixed energy = 0;
__shared__ AtomData4 localData[EDIFF_THREAD_BLOCK_SIZE];
// First loop: process tiles that contain exclusions.
......
......@@ -191,14 +191,14 @@ __device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, real rmi
/**
* Compute WCA interaction.
*/
extern "C" __global__ void computeWCAForce(unsigned long long* __restrict__ forceBuffers, real* __restrict__ energyBuffer,
extern "C" __global__ void computeWCAForce(unsigned long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer,
const real4* __restrict__ posq, unsigned int startTileIndex, unsigned int numTileIndices, const float2* __restrict__ radiusEpsilon) {
unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
const unsigned int numTiles = numTileIndices;
unsigned int pos = (unsigned int) (startTileIndex+warp*(long long)numTiles/totalWarps);
unsigned int end = (unsigned int) (startTileIndex+(warp+1)*(long long)numTiles/totalWarps);
real energy = 0;
mixed energy = 0;
__shared__ AtomData localData[THREAD_BLOCK_SIZE];
do {
......
......@@ -54,7 +54,7 @@ __device__ float computePScaleFactor(uint2 covalent, unsigned int polarizationGr
return (x && y ? 0.0f : (x && p ? 0.5f : 1.0f));
}
__device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, bool hasExclusions, float dScale, float pScale, float mScale, float forceFactor, real& energy) {
__device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, bool hasExclusions, float dScale, float pScale, float mScale, float forceFactor, mixed& energy) {
// Compute the displacement.
real3 delta;
......@@ -374,7 +374,7 @@ __device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, bool has
* Compute electrostatic interactions.
*/
extern "C" __global__ void computeElectrostatics(
unsigned long long* __restrict__ forceBuffers, unsigned long long* __restrict__ torqueBuffers, real* __restrict__ energyBuffer,
unsigned long long* __restrict__ forceBuffers, unsigned long long* __restrict__ torqueBuffers, mixed* __restrict__ energyBuffer,
const real4* __restrict__ posq, const uint2* __restrict__ covalentFlags, const unsigned int* __restrict__ polarizationGroupFlags,
const ushort2* __restrict__ exclusionTiles, unsigned int startTileIndex, unsigned int numTileIndices,
#ifdef USE_CUTOFF
......@@ -388,7 +388,7 @@ extern "C" __global__ void computeElectrostatics(
const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
const unsigned int tbx = threadIdx.x - tgx;
real energy = 0;
mixed energy = 0;
__shared__ AtomData localData[THREAD_BLOCK_SIZE];
// First loop: process tiles that contain exclusions.
......
......@@ -874,14 +874,14 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri
}
extern "C" __global__ void computeFixedMultipoleForceAndEnergy(real4* __restrict__ posq, unsigned long long* __restrict__ forceBuffers,
long long* __restrict__ torqueBuffers, real* __restrict__ energyBuffer, const real* __restrict__ labFrameDipole,
long long* __restrict__ torqueBuffers, mixed* __restrict__ energyBuffer, const real* __restrict__ labFrameDipole,
const real* __restrict__ labFrameQuadrupole, const real* __restrict__ fracDipole, const real* __restrict__ fracQuadrupole,
const real* __restrict__ phi, const real* __restrict__ cphi_global, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
real multipole[10];
const int deriv1[] = {1, 4, 7, 8, 10, 15, 17, 13, 14, 19};
const int deriv2[] = {2, 7, 5, 9, 13, 11, 18, 15, 19, 16};
const int deriv3[] = {3, 8, 9, 6, 14, 16, 12, 19, 17, 18};
real energy = 0;
mixed energy = 0;
__shared__ real fracToCart[3][3];
if (threadIdx.x == 0) {
fracToCart[0][0] = GRID_SIZE_X*recipBoxVecX.x;
......@@ -956,7 +956,7 @@ extern "C" __global__ void computeFixedMultipoleForceAndEnergy(real4* __restrict
}
extern "C" __global__ void computeInducedDipoleForceAndEnergy(real4* __restrict__ posq, unsigned long long* __restrict__ forceBuffers,
long long* __restrict__ torqueBuffers, real* __restrict__ energyBuffer, const real* __restrict__ labFrameDipole,
long long* __restrict__ torqueBuffers, mixed* __restrict__ energyBuffer, const real* __restrict__ labFrameDipole,
const real* __restrict__ labFrameQuadrupole, const real* __restrict__ fracDipole, const real* __restrict__ fracQuadrupole,
const real* __restrict__ inducedDipole_global, const real* __restrict__ inducedDipolePolar_global,
const real* __restrict__ phi, const real* __restrict__ phid, const real* __restrict__ phip,
......@@ -967,7 +967,7 @@ extern "C" __global__ void computeInducedDipoleForceAndEnergy(real4* __restrict_
const int deriv1[] = {1, 4, 7, 8, 10, 15, 17, 13, 14, 19};
const int deriv2[] = {2, 7, 5, 9, 13, 11, 18, 15, 19, 16};
const int deriv3[] = {3, 8, 9, 6, 14, 16, 12, 19, 17, 18};
real energy = 0;
mixed energy = 0;
__shared__ real fracToCart[3][3];
if (threadIdx.x == 0) {
fracToCart[0][0] = GRID_SIZE_X*recipBoxVecX.x;
......
......@@ -56,7 +56,7 @@ __device__ float computePScaleFactor(uint2 covalent, unsigned int polarizationGr
}
__device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, bool hasExclusions, float dScale, float pScale, float mScale, float forceFactor,
real& energy, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) {
mixed& energy, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) {
// Compute the displacement.
real3 delta;
......@@ -411,7 +411,7 @@ __device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, bool has
/**
* Compute the self energy and self torque.
*/
__device__ void computeSelfEnergyAndTorque(AtomData& atom1, real& energy) {
__device__ void computeSelfEnergyAndTorque(AtomData& atom1, mixed& energy) {
real cii = atom1.q*atom1.q;
real3 dipole = make_real3(atom1.sphericalDipole.y, atom1.sphericalDipole.z, atom1.sphericalDipole.x);
real dii = dot(dipole, dipole+atom1.inducedDipole);
......@@ -439,7 +439,7 @@ __device__ void computeSelfEnergyAndTorque(AtomData& atom1, real& energy) {
* Compute electrostatic interactions.
*/
extern "C" __global__ void computeElectrostatics(
unsigned long long* __restrict__ forceBuffers, unsigned long long* __restrict__ torqueBuffers, real* __restrict__ energyBuffer,
unsigned long long* __restrict__ forceBuffers, unsigned long long* __restrict__ torqueBuffers, mixed* __restrict__ energyBuffer,
const real4* __restrict__ posq, const uint2* __restrict__ covalentFlags, const unsigned int* __restrict__ polarizationGroupFlags,
const ushort2* __restrict__ exclusionTiles, unsigned int startTileIndex, unsigned int numTileIndices,
#ifdef USE_CUTOFF
......@@ -453,7 +453,7 @@ extern "C" __global__ void computeElectrostatics(
const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
const unsigned int tbx = threadIdx.x - tgx;
real energy = 0;
mixed energy = 0;
__shared__ AtomData localData[THREAD_BLOCK_SIZE];
// First loop: process tiles that contain exclusions.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment