Commit 59bd8d19 authored by peastman's avatar peastman
Browse files

Merge pull request #1165 from peastman/mixedenergy

Further improved energy accuracy in mixed precision mode
parents a20944f6 1f2b65da
...@@ -99,7 +99,7 @@ void CudaBondedUtilities::initialize(const System& system) { ...@@ -99,7 +99,7 @@ void CudaBondedUtilities::initialize(const System& system) {
s<<CudaKernelSources::vectorOps; s<<CudaKernelSources::vectorOps;
for (int i = 0; i < (int) prefixCode.size(); i++) for (int i = 0; i < (int) prefixCode.size(); i++)
s<<prefixCode[i]; s<<prefixCode[i];
s<<"extern \"C\" __global__ void computeBondedForces(unsigned long long* __restrict__ forceBuffer, real* __restrict__ energyBuffer, const real4* __restrict__ posq, int groups, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ"; s<<"extern \"C\" __global__ void computeBondedForces(unsigned long long* __restrict__ forceBuffer, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq, int groups, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ";
for (int force = 0; force < numForces; force++) { for (int force = 0; force < numForces; force++) {
for (int i = 0; i < (int) atomIndices[force].size(); i++) { for (int i = 0; i < (int) atomIndices[force].size(); i++) {
int indexWidth = atomIndices[force][i]->getElementSize()/4; int indexWidth = atomIndices[force][i]->getElementSize()/4;
...@@ -110,7 +110,7 @@ void CudaBondedUtilities::initialize(const System& system) { ...@@ -110,7 +110,7 @@ void CudaBondedUtilities::initialize(const System& system) {
for (int i = 0; i < (int) arguments.size(); i++) for (int i = 0; i < (int) arguments.size(); i++)
s<<", "<<argTypes[i]<<"* customArg"<<(i+1); s<<", "<<argTypes[i]<<"* customArg"<<(i+1);
s<<") {\n"; s<<") {\n";
s<<"real energy = 0;\n"; s<<"mixed energy = 0;\n";
for (int force = 0; force < numForces; force++) for (int force = 0; force < numForces; force++)
s<<createForceSource(force, forceAtoms[force].size(), forceAtoms[force][0].size(), forceGroup[force], forceSource[force]); s<<createForceSource(force, forceAtoms[force].size(), forceAtoms[force][0].size(), forceGroup[force], forceSource[force]);
s<<"energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;\n"; s<<"energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;\n";
......
...@@ -368,7 +368,7 @@ void CudaContext::initialize() { ...@@ -368,7 +368,7 @@ void CudaContext::initialize() {
CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0)); CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0));
} }
else if (useMixedPrecision) { else if (useMixedPrecision) {
energyBuffer = CudaArray::create<float>(*this, numEnergyBuffers, "energyBuffer"); energyBuffer = CudaArray::create<double>(*this, numEnergyBuffers, "energyBuffer");
int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers); int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers);
CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0)); CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0));
} }
......
...@@ -112,7 +112,7 @@ double CudaCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bo ...@@ -112,7 +112,7 @@ double CudaCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bo
cu.getIntegrationUtilities().distributeForcesFromVirtualSites(); cu.getIntegrationUtilities().distributeForcesFromVirtualSites();
if (includeEnergy) { if (includeEnergy) {
CudaArray& energyArray = cu.getEnergyBuffer(); CudaArray& energyArray = cu.getEnergyBuffer();
if (cu.getUseDoublePrecision()) { if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
double* energy = (double*) cu.getPinnedBuffer(); double* energy = (double*) cu.getPinnedBuffer();
energyArray.download(energy); energyArray.download(energy);
for (int i = 0; i < energyArray.getSize(); i++) for (int i = 0; i < energyArray.getSize(); i++)
......
...@@ -104,10 +104,10 @@ inline __device__ real4 computeCross(real4 vec1, real4 vec2) { ...@@ -104,10 +104,10 @@ inline __device__ real4 computeCross(real4 vec1, real4 vec2) {
/** /**
* Compute the forces on groups based on the bonds. * Compute the forces on groups based on the bonds.
*/ */
extern "C" __global__ void computeGroupForces(unsigned long long* __restrict__ groupForce, real* __restrict__ energyBuffer, const real4* __restrict__ centerPositions, extern "C" __global__ void computeGroupForces(unsigned long long* __restrict__ groupForce, mixed* __restrict__ energyBuffer, const real4* __restrict__ centerPositions,
const int* __restrict__ bondGroups const int* __restrict__ bondGroups
EXTRA_ARGS) { EXTRA_ARGS) {
real energy = 0; mixed energy = 0;
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_BONDS; index += blockDim.x*gridDim.x) { for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_BONDS; index += blockDim.x*gridDim.x) {
COMPUTE_FORCE COMPUTE_FORCE
} }
......
...@@ -13,7 +13,7 @@ typedef struct { ...@@ -13,7 +13,7 @@ typedef struct {
/** /**
* Compute a force based on pair interactions. * Compute a force based on pair interactions.
*/ */
extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forceBuffers, real* __restrict__ energyBuffer, extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer,
const real4* __restrict__ posq, const unsigned int* __restrict__ exclusions, const ushort2* __restrict__ exclusionTiles, const real4* __restrict__ posq, const unsigned int* __restrict__ exclusions, const ushort2* __restrict__ exclusionTiles,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
...@@ -27,7 +27,7 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc ...@@ -27,7 +27,7 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE; const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
const unsigned int tgx = threadIdx.x & (TILE_SIZE-1); const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
const unsigned int tbx = threadIdx.x - tgx; const unsigned int tbx = threadIdx.x - tgx;
real energy = 0; mixed energy = 0;
__shared__ AtomData localData[THREAD_BLOCK_SIZE]; __shared__ AtomData localData[THREAD_BLOCK_SIZE];
// First loop: process tiles that contain exclusions. // First loop: process tiles that contain exclusions.
......
...@@ -2,9 +2,9 @@ ...@@ -2,9 +2,9 @@
* Reduce the derivatives computed in the N^2 energy kernel, and compute all per-particle energy terms. * Reduce the derivatives computed in the N^2 energy kernel, and compute all per-particle energy terms.
*/ */
extern "C" __global__ void computePerParticleEnergy(long long* __restrict__ forceBuffers, real* __restrict__ energyBuffer, const real4* __restrict__ posq extern "C" __global__ void computePerParticleEnergy(long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq
PARAMETER_ARGUMENTS) { PARAMETER_ARGUMENTS) {
real energy = 0; mixed energy = 0;
for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) { for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
// Load the derivatives // Load the derivatives
......
...@@ -66,12 +66,12 @@ inline __device__ real4 computeCross(real4 vec1, real4 vec2) { ...@@ -66,12 +66,12 @@ inline __device__ real4 computeCross(real4 vec1, real4 vec2) {
/** /**
* Compute forces on donors. * Compute forces on donors.
*/ */
extern "C" __global__ void computeDonorForces(unsigned long long* __restrict__ force, real* __restrict__ energyBuffer, const real4* __restrict__ posq, extern "C" __global__ void computeDonorForces(unsigned long long* __restrict__ force, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq,
const int4* __restrict__ exclusions, const int4* __restrict__ donorAtoms, const int4* __restrict__ acceptorAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, const int4* __restrict__ exclusions, const int4* __restrict__ donorAtoms, const int4* __restrict__ acceptorAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
PARAMETER_ARGUMENTS) { PARAMETER_ARGUMENTS) {
extern __shared__ real4 posBuffer[]; extern __shared__ real4 posBuffer[];
real energy = 0; mixed energy = 0;
real3 f1 = make_real3(0); real3 f1 = make_real3(0);
real3 f2 = make_real3(0); real3 f2 = make_real3(0);
real3 f3 = make_real3(0); real3 f3 = make_real3(0);
...@@ -155,7 +155,7 @@ extern "C" __global__ void computeDonorForces(unsigned long long* __restrict__ f ...@@ -155,7 +155,7 @@ extern "C" __global__ void computeDonorForces(unsigned long long* __restrict__ f
/** /**
* Compute forces on acceptors. * Compute forces on acceptors.
*/ */
extern "C" __global__ void computeAcceptorForces(unsigned long long* __restrict__ force, real* __restrict__ energyBuffer, const real4* __restrict__ posq, extern "C" __global__ void computeAcceptorForces(unsigned long long* __restrict__ force, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq,
const int4* __restrict__ exclusions, const int4* __restrict__ donorAtoms, const int4* __restrict__ acceptorAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, const int4* __restrict__ exclusions, const int4* __restrict__ donorAtoms, const int4* __restrict__ acceptorAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
PARAMETER_ARGUMENTS) { PARAMETER_ARGUMENTS) {
......
...@@ -78,7 +78,7 @@ __constant__ float globals[NUM_GLOBALS]; ...@@ -78,7 +78,7 @@ __constant__ float globals[NUM_GLOBALS];
* Compute the interaction. * Compute the interaction.
*/ */
extern "C" __global__ void computeInteraction( extern "C" __global__ void computeInteraction(
unsigned long long* __restrict__ forceBuffers, real* __restrict__ energyBuffer, const real4* __restrict__ posq, unsigned long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq,
real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
, const int* __restrict__ neighbors, const int* __restrict__ neighborStartIndex , const int* __restrict__ neighbors, const int* __restrict__ neighborStartIndex
...@@ -90,7 +90,7 @@ extern "C" __global__ void computeInteraction( ...@@ -90,7 +90,7 @@ extern "C" __global__ void computeInteraction(
, int* __restrict__ exclusions, int* __restrict__ exclusionStartIndex , int* __restrict__ exclusions, int* __restrict__ exclusionStartIndex
#endif #endif
PARAMETER_ARGUMENTS) { PARAMETER_ARGUMENTS) {
real energy = 0.0f; mixed energy = 0;
// Loop over particles to be the first one in the set. // Loop over particles to be the first one in the set.
......
...@@ -9,14 +9,14 @@ typedef struct { ...@@ -9,14 +9,14 @@ typedef struct {
} AtomData; } AtomData;
extern "C" __global__ void computeInteractionGroups( extern "C" __global__ void computeInteractionGroups(
unsigned long long* __restrict__ forceBuffers, real* __restrict__ energyBuffer, const real4* __restrict__ posq, const int4* __restrict__ groupData, unsigned long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq, const int4* __restrict__ groupData,
real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
PARAMETER_ARGUMENTS) { PARAMETER_ARGUMENTS) {
const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE; const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE; // global warpIndex const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE; // global warpIndex
const unsigned int tgx = threadIdx.x & (TILE_SIZE-1); // index within the warp const unsigned int tgx = threadIdx.x & (TILE_SIZE-1); // index within the warp
const unsigned int tbx = threadIdx.x - tgx; // block warpIndex const unsigned int tbx = threadIdx.x - tgx; // block warpIndex
real energy = 0.0f; mixed energy = 0;
__shared__ AtomData localData[LOCAL_MEMORY_SIZE]; __shared__ AtomData localData[LOCAL_MEMORY_SIZE];
const unsigned int startTile = FIRST_TILE+warp*(LAST_TILE-FIRST_TILE)/totalWarps; const unsigned int startTile = FIRST_TILE+warp*(LAST_TILE-FIRST_TILE)/totalWarps;
......
...@@ -6,7 +6,7 @@ __device__ real2 multofReal2(real2 a, real2 b) { ...@@ -6,7 +6,7 @@ __device__ real2 multofReal2(real2 a, real2 b) {
* Precompute the cosine and sine sums which appear in each force term. * Precompute the cosine and sine sums which appear in each force term.
*/ */
extern "C" __global__ void calculateEwaldCosSinSums(real* __restrict__ energyBuffer, const real4* __restrict__ posq, real2* __restrict__ cosSinSum, real4 periodicBoxSize) { extern "C" __global__ void calculateEwaldCosSinSums(mixed* __restrict__ energyBuffer, const real4* __restrict__ posq, real2* __restrict__ cosSinSum, real4 periodicBoxSize) {
const unsigned int ksizex = 2*KMAX_X-1; const unsigned int ksizex = 2*KMAX_X-1;
const unsigned int ksizey = 2*KMAX_Y-1; const unsigned int ksizey = 2*KMAX_Y-1;
const unsigned int ksizez = 2*KMAX_Z-1; const unsigned int ksizez = 2*KMAX_Z-1;
...@@ -14,7 +14,7 @@ extern "C" __global__ void calculateEwaldCosSinSums(real* __restrict__ energyBuf ...@@ -14,7 +14,7 @@ extern "C" __global__ void calculateEwaldCosSinSums(real* __restrict__ energyBuf
real3 reciprocalBoxSize = make_real3(2*M_PI/periodicBoxSize.x, 2*M_PI/periodicBoxSize.y, 2*M_PI/periodicBoxSize.z); real3 reciprocalBoxSize = make_real3(2*M_PI/periodicBoxSize.x, 2*M_PI/periodicBoxSize.y, 2*M_PI/periodicBoxSize.z);
real reciprocalCoefficient = ONE_4PI_EPS0*4*M_PI/(periodicBoxSize.x*periodicBoxSize.y*periodicBoxSize.z); real reciprocalCoefficient = ONE_4PI_EPS0*4*M_PI/(periodicBoxSize.x*periodicBoxSize.y*periodicBoxSize.z);
unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; unsigned int index = blockIdx.x*blockDim.x+threadIdx.x;
real energy = 0; mixed energy = 0;
while (index < (KMAX_Y-1)*ksizez+KMAX_Z) while (index < (KMAX_Y-1)*ksizez+KMAX_Z)
index += blockDim.x*gridDim.x; index += blockDim.x*gridDim.x;
while (index < totalK) { while (index < totalK) {
......
...@@ -33,9 +33,9 @@ extern "C" __global__ void reduceBornSum(float alpha, float beta, float gamma, c ...@@ -33,9 +33,9 @@ extern "C" __global__ void reduceBornSum(float alpha, float beta, float gamma, c
* Reduce the Born force. * Reduce the Born force.
*/ */
extern "C" __global__ void reduceBornForce(long long* __restrict__ bornForce, real* __restrict__ energyBuffer, extern "C" __global__ void reduceBornForce(long long* __restrict__ bornForce, mixed* __restrict__ energyBuffer,
const float2* __restrict__ params, const real* __restrict__ bornRadii, const real* __restrict__ obcChain) { const float2* __restrict__ params, const real* __restrict__ bornRadii, const real* __restrict__ obcChain) {
real energy = 0; mixed energy = 0;
for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) { for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
// Get summed Born force // Get summed Born force
...@@ -402,7 +402,7 @@ typedef struct { ...@@ -402,7 +402,7 @@ typedef struct {
*/ */
extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ forceBuffers, unsigned long long* __restrict__ global_bornForce, extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ forceBuffers, unsigned long long* __restrict__ global_bornForce,
real* __restrict__ energyBuffer, const real4* __restrict__ posq, const real* __restrict__ global_bornRadii, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq, const real* __restrict__ global_bornRadii,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, const real4* __restrict__ blockCenter, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, const real4* __restrict__ blockCenter,
...@@ -415,7 +415,7 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo ...@@ -415,7 +415,7 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE; const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
const unsigned int tgx = threadIdx.x & (TILE_SIZE-1); const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
const unsigned int tbx = threadIdx.x - tgx; const unsigned int tbx = threadIdx.x - tgx;
real energy = 0; mixed energy = 0;
__shared__ AtomData2 localData[FORCE_WORK_GROUP_SIZE]; __shared__ AtomData2 localData[FORCE_WORK_GROUP_SIZE];
// First loop: process tiles that contain exclusions. // First loop: process tiles that contain exclusions.
......
...@@ -100,7 +100,7 @@ static __inline__ __device__ long long real_shfl(long long var, int srcLane) { ...@@ -100,7 +100,7 @@ static __inline__ __device__ long long real_shfl(long long var, int srcLane) {
* *
*/ */
extern "C" __global__ void computeNonbonded( extern "C" __global__ void computeNonbonded(
unsigned long long* __restrict__ forceBuffers, real* __restrict__ energyBuffer, const real4* __restrict__ posq, const tileflags* __restrict__ exclusions, unsigned long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq, const tileflags* __restrict__ exclusions,
const ushort2* __restrict__ exclusionTiles, unsigned int startTileIndex, unsigned int numTileIndices const ushort2* __restrict__ exclusionTiles, unsigned int startTileIndex, unsigned int numTileIndices
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
, const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount,real4 periodicBoxSize, real4 invPeriodicBoxSize, , const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount,real4 periodicBoxSize, real4 invPeriodicBoxSize,
...@@ -583,6 +583,6 @@ extern "C" __global__ void computeNonbonded( ...@@ -583,6 +583,6 @@ extern "C" __global__ void computeNonbonded(
pos++; pos++;
} }
#ifdef INCLUDE_ENERGY #ifdef INCLUDE_ENERGY
energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += (real) energy; energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
#endif #endif
} }
\ No newline at end of file
...@@ -115,7 +115,7 @@ extern "C" __global__ void finishSpreadCharge(long long* __restrict__ originalPm ...@@ -115,7 +115,7 @@ extern "C" __global__ void finishSpreadCharge(long long* __restrict__ originalPm
// convolutes on the halfcomplex_pmeGrid, which is of size NX*NY*(NZ/2+1) as F(Q) is conjugate symmetric // convolutes on the halfcomplex_pmeGrid, which is of size NX*NY*(NZ/2+1) as F(Q) is conjugate symmetric
extern "C" __global__ void extern "C" __global__ void
reciprocalConvolution(real2* __restrict__ halfcomplex_pmeGrid, real* __restrict__ energyBuffer, reciprocalConvolution(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict__ energyBuffer,
const real* __restrict__ pmeBsplineModuliX, const real* __restrict__ pmeBsplineModuliY, const real* __restrict__ pmeBsplineModuliZ, const real* __restrict__ pmeBsplineModuliX, const real* __restrict__ pmeBsplineModuliY, const real* __restrict__ pmeBsplineModuliZ,
real4 periodicBoxSize, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) { real4 periodicBoxSize, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
// R2C stores into a half complex matrix where the last dimension is cut by half // R2C stores into a half complex matrix where the last dimension is cut by half
...@@ -150,14 +150,14 @@ reciprocalConvolution(real2* __restrict__ halfcomplex_pmeGrid, real* __restrict_ ...@@ -150,14 +150,14 @@ reciprocalConvolution(real2* __restrict__ halfcomplex_pmeGrid, real* __restrict_
extern "C" __global__ void extern "C" __global__ void
gridEvaluateEnergy(real2* __restrict__ halfcomplex_pmeGrid, real* __restrict__ energyBuffer, gridEvaluateEnergy(real2* __restrict__ halfcomplex_pmeGrid, mixed* __restrict__ energyBuffer,
const real* __restrict__ pmeBsplineModuliX, const real* __restrict__ pmeBsplineModuliY, const real* __restrict__ pmeBsplineModuliZ, const real* __restrict__ pmeBsplineModuliX, const real* __restrict__ pmeBsplineModuliY, const real* __restrict__ pmeBsplineModuliZ,
real4 periodicBoxSize, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) { real4 periodicBoxSize, real3 recipBoxVecX, real3 recipBoxVecY, real3 recipBoxVecZ) {
// R2C stores into a half complex matrix where the last dimension is cut by half // R2C stores into a half complex matrix where the last dimension is cut by half
const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z; const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
const real recipScaleFactor = RECIP(M_PI*periodicBoxSize.x*periodicBoxSize.y*periodicBoxSize.z); const real recipScaleFactor = RECIP(M_PI*periodicBoxSize.x*periodicBoxSize.y*periodicBoxSize.z);
real energy = 0; mixed energy = 0;
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x) { for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x) {
// real indices // real indices
int kx = index/(GRID_SIZE_Y*(GRID_SIZE_Z)); int kx = index/(GRID_SIZE_Y*(GRID_SIZE_Z));
......
...@@ -181,7 +181,7 @@ void OpenCLBondedUtilities::initialize(const System& system) { ...@@ -181,7 +181,7 @@ void OpenCLBondedUtilities::initialize(const System& system) {
for (int i = 0; i < (int) prefixCode.size(); i++) for (int i = 0; i < (int) prefixCode.size(); i++)
s<<prefixCode[i]; s<<prefixCode[i];
string bufferType = (context.getSupports64BitGlobalAtomics() ? "long" : "real4"); string bufferType = (context.getSupports64BitGlobalAtomics() ? "long" : "real4");
s<<"__kernel void computeBondedForces(__global "<<bufferType<<"* restrict forceBuffers, __global real* restrict energyBuffer, __global const real4* restrict posq, int groups, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ"; s<<"__kernel void computeBondedForces(__global "<<bufferType<<"* restrict forceBuffers, __global mixed* restrict energyBuffer, __global const real4* restrict posq, int groups, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ";
for (int i = 0; i < setSize; i++) { for (int i = 0; i < setSize; i++) {
int force = set[i]; int force = set[i];
string indexType = "uint"+(indexWidth[force] == 1 ? "" : context.intToString(indexWidth[force])); string indexType = "uint"+(indexWidth[force] == 1 ? "" : context.intToString(indexWidth[force]));
...@@ -191,7 +191,7 @@ void OpenCLBondedUtilities::initialize(const System& system) { ...@@ -191,7 +191,7 @@ void OpenCLBondedUtilities::initialize(const System& system) {
for (int i = 0; i < (int) arguments.size(); i++) for (int i = 0; i < (int) arguments.size(); i++)
s<<", __global "<<argTypes[i]<<"* customArg"<<(i+1); s<<", __global "<<argTypes[i]<<"* customArg"<<(i+1);
s<<") {\n"; s<<") {\n";
s<<"real energy = 0.0f;\n"; s<<"mixed energy = 0;\n";
for (int i = 0; i < setSize; i++) { for (int i = 0; i < setSize; i++) {
int force = set[i]; int force = set[i];
s<<createForceSource(i, forceAtoms[force].size(), forceAtoms[force][0].size(), forceGroup[force], forceSource[force]); s<<createForceSource(i, forceAtoms[force].size(), forceAtoms[force][0].size(), forceGroup[force], forceSource[force]);
......
...@@ -460,7 +460,7 @@ void OpenCLContext::initialize() { ...@@ -460,7 +460,7 @@ void OpenCLContext::initialize() {
else { else {
forceBuffers = OpenCLArray::create<mm_float4>(*this, paddedNumAtoms*numForceBuffers, "forceBuffers"); forceBuffers = OpenCLArray::create<mm_float4>(*this, paddedNumAtoms*numForceBuffers, "forceBuffers");
force = OpenCLArray::create<mm_float4>(*this, &forceBuffers->getDeviceBuffer(), paddedNumAtoms, "force"); force = OpenCLArray::create<mm_float4>(*this, &forceBuffers->getDeviceBuffer(), paddedNumAtoms, "force");
energyBuffer = OpenCLArray::create<cl_float>(*this, max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers()), "energyBuffer"); energyBuffer = OpenCLArray::create<cl_double>(*this, max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers()), "energyBuffer");
} }
if (supports64BitGlobalAtomics) { if (supports64BitGlobalAtomics) {
longForceBuffer = OpenCLArray::create<cl_long>(*this, 3*paddedNumAtoms, "longForceBuffer"); longForceBuffer = OpenCLArray::create<cl_long>(*this, 3*paddedNumAtoms, "longForceBuffer");
......
...@@ -136,7 +136,7 @@ double OpenCLCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, ...@@ -136,7 +136,7 @@ double OpenCLCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context,
cl.getIntegrationUtilities().distributeForcesFromVirtualSites(); cl.getIntegrationUtilities().distributeForcesFromVirtualSites();
if (includeEnergy) { if (includeEnergy) {
OpenCLArray& energyArray = cl.getEnergyBuffer(); OpenCLArray& energyArray = cl.getEnergyBuffer();
if (cl.getUseDoublePrecision()) { if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision()) {
double* energy = (double*) cl.getPinnedBuffer(); double* energy = (double*) cl.getPinnedBuffer();
energyArray.download(energy); energyArray.download(energy);
for (int i = 0; i < energyArray.getSize(); i++) for (int i = 0; i < energyArray.getSize(); i++)
......
...@@ -105,10 +105,10 @@ real4 computeCross(real4 vec1, real4 vec2) { ...@@ -105,10 +105,10 @@ real4 computeCross(real4 vec1, real4 vec2) {
/** /**
* Compute the forces on groups based on the bonds. * Compute the forces on groups based on the bonds.
*/ */
__kernel void computeGroupForces(__global long* restrict groupForce, __global real* restrict energyBuffer, __global const real4* restrict centerPositions, __kernel void computeGroupForces(__global long* restrict groupForce, __global mixed* restrict energyBuffer, __global const real4* restrict centerPositions,
__global const int* restrict bondGroups __global const int* restrict bondGroups
EXTRA_ARGS) { EXTRA_ARGS) {
real energy = 0; mixed energy = 0;
for (int index = get_global_id(0); index < NUM_BONDS; index += get_global_size(0)) { for (int index = get_global_id(0); index < NUM_BONDS; index += get_global_size(0)) {
COMPUTE_FORCE COMPUTE_FORCE
} }
......
...@@ -16,7 +16,7 @@ __kernel void computeN2Energy( ...@@ -16,7 +16,7 @@ __kernel void computeN2Energy(
#else #else
__global real4* restrict forceBuffers, __global real4* restrict forceBuffers,
#endif #endif
__global real* restrict energyBuffer, __local real4* restrict local_force, __global mixed* restrict energyBuffer, __local real4* restrict local_force,
__global const real4* restrict posq, __local real4* restrict local_posq, __global const unsigned int* restrict exclusions, __global const real4* restrict posq, __local real4* restrict local_posq, __global const unsigned int* restrict exclusions,
__global const ushort2* exclusionTiles, __global const ushort2* exclusionTiles,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
...@@ -31,7 +31,7 @@ __kernel void computeN2Energy( ...@@ -31,7 +31,7 @@ __kernel void computeN2Energy(
const unsigned int warp = get_global_id(0)/TILE_SIZE; const unsigned int warp = get_global_id(0)/TILE_SIZE;
const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1); const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
const unsigned int tbx = get_local_id(0) - tgx; const unsigned int tbx = get_local_id(0) - tgx;
real energy = 0; mixed energy = 0;
// First loop: process tiles that contain exclusions. // First loop: process tiles that contain exclusions.
......
...@@ -16,7 +16,7 @@ __kernel void computeN2Energy( ...@@ -16,7 +16,7 @@ __kernel void computeN2Energy(
#else #else
__global real4* restrict forceBuffers, __global real4* restrict forceBuffers,
#endif #endif
__global real* restrict energyBuffer, __local real4* restrict local_force, __global mixed* restrict energyBuffer, __local real4* restrict local_force,
__global const real4* restrict posq, __local real4* restrict local_posq, __global const unsigned int* restrict exclusions, __global const real4* restrict posq, __local real4* restrict local_posq, __global const unsigned int* restrict exclusions,
__global const ushort2* exclusionTiles, __global const ushort2* exclusionTiles,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
...@@ -27,7 +27,7 @@ __kernel void computeN2Energy( ...@@ -27,7 +27,7 @@ __kernel void computeN2Energy(
unsigned int numTiles unsigned int numTiles
#endif #endif
PARAMETER_ARGUMENTS) { PARAMETER_ARGUMENTS) {
real energy = 0; mixed energy = 0;
// First loop: process tiles that contain exclusions. // First loop: process tiles that contain exclusions.
......
...@@ -9,9 +9,9 @@ ...@@ -9,9 +9,9 @@
* Reduce the derivatives computed in the N^2 energy kernel, and compute all per-particle energy terms. * Reduce the derivatives computed in the N^2 energy kernel, and compute all per-particle energy terms.
*/ */
__kernel void computePerParticleEnergy(int bufferSize, int numBuffers, __global real4* restrict forceBuffers, __global real* restrict energyBuffer, __global const real4* restrict posq __kernel void computePerParticleEnergy(int bufferSize, int numBuffers, __global real4* restrict forceBuffers, __global mixed* restrict energyBuffer, __global const real4* restrict posq
PARAMETER_ARGUMENTS) { PARAMETER_ARGUMENTS) {
real energy = 0; mixed energy = 0;
unsigned int index = get_global_id(0); unsigned int index = get_global_id(0);
while (index < NUM_ATOMS) { while (index < NUM_ATOMS) {
// Reduce the derivatives // Reduce the derivatives
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment