Commit 18355094 authored by Peter Eastman's avatar Peter Eastman
Browse files

When converting to fixed point, multiply by 0x100000000 instead of 0xFFFFFFFF....

When converting to fixed point, multiply by 0x100000000 instead of 0xFFFFFFFF.  This should be (very very slightly) more accurate, since its reciprocal can be exactly represented in floating point.
parent a6bb39a3
...@@ -142,9 +142,9 @@ string CudaBondedUtilities::createForceSource(int forceIndex, int numBonds, int ...@@ -142,9 +142,9 @@ string CudaBondedUtilities::createForceSource(int forceIndex, int numBonds, int
} }
s<<computeForce<<"\n"; s<<computeForce<<"\n";
for (int i = 0; i < numAtoms; i++) { for (int i = 0; i < numAtoms; i++) {
s<<" atomicAdd(&forceBuffer[atom"<<(i+1)<<"], static_cast<unsigned long long>((long long) (force"<<(i+1)<<".x*0xFFFFFFFF)));\n"; s<<" atomicAdd(&forceBuffer[atom"<<(i+1)<<"], static_cast<unsigned long long>((long long) (force"<<(i+1)<<".x*0x100000000)));\n";
s<<" atomicAdd(&forceBuffer[atom"<<(i+1)<<"+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force"<<(i+1)<<".y*0xFFFFFFFF)));\n"; s<<" atomicAdd(&forceBuffer[atom"<<(i+1)<<"+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force"<<(i+1)<<".y*0x100000000)));\n";
s<<" atomicAdd(&forceBuffer[atom"<<(i+1)<<"+PADDED_NUM_ATOMS*2], static_cast<unsigned long long>((long long) (force"<<(i+1)<<".z*0xFFFFFFFF)));\n"; s<<" atomicAdd(&forceBuffer[atom"<<(i+1)<<"+PADDED_NUM_ATOMS*2], static_cast<unsigned long long>((long long) (force"<<(i+1)<<".z*0x100000000)));\n";
s<<" __threadfence_block();\n"; s<<" __threadfence_block();\n";
} }
s<<"}\n"; s<<"}\n";
......
...@@ -857,7 +857,7 @@ double CudaIntegrationUtilities::computeKineticEnergy(double timeShift) { ...@@ -857,7 +857,7 @@ double CudaIntegrationUtilities::computeKineticEnergy(double timeShift) {
int paddedNumParticles = context.getPaddedNumAtoms(); int paddedNumParticles = context.getPaddedNumAtoms();
long long* force = (long long*) context.getPinnedBuffer(); long long* force = (long long*) context.getPinnedBuffer();
context.getForce().download(force); context.getForce().download(force);
double forceScale = timeShift/0xFFFFFFFF; double forceScale = timeShift/0x100000000;
double energy = 0.0; double energy = 0.0;
if (context.getUseDoublePrecision() || context.getUseMixedPrecision()) { if (context.getUseDoublePrecision() || context.getUseMixedPrecision()) {
vector<double4> velm; vector<double4> velm;
......
...@@ -288,7 +288,7 @@ void CudaUpdateStateDataKernel::getForces(ContextImpl& context, vector<Vec3>& fo ...@@ -288,7 +288,7 @@ void CudaUpdateStateDataKernel::getForces(ContextImpl& context, vector<Vec3>& fo
int numParticles = context.getSystem().getNumParticles(); int numParticles = context.getSystem().getNumParticles();
int paddedNumParticles = cu.getPaddedNumAtoms(); int paddedNumParticles = cu.getPaddedNumAtoms();
forces.resize(numParticles); forces.resize(numParticles);
double scale = 1.0/(double) 0xFFFFFFFF; double scale = 1.0/(double) 0x100000000;
for (int i = 0; i < numParticles; ++i) for (int i = 0; i < numParticles; ++i)
forces[order[i]] = Vec3(scale*force[i], scale*force[i+paddedNumParticles], scale*force[i+paddedNumParticles*2]); forces[order[i]] = Vec3(scale*force[i], scale*force[i+paddedNumParticles], scale*force[i+paddedNumParticles*2]);
} }
...@@ -2618,7 +2618,7 @@ void CudaCalcCustomGBForceKernel::initialize(const System& system, const CustomG ...@@ -2618,7 +2618,7 @@ void CudaCalcCustomGBForceKernel::initialize(const System& system, const CustomG
extraArgs << ", const long long* __restrict__ derivBuffersIn"; extraArgs << ", const long long* __restrict__ derivBuffersIn";
for (int i = 0; i < energyDerivs->getNumParameters(); ++i) for (int i = 0; i < energyDerivs->getNumParameters(); ++i)
load << "derivBuffers" << energyDerivs->getParameterSuffix(i, "[index]") << load << "derivBuffers" << energyDerivs->getParameterSuffix(i, "[index]") <<
" = RECIP(0xFFFFFFFF)*derivBuffersIn[index+PADDED_NUM_ATOMS*" << cu.intToString(i) << "];\n"; " = RECIP(0x100000000)*derivBuffersIn[index+PADDED_NUM_ATOMS*" << cu.intToString(i) << "];\n";
// Compute the various expressions. // Compute the various expressions.
...@@ -2660,9 +2660,9 @@ void CudaCalcCustomGBForceKernel::initialize(const System& system, const CustomG ...@@ -2660,9 +2660,9 @@ void CudaCalcCustomGBForceKernel::initialize(const System& system, const CustomG
// Record values. // Record values.
compute << "forceBuffers[index] += (long long) (force.x*0xFFFFFFFF);\n"; compute << "forceBuffers[index] += (long long) (force.x*0x100000000);\n";
compute << "forceBuffers[index+PADDED_NUM_ATOMS] += (long long) (force.y*0xFFFFFFFF);\n"; compute << "forceBuffers[index+PADDED_NUM_ATOMS] += (long long) (force.y*0x100000000);\n";
compute << "forceBuffers[index+PADDED_NUM_ATOMS*2] += (long long) (force.z*0xFFFFFFFF);\n"; compute << "forceBuffers[index+PADDED_NUM_ATOMS*2] += (long long) (force.z*0x100000000);\n";
for (int i = 1; i < force.getNumComputedValues(); i++) { for (int i = 1; i < force.getNumComputedValues(); i++) {
compute << "real totalDeriv"<<i<<" = dV"<<i<<"dV0"; compute << "real totalDeriv"<<i<<" = dV"<<i<<"dV0";
for (int j = 1; j < i; j++) for (int j = 1; j < i; j++)
......
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
extern "C" __global__ void integrateBrownianPart1(mixed tauDeltaT, mixed noiseAmplitude, const long long* __restrict__ force, extern "C" __global__ void integrateBrownianPart1(mixed tauDeltaT, mixed noiseAmplitude, const long long* __restrict__ force,
mixed4* __restrict__ posDelta, const mixed4* __restrict__ velm, const float4* __restrict__ random, unsigned int randomIndex) { mixed4* __restrict__ posDelta, const mixed4* __restrict__ velm, const float4* __restrict__ random, unsigned int randomIndex) {
randomIndex += blockIdx.x*blockDim.x+threadIdx.x; randomIndex += blockIdx.x*blockDim.x+threadIdx.x;
const mixed fscale = tauDeltaT/(mixed) 0xFFFFFFFF; const mixed fscale = tauDeltaT/(mixed) 0x100000000;
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) { for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
mixed invMass = velm[index].w; mixed invMass = velm[index].w;
if (invMass != 0) { if (invMass != 0) {
......
#define STORE_DERIVATIVE_1(INDEX) atomicAdd(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (deriv##INDEX##_1*0xFFFFFFFF))); #define STORE_DERIVATIVE_1(INDEX) atomicAdd(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (deriv##INDEX##_1*0x100000000)));
#define STORE_DERIVATIVE_2(INDEX) atomicAdd(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].deriv##INDEX*0xFFFFFFFF))); #define STORE_DERIVATIVE_2(INDEX) atomicAdd(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].deriv##INDEX*0x100000000)));
#define TILE_SIZE 32 #define TILE_SIZE 32
typedef struct { typedef struct {
...@@ -211,16 +211,16 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc ...@@ -211,16 +211,16 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
if (pos < end) { if (pos < end) {
const unsigned int offset = x*TILE_SIZE + tgx; const unsigned int offset = x*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (force.x*0xFFFFFFFF))); atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0xFFFFFFFF))); atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0xFFFFFFFF))); atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
STORE_DERIVATIVES_1 STORE_DERIVATIVES_1
} }
if (pos < end && x != y) { if (pos < end && x != y) {
const unsigned int offset = y*TILE_SIZE + tgx; const unsigned int offset = y*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0xFFFFFFFF))); atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0xFFFFFFFF))); atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0xFFFFFFFF))); atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
STORE_DERIVATIVES_2 STORE_DERIVATIVES_2
} }
pos++; pos++;
......
...@@ -4,13 +4,13 @@ ...@@ -4,13 +4,13 @@
extern "C" __global__ void computeGradientChainRuleTerms(long long* __restrict__ forceBuffers, const real4* __restrict__ posq extern "C" __global__ void computeGradientChainRuleTerms(long long* __restrict__ forceBuffers, const real4* __restrict__ posq
PARAMETER_ARGUMENTS) { PARAMETER_ARGUMENTS) {
const real scale = RECIP((real) 0xFFFFFFFF); const real scale = RECIP((real) 0x100000000);
for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) { for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
real4 pos = posq[index]; real4 pos = posq[index];
real3 force = make_real3(scale*forceBuffers[index], scale*forceBuffers[index+PADDED_NUM_ATOMS], scale*forceBuffers[index+PADDED_NUM_ATOMS*2]); real3 force = make_real3(scale*forceBuffers[index], scale*forceBuffers[index+PADDED_NUM_ATOMS], scale*forceBuffers[index+PADDED_NUM_ATOMS*2]);
COMPUTE_FORCES COMPUTE_FORCES
forceBuffers[index] = (long long) (force.x*0xFFFFFFFF); forceBuffers[index] = (long long) (force.x*0x100000000);
forceBuffers[index+PADDED_NUM_ATOMS] = (long long) (force.y*0xFFFFFFFF); forceBuffers[index+PADDED_NUM_ATOMS] = (long long) (force.y*0x100000000);
forceBuffers[index+PADDED_NUM_ATOMS*2] = (long long) (force.z*0xFFFFFFFF); forceBuffers[index+PADDED_NUM_ATOMS*2] = (long long) (force.z*0x100000000);
} }
} }
...@@ -238,11 +238,11 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const ...@@ -238,11 +238,11 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
if (pos < end) { if (pos < end) {
const unsigned int offset = x*TILE_SIZE + tgx; const unsigned int offset = x*TILE_SIZE + tgx;
atomicAdd(&global_value[offset], static_cast<unsigned long long>((long long) (value*0xFFFFFFFF))); atomicAdd(&global_value[offset], static_cast<unsigned long long>((long long) (value*0x100000000)));
} }
if (pos < end && x != y) { if (pos < end && x != y) {
const unsigned int offset = y*TILE_SIZE + tgx; const unsigned int offset = y*TILE_SIZE + tgx;
atomicAdd(&global_value[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].value*0xFFFFFFFF))); atomicAdd(&global_value[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].value*0x100000000)));
} }
lasty = y; lasty = y;
pos++; pos++;
......
...@@ -7,7 +7,7 @@ extern "C" __global__ void computePerParticleValues(real4* posq, long long* valu ...@@ -7,7 +7,7 @@ extern "C" __global__ void computePerParticleValues(real4* posq, long long* valu
for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) { for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
// Load the pairwise value // Load the pairwise value
real sum = valueBuffers[index]/(real) 0xFFFFFFFF; real sum = valueBuffers[index]/(real) 0x100000000;
// Now calculate other values // Now calculate other values
......
...@@ -132,21 +132,21 @@ extern "C" __global__ void computeDonorForces(unsigned long long* __restrict__ f ...@@ -132,21 +132,21 @@ extern "C" __global__ void computeDonorForces(unsigned long long* __restrict__ f
if (donorIndex < NUM_DONORS) { if (donorIndex < NUM_DONORS) {
if (atoms.x > -1) { if (atoms.x > -1) {
atomicAdd(&force[atoms.x], static_cast<unsigned long long>((long long) (f1.x*0xFFFFFFFF))); atomicAdd(&force[atoms.x], static_cast<unsigned long long>((long long) (f1.x*0x100000000)));
atomicAdd(&force[atoms.x+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f1.y*0xFFFFFFFF))); atomicAdd(&force[atoms.x+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f1.y*0x100000000)));
atomicAdd(&force[atoms.x+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f1.z*0xFFFFFFFF))); atomicAdd(&force[atoms.x+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f1.z*0x100000000)));
__threadfence_block(); __threadfence_block();
} }
if (atoms.y > -1) { if (atoms.y > -1) {
atomicAdd(&force[atoms.y], static_cast<unsigned long long>((long long) (f2.x*0xFFFFFFFF))); atomicAdd(&force[atoms.y], static_cast<unsigned long long>((long long) (f2.x*0x100000000)));
atomicAdd(&force[atoms.y+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f2.y*0xFFFFFFFF))); atomicAdd(&force[atoms.y+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f2.y*0x100000000)));
atomicAdd(&force[atoms.y+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f2.z*0xFFFFFFFF))); atomicAdd(&force[atoms.y+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f2.z*0x100000000)));
__threadfence_block(); __threadfence_block();
} }
if (atoms.z > -1) { if (atoms.z > -1) {
atomicAdd(&force[atoms.z], static_cast<unsigned long long>((long long) (f3.x*0xFFFFFFFF))); atomicAdd(&force[atoms.z], static_cast<unsigned long long>((long long) (f3.x*0x100000000)));
atomicAdd(&force[atoms.z+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f3.y*0xFFFFFFFF))); atomicAdd(&force[atoms.z+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f3.y*0x100000000)));
atomicAdd(&force[atoms.z+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f3.z*0xFFFFFFFF))); atomicAdd(&force[atoms.z+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f3.z*0x100000000)));
__threadfence_block(); __threadfence_block();
} }
} }
...@@ -219,21 +219,21 @@ extern "C" __global__ void computeAcceptorForces(unsigned long long* __restrict_ ...@@ -219,21 +219,21 @@ extern "C" __global__ void computeAcceptorForces(unsigned long long* __restrict_
if (acceptorIndex < NUM_ACCEPTORS) { if (acceptorIndex < NUM_ACCEPTORS) {
if (atoms.x > -1) { if (atoms.x > -1) {
atomicAdd(&force[atoms.x], static_cast<unsigned long long>((long long) (f1.x*0xFFFFFFFF))); atomicAdd(&force[atoms.x], static_cast<unsigned long long>((long long) (f1.x*0x100000000)));
atomicAdd(&force[atoms.x+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f1.y*0xFFFFFFFF))); atomicAdd(&force[atoms.x+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f1.y*0x100000000)));
atomicAdd(&force[atoms.x+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f1.z*0xFFFFFFFF))); atomicAdd(&force[atoms.x+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f1.z*0x100000000)));
__threadfence_block(); __threadfence_block();
} }
if (atoms.y > -1) { if (atoms.y > -1) {
atomicAdd(&force[atoms.y], static_cast<unsigned long long>((long long) (f2.x*0xFFFFFFFF))); atomicAdd(&force[atoms.y], static_cast<unsigned long long>((long long) (f2.x*0x100000000)));
atomicAdd(&force[atoms.y+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f2.y*0xFFFFFFFF))); atomicAdd(&force[atoms.y+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f2.y*0x100000000)));
atomicAdd(&force[atoms.y+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f2.z*0xFFFFFFFF))); atomicAdd(&force[atoms.y+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f2.z*0x100000000)));
__threadfence_block(); __threadfence_block();
} }
if (atoms.z > -1) { if (atoms.z > -1) {
atomicAdd(&force[atoms.z], static_cast<unsigned long long>((long long) (f3.x*0xFFFFFFFF))); atomicAdd(&force[atoms.z], static_cast<unsigned long long>((long long) (f3.x*0x100000000)));
atomicAdd(&force[atoms.z+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f3.y*0xFFFFFFFF))); atomicAdd(&force[atoms.z+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f3.y*0x100000000)));
atomicAdd(&force[atoms.z+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f3.z*0xFFFFFFFF))); atomicAdd(&force[atoms.z+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f3.z*0x100000000)));
__threadfence_block(); __threadfence_block();
} }
} }
......
...@@ -102,9 +102,9 @@ extern "C" __global__ void calculateEwaldForces(unsigned long long* __restrict__ ...@@ -102,9 +102,9 @@ extern "C" __global__ void calculateEwaldForces(unsigned long long* __restrict__
// Record the force on the atom. // Record the force on the atom.
atomicAdd(&forceBuffers[atom], static_cast<unsigned long long>((long long) (force.x*0xFFFFFFFF))); atomicAdd(&forceBuffers[atom], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
atomicAdd(&forceBuffers[atom+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0xFFFFFFFF))); atomicAdd(&forceBuffers[atom+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
atomicAdd(&forceBuffers[atom+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0xFFFFFFFF))); atomicAdd(&forceBuffers[atom+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
atom += blockDim.x*gridDim.x; atom += blockDim.x*gridDim.x;
} }
} }
...@@ -13,7 +13,7 @@ extern "C" __global__ void reduceBornSum(float alpha, float beta, float gamma, c ...@@ -13,7 +13,7 @@ extern "C" __global__ void reduceBornSum(float alpha, float beta, float gamma, c
for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) { for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
// Get summed Born data // Get summed Born data
real sum = RECIP(0xFFFFFFFF)*bornSum[index]; real sum = RECIP(0x100000000)*bornSum[index];
// Now calculate Born radius and OBC term. // Now calculate Born radius and OBC term.
...@@ -41,7 +41,7 @@ extern "C" __global__ void reduceBornForce(long long* __restrict__ bornForce, re ...@@ -41,7 +41,7 @@ extern "C" __global__ void reduceBornForce(long long* __restrict__ bornForce, re
for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) { for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
// Get summed Born force // Get summed Born force
real force = RECIP(0xFFFFFFFF)*bornForce[index]; real force = RECIP(0x100000000)*bornForce[index];
// Now calculate the actual force // Now calculate the actual force
...@@ -53,7 +53,7 @@ extern "C" __global__ void reduceBornForce(long long* __restrict__ bornForce, re ...@@ -53,7 +53,7 @@ extern "C" __global__ void reduceBornForce(long long* __restrict__ bornForce, re
force += saTerm/bornRadius; force += saTerm/bornRadius;
energy += saTerm; energy += saTerm;
force *= bornRadius*bornRadius*obcChain[index]; force *= bornRadius*bornRadius*obcChain[index];
bornForce[index] = (long long) (force*0xFFFFFFFF); bornForce[index] = (long long) (force*0x100000000);
} }
energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy/-6; energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy/-6;
} }
...@@ -317,11 +317,11 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa ...@@ -317,11 +317,11 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
if (pos < end) { if (pos < end) {
const unsigned int offset = x*TILE_SIZE + tgx; const unsigned int offset = x*TILE_SIZE + tgx;
atomicAdd(&global_bornSum[offset], static_cast<unsigned long long>((long long) (bornSum*0xFFFFFFFF))); atomicAdd(&global_bornSum[offset], static_cast<unsigned long long>((long long) (bornSum*0x100000000)));
} }
if (pos < end && x != y) { if (pos < end && x != y) {
const unsigned int offset = y*TILE_SIZE + tgx; const unsigned int offset = y*TILE_SIZE + tgx;
atomicAdd(&global_bornSum[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].bornSum*0xFFFFFFFF))); atomicAdd(&global_bornSum[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].bornSum*0x100000000)));
} }
lasty = y; lasty = y;
pos++; pos++;
...@@ -607,17 +607,17 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo ...@@ -607,17 +607,17 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
if (pos < end) { if (pos < end) {
const unsigned int offset = x*TILE_SIZE + tgx; const unsigned int offset = x*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (force.x*0xFFFFFFFF))); atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0xFFFFFFFF))); atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0xFFFFFFFF))); atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
atomicAdd(&global_bornForce[offset], static_cast<unsigned long long>((long long) (force.w*0xFFFFFFFF))); atomicAdd(&global_bornForce[offset], static_cast<unsigned long long>((long long) (force.w*0x100000000)));
} }
if (pos < end && x != y) { if (pos < end && x != y) {
const unsigned int offset = y*TILE_SIZE + tgx; const unsigned int offset = y*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fx*0xFFFFFFFF))); atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fx*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fy*0xFFFFFFFF))); atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fy*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fz*0xFFFFFFFF))); atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fz*0x100000000)));
atomicAdd(&global_bornForce[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fw*0xFFFFFFFF))); atomicAdd(&global_bornForce[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fw*0x100000000)));
} }
lasty = y; lasty = y;
pos++; pos++;
......
...@@ -16,8 +16,8 @@ ...@@ -16,8 +16,8 @@
real t2I = (l_ij2I-u_ij2I); real t2I = (l_ij2I-u_ij2I);
real term1 = (0.5f*(0.25f+obcParams2.y*obcParams2.y*invRSquaredOver4)*t2J + t1J*invRSquaredOver4)*invR; real term1 = (0.5f*(0.25f+obcParams2.y*obcParams2.y*invRSquaredOver4)*t2J + t1J*invRSquaredOver4)*invR;
real term2 = (0.5f*(0.25f+obcParams1.y*obcParams1.y*invRSquaredOver4)*t2I + t1I*invRSquaredOver4)*invR; real term2 = (0.5f*(0.25f+obcParams1.y*obcParams1.y*invRSquaredOver4)*t2I + t1I*invRSquaredOver4)*invR;
real tempdEdR = (obcParams1.x < rScaledRadiusJ ? bornForce1*term1/0xFFFFFFFF : 0); real tempdEdR = (obcParams1.x < rScaledRadiusJ ? bornForce1*term1/0x100000000 : 0);
tempdEdR += (obcParams2.x < rScaledRadiusI ? bornForce2*term2/0xFFFFFFFF : 0); tempdEdR += (obcParams2.x < rScaledRadiusI ? bornForce2*term2/0x100000000 : 0);
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
unsigned int includeInteraction = (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2 && r2 < CUTOFF_SQUARED); unsigned int includeInteraction = (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2 && r2 < CUTOFF_SQUARED);
#else #else
......
...@@ -775,15 +775,15 @@ extern "C" __global__ void computeVirtualSites(real4* __restrict__ posq, real4* ...@@ -775,15 +775,15 @@ extern "C" __global__ void computeVirtualSites(real4* __restrict__ posq, real4*
} }
inline __device__ real3 loadForce(int index, long long* __restrict__ force) { inline __device__ real3 loadForce(int index, long long* __restrict__ force) {
real scale = 1/((real) 0xFFFFFFFF); real scale = 1/((real) 0x100000000);
return make_real3(scale*force[index], scale*force[index+PADDED_NUM_ATOMS], scale*force[index+PADDED_NUM_ATOMS*2]); return make_real3(scale*force[index], scale*force[index+PADDED_NUM_ATOMS], scale*force[index+PADDED_NUM_ATOMS*2]);
} }
inline __device__ void addForce(int index, long long* __restrict__ force, real3 value) { inline __device__ void addForce(int index, long long* __restrict__ force, real3 value) {
unsigned long long* f = (unsigned long long*) force; unsigned long long* f = (unsigned long long*) force;
atomicAdd(&f[index], static_cast<unsigned long long>((long long) (value.x*0xFFFFFFFF))); atomicAdd(&f[index], static_cast<unsigned long long>((long long) (value.x*0x100000000)));
atomicAdd(&f[index+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (value.y*0xFFFFFFFF))); atomicAdd(&f[index+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (value.y*0x100000000)));
atomicAdd(&f[index+PADDED_NUM_ATOMS*2], static_cast<unsigned long long>((long long) (value.z*0xFFFFFFFF))); atomicAdd(&f[index+PADDED_NUM_ATOMS*2], static_cast<unsigned long long>((long long) (value.z*0x100000000)));
} }
/** /**
......
...@@ -7,7 +7,7 @@ enum {VelScale, ForceScale, NoiseScale, MaxParams}; ...@@ -7,7 +7,7 @@ enum {VelScale, ForceScale, NoiseScale, MaxParams};
extern "C" __global__ void integrateLangevinPart1(mixed4* __restrict__ velm, const long long* __restrict__ force, mixed4* __restrict__ posDelta, extern "C" __global__ void integrateLangevinPart1(mixed4* __restrict__ velm, const long long* __restrict__ force, mixed4* __restrict__ posDelta,
const mixed* __restrict__ paramBuffer, const mixed2* __restrict__ dt, const float4* __restrict__ random, unsigned int randomIndex) { const mixed* __restrict__ paramBuffer, const mixed2* __restrict__ dt, const float4* __restrict__ random, unsigned int randomIndex) {
mixed vscale = paramBuffer[VelScale]; mixed vscale = paramBuffer[VelScale];
mixed fscale = paramBuffer[ForceScale]/(mixed) 0xFFFFFFFF; mixed fscale = paramBuffer[ForceScale]/(mixed) 0x100000000;
mixed noisescale = paramBuffer[NoiseScale]; mixed noisescale = paramBuffer[NoiseScale];
mixed stepSize = dt[0].y; mixed stepSize = dt[0].y;
int index = blockIdx.x*blockDim.x+threadIdx.x; int index = blockIdx.x*blockDim.x+threadIdx.x;
...@@ -75,7 +75,7 @@ extern "C" __global__ void selectLangevinStepSize(mixed maxStepSize, mixed error ...@@ -75,7 +75,7 @@ extern "C" __global__ void selectLangevinStepSize(mixed maxStepSize, mixed error
mixed* error = &params[MaxParams]; mixed* error = &params[MaxParams];
mixed err = 0; mixed err = 0;
unsigned int index = threadIdx.x; unsigned int index = threadIdx.x;
const mixed scale = RECIP((mixed) 0xFFFFFFFF); const mixed scale = RECIP((mixed) 0x100000000);
while (index < NUM_ATOMS) { while (index < NUM_ATOMS) {
mixed3 f = make_mixed3(scale*force[index], scale*force[index+PADDED_NUM_ATOMS], scale*force[index+PADDED_NUM_ATOMS*2]); mixed3 f = make_mixed3(scale*force[index], scale*force[index+PADDED_NUM_ATOMS], scale*force[index+PADDED_NUM_ATOMS*2]);
mixed invMass = velm[index].w; mixed invMass = velm[index].w;
......
...@@ -329,15 +329,15 @@ extern "C" __global__ void computeNonbonded( ...@@ -329,15 +329,15 @@ extern "C" __global__ void computeNonbonded(
if (pos < end) { if (pos < end) {
const unsigned int offset = x*TILE_SIZE + tgx; const unsigned int offset = x*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (force.x*0xFFFFFFFF))); atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0xFFFFFFFF))); atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0xFFFFFFFF))); atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
} }
if (pos < end && x != y) { if (pos < end && x != y) {
const unsigned int offset = y*TILE_SIZE + tgx; const unsigned int offset = y*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fx*0xFFFFFFFF))); atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fx*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fy*0xFFFFFFFF))); atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fy*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fz*0xFFFFFFFF))); atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fz*0x100000000)));
} }
pos++; pos++;
} while (pos < end); } while (pos < end);
......
...@@ -106,12 +106,12 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real ...@@ -106,12 +106,12 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real
z -= (z >= GRID_SIZE_Z ? GRID_SIZE_Z : 0); z -= (z >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
#ifdef USE_DOUBLE_PRECISION #ifdef USE_DOUBLE_PRECISION
unsigned long long * ulonglong_p = (unsigned long long *) originalPmeGrid; unsigned long long * ulonglong_p = (unsigned long long *) originalPmeGrid;
atomicAdd(&ulonglong_p[x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z], static_cast<unsigned long long>((long long) (add*0xFFFFFFFF))); atomicAdd(&ulonglong_p[x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z], static_cast<unsigned long long>((long long) (add*0x100000000)));
#elif __CUDA_ARCH__ < 200 #elif __CUDA_ARCH__ < 200
unsigned long long * ulonglong_p = (unsigned long long *) originalPmeGrid; unsigned long long * ulonglong_p = (unsigned long long *) originalPmeGrid;
int gridIndex = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z; int gridIndex = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z;
gridIndex = (gridIndex%2 == 0 ? gridIndex/2 : (gridIndex+GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z)/2); gridIndex = (gridIndex%2 == 0 ? gridIndex/2 : (gridIndex+GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z)/2);
atomicAdd(&ulonglong_p[gridIndex], static_cast<unsigned long long>((long long) (add*0xFFFFFFFF))); atomicAdd(&ulonglong_p[gridIndex], static_cast<unsigned long long>((long long) (add*0x100000000)));
#else #else
atomicAdd(&originalPmeGrid[x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z], add*EPSILON_FACTOR); atomicAdd(&originalPmeGrid[x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z], add*EPSILON_FACTOR);
#endif #endif
...@@ -123,7 +123,7 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real ...@@ -123,7 +123,7 @@ extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real
extern "C" __global__ void finishSpreadCharge(long long* __restrict__ originalPmeGrid) { extern "C" __global__ void finishSpreadCharge(long long* __restrict__ originalPmeGrid) {
real* floatGrid = (real*) originalPmeGrid; real* floatGrid = (real*) originalPmeGrid;
const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z; const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
real scale = EPSILON_FACTOR/(real) 0xFFFFFFFF; real scale = EPSILON_FACTOR/(real) 0x100000000;
#ifdef USE_DOUBLE_PRECISION #ifdef USE_DOUBLE_PRECISION
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x) for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x)
floatGrid[index] = scale*originalPmeGrid[index]; floatGrid[index] = scale*originalPmeGrid[index];
...@@ -262,8 +262,8 @@ void gridInterpolateForce(const real4* __restrict__ posq, unsigned long long* __ ...@@ -262,8 +262,8 @@ void gridInterpolateForce(const real4* __restrict__ posq, unsigned long long* __
} }
} }
real q = pos.w*EPSILON_FACTOR; real q = pos.w*EPSILON_FACTOR;
forceBuffers[atom] += static_cast<unsigned long long>((long long) (-q*force.x*GRID_SIZE_X*invPeriodicBoxSize.x*0xFFFFFFFF)); forceBuffers[atom] += static_cast<unsigned long long>((long long) (-q*force.x*GRID_SIZE_X*invPeriodicBoxSize.x*0x100000000));
forceBuffers[atom+PADDED_NUM_ATOMS] += static_cast<unsigned long long>((long long) (-q*force.y*GRID_SIZE_Y*invPeriodicBoxSize.y*0xFFFFFFFF)); forceBuffers[atom+PADDED_NUM_ATOMS] += static_cast<unsigned long long>((long long) (-q*force.y*GRID_SIZE_Y*invPeriodicBoxSize.y*0x100000000));
forceBuffers[atom+2*PADDED_NUM_ATOMS] += static_cast<unsigned long long>((long long) (-q*force.z*GRID_SIZE_Z*invPeriodicBoxSize.z*0xFFFFFFFF)); forceBuffers[atom+2*PADDED_NUM_ATOMS] += static_cast<unsigned long long>((long long) (-q*force.z*GRID_SIZE_Z*invPeriodicBoxSize.z*0x100000000));
} }
} }
...@@ -7,7 +7,7 @@ extern "C" __global__ void integrateVerletPart1(const mixed2* __restrict__ dt, c ...@@ -7,7 +7,7 @@ extern "C" __global__ void integrateVerletPart1(const mixed2* __restrict__ dt, c
const mixed2 stepSize = dt[0]; const mixed2 stepSize = dt[0];
const mixed dtPos = stepSize.y; const mixed dtPos = stepSize.y;
const mixed dtVel = 0.5f*(stepSize.x+stepSize.y); const mixed dtVel = 0.5f*(stepSize.x+stepSize.y);
const mixed scale = dtVel/(mixed) 0xFFFFFFFF; const mixed scale = dtVel/(mixed) 0x100000000;
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) { for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
mixed4 velocity = velm[index]; mixed4 velocity = velm[index];
if (velocity.w != 0.0) { if (velocity.w != 0.0) {
...@@ -76,7 +76,7 @@ extern "C" __global__ void selectVerletStepSize(mixed maxStepSize, mixed errorTo ...@@ -76,7 +76,7 @@ extern "C" __global__ void selectVerletStepSize(mixed maxStepSize, mixed errorTo
extern __shared__ mixed error[]; extern __shared__ mixed error[];
mixed err = 0.0f; mixed err = 0.0f;
const mixed scale = RECIP((mixed) 0xFFFFFFFF); const mixed scale = RECIP((mixed) 0x100000000);
for (int index = threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) { for (int index = threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
mixed3 f = make_mixed3(scale*force[index], scale*force[index+PADDED_NUM_ATOMS], scale*force[index+PADDED_NUM_ATOMS*2]); mixed3 f = make_mixed3(scale*force[index], scale*force[index+PADDED_NUM_ATOMS], scale*force[index+PADDED_NUM_ATOMS*2]);
mixed invMass = velm[index].w; mixed invMass = velm[index].w;
......
...@@ -2715,7 +2715,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo ...@@ -2715,7 +2715,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
extraArgs << ", __global const long* restrict derivBuffersIn"; extraArgs << ", __global const long* restrict derivBuffersIn";
for (int i = 0; i < energyDerivs->getNumParameters(); ++i) for (int i = 0; i < energyDerivs->getNumParameters(); ++i)
reduce << "derivBuffers" << energyDerivs->getParameterSuffix(i, "[index]") << reduce << "derivBuffers" << energyDerivs->getParameterSuffix(i, "[index]") <<
" = (1.0f/0xFFFFFFFF)*derivBuffersIn[index+PADDED_NUM_ATOMS*" << cl.intToString(i) << "];\n"; " = (1.0f/0x100000000)*derivBuffersIn[index+PADDED_NUM_ATOMS*" << cl.intToString(i) << "];\n";
} }
else { else {
for (int i = 0; i < (int) energyDerivs->getBuffers().size(); i++) for (int i = 0; i < (int) energyDerivs->getBuffers().size(); i++)
......
#define TILE_SIZE 32 #define TILE_SIZE 32
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
#define STORE_DERIVATIVE_1(INDEX) atom_add(&derivBuffers[offset1+(INDEX-1)*PADDED_NUM_ATOMS], (long) (deriv##INDEX##_1*0xFFFFFFFF)); #define STORE_DERIVATIVE_1(INDEX) atom_add(&derivBuffers[offset1+(INDEX-1)*PADDED_NUM_ATOMS], (long) (deriv##INDEX##_1*0x100000000));
#define STORE_DERIVATIVE_2(INDEX) atom_add(&derivBuffers[offset2+(INDEX-1)*PADDED_NUM_ATOMS], (long) (local_deriv##INDEX[get_local_id(0)]*0xFFFFFFFF)); #define STORE_DERIVATIVE_2(INDEX) atom_add(&derivBuffers[offset2+(INDEX-1)*PADDED_NUM_ATOMS], (long) (local_deriv##INDEX[get_local_id(0)]*0x100000000));
#else #else
#define STORE_DERIVATIVE_1(INDEX) derivBuffers##INDEX[offset1] += deriv##INDEX##_1+tempDerivBuffer##INDEX[get_local_id(0)+TILE_SIZE]; #define STORE_DERIVATIVE_1(INDEX) derivBuffers##INDEX[offset1] += deriv##INDEX##_1+tempDerivBuffer##INDEX[get_local_id(0)+TILE_SIZE];
#define STORE_DERIVATIVE_2(INDEX) derivBuffers##INDEX[offset2] += local_deriv##INDEX[get_local_id(0)]+local_deriv##INDEX[get_local_id(0)+TILE_SIZE]; #define STORE_DERIVATIVE_2(INDEX) derivBuffers##INDEX[offset2] += local_deriv##INDEX[get_local_id(0)]+local_deriv##INDEX[get_local_id(0)+TILE_SIZE];
...@@ -141,9 +141,9 @@ void computeN2Energy( ...@@ -141,9 +141,9 @@ void computeN2Energy(
if (get_local_id(0) < TILE_SIZE) { if (get_local_id(0) < TILE_SIZE) {
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
const unsigned int offset1 = x*TILE_SIZE + tgx; const unsigned int offset1 = x*TILE_SIZE + tgx;
atom_add(&forceBuffers[offset1], (long) ((force.x + tempForceBuffer[get_local_id(0)+TILE_SIZE].x)*0xFFFFFFFF)); atom_add(&forceBuffers[offset1], (long) ((force.x + tempForceBuffer[get_local_id(0)+TILE_SIZE].x)*0x100000000));
atom_add(&forceBuffers[offset1+PADDED_NUM_ATOMS], (long) ((force.y + tempForceBuffer[get_local_id(0)+TILE_SIZE].y)*0xFFFFFFFF)); atom_add(&forceBuffers[offset1+PADDED_NUM_ATOMS], (long) ((force.y + tempForceBuffer[get_local_id(0)+TILE_SIZE].y)*0x100000000));
atom_add(&forceBuffers[offset1+2*PADDED_NUM_ATOMS], (long) ((force.z + tempForceBuffer[get_local_id(0)+TILE_SIZE].z)*0xFFFFFFFF)); atom_add(&forceBuffers[offset1+2*PADDED_NUM_ATOMS], (long) ((force.z + tempForceBuffer[get_local_id(0)+TILE_SIZE].z)*0x100000000));
#else #else
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK #ifdef USE_OUTPUT_BUFFER_PER_BLOCK
const unsigned int offset1 = x*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS; const unsigned int offset1 = x*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
...@@ -230,12 +230,12 @@ void computeN2Energy( ...@@ -230,12 +230,12 @@ void computeN2Energy(
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
const unsigned int offset1 = x*TILE_SIZE + tgx; const unsigned int offset1 = x*TILE_SIZE + tgx;
const unsigned int offset2 = y*TILE_SIZE + tgx; const unsigned int offset2 = y*TILE_SIZE + tgx;
atom_add(&forceBuffers[offset1], (long) ((force.x+tempForceBuffer[get_local_id(0)+TILE_SIZE].x)*0xFFFFFFFF)); atom_add(&forceBuffers[offset1], (long) ((force.x+tempForceBuffer[get_local_id(0)+TILE_SIZE].x)*0x100000000));
atom_add(&forceBuffers[offset1+PADDED_NUM_ATOMS], (long) ((force.y+tempForceBuffer[get_local_id(0)+TILE_SIZE].y)*0xFFFFFFFF)); atom_add(&forceBuffers[offset1+PADDED_NUM_ATOMS], (long) ((force.y+tempForceBuffer[get_local_id(0)+TILE_SIZE].y)*0x100000000));
atom_add(&forceBuffers[offset1+2*PADDED_NUM_ATOMS], (long) ((force.z+tempForceBuffer[get_local_id(0)+TILE_SIZE].z)*0xFFFFFFFF)); atom_add(&forceBuffers[offset1+2*PADDED_NUM_ATOMS], (long) ((force.z+tempForceBuffer[get_local_id(0)+TILE_SIZE].z)*0x100000000));
atom_add(&forceBuffers[offset2], (long) ((local_force[get_local_id(0)].x+local_force[get_local_id(0)+TILE_SIZE].x)*0xFFFFFFFF)); atom_add(&forceBuffers[offset2], (long) ((local_force[get_local_id(0)].x+local_force[get_local_id(0)+TILE_SIZE].x)*0x100000000));
atom_add(&forceBuffers[offset2+PADDED_NUM_ATOMS], (long) ((local_force[get_local_id(0)].y+local_force[get_local_id(0)+TILE_SIZE].y)*0xFFFFFFFF)); atom_add(&forceBuffers[offset2+PADDED_NUM_ATOMS], (long) ((local_force[get_local_id(0)].y+local_force[get_local_id(0)+TILE_SIZE].y)*0x100000000));
atom_add(&forceBuffers[offset2+2*PADDED_NUM_ATOMS], (long) ((local_force[get_local_id(0)].z+local_force[get_local_id(0)+TILE_SIZE].z)*0xFFFFFFFF)); atom_add(&forceBuffers[offset2+2*PADDED_NUM_ATOMS], (long) ((local_force[get_local_id(0)].z+local_force[get_local_id(0)+TILE_SIZE].z)*0x100000000));
#else #else
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK #ifdef USE_OUTPUT_BUFFER_PER_BLOCK
const unsigned int offset1 = x*TILE_SIZE + tgx + y*PADDED_NUM_ATOMS; const unsigned int offset1 = x*TILE_SIZE + tgx + y*PADDED_NUM_ATOMS;
......
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
#define STORE_DERIVATIVE_1(INDEX) atom_add(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (long) (deriv##INDEX##_1*0xFFFFFFFF)); #define STORE_DERIVATIVE_1(INDEX) atom_add(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (long) (deriv##INDEX##_1*0x100000000));
#define STORE_DERIVATIVE_2(INDEX) atom_add(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (long) (local_deriv##INDEX[get_local_id(0)]*0xFFFFFFFF)); #define STORE_DERIVATIVE_2(INDEX) atom_add(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (long) (local_deriv##INDEX[get_local_id(0)]*0x100000000));
#else #else
#define STORE_DERIVATIVE_1(INDEX) derivBuffers##INDEX[offset] += deriv##INDEX##_1; #define STORE_DERIVATIVE_1(INDEX) derivBuffers##INDEX[offset] += deriv##INDEX##_1;
#define STORE_DERIVATIVE_2(INDEX) derivBuffers##INDEX[offset] += local_deriv##INDEX[get_local_id(0)]; #define STORE_DERIVATIVE_2(INDEX) derivBuffers##INDEX[offset] += local_deriv##INDEX[get_local_id(0)];
...@@ -211,16 +211,16 @@ __kernel void computeN2Energy( ...@@ -211,16 +211,16 @@ __kernel void computeN2Energy(
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
if (pos < end) { if (pos < end) {
const unsigned int offset = x*TILE_SIZE + tgx; const unsigned int offset = x*TILE_SIZE + tgx;
atom_add(&forceBuffers[offset], (long) (force.x*0xFFFFFFFF)); atom_add(&forceBuffers[offset], (long) (force.x*0x100000000));
atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (force.y*0xFFFFFFFF)); atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (force.z*0xFFFFFFFF)); atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
STORE_DERIVATIVES_1 STORE_DERIVATIVES_1
} }
if (pos < end && x != y) { if (pos < end && x != y) {
const unsigned int offset = y*TILE_SIZE + tgx; const unsigned int offset = y*TILE_SIZE + tgx;
atom_add(&forceBuffers[offset], (long) (local_force[get_local_id(0)].x*0xFFFFFFFF)); atom_add(&forceBuffers[offset], (long) (local_force[get_local_id(0)].x*0x100000000));
atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (local_force[get_local_id(0)].y*0xFFFFFFFF)); atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (local_force[get_local_id(0)].y*0x100000000));
atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (local_force[get_local_id(0)].z*0xFFFFFFFF)); atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (local_force[get_local_id(0)].z*0x100000000));
STORE_DERIVATIVES_2 STORE_DERIVATIVES_2
} }
#else #else
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment