Commit 0df0e673 authored by Peter Eastman's avatar Peter Eastman
Browse files

Fixed errors in AMD implementation of GB

parent f1171d6c
...@@ -3017,7 +3017,7 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include ...@@ -3017,7 +3017,7 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionRowIndices().getDeviceBuffer()); pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionRowIndices().getDeviceBuffer());
/// \todo Eliminate this argument and make local to the kernel. For *_default.cl kernel can actually make it TileSize rather than getForceThreadBlockSize as only half the workgroup stores to it as was done with nonbonded_default.cl. /// \todo Eliminate this argument and make local to the kernel. For *_default.cl kernel can actually make it TileSize rather than getForceThreadBlockSize as only half the workgroup stores to it as was done with nonbonded_default.cl.
/// \todo Also make the previous __local argument local as was done with nonbonded_default.cl. /// \todo Also make the previous __local argument local as was done with nonbonded_default.cl.
pairEnergyKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : 1), NULL); pairEnergyKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : (cl.getSIMDWidth() == 32 ? 1 : nb.getForceThreadBlockSize()))*4*elementSize, NULL);
if (nb.getUseCutoff()) { if (nb.getUseCutoff()) {
pairEnergyKernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer()); pairEnergyKernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer());
pairEnergyKernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer()); pairEnergyKernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
......
...@@ -77,7 +77,7 @@ void computeBornSum( ...@@ -77,7 +77,7 @@ void computeBornSum(
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
for (unsigned int j = 0; j < TILE_SIZE/2; j++) { for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
real4 delta = (float4) (localData[baseLocalAtom+j].x-posq1.x, localData[baseLocalAtom+j].y-posq1.y, localData[baseLocalAtom+j].z-posq1.z, 0); real4 delta = (real4) (localData[baseLocalAtom+j].x-posq1.x, localData[baseLocalAtom+j].y-posq1.y, localData[baseLocalAtom+j].z-posq1.z, 0);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
...@@ -98,9 +98,9 @@ void computeBornSum( ...@@ -98,9 +98,9 @@ void computeBornSum(
real l_ij2 = l_ij*l_ij; real l_ij2 = l_ij*l_ij;
real u_ij2 = u_ij*u_ij; real u_ij2 = u_ij*u_ij;
real ratio = LOG(u_ij * RECIP(l_ij)); real ratio = LOG(u_ij * RECIP(l_ij));
bornSum += select(0.0f, l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) + bornSum += (includeInteraction ? l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2), includeInteraction); (0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2) : (real) 0);
bornSum += select(0.0f, 2.0f*(RECIP(params1.x)-l_ij), includeInteraction && params1.x < params2.y-r); bornSum += (includeInteraction && params1.x < params2.y-r ? 2.0f*(RECIP(params1.x)-l_ij) : (real) 0);
} }
// Sum the forces and write results. // Sum the forces and write results.
...@@ -132,7 +132,7 @@ void computeBornSum( ...@@ -132,7 +132,7 @@ void computeBornSum(
localData[get_local_id(0)].x = tempPosq.x; localData[get_local_id(0)].x = tempPosq.x;
localData[get_local_id(0)].y = tempPosq.y; localData[get_local_id(0)].y = tempPosq.y;
localData[get_local_id(0)].z = tempPosq.z; localData[get_local_id(0)].z = tempPosq.z;
real2 tempParams = global_params[j]; float2 tempParams = global_params[j];
localData[get_local_id(0)].radius = tempParams.x; localData[get_local_id(0)].radius = tempParams.x;
localData[get_local_id(0)].scaledRadius = tempParams.y; localData[get_local_id(0)].scaledRadius = tempParams.y;
} }
...@@ -166,9 +166,9 @@ void computeBornSum( ...@@ -166,9 +166,9 @@ void computeBornSum(
real u_ij2 = u_ij*u_ij; real u_ij2 = u_ij*u_ij;
real ratio = LOG(u_ij * RECIP(l_ij)); real ratio = LOG(u_ij * RECIP(l_ij));
unsigned int includeTerm = (includeInteraction && params1.x < rScaledRadiusJ); unsigned int includeTerm = (includeInteraction && params1.x < rScaledRadiusJ);
bornSum += select(0.0f, l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) + bornSum += (includeTerm ? l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2), includeTerm); (0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2) : (real) 0);
bornSum += select(0.0f, 2.0f*(RECIP(params1.x)-l_ij), includeTerm && params1.x < params2.y-r); bornSum += (includeTerm && params1.x < params2.y-r ? 2.0f*(RECIP(params1.x)-l_ij) : (real) 0);
} }
real rScaledRadiusI = r+params1.y; real rScaledRadiusI = r+params1.y;
{ {
...@@ -179,8 +179,8 @@ void computeBornSum( ...@@ -179,8 +179,8 @@ void computeBornSum(
real ratio = LOG(u_ij * RECIP(l_ij)); real ratio = LOG(u_ij * RECIP(l_ij));
real term = l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) + real term = l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params1.y*params1.y*invR)*(l_ij2-u_ij2); (0.25f*params1.y*params1.y*invR)*(l_ij2-u_ij2);
term += select(0.0f, 2.0f*(RECIP(params2.x)-l_ij), params2.x < params1.y-r); term += (params2.x < params1.y-r ? 2.0f*(RECIP(params2.x)-l_ij) : (real) 0);
localBornSum[tj+localForceOffset] += select(0.0f, term, includeInteraction && params2.x < rScaledRadiusI); localBornSum[tj+localForceOffset] += (includeInteraction && params2.x < rScaledRadiusI ? term : (real) 0);
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
tj = (tj+1) & (TILE_SIZE-1); tj = (tj+1) & (TILE_SIZE-1);
...@@ -323,13 +323,13 @@ void computeGBSAForce1( ...@@ -323,13 +323,13 @@ void computeGBSAForce1(
real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij); real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
real dEdR = Gpol*(1.0f - 0.25f*expTerm); real dEdR = Gpol*(1.0f - 0.25f*expTerm);
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
dEdR = select(dEdR, 0.0f, r2 > CUTOFF_SQUARED); dEdR = (r2 > CUTOFF_SQUARED ? (real) 0 : dEdR);
tempEnergy = select(tempEnergy, 0.0f, r2 > CUTOFF_SQUARED); tempEnergy = (r2 > CUTOFF_SQUARED ? (real) 0 : tempEnergy);
dGpol_dalpha2_ij = select(dGpol_dalpha2_ij, 0.0f, r2 > CUTOFF_SQUARED); dGpol_dalpha2_ij = (r2 > CUTOFF_SQUARED ? (real) 0 : dGpol_dalpha2_ij);
#endif #endif
force.w += select(0.0f, dGpol_dalpha2_ij*bornRadius2, includeInteraction); force.w += (includeInteraction ? dGpol_dalpha2_ij*bornRadius2 : (real) 0);
energy += select(0.0f, 0.5f*tempEnergy, includeInteraction); energy += (includeInteraction ? 0.5f*tempEnergy : (real) 0);
delta.xyz *= select(0.0f, dEdR, includeInteraction); delta.xyz *= (includeInteraction ? dEdR : (real) 0);
force.xyz -= delta.xyz; force.xyz -= delta.xyz;
} }
...@@ -412,18 +412,18 @@ void computeGBSAForce1( ...@@ -412,18 +412,18 @@ void computeGBSAForce1(
real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij); real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
real dEdR = Gpol*(1.0f - 0.25f*expTerm); real dEdR = Gpol*(1.0f - 0.25f*expTerm);
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
dEdR = select(dEdR, 0.0f, r2 > CUTOFF_SQUARED); dEdR = (r2 > CUTOFF_SQUARED ? (real) 0 : dEdR);
tempEnergy = select(tempEnergy, 0.0f, r2 > CUTOFF_SQUARED); tempEnergy = (r2 > CUTOFF_SQUARED ? (real) 0 : tempEnergy);
dGpol_dalpha2_ij = select(dGpol_dalpha2_ij, 0.0f, r2 > CUTOFF_SQUARED); dGpol_dalpha2_ij = (r2 > CUTOFF_SQUARED ? (real) 0 : dGpol_dalpha2_ij);
#endif #endif
force.w += select(0.0f, dGpol_dalpha2_ij*bornRadius2, includeInteraction); force.w += (includeInteraction ? dGpol_dalpha2_ij*bornRadius2 : (real) 0);
energy += select(0.0f, tempEnergy, includeInteraction); energy += (includeInteraction ? tempEnergy : (real) 0);
delta.xyz *= select(0.0f, dEdR, includeInteraction); delta.xyz *= (includeInteraction ? dEdR : (real) 0);
force.xyz -= delta.xyz; force.xyz -= delta.xyz;
localForce[tj+localForceOffset].x += delta.x; localForce[tj+localForceOffset].x += delta.x;
localForce[tj+localForceOffset].y += delta.y; localForce[tj+localForceOffset].y += delta.y;
localForce[tj+localForceOffset].z += delta.z; localForce[tj+localForceOffset].z += delta.z;
localForce[tj+localForceOffset].w += select(0.0f, dGpol_dalpha2_ij*bornRadius1, includeInteraction); localForce[tj+localForceOffset].w += (includeInteraction ? dGpol_dalpha2_ij*bornRadius1 : (real) 0);
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
tj = (tj+1) & (TILE_SIZE-1); tj = (tj+1) & (TILE_SIZE-1);
} }
...@@ -446,10 +446,10 @@ void computeGBSAForce1( ...@@ -446,10 +446,10 @@ void computeGBSAForce1(
atom_add(&forceBuffers[offset1+PADDED_NUM_ATOMS], (long) ((force.y + localData[tgx].temp_y)*0xFFFFFFFF)); atom_add(&forceBuffers[offset1+PADDED_NUM_ATOMS], (long) ((force.y + localData[tgx].temp_y)*0xFFFFFFFF));
atom_add(&forceBuffers[offset1+2*PADDED_NUM_ATOMS], (long) ((force.z + localData[tgx].temp_z)*0xFFFFFFFF)); atom_add(&forceBuffers[offset1+2*PADDED_NUM_ATOMS], (long) ((force.z + localData[tgx].temp_z)*0xFFFFFFFF));
atom_add(&global_bornForce[offset1], (long) ((force.w + localData[tgx].temp_w)*0xFFFFFFFF)); atom_add(&global_bornForce[offset1], (long) ((force.w + localData[tgx].temp_w)*0xFFFFFFFF));
atom_add(&forceBuffers[offset2], (long) ((localData[get_local_id(0)].fx + localForce[get_local_id(0)+TILE_SIZE].x)*0xFFFFFFFF)); atom_add(&forceBuffers[offset2], (long) ((localForce[get_local_id(0)].x + localForce[get_local_id(0)+TILE_SIZE].x)*0xFFFFFFFF));
atom_add(&forceBuffers[offset2+PADDED_NUM_ATOMS], (long) ((localData[get_local_id(0)].fy + localForce[get_local_id(0)+TILE_SIZE].y)*0xFFFFFFFF)); atom_add(&forceBuffers[offset2+PADDED_NUM_ATOMS], (long) ((localForce[get_local_id(0)].y + localForce[get_local_id(0)+TILE_SIZE].y)*0xFFFFFFFF));
atom_add(&forceBuffers[offset2+2*PADDED_NUM_ATOMS], (long) ((localData[get_local_id(0)].fz + localForce[get_local_id(0)+TILE_SIZE].z)*0xFFFFFFFF)); atom_add(&forceBuffers[offset2+2*PADDED_NUM_ATOMS], (long) ((localForce[get_local_id(0)].z + localForce[get_local_id(0)+TILE_SIZE].z)*0xFFFFFFFF));
atom_add(&global_bornForce[offset2], (long) ((localData[get_local_id(0)].fw + localForce[get_local_id(0)+TILE_SIZE].w)*0xFFFFFFFF)); atom_add(&global_bornForce[offset2], (long) ((localForce[get_local_id(0)].w + localForce[get_local_id(0)+TILE_SIZE].w)*0xFFFFFFFF));
#else #else
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK #ifdef USE_OUTPUT_BUFFER_PER_BLOCK
const unsigned int offset1 = x*TILE_SIZE + tgx + y*PADDED_NUM_ATOMS; const unsigned int offset1 = x*TILE_SIZE + tgx + y*PADDED_NUM_ATOMS;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment