Commit b9b7b40d authored by Peter Eastman's avatar Peter Eastman
Browse files

Simplified constraint code. In the process, this fixed a bug that caused...

Simplified constraint code.  In the process, this fixed a bug that caused bubbles to appear in water when running on Fermi GPUs.
parent a161d091
...@@ -1216,9 +1216,9 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn ...@@ -1216,9 +1216,9 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn
prevStepSize = stepSize; prevStepSize = stepSize;
} }
kVerletUpdatePart1(gpu); kVerletUpdatePart1(gpu);
kApplyFirstShake(gpu); kApplyShake(gpu);
kApplyFirstSettle(gpu); kApplySettle(gpu);
kApplyFirstCCMA(gpu); kApplyCCMA(gpu);
if (data.removeCM) if (data.removeCM)
if (data.stepCount%data.cmMotionFrequency == 0) if (data.stepCount%data.cmMotionFrequency == 0)
gpu->bCalculateCM = true; gpu->bCalculateCM = true;
...@@ -1261,9 +1261,9 @@ void CudaIntegrateLangevinStepKernel::execute(ContextImpl& context, const Langev ...@@ -1261,9 +1261,9 @@ void CudaIntegrateLangevinStepKernel::execute(ContextImpl& context, const Langev
if (data.stepCount%data.cmMotionFrequency == 0) if (data.stepCount%data.cmMotionFrequency == 0)
gpu->bCalculateCM = true; gpu->bCalculateCM = true;
kLangevinUpdatePart2(gpu); kLangevinUpdatePart2(gpu);
kApplySecondShake(gpu); kApplyShake(gpu);
kApplySecondSettle(gpu); kApplySettle(gpu);
kApplySecondCCMA(gpu); kApplyCCMA(gpu);
kSetVelocitiesFromPositions(gpu); kSetVelocitiesFromPositions(gpu);
data.time += stepSize; data.time += stepSize;
data.stepCount++; data.stepCount++;
...@@ -1299,9 +1299,9 @@ void CudaIntegrateBrownianStepKernel::execute(ContextImpl& context, const Browni ...@@ -1299,9 +1299,9 @@ void CudaIntegrateBrownianStepKernel::execute(ContextImpl& context, const Browni
prevStepSize = stepSize; prevStepSize = stepSize;
} }
kBrownianUpdatePart1(gpu); kBrownianUpdatePart1(gpu);
kApplyFirstShake(gpu); kApplyShake(gpu);
kApplyFirstSettle(gpu); kApplySettle(gpu);
kApplyFirstCCMA(gpu); kApplyCCMA(gpu);
if (data.removeCM) if (data.removeCM)
if (data.stepCount%data.cmMotionFrequency == 0) if (data.stepCount%data.cmMotionFrequency == 0)
gpu->bCalculateCM = true; gpu->bCalculateCM = true;
...@@ -1331,9 +1331,9 @@ void CudaIntegrateVariableVerletStepKernel::execute(ContextImpl& context, const ...@@ -1331,9 +1331,9 @@ void CudaIntegrateVariableVerletStepKernel::execute(ContextImpl& context, const
float maxStepSize = (float)(maxTime-data.time); float maxStepSize = (float)(maxTime-data.time);
kSelectVerletStepSize(gpu, maxStepSize); kSelectVerletStepSize(gpu, maxStepSize);
kVerletUpdatePart1(gpu); kVerletUpdatePart1(gpu);
kApplyFirstShake(gpu); kApplyShake(gpu);
kApplyFirstSettle(gpu); kApplySettle(gpu);
kApplyFirstCCMA(gpu); kApplyCCMA(gpu);
if (data.removeCM) if (data.removeCM)
if (data.stepCount%data.cmMotionFrequency == 0) if (data.stepCount%data.cmMotionFrequency == 0)
gpu->bCalculateCM = true; gpu->bCalculateCM = true;
...@@ -1381,9 +1381,9 @@ void CudaIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, cons ...@@ -1381,9 +1381,9 @@ void CudaIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, cons
if (data.stepCount%data.cmMotionFrequency == 0) if (data.stepCount%data.cmMotionFrequency == 0)
gpu->bCalculateCM = true; gpu->bCalculateCM = true;
kLangevinUpdatePart2(gpu); kLangevinUpdatePart2(gpu);
kApplySecondShake(gpu); kApplyShake(gpu);
kApplySecondSettle(gpu); kApplySettle(gpu);
kApplySecondCCMA(gpu); kApplyCCMA(gpu);
kSetVelocitiesFromPositions(gpu); kSetVelocitiesFromPositions(gpu);
gpu->psStepSize->Download(); gpu->psStepSize->Download();
data.time += (*gpu->psStepSize)[0].y; data.time += (*gpu->psStepSize)[0].y;
......
...@@ -54,12 +54,9 @@ extern void kCalculateGBVIForces2(gpuContext gpu); ...@@ -54,12 +54,9 @@ extern void kCalculateGBVIForces2(gpuContext gpu);
extern void kCalculateLocalForces(gpuContext gpu); extern void kCalculateLocalForces(gpuContext gpu);
extern void kCalculateAndersenThermostat(gpuContext gpu, CUDAStream<int>& atomGroups); extern void kCalculateAndersenThermostat(gpuContext gpu, CUDAStream<int>& atomGroups);
extern void kReduceBornSumAndForces(gpuContext gpu); extern void kReduceBornSumAndForces(gpuContext gpu);
extern void kApplyFirstShake(gpuContext gpu); extern void kApplyShake(gpuContext gpu);
extern void kApplyFirstCCMA(gpuContext gpu); extern void kApplyCCMA(gpuContext gpu);
extern void kApplyFirstSettle(gpuContext gpu); extern void kApplySettle(gpuContext gpu);
extern void kApplySecondShake(gpuContext gpu);
extern void kApplySecondCCMA(gpuContext gpu);
extern void kApplySecondSettle(gpuContext gpu);
extern void kLangevinUpdatePart1(gpuContext gpu); extern void kLangevinUpdatePart1(gpuContext gpu);
extern void kLangevinUpdatePart2(gpuContext gpu); extern void kLangevinUpdatePart2(gpuContext gpu);
extern void kSelectLangevinStepSize(gpuContext gpu, float maxTimeStep); extern void kSelectLangevinStepSize(gpuContext gpu, float maxTimeStep);
......
...@@ -312,7 +312,6 @@ struct cudaGmxSimulation { ...@@ -312,7 +312,6 @@ struct cudaGmxSimulation {
unsigned int shake_threads_per_block; // Threads per block in shake kernel calls unsigned int shake_threads_per_block; // Threads per block in shake kernel calls
unsigned int settle_threads_per_block; // Threads per block in SETTLE kernel calls unsigned int settle_threads_per_block; // Threads per block in SETTLE kernel calls
unsigned int ccma_threads_per_block; // Threads per block in CCMA kernel calls unsigned int ccma_threads_per_block; // Threads per block in CCMA kernel calls
unsigned int nonshake_threads_per_block; // Threads per block in nonshaking kernel call
unsigned int max_localForces_threads_per_block; // Threads per block in local forces kernel calls unsigned int max_localForces_threads_per_block; // Threads per block in local forces kernel calls
unsigned int localForces_threads_per_block; // Threads per block in local forces kernel calls unsigned int localForces_threads_per_block; // Threads per block in local forces kernel calls
unsigned int random_threads_per_block; // Threads per block in RNG kernel calls unsigned int random_threads_per_block; // Threads per block in RNG kernel calls
...@@ -447,7 +446,6 @@ struct cudaGmxSimulation { ...@@ -447,7 +446,6 @@ struct cudaGmxSimulation {
unsigned int rigidClusters; // Total number of rigid clusters unsigned int rigidClusters; // Total number of rigid clusters
unsigned int maxRigidClusterSize; // The size of the largest rigid cluster unsigned int maxRigidClusterSize; // The size of the largest rigid cluster
unsigned int clusterShakeBlockSize; // The number of threads to process each rigid cluster unsigned int clusterShakeBlockSize; // The number of threads to process each rigid cluster
unsigned int NonShakeConstraints; // Total number of NonShake atoms
unsigned int maxShakeIterations; // Maximum shake iterations unsigned int maxShakeIterations; // Maximum shake iterations
unsigned int degreesOfFreedom; // Number of degrees of freedom in system unsigned int degreesOfFreedom; // Number of degrees of freedom in system
float shakeTolerance; // Shake tolerance float shakeTolerance; // Shake tolerance
......
...@@ -1568,44 +1568,6 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const ...@@ -1568,44 +1568,6 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
gpu->sim.ccma_threads_per_block = gpu->sim.threads_per_block; gpu->sim.ccma_threads_per_block = gpu->sim.threads_per_block;
if (gpu->sim.ccma_threads_per_block < gpu->sim.blocks) if (gpu->sim.ccma_threads_per_block < gpu->sim.blocks)
gpu->sim.ccma_threads_per_block = gpu->sim.blocks; gpu->sim.ccma_threads_per_block = gpu->sim.blocks;
// count number of atoms w/o constraint
int count = 0;
for (int i = 0; i < gpu->natoms; i++)
if (!isShakeAtom[i])
count++;
// Allocate NonShake parameters
gpu->sim.NonShakeConstraints = count;
if( count || true ){
CUDAStream<int>* psNonShakeID = new CUDAStream<int>(count, 1, "NonShakeID");
gpu->psNonShakeID = psNonShakeID;
gpu->sim.pNonShakeID = psNonShakeID->_pDevStream[0];
gpu->sim.nonshake_threads_per_block = (count + gpu->sim.blocks - 1) / gpu->sim.blocks;
if (gpu->sim.nonshake_threads_per_block > gpu->sim.max_shake_threads_per_block)
gpu->sim.nonshake_threads_per_block = gpu->sim.max_shake_threads_per_block;
if (gpu->sim.nonshake_threads_per_block < 1)
gpu->sim.nonshake_threads_per_block = 1;
// load indices
count = 0;
for (int i = 0; i < gpu->natoms; i++){
if (!isShakeAtom[i]){
(*psNonShakeID)[count++] = i;
}
}
psNonShakeID->Upload();
} else {
gpu->sim.nonshake_threads_per_block = 0;
}
} }
extern "C" extern "C"
...@@ -1949,7 +1911,6 @@ void* gpuInit(int numAtoms, unsigned int device, bool useBlockingSync) ...@@ -1949,7 +1911,6 @@ void* gpuInit(int numAtoms, unsigned int device, bool useBlockingSync)
gpu->psShakeParameter = NULL; gpu->psShakeParameter = NULL;
gpu->psSettleID = NULL; gpu->psSettleID = NULL;
gpu->psSettleParameter = NULL; gpu->psSettleParameter = NULL;
gpu->psNonShakeID = NULL;
gpu->psExclusion = NULL; gpu->psExclusion = NULL;
gpu->psExclusionIndex = NULL; gpu->psExclusionIndex = NULL;
gpu->psWorkUnit = NULL; gpu->psWorkUnit = NULL;
...@@ -2114,8 +2075,6 @@ void gpuShutDown(gpuContext gpu) ...@@ -2114,8 +2075,6 @@ void gpuShutDown(gpuContext gpu)
delete gpu->psShakeParameter; delete gpu->psShakeParameter;
delete gpu->psSettleID; delete gpu->psSettleID;
delete gpu->psSettleParameter; delete gpu->psSettleParameter;
if (gpu->psNonShakeID != NULL)
delete gpu->psNonShakeID;
delete gpu->psExclusion; delete gpu->psExclusion;
delete gpu->psExclusionIndex; delete gpu->psExclusionIndex;
delete gpu->psWorkUnit; delete gpu->psWorkUnit;
......
...@@ -148,7 +148,6 @@ struct _gpuContext { ...@@ -148,7 +148,6 @@ struct _gpuContext {
CUDAStream<float2>* psRbDihedralParameter2; CUDAStream<float2>* psRbDihedralParameter2;
CUDAStream<int4>* psLJ14ID; CUDAStream<int4>* psLJ14ID;
CUDAStream<float4>* psLJ14Parameter; CUDAStream<float4>* psLJ14Parameter;
CUDAStream<int>* psNonShakeID;
CUDAStream<int4>* psShakeID; CUDAStream<int4>* psShakeID;
CUDAStream<float4>* psShakeParameter; CUDAStream<float4>* psShakeParameter;
CUDAStream<int4>* psSettleID; CUDAStream<int4>* psSettleID;
......
...@@ -56,9 +56,9 @@ void kApplyConstraints(gpuContext gpu) ...@@ -56,9 +56,9 @@ void kApplyConstraints(gpuContext gpu)
{ {
kPrepareConstraints_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>(gpu->natoms, gpu->sim.pOldPosq, gpu->sim.pPosq, gpu->sim.pPosqP); kPrepareConstraints_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>(gpu->natoms, gpu->sim.pOldPosq, gpu->sim.pPosq, gpu->sim.pPosqP);
LAUNCHERROR("kPrepareConstraints"); LAUNCHERROR("kPrepareConstraints");
kApplyFirstShake(gpu); kApplyShake(gpu);
kApplyFirstSettle(gpu); kApplySettle(gpu);
kApplyFirstCCMA(gpu); kApplyCCMA(gpu);
kFinishConstraints_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>(gpu->natoms, gpu->sim.pPosq, gpu->sim.pPosqP); kFinishConstraints_kernel<<<gpu->sim.blocks, gpu->sim.update_threads_per_block>>>(gpu->natoms, gpu->sim.pPosq, gpu->sim.pPosqP);
LAUNCHERROR("kFinishConstraints"); LAUNCHERROR("kFinishConstraints");
} }
......
...@@ -205,16 +205,9 @@ void kApplyCCMA(gpuContext gpu, float4* posq, bool addOldPosition) ...@@ -205,16 +205,9 @@ void kApplyCCMA(gpuContext gpu, float4* posq, bool addOldPosition)
} }
} }
void kApplyFirstCCMA(gpuContext gpu) void kApplyCCMA(gpuContext gpu)
{ {
// printf("kApplyFirstCCMA\n"); // printf("kApplyCCMA\n");
if (gpu->sim.ccmaConstraints > 0) if (gpu->sim.ccmaConstraints > 0)
kApplyCCMA(gpu, gpu->sim.pPosqP, true); kApplyCCMA(gpu, gpu->sim.pPosqP, true);
} }
void kApplySecondCCMA(gpuContext gpu)
{
// printf("kApplySecondCCMA\n");
if (gpu->sim.ccmaConstraints > 0)
kApplyCCMA(gpu, gpu->sim.pPosq, false);
}
...@@ -186,12 +186,16 @@ void kSetVelocitiesFromPositions_kernel() ...@@ -186,12 +186,16 @@ void kSetVelocitiesFromPositions_kernel()
while (pos < cSim.atoms) while (pos < cSim.atoms)
{ {
float4 posq = cSim.pPosq[pos]; float4 posq = cSim.pPosq[pos];
float4 oldPosq = cSim.pOldPosq[pos]; float4 posqP = cSim.pPosqP[pos];
float4 velm = cSim.pVelm4[pos]; float4 velm = cSim.pVelm4[pos];
velm.x = (float) (oneOverDt*((double)posq.x-(double)oldPosq.x)); velm.x = (float) (oneOverDt*posqP.x);
velm.y = (float) (oneOverDt*((double)posq.y-(double)oldPosq.y)); velm.y = (float) (oneOverDt*posqP.y);
velm.z = (float) (oneOverDt*((double)posq.z-(double)oldPosq.z)); velm.z = (float) (oneOverDt*posqP.z);
cSim.pVelm4[pos] = velm; cSim.pVelm4[pos] = velm;
posq.x += posqP.x;
posq.y += posqP.y;
posq.z += posqP.z;
cSim.pPosq[pos] = posq;
pos += blockDim.x * gridDim.x; pos += blockDim.x * gridDim.x;
} }
} }
......
...@@ -152,7 +152,7 @@ void kLangevinUpdatePart2_kernel() ...@@ -152,7 +152,7 @@ void kLangevinUpdatePart2_kernel()
float4 xPrime = make_float4(dt*velocity.x, dt*velocity.y, dt*velocity.z, cSim.pPosq[pos].w); float4 xPrime = make_float4(dt*velocity.x, dt*velocity.y, dt*velocity.z, cSim.pPosq[pos].w);
cSim.pPosq[pos] = xPrime; cSim.pPosqP[pos] = xPrime;
pos += blockDim.x * gridDim.x; pos += blockDim.x * gridDim.x;
} }
......
...@@ -65,7 +65,7 @@ __launch_bounds__(GT2XX_SHAKE_THREADS_PER_BLOCK, 1) ...@@ -65,7 +65,7 @@ __launch_bounds__(GT2XX_SHAKE_THREADS_PER_BLOCK, 1)
#else #else
__launch_bounds__(G8X_SHAKE_THREADS_PER_BLOCK, 1) __launch_bounds__(G8X_SHAKE_THREADS_PER_BLOCK, 1)
#endif #endif
void kApplyFirstSettle_kernel() void kApplySettle_kernel()
{ {
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x; unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
while (pos < cSim.settleConstraints) while (pos < cSim.settleConstraints)
...@@ -229,178 +229,12 @@ void kApplyFirstSettle_kernel() ...@@ -229,178 +229,12 @@ void kApplyFirstSettle_kernel()
} }
} }
void kApplyFirstSettle(gpuContext gpu) void kApplySettle(gpuContext gpu)
{ {
// printf("kApplyFirstSettle\n"); // printf("kApplySettle\n");
if (gpu->sim.settleConstraints > 0) if (gpu->sim.settleConstraints > 0)
{ {
kApplyFirstSettle_kernel<<<gpu->sim.blocks, gpu->sim.settle_threads_per_block>>>(); kApplySettle_kernel<<<gpu->sim.blocks, gpu->sim.settle_threads_per_block>>>();
LAUNCHERROR("kApplyFirstSettle"); LAUNCHERROR("kApplySettle");
}
}
__global__ void
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_SHAKE_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_SHAKE_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_SHAKE_THREADS_PER_BLOCK, 1)
#endif
kApplySecondSettle_kernel()
{
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
while (pos < cSim.settleConstraints)
{
int4 atomID = cSim.pSettleID[pos];
float2 params = cSim.pSettleParameter[pos];
float4 apos0 = cSim.pOldPosq[atomID.x];
float4 xp0 = cSim.pPosq[atomID.x];
float4 apos1 = cSim.pOldPosq[atomID.y];
float4 xp1 = cSim.pPosq[atomID.y];
float4 apos2 = cSim.pOldPosq[atomID.z];
float4 xp2 = cSim.pPosq[atomID.z];
float m0 = 1.0f/cSim.pVelm4[atomID.x].w;
float m1 = 1.0f/cSim.pVelm4[atomID.y].w;
float m2 = 1.0f/cSim.pVelm4[atomID.z].w;
float xb0 = apos1.x-apos0.x;
float yb0 = apos1.y-apos0.y;
float zb0 = apos1.z-apos0.z;
float xc0 = apos2.x-apos0.x;
float yc0 = apos2.y-apos0.y;
float zc0 = apos2.z-apos0.z;
float totalMass = m0+m1+m2;
float xcom = ((apos0.x+xp0.x)*m0 + (apos1.x+xp1.x)*m1 + (apos2.x+xp2.x)*m2) / totalMass;
float ycom = ((apos0.y+xp0.y)*m0 + (apos1.y+xp1.y)*m1 + (apos2.y+xp2.y)*m2) / totalMass;
float zcom = ((apos0.z+xp0.z)*m0 + (apos1.z+xp1.z)*m1 + (apos2.z+xp2.z)*m2) / totalMass;
float xa1 = apos0.x + xp0.x - xcom;
float ya1 = apos0.y + xp0.y - ycom;
float za1 = apos0.z + xp0.z - zcom;
float xb1 = apos1.x + xp1.x - xcom;
float yb1 = apos1.y + xp1.y - ycom;
float zb1 = apos1.z + xp1.z - zcom;
float xc1 = apos2.x + xp2.x - xcom;
float yc1 = apos2.y + xp2.y - ycom;
float zc1 = apos2.z + xp2.z - zcom;
float xaksZd = yb0*zc0 - zb0*yc0;
float yaksZd = zb0*xc0 - xb0*zc0;
float zaksZd = xb0*yc0 - yb0*xc0;
float xaksXd = ya1*zaksZd - za1*yaksZd;
float yaksXd = za1*xaksZd - xa1*zaksZd;
float zaksXd = xa1*yaksZd - ya1*xaksZd;
float xaksYd = yaksZd*zaksXd - zaksZd*yaksXd;
float yaksYd = zaksZd*xaksXd - xaksZd*zaksXd;
float zaksYd = xaksZd*yaksXd - yaksZd*xaksXd;
float axlng = sqrt(xaksXd * xaksXd + yaksXd * yaksXd + zaksXd * zaksXd);
float aylng = sqrt(xaksYd * xaksYd + yaksYd * yaksYd + zaksYd * zaksYd);
float azlng = sqrt(xaksZd * xaksZd + yaksZd * yaksZd + zaksZd * zaksZd);
float trns11 = xaksXd / axlng;
float trns21 = yaksXd / axlng;
float trns31 = zaksXd / axlng;
float trns12 = xaksYd / aylng;
float trns22 = yaksYd / aylng;
float trns32 = zaksYd / aylng;
float trns13 = xaksZd / azlng;
float trns23 = yaksZd / azlng;
float trns33 = zaksZd / azlng;
float xb0d = trns11*xb0 + trns21*yb0 + trns31*zb0;
float yb0d = trns12*xb0 + trns22*yb0 + trns32*zb0;
float xc0d = trns11*xc0 + trns21*yc0 + trns31*zc0;
float yc0d = trns12*xc0 + trns22*yc0 + trns32*zc0;
float za1d = trns13*xa1 + trns23*ya1 + trns33*za1;
float xb1d = trns11*xb1 + trns21*yb1 + trns31*zb1;
float yb1d = trns12*xb1 + trns22*yb1 + trns32*zb1;
float zb1d = trns13*xb1 + trns23*yb1 + trns33*zb1;
float xc1d = trns11*xc1 + trns21*yc1 + trns31*zc1;
float yc1d = trns12*xc1 + trns22*yc1 + trns32*zc1;
float zc1d = trns13*xc1 + trns23*yc1 + trns33*zc1;
// --- Step2 A2' ---
float rc = 0.5f*params.y;
float rb = sqrt(params.x*params.x-rc*rc);
float ra = rb*(m1+m2)/totalMass;
rb -= ra;
float sinphi = za1d / ra;
float cosphi = sqrt(1.0f - sinphi*sinphi);
float sinpsi = ( zb1d - zc1d ) / (2*rc*cosphi);
float cospsi = sqrt(1.0f - sinpsi*sinpsi);
float ya2d = ra * cosphi;
float xb2d = - rc * cospsi;
float yb2d = - rb * cosphi - rc *sinpsi * sinphi;
float yc2d = - rb * cosphi + rc *sinpsi * sinphi;
float xb2d2 = xb2d * xb2d;
float hh2 = 4.0f * xb2d2 + (yb2d-yc2d) * (yb2d-yc2d) + (zb1d-zc1d) * (zb1d-zc1d);
float deltx = 2.0f * xb2d + sqrt ( 4.0f * xb2d2 - hh2 + params.y*params.y );
xb2d -= deltx * 0.5f;
// --- Step3 al,be,ga ---
float alpa = ( xb2d * (xb0d-xc0d) + yb0d * yb2d + yc0d * yc2d );
float beta = ( xb2d * (yc0d-yb0d) + xb0d * yb2d + xc0d * yc2d );
float gama = xb0d * yb1d - xb1d * yb0d + xc0d * yc1d - xc1d * yc0d;
float al2be2 = alpa * alpa + beta * beta;
float sinthe = ( alpa*gama - beta * sqrt ( al2be2 - gama * gama ) ) / al2be2;
// --- Step4 A3' ---
float costhe = sqrt (1.0f - sinthe * sinthe );
float xa3d = - ya2d * sinthe;
float ya3d = ya2d * costhe;
float za3d = za1d;
float xb3d = xb2d * costhe - yb2d * sinthe;
float yb3d = xb2d * sinthe + yb2d * costhe;
float zb3d = zb1d;
float xc3d = - xb2d * costhe - yc2d * sinthe;
float yc3d = - xb2d * sinthe + yc2d * costhe;
float zc3d = zc1d;
// --- Step5 A3 ---
float xa3 = trns11*xa3d + trns12*ya3d + trns13*za3d;
float ya3 = trns21*xa3d + trns22*ya3d + trns23*za3d;
float za3 = trns31*xa3d + trns32*ya3d + trns33*za3d;
float xb3 = trns11*xb3d + trns12*yb3d + trns13*zb3d;
float yb3 = trns21*xb3d + trns22*yb3d + trns23*zb3d;
float zb3 = trns31*xb3d + trns32*yb3d + trns33*zb3d;
float xc3 = trns11*xc3d + trns12*yc3d + trns13*zc3d;
float yc3 = trns21*xc3d + trns22*yc3d + trns23*zc3d;
float zc3 = trns31*xc3d + trns32*yc3d + trns33*zc3d;
xp0.x = xcom + xa3;
xp0.y = ycom + ya3;
xp0.z = zcom + za3;
xp1.x = xcom + xb3;
xp1.y = ycom + yb3;
xp1.z = zcom + zb3;
xp2.x = xcom + xc3;
xp2.y = ycom + yc3;
xp2.z = zcom + zc3;
cSim.pPosq[atomID.x] = xp0;
cSim.pPosq[atomID.y] = xp1;
cSim.pPosq[atomID.z] = xp2;
pos += blockDim.x * gridDim.x;
}
}
void kApplySecondSettle(gpuContext gpu)
{
// printf("kApplySecondSettle\n");
if (gpu->sim.settleConstraints > 0)
{
kApplySecondSettle_kernel<<<gpu->sim.blocks, gpu->sim.settle_threads_per_block>>>();
LAUNCHERROR("kApplySecondSettle");
} }
} }
...@@ -72,7 +72,7 @@ __launch_bounds__(GT2XX_SHAKE_THREADS_PER_BLOCK, 1) ...@@ -72,7 +72,7 @@ __launch_bounds__(GT2XX_SHAKE_THREADS_PER_BLOCK, 1)
#else #else
__launch_bounds__(G8X_SHAKE_THREADS_PER_BLOCK, 1) __launch_bounds__(G8X_SHAKE_THREADS_PER_BLOCK, 1)
#endif #endif
void kApplyFirstShake_kernel() void kApplyShake_kernel()
{ {
extern __shared__ Atom sA[]; extern __shared__ Atom sA[];
Atom* psA = &sA[threadIdx.x]; Atom* psA = &sA[threadIdx.x];
...@@ -224,248 +224,13 @@ void kApplyFirstShake_kernel() ...@@ -224,248 +224,13 @@ void kApplyFirstShake_kernel()
} }
} }
void kApplyFirstShake(gpuContext gpu) void kApplyShake(gpuContext gpu)
{ {
// printf("kApplyFirstShake\n"); // printf("kApplyShake\n");
if (gpu->sim.ShakeConstraints > 0) if (gpu->sim.ShakeConstraints > 0)
{ {
kApplyFirstShake_kernel<<<gpu->sim.blocks, gpu->sim.shake_threads_per_block, sizeof(Atom)*gpu->sim.shake_threads_per_block>>>(); kApplyShake_kernel<<<gpu->sim.blocks, gpu->sim.shake_threads_per_block, sizeof(Atom)*gpu->sim.shake_threads_per_block>>>();
LAUNCHERROR("kApplyFirstShake"); LAUNCHERROR("kApplyShake");
} }
} }
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_SHAKE_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_SHAKE_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_SHAKE_THREADS_PER_BLOCK, 1)
#endif
void kApplySecondShake_kernel()
{
extern __shared__ Atom sA[];
Atom* psA = &sA[threadIdx.x];
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
while (pos < cSim.ShakeConstraints)
{
int4 atomID = cSim.pShakeID[pos];
float4 params = cSim.pShakeParameter[pos];
float4 apos = cSim.pOldPosq[atomID.x];
float4 xpi = cSim.pPosq[atomID.x];
float4 apos1 = cSim.pOldPosq[atomID.y];
float4 xpj1 = cSim.pPosq[atomID.y];
float4 apos2 = {0.0f, 0.0f, 0.0f, 0.0f};
float4 xpj2 = {0.0f, 0.0f, 0.0f, 0.0f};
psA->InvMassI = params.x;
psA->M = params.y;
psA->d2 = params.z;
float invMassJ = params.w;
if (atomID.z != -1)
{
apos2 = cSim.pOldPosq[atomID.z];
xpj2 = cSim.pPosq[atomID.z];
}
float4 apos3 = {0.0f, 0.0f, 0.0f, 0.0f};
float4 xpj3 = {0.0f, 0.0f, 0.0f, 0.0f};
if (atomID.w != -1)
{
apos3 = cSim.pOldPosq[atomID.w];
xpj3 = cSim.pPosq[atomID.w];
}
float3 xi, xj1, xj2, xj3;
xi.x = apos.x;
xi.y = apos.y;
xi.z = apos.z;
xj1.x = apos1.x;
xj1.y = apos1.y;
xj1.z = apos1.z;
xj2.x = apos2.x;
xj2.y = apos2.y;
xj2.z = apos2.z;
xj3.x = apos3.x;
xj3.y = apos3.y;
xj3.z = apos3.z;
psA->rij1.x = xi.x - xj1.x;
psA->rij1.y = xi.y - xj1.y;
psA->rij1.z = xi.z - xj1.z;
psA->rij2.x = xi.x - xj2.x;
psA->rij2.y = xi.y - xj2.y;
psA->rij2.z = xi.z - xj2.z;
psA->rij3.x = xi.x - xj3.x;
psA->rij3.y = xi.y - xj3.y;
psA->rij3.z = xi.z - xj3.z;
psA->rij1sq = psA->rij1.x * psA->rij1.x + psA->rij1.y * psA->rij1.y + psA->rij1.z * psA->rij1.z;
psA->rij2sq = psA->rij2.x * psA->rij2.x + psA->rij2.y * psA->rij2.y + psA->rij2.z * psA->rij2.z;
psA->rij3sq = psA->rij3.x * psA->rij3.x + psA->rij3.y * psA->rij3.y + psA->rij3.z * psA->rij3.z;
float ld1 = psA->d2 - psA->rij1sq;
float ld2 = psA->d2 - psA->rij2sq;
float ld3 = psA->d2 - psA->rij3sq;
bool converged = false;
int iteration = 0;
while (iteration < 15 && !converged)
{
converged = true;
float3 rpij;
rpij.x = xpi.x - xpj1.x;
rpij.y = xpi.y - xpj1.y;
rpij.z = xpi.z - xpj1.z;
float rpsqij = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z;
float rrpr = psA->rij1.x * rpij.x + psA->rij1.y * rpij.y + psA->rij1.z * rpij.z;
float diff = fabs(ld1 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance );
if (diff >= 1.0f)
{
float acor = (ld1 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij1sq);
float3 dr;
dr.x = psA->rij1.x * acor;
dr.y = psA->rij1.y * acor;
dr.z = psA->rij1.z * acor;
xpi.x += dr.x * psA->InvMassI;
xpi.y += dr.y * psA->InvMassI;
xpi.z += dr.z * psA->InvMassI;
xpj1.x -= dr.x * invMassJ;
xpj1.y -= dr.y * invMassJ;
xpj1.z -= dr.z * invMassJ;
converged = false;
}
if (atomID.z != -1)
{
rpij.x = xpi.x - xpj2.x;
rpij.y = xpi.y - xpj2.y;
rpij.z = xpi.z - xpj2.z;
rpsqij = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z;
rrpr = psA->rij2.x * rpij.x + psA->rij2.y * rpij.y + psA->rij2.z * rpij.z;
diff = fabs(ld2 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance );
if (diff >= 1.0f)
{
float acor = (ld2 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij2sq);
float3 dr;
dr.x = psA->rij2.x * acor;
dr.y = psA->rij2.y * acor;
dr.z = psA->rij2.z * acor;
xpi.x += dr.x * psA->InvMassI;
xpi.y += dr.y * psA->InvMassI;
xpi.z += dr.z * psA->InvMassI;
xpj2.x -= dr.x * invMassJ;
xpj2.y -= dr.y * invMassJ;
xpj2.z -= dr.z * invMassJ;
converged = false;
}
}
if (atomID.w != -1)
{
rpij.x = xpi.x - xpj3.x;
rpij.y = xpi.y - xpj3.y;
rpij.z = xpi.z - xpj3.z;
rpsqij = rpij.x * rpij.x + rpij.y * rpij.y + rpij.z * rpij.z;
rrpr = psA->rij3.x * rpij.x + psA->rij3.y * rpij.y + psA->rij3.z * rpij.z;
diff = fabs(ld3 - 2.0f * rrpr - rpsqij) / (psA->d2 * cSim.shakeTolerance );
if (diff >= 1.0f)
{
float acor = (ld3 - 2.0f * rrpr - rpsqij) * psA->M / (rrpr + psA->rij3sq);
float3 dr;
dr.x = psA->rij3.x * acor;
dr.y = psA->rij3.y * acor;
dr.z = psA->rij3.z * acor;
xpi.x += dr.x * psA->InvMassI;
xpi.y += dr.y * psA->InvMassI;
xpi.z += dr.z * psA->InvMassI;
xpj3.x -= dr.x * invMassJ;
xpj3.y -= dr.y * invMassJ;
xpj3.z -= dr.z * invMassJ;
converged = false;
}
}
iteration++;
}
xpi.x += xi.x;
xpi.y += xi.y;
xpi.z += xi.z;
xpj1.x += xj1.x;
xpj1.y += xj1.y;
xpj1.z += xj1.z;
xpj2.x += xj2.x;
xpj2.y += xj2.y;
xpj2.z += xj2.z;
xpj3.x += xj3.x;
xpj3.y += xj3.y;
xpj3.z += xj3.z;
cSim.pPosq[atomID.x] = xpi;
cSim.pPosq[atomID.y] = xpj1;
if (atomID.z != -1)
cSim.pPosq[atomID.z] = xpj2;
if (atomID.w != -1)
cSim.pPosq[atomID.w] = xpj3;
pos += blockDim.x * gridDim.x;
}
// Update any atoms that SHAKE is not applied to.
pos = threadIdx.x + blockIdx.x * blockDim.x;
while (pos < cSim.NonShakeConstraints)
{
int atomID = cSim.pNonShakeID[pos];
float4 apos = cSim.pOldPosq[atomID];
float4 xpi = cSim.pPosq[atomID];
xpi.x += apos.x;
xpi.y += apos.y;
xpi.z += apos.z;
cSim.pPosq[atomID] = xpi;
pos += blockDim.x * gridDim.x;
}
}
__global__ void
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_SHAKE_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_SHAKE_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_SHAKE_THREADS_PER_BLOCK, 1)
#endif
kApplyNoShake_kernel()
{
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x;
while (pos < cSim.NonShakeConstraints)
{
int atomID = cSim.pNonShakeID[pos];
float4 apos = cSim.pOldPosq[atomID];
float4 xpi = cSim.pPosq[atomID];
xpi.x += apos.x;
xpi.y += apos.y;
xpi.z += apos.z;
cSim.pPosq[atomID] = xpi;
pos += blockDim.x * gridDim.x;
}
}
void kApplySecondShake(gpuContext gpu)
{
// printf("kApplySecondShake\n");
if (gpu->sim.ShakeConstraints > 0)
{
kApplySecondShake_kernel<<<gpu->sim.blocks, gpu->sim.shake_threads_per_block, sizeof(Atom)*gpu->sim.shake_threads_per_block>>>();
LAUNCHERROR("kApplySecondShake");
}
else if (gpu->sim.NonShakeConstraints > 0)
{
// handle non-Shake atoms
kApplyNoShake_kernel<<<gpu->sim.blocks, gpu->sim.nonshake_threads_per_block>>>();
LAUNCHERROR("kApplyNoShake");
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment