Commit 1d0193eb authored by Peter Eastman's avatar Peter Eastman
Browse files

Restructured CCMA code to work on Fermi boards

parent 5301b9cc
...@@ -460,8 +460,7 @@ struct cudaGmxSimulation { ...@@ -460,8 +460,7 @@ struct cudaGmxSimulation {
float* pCcmaDelta2; // Workspace for CCMA float* pCcmaDelta2; // Workspace for CCMA
int* pCcmaAtomConstraints; // The indices of constraints involving each atom int* pCcmaAtomConstraints; // The indices of constraints involving each atom
int* pCcmaNumAtomConstraints; // The number of constraints involving each atom int* pCcmaNumAtomConstraints; // The number of constraints involving each atom
short* pSyncCounter; // Used for global thread synchronization int* pCcmaConverged; // Used by CCMA to communicate whether iteration has converged
unsigned int* pRequiredIterations; // Used by CCMA to communicate whether iteration has converged
float* pCcmaReducedMass; // The reduced mass for each CCMA constraint float* pCcmaReducedMass; // The reduced mass for each CCMA constraint
unsigned int* pConstraintMatrixColumn; // The column of each element in the constraint matrix. unsigned int* pConstraintMatrixColumn; // The column of each element in the constraint matrix.
float* pConstraintMatrixValue; // The value of each element in the constraint matrix. float* pConstraintMatrixValue; // The value of each element in the constraint matrix.
......
...@@ -1533,12 +1533,9 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const ...@@ -1533,12 +1533,9 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
CUDAStream<float>* psCcmaDelta2 = new CUDAStream<float>(numCCMA, 1, "CcmaDelta2"); CUDAStream<float>* psCcmaDelta2 = new CUDAStream<float>(numCCMA, 1, "CcmaDelta2");
gpu->psCcmaDelta2 = psCcmaDelta2; gpu->psCcmaDelta2 = psCcmaDelta2;
gpu->sim.pCcmaDelta2 = psCcmaDelta2->_pDevData; gpu->sim.pCcmaDelta2 = psCcmaDelta2->_pDevData;
CUDAStream<short>* psSyncCounter = new CUDAStream<short>(3*gpu->sim.blocks, 1, "SyncCounter"); CUDAStream<int>* psCcmaConverged = new CUDAStream<int>(gpu->sim.blocks, 1, "CcmaConverged");
gpu->psSyncCounter = psSyncCounter; gpu->psCcmaConverged = psCcmaConverged;
gpu->sim.pSyncCounter = psSyncCounter->_pDevData; gpu->sim.pCcmaConverged = psCcmaConverged->_pDevData;
CUDAStream<unsigned int>* psRequiredIterations = new CUDAStream<unsigned int>(1, 1, "RequiredIterations");
gpu->psRequiredIterations = psRequiredIterations;
gpu->sim.pRequiredIterations = psRequiredIterations->_pDevData;
CUDAStream<float>* psCcmaReducedMass = new CUDAStream<float>(numCCMA, 1, "CcmaReducedMass"); CUDAStream<float>* psCcmaReducedMass = new CUDAStream<float>(numCCMA, 1, "CcmaReducedMass");
gpu->psCcmaReducedMass = psCcmaReducedMass; gpu->psCcmaReducedMass = psCcmaReducedMass;
gpu->sim.pCcmaReducedMass = psCcmaReducedMass->_pDevData; gpu->sim.pCcmaReducedMass = psCcmaReducedMass->_pDevData;
...@@ -1562,8 +1559,6 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const ...@@ -1562,8 +1559,6 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
} }
(*psConstraintMatrixColumn)[i+matrix[index].size()*numCCMA] = numCCMA; (*psConstraintMatrixColumn)[i+matrix[index].size()*numCCMA] = numCCMA;
} }
for (unsigned int i = 0; i < psSyncCounter->_length; i++)
(*psSyncCounter)[i] = -1;
for (unsigned int i = 0; i < atomConstraints.size(); i++) { for (unsigned int i = 0; i < atomConstraints.size(); i++) {
(*psCcmaNumAtomConstraints)[i] = atomConstraints[i].size(); (*psCcmaNumAtomConstraints)[i] = atomConstraints[i].size();
for (unsigned int j = 0; j < atomConstraints[i].size(); j++) { for (unsigned int j = 0; j < atomConstraints[i].size(); j++) {
...@@ -1576,7 +1571,6 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const ...@@ -1576,7 +1571,6 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
psCcmaReducedMass->Upload(); psCcmaReducedMass->Upload();
psCcmaAtomConstraints->Upload(); psCcmaAtomConstraints->Upload();
psCcmaNumAtomConstraints->Upload(); psCcmaNumAtomConstraints->Upload();
psSyncCounter->Upload();
psConstraintMatrixColumn->Upload(); psConstraintMatrixColumn->Upload();
psConstraintMatrixValue->Upload(); psConstraintMatrixValue->Upload();
gpu->sim.ccma_threads_per_block = (gpu->sim.ccmaConstraints + gpu->sim.blocks - 1) / gpu->sim.blocks; gpu->sim.ccma_threads_per_block = (gpu->sim.ccmaConstraints + gpu->sim.blocks - 1) / gpu->sim.blocks;
...@@ -1994,8 +1988,7 @@ void* gpuInit(int numAtoms, unsigned int device, bool useBlockingSync) ...@@ -1994,8 +1988,7 @@ void* gpuInit(int numAtoms, unsigned int device, bool useBlockingSync)
gpu->psCcmaNumAtomConstraints = NULL; gpu->psCcmaNumAtomConstraints = NULL;
gpu->psCcmaDelta1 = NULL; gpu->psCcmaDelta1 = NULL;
gpu->psCcmaDelta2 = NULL; gpu->psCcmaDelta2 = NULL;
gpu->psSyncCounter = NULL; gpu->psCcmaConverged = NULL;
gpu->psRequiredIterations = NULL;
gpu->psCcmaReducedMass = NULL; gpu->psCcmaReducedMass = NULL;
gpu->psConstraintMatrixColumn = NULL; gpu->psConstraintMatrixColumn = NULL;
gpu->psConstraintMatrixValue = NULL; gpu->psConstraintMatrixValue = NULL;
...@@ -2214,8 +2207,7 @@ void gpuShutDown(gpuContext gpu) ...@@ -2214,8 +2207,7 @@ void gpuShutDown(gpuContext gpu)
delete gpu->psCcmaNumAtomConstraints; delete gpu->psCcmaNumAtomConstraints;
delete gpu->psCcmaDelta1; delete gpu->psCcmaDelta1;
delete gpu->psCcmaDelta2; delete gpu->psCcmaDelta2;
delete gpu->psSyncCounter; delete gpu->psCcmaConverged;
delete gpu->psRequiredIterations;
delete gpu->psCcmaReducedMass; delete gpu->psCcmaReducedMass;
delete gpu->psConstraintMatrixColumn; delete gpu->psConstraintMatrixColumn;
delete gpu->psConstraintMatrixValue; delete gpu->psConstraintMatrixValue;
......
...@@ -171,9 +171,8 @@ struct _gpuContext { ...@@ -171,9 +171,8 @@ struct _gpuContext {
CUDAStream<int>* psCcmaNumAtomConstraints; // The number of constraints involving each atom CUDAStream<int>* psCcmaNumAtomConstraints; // The number of constraints involving each atom
CUDAStream<float>* psCcmaDelta1; // Workspace for CCMA CUDAStream<float>* psCcmaDelta1; // Workspace for CCMA
CUDAStream<float>* psCcmaDelta2; // Workspace for CCMA CUDAStream<float>* psCcmaDelta2; // Workspace for CCMA
CUDAStream<short>* psSyncCounter; // Used for global thread synchronization CUDAStream<int>* psCcmaConverged; // Used by CCMA to communicate whether iteration has converged
CUDAStream<unsigned int>* psRequiredIterations; // Used by SHAKE to communicate whether iteration has converged CUDAStream<float>* psCcmaReducedMass; // The reduced mass for each CCMA constraint
CUDAStream<float>* psCcmaReducedMass; // The reduced mass for each SHAKE constraint
CUDAStream<float>* psRigidClusterMatrix;// The inverse constraint matrix for each rigid cluster CUDAStream<float>* psRigidClusterMatrix;// The inverse constraint matrix for each rigid cluster
CUDAStream<unsigned int>* psRigidClusterConstraintIndex; // The index of each cluster in the stream containing cluster constraints. CUDAStream<unsigned int>* psRigidClusterConstraintIndex; // The index of each cluster in the stream containing cluster constraints.
CUDAStream<unsigned int>* psRigidClusterMatrixIndex; // The index of each cluster in the stream containing cluster matrices. CUDAStream<unsigned int>* psRigidClusterMatrixIndex; // The index of each cluster in the stream containing cluster matrices.
......
...@@ -48,24 +48,6 @@ void GetCCMASim(gpuContext gpu) ...@@ -48,24 +48,6 @@ void GetCCMASim(gpuContext gpu)
RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed"); RTERROR(status, "cudaMemcpyFromSymbol: SetSim copy from cSim failed");
} }
/**
* Synchronize all threads across all blocks.
*/
__device__ void kSyncAllThreads_kernel(short* syncCounter, short newCount)
{
__syncthreads();
if (threadIdx.x == 0)
syncCounter[blockIdx.x] = newCount;
if (threadIdx.x < gridDim.x)
{
volatile short* counter = &syncCounter[threadIdx.x];
do
{
} while (*counter != newCount);
}
__syncthreads();
}
__global__ void __global__ void
#if (__CUDA_ARCH__ >= 200) #if (__CUDA_ARCH__ >= 200)
__launch_bounds__(1024, 1) __launch_bounds__(1024, 1)
...@@ -74,52 +56,53 @@ __launch_bounds__(512, 1) ...@@ -74,52 +56,53 @@ __launch_bounds__(512, 1)
#else #else
__launch_bounds__(256, 1) __launch_bounds__(256, 1)
#endif #endif
kApplyCCMA_kernel(float4* atomPositions, bool addOldPosition) kComputeCCMAConstraintDirections()
{ {
// Initialize counters used for monitoring convergence and doing global thread synchronization.
__shared__ unsigned int requiredIterations;
if (threadIdx.x == 0)
{
requiredIterations = 0;
cSim.pSyncCounter[gridDim.x+blockIdx.x] = -1;
cSim.pSyncCounter[2*gridDim.x+blockIdx.x] = -1;
cSim.pRequiredIterations[0] = 0;
}
// Calculate the direction of each constraint. // Calculate the direction of each constraint.
unsigned int pos = threadIdx.x + blockIdx.x * blockDim.x; for (unsigned int index = threadIdx.x+blockIdx.x*blockDim.x; index < cSim.ccmaConstraints; index += blockDim.x*gridDim.x)
while (pos < cSim.ccmaConstraints)
{ {
int2 atoms = cSim.pCcmaAtoms[pos]; int2 atoms = cSim.pCcmaAtoms[index];
float4 dir = cSim.pCcmaDistance[pos]; float4 dir = cSim.pCcmaDistance[index];
float4 oldPos1 = cSim.pOldPosq[atoms.x]; float4 oldPos1 = cSim.pOldPosq[atoms.x];
float4 oldPos2 = cSim.pOldPosq[atoms.y]; float4 oldPos2 = cSim.pOldPosq[atoms.y];
dir.x = oldPos1.x-oldPos2.x; dir.x = oldPos1.x-oldPos2.x;
dir.y = oldPos1.y-oldPos2.y; dir.y = oldPos1.y-oldPos2.y;
dir.z = oldPos1.z-oldPos2.z; dir.z = oldPos1.z-oldPos2.z;
cSim.pCcmaDistance[pos] = dir; cSim.pCcmaDistance[index] = dir;
pos += blockDim.x*gridDim.x;
} }
__syncthreads();
// Iteratively update the atom positions. // Mark that no blocks have converged yet.
unsigned int maxIterations = 150; for (unsigned int index = threadIdx.x+blockIdx.x*blockDim.x; index < gridDim.x; index += blockDim.x*gridDim.x)
cSim.pCcmaConverged[index] = false;
}
__global__ void
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(1024, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(512, 1)
#else
__launch_bounds__(256, 1)
#endif
kComputeCCMAConstraintForces(float4* atomPositions, bool addOldPosition)
{
if (cSim.pCcmaConverged[blockIdx.x])
return; // The constraint iteration has already converged.
extern __shared__ int convergedBuffer[];
float lowerTol = 1.0f-2.0f*cSim.shakeTolerance+cSim.shakeTolerance*cSim.shakeTolerance; float lowerTol = 1.0f-2.0f*cSim.shakeTolerance+cSim.shakeTolerance*cSim.shakeTolerance;
float upperTol = 1.0f+2.0f*cSim.shakeTolerance+cSim.shakeTolerance*cSim.shakeTolerance; float upperTol = 1.0f+2.0f*cSim.shakeTolerance+cSim.shakeTolerance*cSim.shakeTolerance;
for (unsigned int iteration = 0; iteration < maxIterations; iteration++) int threadConverged = 1;
{
// Calculate the constraint force for each constraint. // Calculate the constraint force for each constraint.
pos = threadIdx.x + blockIdx.x * blockDim.x; for (unsigned int index = threadIdx.x+blockIdx.x*blockDim.x; index < cSim.ccmaConstraints; index += blockDim.x*gridDim.x)
while (pos < cSim.ccmaConstraints)
{ {
int2 atoms = cSim.pCcmaAtoms[pos]; int2 atoms = cSim.pCcmaAtoms[index];
float4 delta1 = atomPositions[atoms.x]; float4 delta1 = atomPositions[atoms.x];
float4 delta2 = atomPositions[atoms.y]; float4 delta2 = atomPositions[atoms.y];
float4 dir = cSim.pCcmaDistance[pos]; float4 dir = cSim.pCcmaDistance[index];
float3 rp_ij = make_float3(delta1.x-delta2.x, delta1.y-delta2.y, delta1.z-delta2.z); float3 rp_ij = make_float3(delta1.x-delta2.x, delta1.y-delta2.y, delta1.z-delta2.z);
if (addOldPosition) if (addOldPosition)
{ {
...@@ -132,51 +115,92 @@ kApplyCCMA_kernel(float4* atomPositions, bool addOldPosition) ...@@ -132,51 +115,92 @@ kApplyCCMA_kernel(float4* atomPositions, bool addOldPosition)
float diff = dist2 - rp2; float diff = dist2 - rp2;
float rrpr = rp_ij.x*dir.x + rp_ij.y*dir.y + rp_ij.z*dir.z; float rrpr = rp_ij.x*dir.x + rp_ij.y*dir.y + rp_ij.z*dir.z;
float d_ij2 = dir.x*dir.x + dir.y*dir.y + dir.z*dir.z; float d_ij2 = dir.x*dir.x + dir.y*dir.y + dir.z*dir.z;
float reducedMass = cSim.pCcmaReducedMass[pos]; float reducedMass = cSim.pCcmaReducedMass[index];
cSim.pCcmaDelta1[pos] = (rrpr > d_ij2*1e-6f ? reducedMass*diff/rrpr : 0.0f); cSim.pCcmaDelta1[index] = (rrpr > d_ij2*1e-6f ? reducedMass*diff/rrpr : 0.0f);
if (requiredIterations == iteration && (rp2 < lowerTol*dist2 || rp2 > upperTol*dist2))
requiredIterations = iteration+1; // See whether it has converged.
pos += blockDim.x * gridDim.x;
threadConverged &= (rp2 >= lowerTol*dist2 && rp2 <= upperTol*dist2);
} }
// Perform a parallel reduction to see if all constraints handled by this block have converged.
convergedBuffer[threadIdx.x] = threadConverged;
__syncthreads();
for (unsigned int step = 1; step < blockDim.x; step *= 2) {
if (threadIdx.x%(2*step) == 0)
convergedBuffer[threadIdx.x] &= convergedBuffer[threadIdx.x+step];
__syncthreads();
}
if (threadIdx.x == 0)
cSim.pCcmaConverged[blockIdx.x] = convergedBuffer[0];
}
__global__ void
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(1024, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(512, 1)
#else
__launch_bounds__(256, 1)
#endif
kMultiplyByCCMAConstraintMatrix()
{
extern __shared__ int convergedBuffer[];
// First see whether all work groups have converged.
convergedBuffer[threadIdx.x] = true;
for (int index = threadIdx.x; index < gridDim.x; index += blockDim.x)
convergedBuffer[threadIdx.x] &= cSim.pCcmaConverged[index];
__syncthreads();
for (int step = 1; step < blockDim.x; step *= 2) {
if (threadIdx.x%(2*step) == 0)
convergedBuffer[threadIdx.x] &= convergedBuffer[threadIdx.x+step];
__syncthreads(); __syncthreads();
if (threadIdx.x == 0 && requiredIterations > iteration) }
cSim.pRequiredIterations[0] = requiredIterations; if (threadIdx.x == 0)
kSyncAllThreads_kernel(cSim.pSyncCounter, iteration); cSim.pCcmaConverged[blockIdx.x] = convergedBuffer[0];
if (iteration == cSim.pRequiredIterations[0]) if (cSim.pCcmaConverged[0])
break; // All constraints have converged. return; // The constraint iteration has already converged.
// Multiply by the inverse constraint matrix. // Multiply by the inverse constraint matrix.
pos = threadIdx.x + blockIdx.x * blockDim.x; for (unsigned int index = threadIdx.x+blockIdx.x*blockDim.x; index < cSim.ccmaConstraints; index += blockDim.x*gridDim.x)
while (pos < cSim.ccmaConstraints)
{ {
float sum = 0.0f; float sum = 0.0f;
for (unsigned int i = 0; ; i++) for (unsigned int i = 0; ; i++)
{ {
unsigned int index = pos+i*cSim.ccmaConstraints; unsigned int element = index+i*cSim.ccmaConstraints;
unsigned int column = cSim.pConstraintMatrixColumn[index]; unsigned int column = cSim.pConstraintMatrixColumn[element];
if (column >= cSim.ccmaConstraints) if (column >= cSim.ccmaConstraints)
break; break;
sum += cSim.pCcmaDelta1[column]*cSim.pConstraintMatrixValue[index]; sum += cSim.pCcmaDelta1[column]*cSim.pConstraintMatrixValue[element];
} }
cSim.pCcmaDelta2[pos] = sum; cSim.pCcmaDelta2[index] = sum;
pos += blockDim.x * gridDim.x;
} }
kSyncAllThreads_kernel(&cSim.pSyncCounter[gridDim.x], iteration); }
// Update the position of each atom.
pos = threadIdx.x + blockIdx.x * blockDim.x; __global__ void
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(1024, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(512, 1)
#else
__launch_bounds__(256, 1)
#endif
kUpdateCCMAAtomPositions(float4* atomPositions, int iteration)
{
if (cSim.pCcmaConverged[blockIdx.x])
return; // The constraint iteration has already converged.
float damping = (iteration < 2 ? 0.5f : 1.0f); float damping = (iteration < 2 ? 0.5f : 1.0f);
while (pos < cSim.atoms) for (unsigned int index = threadIdx.x+blockIdx.x*blockDim.x; index < cSim.atoms; index += blockDim.x*gridDim.x)
{ {
float4 atomPos = atomPositions[pos]; float4 atomPos = atomPositions[index];
float invMass = cSim.pVelm4[pos].w; float invMass = cSim.pVelm4[index].w;
int num = cSim.pCcmaNumAtomConstraints[pos]; int num = cSim.pCcmaNumAtomConstraints[index];
for (int i = 0; i < num; i++) for (int i = 0; i < num; i++)
{ {
int index = pos+i*cSim.atoms; int constraint = cSim.pCcmaAtomConstraints[index+i*cSim.atoms];
int constraint = cSim.pCcmaAtomConstraints[index];
bool forward = (constraint > 0); bool forward = (constraint > 0);
constraint = (forward ? constraint-1 : -constraint-1); constraint = (forward ? constraint-1 : -constraint-1);
float constraintForce = damping*invMass*cSim.pCcmaDelta2[constraint]; float constraintForce = damping*invMass*cSim.pCcmaDelta2[constraint];
...@@ -186,36 +210,34 @@ kApplyCCMA_kernel(float4* atomPositions, bool addOldPosition) ...@@ -186,36 +210,34 @@ kApplyCCMA_kernel(float4* atomPositions, bool addOldPosition)
atomPos.y += constraintForce*dir.y; atomPos.y += constraintForce*dir.y;
atomPos.z += constraintForce*dir.z; atomPos.z += constraintForce*dir.z;
} }
atomPositions[pos] = atomPos; atomPositions[index] = atomPos;
pos += blockDim.x*gridDim.x;
}
if (threadIdx.x == 0)
requiredIterations = iteration+1;
kSyncAllThreads_kernel(&cSim.pSyncCounter[2*gridDim.x], iteration);
} }
}
// Reset the initial sync counter to be ready for the next call. void kApplyCCMA(gpuContext gpu, float4* posq, bool addOldPosition)
{
if (threadIdx.x == 0) kComputeCCMAConstraintDirections<<<gpu->sim.blocks, gpu->sim.ccma_threads_per_block>>>();
cSim.pSyncCounter[blockIdx.x] = -1; LAUNCHERROR("kComputeCCMAConstraintDirections");
for (int i = 0; i < 150; i++) {
kComputeCCMAConstraintForces<<<gpu->sim.blocks, gpu->sim.ccma_threads_per_block>>>(posq, addOldPosition);
kMultiplyByCCMAConstraintMatrix<<<gpu->sim.blocks, gpu->sim.ccma_threads_per_block>>>();
gpu->psCcmaConverged->Download();
if ((*gpu->psCcmaConverged)[0])
break;
kUpdateCCMAAtomPositions<<<gpu->sim.blocks, gpu->sim.ccma_threads_per_block>>>(posq, i);
}
} }
void kApplyFirstCCMA(gpuContext gpu) void kApplyFirstCCMA(gpuContext gpu)
{ {
// printf("kApplyFirstCCMA\n"); // printf("kApplyFirstCCMA\n");
if (gpu->sim.ccmaConstraints > 0) if (gpu->sim.ccmaConstraints > 0)
{ kApplyCCMA(gpu, gpu->sim.pPosqP, true);
kApplyCCMA_kernel<<<gpu->sim.blocks, gpu->sim.ccma_threads_per_block>>>(gpu->sim.pPosqP, true);
LAUNCHERROR("kApplyCCMA");
}
} }
void kApplySecondCCMA(gpuContext gpu) void kApplySecondCCMA(gpuContext gpu)
{ {
// printf("kApplySecondCCMA\n"); // printf("kApplySecondCCMA\n");
if (gpu->sim.ccmaConstraints > 0) if (gpu->sim.ccmaConstraints > 0)
{ kApplyCCMA(gpu, gpu->sim.pPosq, false);
kApplyCCMA_kernel<<<gpu->sim.blocks, gpu->sim.ccma_threads_per_block>>>(gpu->sim.pPosq, false);
LAUNCHERROR("kApplyCCMA");
}
} }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment