Commit 6768d049 authored by Peter Eastman's avatar Peter Eastman
Browse files

Optimizations to the CUDA implementation of CCMA

parent c961d453
......@@ -460,7 +460,7 @@ struct cudaGmxSimulation {
float* pCcmaDelta2; // Workspace for CCMA
int* pCcmaAtomConstraints; // The indices of constraints involving each atom
int* pCcmaNumAtomConstraints; // The number of constraints involving each atom
int* pCcmaConverged; // Used by CCMA to communicate whether iteration has converged
int* ccmaConvergedDeviceMarker; // Device memory used to communicate that CCMA has converged
float* pCcmaReducedMass; // The reduced mass for each CCMA constraint
unsigned int* pConstraintMatrixColumn; // The column of each element in the constraint matrix.
float* pConstraintMatrixValue; // The value of each element in the constraint matrix.
......
......@@ -1488,7 +1488,7 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
result = QUERN_solve_with_r(numCCMA, rRowStart, rColIndex, rValue, &rhs[0], &rhs[0]);
for (int j = 0; j < numCCMA; j++) {
double value = rhs[j]*distance[ccmaConstraints[i]]/distance[ccmaConstraints[j]];
if (abs(value) > 0.1)
if (abs(value) > 0.05)
matrix[j].push_back(pair<int, double>(i, value));
}
}
......@@ -1533,9 +1533,6 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
CUDAStream<float>* psCcmaDelta2 = new CUDAStream<float>(numCCMA, 1, "CcmaDelta2");
gpu->psCcmaDelta2 = psCcmaDelta2;
gpu->sim.pCcmaDelta2 = psCcmaDelta2->_pDevData;
CUDAStream<int>* psCcmaConverged = new CUDAStream<int>(gpu->sim.blocks, 1, "CcmaConverged");
gpu->psCcmaConverged = psCcmaConverged;
gpu->sim.pCcmaConverged = psCcmaConverged->_pDevData;
CUDAStream<float>* psCcmaReducedMass = new CUDAStream<float>(numCCMA, 1, "CcmaReducedMass");
gpu->psCcmaReducedMass = psCcmaReducedMass;
gpu->sim.pCcmaReducedMass = psCcmaReducedMass->_pDevData;
......@@ -1545,6 +1542,9 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
CUDAStream<float>* psConstraintMatrixValue = new CUDAStream<float>(numCCMA*maxRowElements, 1, "ConstraintMatrixValue");
gpu->psConstraintMatrixValue = psConstraintMatrixValue;
gpu->sim.pConstraintMatrixValue = psConstraintMatrixValue->_pDevData;
cudaHostAlloc((void**) &gpu->ccmaConvergedHostMarker, sizeof(int), cudaHostAllocMapped);
cudaHostGetDevicePointer((void**) &gpu->sim.ccmaConvergedDeviceMarker, (void*) gpu->ccmaConvergedHostMarker, 0);
cudaEventCreate(&gpu->ccmaEvent);
gpu->sim.ccmaConstraints = numCCMA;
for (int i = 0; i < numCCMA; i++) {
int index = constraintOrder[i];
......@@ -1802,7 +1802,7 @@ void* gpuInit(int numAtoms, unsigned int device, bool useBlockingSync)
cudaSetDevice(device); // Ignore errors
status = cudaGetDevice(&gpu->device);
RTERROR(status, "Error getting CUDA device")
status = cudaSetDeviceFlags(useBlockingSync ? cudaDeviceBlockingSync : cudaDeviceScheduleAuto);
status = cudaSetDeviceFlags(cudaDeviceMapHost+(useBlockingSync ? cudaDeviceBlockingSync : cudaDeviceScheduleAuto));
RTERROR(status, "Error setting device flags")
gpu->useBlockingSync = useBlockingSync;
......@@ -1988,7 +1988,6 @@ void* gpuInit(int numAtoms, unsigned int device, bool useBlockingSync)
gpu->psCcmaNumAtomConstraints = NULL;
gpu->psCcmaDelta1 = NULL;
gpu->psCcmaDelta2 = NULL;
gpu->psCcmaConverged = NULL;
gpu->psCcmaReducedMass = NULL;
gpu->psConstraintMatrixColumn = NULL;
gpu->psConstraintMatrixValue = NULL;
......@@ -2207,8 +2206,8 @@ void gpuShutDown(gpuContext gpu)
delete gpu->psCcmaNumAtomConstraints;
delete gpu->psCcmaDelta1;
delete gpu->psCcmaDelta2;
delete gpu->psCcmaConverged;
delete gpu->psCcmaReducedMass;
cudaEventDestroy(gpu->ccmaEvent);
delete gpu->psConstraintMatrixColumn;
delete gpu->psConstraintMatrixValue;
delete gpu->psTabulatedFunctionParams;
......
......@@ -171,7 +171,8 @@ struct _gpuContext {
CUDAStream<int>* psCcmaNumAtomConstraints; // The number of constraints involving each atom
CUDAStream<float>* psCcmaDelta1; // Workspace for CCMA
CUDAStream<float>* psCcmaDelta2; // Workspace for CCMA
CUDAStream<int>* psCcmaConverged; // Used by CCMA to communicate whether iteration has converged
int* ccmaConvergedHostMarker; // Host memory used to communicate that CCMA has converged
cudaEvent_t ccmaEvent; // Used to optimize communication during CCMA
CUDAStream<float>* psCcmaReducedMass; // The reduced mass for each CCMA constraint
CUDAStream<float>* psRigidClusterMatrix;// The inverse constraint matrix for each rigid cluster
CUDAStream<unsigned int>* psRigidClusterConstraintIndex; // The index of each cluster in the stream containing cluster constraints.
......
......@@ -71,11 +71,6 @@ kComputeCCMAConstraintDirections()
dir.z = oldPos1.z-oldPos2.z;
cSim.pCcmaDistance[index] = dir;
}
// Mark that no blocks have converged yet.
for (unsigned int index = threadIdx.x+blockIdx.x*blockDim.x; index < gridDim.x; index += blockDim.x*gridDim.x)
cSim.pCcmaConverged[index] = 0;
}
__global__ void
......@@ -88,12 +83,12 @@ __launch_bounds__(256, 1)
#endif
kComputeCCMAConstraintForces(float4* atomPositions, bool addOldPosition)
{
if (cSim.pCcmaConverged[blockIdx.x])
return; // The constraint iteration has already converged.
extern __shared__ int convergedBuffer[];
__shared__ int converged;
float lowerTol = 1.0f-2.0f*cSim.shakeTolerance+cSim.shakeTolerance*cSim.shakeTolerance;
float upperTol = 1.0f+2.0f*cSim.shakeTolerance+cSim.shakeTolerance*cSim.shakeTolerance;
int threadConverged = 1;
if (threadIdx.x == 0)
converged = 1;
__syncthreads();
// Calculate the constraint force for each constraint.
......@@ -120,20 +115,12 @@ kComputeCCMAConstraintForces(float4* atomPositions, bool addOldPosition)
// See whether it has converged.
threadConverged &= (rp2 >= lowerTol*dist2 && rp2 <= upperTol*dist2);
if (converged && (rp2 < lowerTol*dist2 || rp2 > upperTol*dist2))
{
converged = 0;
*cSim.ccmaConvergedDeviceMarker = 0;
}
// Perform a parallel reduction to see if all constraints handled by this block have converged.
convergedBuffer[threadIdx.x] = threadConverged;
__syncthreads();
for (unsigned int step = 1; step < blockDim.x; step *= 2) {
if (threadIdx.x%(2*step) == 0)
convergedBuffer[threadIdx.x] &= convergedBuffer[threadIdx.x+step];
__syncthreads();
}
if (threadIdx.x == 0)
cSim.pCcmaConverged[blockIdx.x] = convergedBuffer[0];
}
__global__ void
......@@ -146,22 +133,8 @@ __launch_bounds__(256, 1)
#endif
kMultiplyByCCMAConstraintMatrix()
{
extern __shared__ int convergedBuffer[];
// First see whether all work groups have converged.
convergedBuffer[threadIdx.x] = true;
for (int index = threadIdx.x; index < gridDim.x; index += blockDim.x)
convergedBuffer[threadIdx.x] &= cSim.pCcmaConverged[index];
__syncthreads();
for (int step = 1; step < blockDim.x; step *= 2) {
if (threadIdx.x%(2*step) == 0)
convergedBuffer[threadIdx.x] &= convergedBuffer[threadIdx.x+step];
__syncthreads();
}
if (threadIdx.x == 0)
cSim.pCcmaConverged[blockIdx.x] = convergedBuffer[0];
if (cSim.pCcmaConverged[0])
return; // The constraint iteration has already converged.
if (*cSim.ccmaConvergedDeviceMarker)
return; // The constraint iteration has already converged
// Multiply by the inverse constraint matrix.
......@@ -190,7 +163,7 @@ __launch_bounds__(256, 1)
#endif
kUpdateCCMAAtomPositions(float4* atomPositions, int iteration)
{
if (cSim.pCcmaConverged[blockIdx.x])
if (*cSim.ccmaConvergedDeviceMarker)
return; // The constraint iteration has already converged.
float damping = (iteration < 2 ? 0.5f : 1.0f);
for (unsigned int index = threadIdx.x+blockIdx.x*blockDim.x; index < cSim.atoms; index += blockDim.x*gridDim.x)
......@@ -218,13 +191,17 @@ void kApplyCCMA(gpuContext gpu, float4* posq, bool addOldPosition)
{
kComputeCCMAConstraintDirections<<<gpu->sim.blocks, gpu->sim.ccma_threads_per_block>>>();
LAUNCHERROR("kComputeCCMAConstraintDirections");
const int checkInterval = 3;
for (int i = 0; i < 150; i++) {
if ((i+1)%checkInterval == 0)
*gpu->ccmaConvergedHostMarker = 1;
kComputeCCMAConstraintForces<<<gpu->sim.blocks, gpu->sim.ccma_threads_per_block, gpu->sim.ccma_threads_per_block*sizeof(int)>>>(posq, addOldPosition);
gpu->psCcmaConverged->Download();
cudaEventRecord(gpu->ccmaEvent, 0);
kMultiplyByCCMAConstraintMatrix<<<gpu->sim.blocks, gpu->sim.ccma_threads_per_block, gpu->sim.ccma_threads_per_block*sizeof(int)>>>();
if ((*gpu->psCcmaConverged)[0])
kUpdateCCMAAtomPositions<<<gpu->sim.blocks, gpu->sim.ccma_threads_per_block>>>(posq, 3*i+2);
cudaEventSynchronize(gpu->ccmaEvent);
if ((i+1)%checkInterval == 0 && *gpu->ccmaConvergedHostMarker)
break;
kUpdateCCMAAtomPositions<<<gpu->sim.blocks, gpu->sim.ccma_threads_per_block>>>(posq, i);
}
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment