Commit 6768d049 authored by Peter Eastman's avatar Peter Eastman
Browse files

Optimizations to the CUDA implementation of CCMA

parent c961d453
...@@ -460,7 +460,7 @@ struct cudaGmxSimulation { ...@@ -460,7 +460,7 @@ struct cudaGmxSimulation {
float* pCcmaDelta2; // Workspace for CCMA float* pCcmaDelta2; // Workspace for CCMA
int* pCcmaAtomConstraints; // The indices of constraints involving each atom int* pCcmaAtomConstraints; // The indices of constraints involving each atom
int* pCcmaNumAtomConstraints; // The number of constraints involving each atom int* pCcmaNumAtomConstraints; // The number of constraints involving each atom
int* pCcmaConverged; // Used by CCMA to communicate whether iteration has converged int* ccmaConvergedDeviceMarker; // Device memory used to communicate that CCMA has converged
float* pCcmaReducedMass; // The reduced mass for each CCMA constraint float* pCcmaReducedMass; // The reduced mass for each CCMA constraint
unsigned int* pConstraintMatrixColumn; // The column of each element in the constraint matrix. unsigned int* pConstraintMatrixColumn; // The column of each element in the constraint matrix.
float* pConstraintMatrixValue; // The value of each element in the constraint matrix. float* pConstraintMatrixValue; // The value of each element in the constraint matrix.
......
...@@ -1488,7 +1488,7 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const ...@@ -1488,7 +1488,7 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
result = QUERN_solve_with_r(numCCMA, rRowStart, rColIndex, rValue, &rhs[0], &rhs[0]); result = QUERN_solve_with_r(numCCMA, rRowStart, rColIndex, rValue, &rhs[0], &rhs[0]);
for (int j = 0; j < numCCMA; j++) { for (int j = 0; j < numCCMA; j++) {
double value = rhs[j]*distance[ccmaConstraints[i]]/distance[ccmaConstraints[j]]; double value = rhs[j]*distance[ccmaConstraints[i]]/distance[ccmaConstraints[j]];
if (abs(value) > 0.1) if (abs(value) > 0.05)
matrix[j].push_back(pair<int, double>(i, value)); matrix[j].push_back(pair<int, double>(i, value));
} }
} }
...@@ -1533,9 +1533,6 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const ...@@ -1533,9 +1533,6 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
CUDAStream<float>* psCcmaDelta2 = new CUDAStream<float>(numCCMA, 1, "CcmaDelta2"); CUDAStream<float>* psCcmaDelta2 = new CUDAStream<float>(numCCMA, 1, "CcmaDelta2");
gpu->psCcmaDelta2 = psCcmaDelta2; gpu->psCcmaDelta2 = psCcmaDelta2;
gpu->sim.pCcmaDelta2 = psCcmaDelta2->_pDevData; gpu->sim.pCcmaDelta2 = psCcmaDelta2->_pDevData;
CUDAStream<int>* psCcmaConverged = new CUDAStream<int>(gpu->sim.blocks, 1, "CcmaConverged");
gpu->psCcmaConverged = psCcmaConverged;
gpu->sim.pCcmaConverged = psCcmaConverged->_pDevData;
CUDAStream<float>* psCcmaReducedMass = new CUDAStream<float>(numCCMA, 1, "CcmaReducedMass"); CUDAStream<float>* psCcmaReducedMass = new CUDAStream<float>(numCCMA, 1, "CcmaReducedMass");
gpu->psCcmaReducedMass = psCcmaReducedMass; gpu->psCcmaReducedMass = psCcmaReducedMass;
gpu->sim.pCcmaReducedMass = psCcmaReducedMass->_pDevData; gpu->sim.pCcmaReducedMass = psCcmaReducedMass->_pDevData;
...@@ -1545,6 +1542,9 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const ...@@ -1545,6 +1542,9 @@ void gpuSetConstraintParameters(gpuContext gpu, const vector<int>& atom1, const
CUDAStream<float>* psConstraintMatrixValue = new CUDAStream<float>(numCCMA*maxRowElements, 1, "ConstraintMatrixValue"); CUDAStream<float>* psConstraintMatrixValue = new CUDAStream<float>(numCCMA*maxRowElements, 1, "ConstraintMatrixValue");
gpu->psConstraintMatrixValue = psConstraintMatrixValue; gpu->psConstraintMatrixValue = psConstraintMatrixValue;
gpu->sim.pConstraintMatrixValue = psConstraintMatrixValue->_pDevData; gpu->sim.pConstraintMatrixValue = psConstraintMatrixValue->_pDevData;
cudaHostAlloc((void**) &gpu->ccmaConvergedHostMarker, sizeof(int), cudaHostAllocMapped);
cudaHostGetDevicePointer((void**) &gpu->sim.ccmaConvergedDeviceMarker, (void*) gpu->ccmaConvergedHostMarker, 0);
cudaEventCreate(&gpu->ccmaEvent);
gpu->sim.ccmaConstraints = numCCMA; gpu->sim.ccmaConstraints = numCCMA;
for (int i = 0; i < numCCMA; i++) { for (int i = 0; i < numCCMA; i++) {
int index = constraintOrder[i]; int index = constraintOrder[i];
...@@ -1802,7 +1802,7 @@ void* gpuInit(int numAtoms, unsigned int device, bool useBlockingSync) ...@@ -1802,7 +1802,7 @@ void* gpuInit(int numAtoms, unsigned int device, bool useBlockingSync)
cudaSetDevice(device); // Ignore errors cudaSetDevice(device); // Ignore errors
status = cudaGetDevice(&gpu->device); status = cudaGetDevice(&gpu->device);
RTERROR(status, "Error getting CUDA device") RTERROR(status, "Error getting CUDA device")
status = cudaSetDeviceFlags(useBlockingSync ? cudaDeviceBlockingSync : cudaDeviceScheduleAuto); status = cudaSetDeviceFlags(cudaDeviceMapHost+(useBlockingSync ? cudaDeviceBlockingSync : cudaDeviceScheduleAuto));
RTERROR(status, "Error setting device flags") RTERROR(status, "Error setting device flags")
gpu->useBlockingSync = useBlockingSync; gpu->useBlockingSync = useBlockingSync;
...@@ -1988,7 +1988,6 @@ void* gpuInit(int numAtoms, unsigned int device, bool useBlockingSync) ...@@ -1988,7 +1988,6 @@ void* gpuInit(int numAtoms, unsigned int device, bool useBlockingSync)
gpu->psCcmaNumAtomConstraints = NULL; gpu->psCcmaNumAtomConstraints = NULL;
gpu->psCcmaDelta1 = NULL; gpu->psCcmaDelta1 = NULL;
gpu->psCcmaDelta2 = NULL; gpu->psCcmaDelta2 = NULL;
gpu->psCcmaConverged = NULL;
gpu->psCcmaReducedMass = NULL; gpu->psCcmaReducedMass = NULL;
gpu->psConstraintMatrixColumn = NULL; gpu->psConstraintMatrixColumn = NULL;
gpu->psConstraintMatrixValue = NULL; gpu->psConstraintMatrixValue = NULL;
...@@ -2207,8 +2206,8 @@ void gpuShutDown(gpuContext gpu) ...@@ -2207,8 +2206,8 @@ void gpuShutDown(gpuContext gpu)
delete gpu->psCcmaNumAtomConstraints; delete gpu->psCcmaNumAtomConstraints;
delete gpu->psCcmaDelta1; delete gpu->psCcmaDelta1;
delete gpu->psCcmaDelta2; delete gpu->psCcmaDelta2;
delete gpu->psCcmaConverged;
delete gpu->psCcmaReducedMass; delete gpu->psCcmaReducedMass;
cudaEventDestroy(gpu->ccmaEvent);
delete gpu->psConstraintMatrixColumn; delete gpu->psConstraintMatrixColumn;
delete gpu->psConstraintMatrixValue; delete gpu->psConstraintMatrixValue;
delete gpu->psTabulatedFunctionParams; delete gpu->psTabulatedFunctionParams;
......
...@@ -171,7 +171,8 @@ struct _gpuContext { ...@@ -171,7 +171,8 @@ struct _gpuContext {
CUDAStream<int>* psCcmaNumAtomConstraints; // The number of constraints involving each atom CUDAStream<int>* psCcmaNumAtomConstraints; // The number of constraints involving each atom
CUDAStream<float>* psCcmaDelta1; // Workspace for CCMA CUDAStream<float>* psCcmaDelta1; // Workspace for CCMA
CUDAStream<float>* psCcmaDelta2; // Workspace for CCMA CUDAStream<float>* psCcmaDelta2; // Workspace for CCMA
CUDAStream<int>* psCcmaConverged; // Used by CCMA to communicate whether iteration has converged int* ccmaConvergedHostMarker; // Host memory used to communicate that CCMA has converged
cudaEvent_t ccmaEvent; // Used to optimize communication during CCMA
CUDAStream<float>* psCcmaReducedMass; // The reduced mass for each CCMA constraint CUDAStream<float>* psCcmaReducedMass; // The reduced mass for each CCMA constraint
CUDAStream<float>* psRigidClusterMatrix;// The inverse constraint matrix for each rigid cluster CUDAStream<float>* psRigidClusterMatrix;// The inverse constraint matrix for each rigid cluster
CUDAStream<unsigned int>* psRigidClusterConstraintIndex; // The index of each cluster in the stream containing cluster constraints. CUDAStream<unsigned int>* psRigidClusterConstraintIndex; // The index of each cluster in the stream containing cluster constraints.
......
...@@ -71,11 +71,6 @@ kComputeCCMAConstraintDirections() ...@@ -71,11 +71,6 @@ kComputeCCMAConstraintDirections()
dir.z = oldPos1.z-oldPos2.z; dir.z = oldPos1.z-oldPos2.z;
cSim.pCcmaDistance[index] = dir; cSim.pCcmaDistance[index] = dir;
} }
// Mark that no blocks have converged yet.
for (unsigned int index = threadIdx.x+blockIdx.x*blockDim.x; index < gridDim.x; index += blockDim.x*gridDim.x)
cSim.pCcmaConverged[index] = 0;
} }
__global__ void __global__ void
...@@ -88,12 +83,12 @@ __launch_bounds__(256, 1) ...@@ -88,12 +83,12 @@ __launch_bounds__(256, 1)
#endif #endif
kComputeCCMAConstraintForces(float4* atomPositions, bool addOldPosition) kComputeCCMAConstraintForces(float4* atomPositions, bool addOldPosition)
{ {
if (cSim.pCcmaConverged[blockIdx.x]) __shared__ int converged;
return; // The constraint iteration has already converged.
extern __shared__ int convergedBuffer[];
float lowerTol = 1.0f-2.0f*cSim.shakeTolerance+cSim.shakeTolerance*cSim.shakeTolerance; float lowerTol = 1.0f-2.0f*cSim.shakeTolerance+cSim.shakeTolerance*cSim.shakeTolerance;
float upperTol = 1.0f+2.0f*cSim.shakeTolerance+cSim.shakeTolerance*cSim.shakeTolerance; float upperTol = 1.0f+2.0f*cSim.shakeTolerance+cSim.shakeTolerance*cSim.shakeTolerance;
int threadConverged = 1; if (threadIdx.x == 0)
converged = 1;
__syncthreads();
// Calculate the constraint force for each constraint. // Calculate the constraint force for each constraint.
...@@ -120,20 +115,12 @@ kComputeCCMAConstraintForces(float4* atomPositions, bool addOldPosition) ...@@ -120,20 +115,12 @@ kComputeCCMAConstraintForces(float4* atomPositions, bool addOldPosition)
// See whether it has converged. // See whether it has converged.
threadConverged &= (rp2 >= lowerTol*dist2 && rp2 <= upperTol*dist2); if (converged && (rp2 < lowerTol*dist2 || rp2 > upperTol*dist2))
} {
converged = 0;
// Perform a parallel reduction to see if all constraints handled by this block have converged. *cSim.ccmaConvergedDeviceMarker = 0;
}
convergedBuffer[threadIdx.x] = threadConverged;
__syncthreads();
for (unsigned int step = 1; step < blockDim.x; step *= 2) {
if (threadIdx.x%(2*step) == 0)
convergedBuffer[threadIdx.x] &= convergedBuffer[threadIdx.x+step];
__syncthreads();
} }
if (threadIdx.x == 0)
cSim.pCcmaConverged[blockIdx.x] = convergedBuffer[0];
} }
__global__ void __global__ void
...@@ -146,22 +133,8 @@ __launch_bounds__(256, 1) ...@@ -146,22 +133,8 @@ __launch_bounds__(256, 1)
#endif #endif
kMultiplyByCCMAConstraintMatrix() kMultiplyByCCMAConstraintMatrix()
{ {
extern __shared__ int convergedBuffer[]; if (*cSim.ccmaConvergedDeviceMarker)
// First see whether all work groups have converged. return; // The constraint iteration has already converged
convergedBuffer[threadIdx.x] = true;
for (int index = threadIdx.x; index < gridDim.x; index += blockDim.x)
convergedBuffer[threadIdx.x] &= cSim.pCcmaConverged[index];
__syncthreads();
for (int step = 1; step < blockDim.x; step *= 2) {
if (threadIdx.x%(2*step) == 0)
convergedBuffer[threadIdx.x] &= convergedBuffer[threadIdx.x+step];
__syncthreads();
}
if (threadIdx.x == 0)
cSim.pCcmaConverged[blockIdx.x] = convergedBuffer[0];
if (cSim.pCcmaConverged[0])
return; // The constraint iteration has already converged.
// Multiply by the inverse constraint matrix. // Multiply by the inverse constraint matrix.
...@@ -190,7 +163,7 @@ __launch_bounds__(256, 1) ...@@ -190,7 +163,7 @@ __launch_bounds__(256, 1)
#endif #endif
kUpdateCCMAAtomPositions(float4* atomPositions, int iteration) kUpdateCCMAAtomPositions(float4* atomPositions, int iteration)
{ {
if (cSim.pCcmaConverged[blockIdx.x]) if (*cSim.ccmaConvergedDeviceMarker)
return; // The constraint iteration has already converged. return; // The constraint iteration has already converged.
float damping = (iteration < 2 ? 0.5f : 1.0f); float damping = (iteration < 2 ? 0.5f : 1.0f);
for (unsigned int index = threadIdx.x+blockIdx.x*blockDim.x; index < cSim.atoms; index += blockDim.x*gridDim.x) for (unsigned int index = threadIdx.x+blockIdx.x*blockDim.x; index < cSim.atoms; index += blockDim.x*gridDim.x)
...@@ -218,13 +191,17 @@ void kApplyCCMA(gpuContext gpu, float4* posq, bool addOldPosition) ...@@ -218,13 +191,17 @@ void kApplyCCMA(gpuContext gpu, float4* posq, bool addOldPosition)
{ {
kComputeCCMAConstraintDirections<<<gpu->sim.blocks, gpu->sim.ccma_threads_per_block>>>(); kComputeCCMAConstraintDirections<<<gpu->sim.blocks, gpu->sim.ccma_threads_per_block>>>();
LAUNCHERROR("kComputeCCMAConstraintDirections"); LAUNCHERROR("kComputeCCMAConstraintDirections");
const int checkInterval = 3;
for (int i = 0; i < 150; i++) { for (int i = 0; i < 150; i++) {
if ((i+1)%checkInterval == 0)
*gpu->ccmaConvergedHostMarker = 1;
kComputeCCMAConstraintForces<<<gpu->sim.blocks, gpu->sim.ccma_threads_per_block, gpu->sim.ccma_threads_per_block*sizeof(int)>>>(posq, addOldPosition); kComputeCCMAConstraintForces<<<gpu->sim.blocks, gpu->sim.ccma_threads_per_block, gpu->sim.ccma_threads_per_block*sizeof(int)>>>(posq, addOldPosition);
gpu->psCcmaConverged->Download(); cudaEventRecord(gpu->ccmaEvent, 0);
kMultiplyByCCMAConstraintMatrix<<<gpu->sim.blocks, gpu->sim.ccma_threads_per_block, gpu->sim.ccma_threads_per_block*sizeof(int)>>>(); kMultiplyByCCMAConstraintMatrix<<<gpu->sim.blocks, gpu->sim.ccma_threads_per_block, gpu->sim.ccma_threads_per_block*sizeof(int)>>>();
if ((*gpu->psCcmaConverged)[0]) kUpdateCCMAAtomPositions<<<gpu->sim.blocks, gpu->sim.ccma_threads_per_block>>>(posq, 3*i+2);
cudaEventSynchronize(gpu->ccmaEvent);
if ((i+1)%checkInterval == 0 && *gpu->ccmaConvergedHostMarker)
break; break;
kUpdateCCMAAtomPositions<<<gpu->sim.blocks, gpu->sim.ccma_threads_per_block>>>(posq, i);
} }
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment