Commit 62a83f80 authored by Peter Eastman's avatar Peter Eastman
Browse files

Optimized OpenCL version of CCMA

parent e1a9418d
...@@ -450,7 +450,7 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c ...@@ -450,7 +450,7 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
ccmaNumAtomConstraints = new OpenCLArray<cl_int>(context, numAtoms, "CcmaAtomConstraintsIndex"); ccmaNumAtomConstraints = new OpenCLArray<cl_int>(context, numAtoms, "CcmaAtomConstraintsIndex");
ccmaDelta1 = new OpenCLArray<cl_float>(context, numCCMA, "CcmaDelta1"); ccmaDelta1 = new OpenCLArray<cl_float>(context, numCCMA, "CcmaDelta1");
ccmaDelta2 = new OpenCLArray<cl_float>(context, numCCMA, "CcmaDelta2"); ccmaDelta2 = new OpenCLArray<cl_float>(context, numCCMA, "CcmaDelta2");
ccmaConverged = new OpenCLArray<cl_int>(context, context.getNumThreadBlocks(), "CcmaConverged"); ccmaConverged = new OpenCLArray<cl_int>(context, 1, "CcmaConverged", true);
ccmaReducedMass = new OpenCLArray<cl_float>(context, numCCMA, "CcmaReducedMass"); ccmaReducedMass = new OpenCLArray<cl_float>(context, numCCMA, "CcmaReducedMass");
ccmaConstraintMatrixColumn = new OpenCLArray<cl_int>(context, numCCMA*maxRowElements, "ConstraintMatrixColumn"); ccmaConstraintMatrixColumn = new OpenCLArray<cl_int>(context, numCCMA*maxRowElements, "ConstraintMatrixColumn");
ccmaConstraintMatrixValue = new OpenCLArray<cl_float>(context, numCCMA*maxRowElements, "ConstraintMatrixValue"); ccmaConstraintMatrixValue = new OpenCLArray<cl_float>(context, numCCMA*maxRowElements, "ConstraintMatrixValue");
...@@ -572,20 +572,17 @@ void OpenCLIntegrationUtilities::applyConstraints(double tol) { ...@@ -572,20 +572,17 @@ void OpenCLIntegrationUtilities::applyConstraints(double tol) {
ccmaDirectionsKernel.setArg<cl::Buffer>(0, ccmaAtoms->getDeviceBuffer()); ccmaDirectionsKernel.setArg<cl::Buffer>(0, ccmaAtoms->getDeviceBuffer());
ccmaDirectionsKernel.setArg<cl::Buffer>(1, ccmaDistance->getDeviceBuffer()); ccmaDirectionsKernel.setArg<cl::Buffer>(1, ccmaDistance->getDeviceBuffer());
ccmaDirectionsKernel.setArg<cl::Buffer>(2, context.getPosq().getDeviceBuffer()); ccmaDirectionsKernel.setArg<cl::Buffer>(2, context.getPosq().getDeviceBuffer());
ccmaDirectionsKernel.setArg<cl::Buffer>(3, ccmaConverged->getDeviceBuffer());
ccmaForceKernel.setArg<cl::Buffer>(0, ccmaAtoms->getDeviceBuffer()); ccmaForceKernel.setArg<cl::Buffer>(0, ccmaAtoms->getDeviceBuffer());
ccmaForceKernel.setArg<cl::Buffer>(1, ccmaDistance->getDeviceBuffer()); ccmaForceKernel.setArg<cl::Buffer>(1, ccmaDistance->getDeviceBuffer());
ccmaForceKernel.setArg<cl::Buffer>(2, posDelta->getDeviceBuffer()); ccmaForceKernel.setArg<cl::Buffer>(2, posDelta->getDeviceBuffer());
ccmaForceKernel.setArg<cl::Buffer>(3, ccmaReducedMass->getDeviceBuffer()); ccmaForceKernel.setArg<cl::Buffer>(3, ccmaReducedMass->getDeviceBuffer());
ccmaForceKernel.setArg<cl::Buffer>(4, ccmaDelta1->getDeviceBuffer()); ccmaForceKernel.setArg<cl::Buffer>(4, ccmaDelta1->getDeviceBuffer());
ccmaForceKernel.setArg<cl::Buffer>(5, ccmaConverged->getDeviceBuffer()); ccmaForceKernel.setArg<cl::Buffer>(5, ccmaConverged->getDeviceBuffer());
ccmaForceKernel.setArg(6, sizeof(cl_int)*OpenCLContext::ThreadBlockSize, NULL);
ccmaMultiplyKernel.setArg<cl::Buffer>(0, ccmaDelta1->getDeviceBuffer()); ccmaMultiplyKernel.setArg<cl::Buffer>(0, ccmaDelta1->getDeviceBuffer());
ccmaMultiplyKernel.setArg<cl::Buffer>(1, ccmaDelta2->getDeviceBuffer()); ccmaMultiplyKernel.setArg<cl::Buffer>(1, ccmaDelta2->getDeviceBuffer());
ccmaMultiplyKernel.setArg<cl::Buffer>(2, ccmaConstraintMatrixColumn->getDeviceBuffer()); ccmaMultiplyKernel.setArg<cl::Buffer>(2, ccmaConstraintMatrixColumn->getDeviceBuffer());
ccmaMultiplyKernel.setArg<cl::Buffer>(3, ccmaConstraintMatrixValue->getDeviceBuffer()); ccmaMultiplyKernel.setArg<cl::Buffer>(3, ccmaConstraintMatrixValue->getDeviceBuffer());
ccmaMultiplyKernel.setArg<cl::Buffer>(4, ccmaConverged->getDeviceBuffer()); ccmaMultiplyKernel.setArg<cl::Buffer>(4, ccmaConverged->getDeviceBuffer());
ccmaMultiplyKernel.setArg(5, sizeof(cl_int)*OpenCLContext::ThreadBlockSize, NULL);
ccmaUpdateKernel.setArg<cl::Buffer>(0, ccmaNumAtomConstraints->getDeviceBuffer()); ccmaUpdateKernel.setArg<cl::Buffer>(0, ccmaNumAtomConstraints->getDeviceBuffer());
ccmaUpdateKernel.setArg<cl::Buffer>(1, ccmaAtomConstraints->getDeviceBuffer()); ccmaUpdateKernel.setArg<cl::Buffer>(1, ccmaAtomConstraints->getDeviceBuffer());
ccmaUpdateKernel.setArg<cl::Buffer>(2, ccmaDistance->getDeviceBuffer()); ccmaUpdateKernel.setArg<cl::Buffer>(2, ccmaDistance->getDeviceBuffer());
...@@ -595,22 +592,21 @@ void OpenCLIntegrationUtilities::applyConstraints(double tol) { ...@@ -595,22 +592,21 @@ void OpenCLIntegrationUtilities::applyConstraints(double tol) {
ccmaUpdateKernel.setArg<cl::Buffer>(6, ccmaDelta2->getDeviceBuffer()); ccmaUpdateKernel.setArg<cl::Buffer>(6, ccmaDelta2->getDeviceBuffer());
ccmaUpdateKernel.setArg<cl::Buffer>(7, ccmaConverged->getDeviceBuffer()); ccmaUpdateKernel.setArg<cl::Buffer>(7, ccmaConverged->getDeviceBuffer());
} }
ccmaForceKernel.setArg<cl_float>(7, (cl_float) tol); ccmaForceKernel.setArg<cl_float>(6, (cl_float) tol);
context.executeKernel(ccmaDirectionsKernel, ccmaAtoms->getSize()); context.executeKernel(ccmaDirectionsKernel, ccmaAtoms->getSize());
vector<cl_int> converged; const int checkInterval = 3;
for (int i = 0; i < 75; i++) { for (int i = 0; i < 150; i++) {
// To reduce the overhead of checking for convergence, perform two iterations if ((i+1)%checkInterval == 0) {
// each time through the loop. (*ccmaConverged)[0] = 1;
ccmaConverged->upload();
context.executeKernel(ccmaForceKernel, ccmaAtoms->getSize()); }
context.executeKernel(ccmaMultiplyKernel, ccmaAtoms->getSize());
ccmaUpdateKernel.setArg<cl_int>(8, i);
context.executeKernel(ccmaUpdateKernel, context.getNumAtoms());
context.executeKernel(ccmaForceKernel, ccmaAtoms->getSize()); context.executeKernel(ccmaForceKernel, ccmaAtoms->getSize());
if ((i+1)%checkInterval == 0) {
ccmaConverged->download();
if ((*ccmaConverged)[0])
break;
}
context.executeKernel(ccmaMultiplyKernel, ccmaAtoms->getSize()); context.executeKernel(ccmaMultiplyKernel, ccmaAtoms->getSize());
ccmaConverged->download(converged);
if (converged[0])
break;
ccmaUpdateKernel.setArg<cl_int>(8, i); ccmaUpdateKernel.setArg<cl_int>(8, i);
context.executeKernel(ccmaUpdateKernel, context.getNumAtoms()); context.executeKernel(ccmaUpdateKernel, context.getNumAtoms());
} }
......
/** /**
* Compute the direction each constraint is pointing in. This is called once at the beginning of constraint evaluation. * Compute the direction each constraint is pointing in. This is called once at the beginning of constraint evaluation.
*/ */
__kernel void computeConstraintDirections(__global int2* constraintAtoms, __global float4* constraintDistance, __global float4* atomPositions, __kernel void computeConstraintDirections(__global int2* constraintAtoms, __global float4* constraintDistance, __global float4* atomPositions) {
__global int* converged) {
for (int index = get_global_id(0); index < NUM_CONSTRAINTS; index += get_global_size(0)) { for (int index = get_global_id(0); index < NUM_CONSTRAINTS; index += get_global_size(0)) {
// Compute the direction for this constraint. // Compute the direction for this constraint.
...@@ -15,23 +14,19 @@ __kernel void computeConstraintDirections(__global int2* constraintAtoms, __glob ...@@ -15,23 +14,19 @@ __kernel void computeConstraintDirections(__global int2* constraintAtoms, __glob
dir.z = oldPos1.z-oldPos2.z; dir.z = oldPos1.z-oldPos2.z;
constraintDistance[index] = dir; constraintDistance[index] = dir;
} }
// Mark that no work groups have converged yet.
for (int index = get_global_id(0); index < get_num_groups(0); index += get_global_size(0))
converged[index] = false;
} }
/** /**
* Compute the force applied by each constraint. * Compute the force applied by each constraint.
*/ */
__kernel void computeConstraintForce(__global int2* constraintAtoms, __global float4* constraintDistance, __global float4* atomPositions, __kernel void computeConstraintForce(__global int2* constraintAtoms, __global float4* constraintDistance, __global float4* atomPositions,
__global float* reducedMass, __global float* delta1, __global int* converged, __local int* convergedBuffer, float tol) { __global float* reducedMass, __global float* delta1, __global int* converged, float tol) {
if (converged[get_group_id(0)]) __local int groupConverged;
return; // The constraint iteration has already converged. if (get_local_id(0) == 0)
groupConverged = 1;
barrier(CLK_LOCAL_MEM_FENCE);
float lowerTol = 1.0f-2.0f*tol+tol*tol; float lowerTol = 1.0f-2.0f*tol+tol*tol;
float upperTol = 1.0f+2.0f*tol+tol*tol; float upperTol = 1.0f+2.0f*tol+tol*tol;
int threadConverged = 1;
for (int index = get_global_id(0); index < NUM_CONSTRAINTS; index += get_global_size(0)) { for (int index = get_global_id(0); index < NUM_CONSTRAINTS; index += get_global_size(0)) {
// Compute the force due to this constraint. // Compute the force due to this constraint.
...@@ -48,40 +43,18 @@ __kernel void computeConstraintForce(__global int2* constraintAtoms, __global fl ...@@ -48,40 +43,18 @@ __kernel void computeConstraintForce(__global int2* constraintAtoms, __global fl
// See whether it has converged. // See whether it has converged.
threadConverged &= (rp2 >= lowerTol*dist2 && rp2 <= upperTol*dist2); if (groupConverged && (rp2 < lowerTol*dist2 || rp2 > upperTol*dist2)) {
} groupConverged = 0;
converged[0] = 0;
// Perform a parallel reduction to see if all constraints handled by this work group have converged. }
convergedBuffer[get_local_id(0)] = threadConverged;
barrier(CLK_LOCAL_MEM_FENCE);
for (int step = 1; step < get_local_size(0); step *= 2) {
if (get_local_id(0)%(2*step) == 0)
convergedBuffer[get_local_id(0)] &= convergedBuffer[get_local_id(0)+step];
barrier(CLK_LOCAL_MEM_FENCE);
} }
if (get_local_id(0) == 0)
converged[get_group_id(0)] = convergedBuffer[0];
} }
/** /**
* Multiply the vector of constraint forces by the constraint matrix. * Multiply the vector of constraint forces by the constraint matrix.
*/ */
__kernel void multiplyByConstraintMatrix(__global float* delta1, __global float* delta2, __global int* constraintMatrixColumn, __kernel void multiplyByConstraintMatrix(__global float* delta1, __global float* delta2, __global int* constraintMatrixColumn,
__global float* constraintMatrixValue, __global int* converged, __local int* convergedBuffer) { __global float* constraintMatrixValue, __global int* converged) {
// First see whether all work groups have converged.
convergedBuffer[get_local_id(0)] = true;
for (int index = get_local_id(0); index < get_num_groups(0); index += get_local_size(0))
convergedBuffer[get_local_id(0)] &= converged[index];
barrier(CLK_LOCAL_MEM_FENCE);
for (int step = 1; step < get_local_size(0); step *= 2) {
if (get_local_id(0)%(2*step) == 0)
convergedBuffer[get_local_id(0)] &= convergedBuffer[get_local_id(0)+step];
barrier(CLK_LOCAL_MEM_FENCE);
}
if (get_local_id(0) == 0)
converged[get_group_id(0)] = convergedBuffer[0];
if (converged[0]) if (converged[0])
return; // The constraint iteration has already converged. return; // The constraint iteration has already converged.
...@@ -105,7 +78,7 @@ __kernel void multiplyByConstraintMatrix(__global float* delta1, __global float* ...@@ -105,7 +78,7 @@ __kernel void multiplyByConstraintMatrix(__global float* delta1, __global float*
*/ */
__kernel void updateAtomPositions(__global int* numAtomConstraints, __global int* atomConstraints, __global float4* constraintDistance, __kernel void updateAtomPositions(__global int* numAtomConstraints, __global int* atomConstraints, __global float4* constraintDistance,
__global float4* atomPositions, __global float4* velm, __global float* delta1, __global float* delta2, __global int* converged, int iteration) { __global float4* atomPositions, __global float4* velm, __global float* delta1, __global float* delta2, __global int* converged, int iteration) {
if (converged[get_group_id(0)]) if (converged[0])
return; // The constraint iteration has already converged. return; // The constraint iteration has already converged.
float damping = (iteration < 2 ? 0.5f : 1.0f); float damping = (iteration < 2 ? 0.5f : 1.0f);
for (int index = get_global_id(0); index < NUM_ATOMS; index += get_global_size(0)) { for (int index = get_global_id(0); index < NUM_ATOMS; index += get_global_size(0)) {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment