Commit 6431f0ac authored by peastman's avatar peastman
Browse files

Merge pull request #1056 from peastman/perdof

Optimizations to custom integrator
parents 637de3d3 c744444a
...@@ -149,6 +149,9 @@ void CustomIntegratorUtilities::analyzeComputations(const ContextImpl& context, ...@@ -149,6 +149,9 @@ void CustomIntegratorUtilities::analyzeComputations(const ContextImpl& context,
} }
} }
} }
for (int step = numSteps-2; step >= 0; step--)
if (forceGroup[step] == -2)
forceGroup[step] = forceGroup[step+1];
// Find the end point of each block. // Find the end point of each block.
......
...@@ -5898,12 +5898,12 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context, ...@@ -5898,12 +5898,12 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context,
// Identify steps that can be merged into a single kernel. // Identify steps that can be merged into a single kernel.
for (int step = 1; step < numSteps; step++) { for (int step = 1; step < numSteps; step++) {
if (invalidatesForces[step] || ((needsForces[step] || needsEnergy[step]) && forceGroupFlags[step] != forceGroupFlags[step-1])) if ((needsForces[step] || needsEnergy[step]) && (invalidatesForces[step-1] || forceGroupFlags[step] != forceGroupFlags[step-1]))
continue; continue;
if (stepType[step-1] == CustomIntegrator::ComputePerDof && stepType[step] == CustomIntegrator::ComputePerDof) if (stepType[step-1] == CustomIntegrator::ComputePerDof && stepType[step] == CustomIntegrator::ComputePerDof)
merged[step] = true; merged[step] = true;
} }
for (int step = numSteps-1; step > 0; step--) for (int step = numSteps-1; step > 0; step--)
if (merged[step]) { if (merged[step]) {
needsForces[step-1] = (needsForces[step] || needsForces[step-1]); needsForces[step-1] = (needsForces[step] || needsForces[step-1]);
needsEnergy[step-1] = (needsEnergy[step] || needsEnergy[step-1]); needsEnergy[step-1] = (needsEnergy[step] || needsEnergy[step-1]);
...@@ -6179,7 +6179,7 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat ...@@ -6179,7 +6179,7 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
kernelArgs[step][0][10] = &uniformRandoms->getDevicePointer(); kernelArgs[step][0][10] = &uniformRandoms->getDevicePointer();
if (requiredUniform[step] > 0) if (requiredUniform[step] > 0)
cu.executeKernel(randomKernel, &randomArgs[0], numAtoms); cu.executeKernel(randomKernel, &randomArgs[0], numAtoms);
cu.executeKernel(kernels[step][0], &kernelArgs[step][0][0], numAtoms); cu.executeKernel(kernels[step][0], &kernelArgs[step][0][0], numAtoms, 128);
} }
else if (stepType[step] == CustomIntegrator::ComputeGlobal) { else if (stepType[step] == CustomIntegrator::ComputeGlobal) {
expressionSet.setVariable(uniformVariableIndex, SimTKOpenMMUtilities::getUniformlyDistributedRandomNumber()); expressionSet.setVariable(uniformVariableIndex, SimTKOpenMMUtilities::getUniformlyDistributedRandomNumber());
...@@ -6196,7 +6196,7 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat ...@@ -6196,7 +6196,7 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
if (requiredUniform[step] > 0) if (requiredUniform[step] > 0)
cu.executeKernel(randomKernel, &randomArgs[0], numAtoms); cu.executeKernel(randomKernel, &randomArgs[0], numAtoms);
cu.clearBuffer(*sumBuffer); cu.clearBuffer(*sumBuffer);
cu.executeKernel(kernels[step][0], &kernelArgs[step][0][0], numAtoms); cu.executeKernel(kernels[step][0], &kernelArgs[step][0][0], numAtoms, 128);
cu.executeKernel(kernels[step][1], &kernelArgs[step][1][0], CudaContext::ThreadBlockSize, CudaContext::ThreadBlockSize); cu.executeKernel(kernels[step][1], &kernelArgs[step][1][0], CudaContext::ThreadBlockSize, CudaContext::ThreadBlockSize);
if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) { if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
double value; double value;
......
...@@ -6162,7 +6162,7 @@ void OpenCLIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context ...@@ -6162,7 +6162,7 @@ void OpenCLIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context
// Identify steps that can be merged into a single kernel. // Identify steps that can be merged into a single kernel.
for (int step = 1; step < numSteps; step++) { for (int step = 1; step < numSteps; step++) {
if (invalidatesForces[step] || ((needsForces[step] || needsEnergy[step]) && forceGroupFlags[step] != forceGroupFlags[step-1])) if ((needsForces[step] || needsEnergy[step]) && (invalidatesForces[step-1] || forceGroupFlags[step] != forceGroupFlags[step-1]))
continue; continue;
if (stepType[step-1] == CustomIntegrator::ComputePerDof && stepType[step] == CustomIntegrator::ComputePerDof) if (stepType[step-1] == CustomIntegrator::ComputePerDof && stepType[step] == CustomIntegrator::ComputePerDof)
merged[step] = true; merged[step] = true;
...@@ -6441,7 +6441,7 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr ...@@ -6441,7 +6441,7 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
kernels[step][0].setArg<cl_float>(11, (cl_float) energy); kernels[step][0].setArg<cl_float>(11, (cl_float) energy);
if (requiredUniform[step] > 0) if (requiredUniform[step] > 0)
cl.executeKernel(randomKernel, numAtoms); cl.executeKernel(randomKernel, numAtoms);
cl.executeKernel(kernels[step][0], numAtoms); cl.executeKernel(kernels[step][0], numAtoms, 128);
} }
else if (stepType[step] == CustomIntegrator::ComputeGlobal) { else if (stepType[step] == CustomIntegrator::ComputeGlobal) {
expressionSet.setVariable(uniformVariableIndex, SimTKOpenMMUtilities::getUniformlyDistributedRandomNumber()); expressionSet.setVariable(uniformVariableIndex, SimTKOpenMMUtilities::getUniformlyDistributedRandomNumber());
...@@ -6460,7 +6460,7 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr ...@@ -6460,7 +6460,7 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
if (requiredUniform[step] > 0) if (requiredUniform[step] > 0)
cl.executeKernel(randomKernel, numAtoms); cl.executeKernel(randomKernel, numAtoms);
cl.clearBuffer(*sumBuffer); cl.clearBuffer(*sumBuffer);
cl.executeKernel(kernels[step][0], numAtoms); cl.executeKernel(kernels[step][0], numAtoms, 128);
cl.executeKernel(kernels[step][1], OpenCLContext::ThreadBlockSize, OpenCLContext::ThreadBlockSize); cl.executeKernel(kernels[step][1], OpenCLContext::ThreadBlockSize, OpenCLContext::ThreadBlockSize);
if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision()) { if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision()) {
double value; double value;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment