Commit 6431f0ac authored by peastman's avatar peastman
Browse files

Merge pull request #1056 from peastman/perdof

Optimizations to custom integrator
parents 637de3d3 c744444a
......@@ -149,6 +149,9 @@ void CustomIntegratorUtilities::analyzeComputations(const ContextImpl& context,
}
}
}
for (int step = numSteps-2; step >= 0; step--)
if (forceGroup[step] == -2)
forceGroup[step] = forceGroup[step+1];
// Find the end point of each block.
......
......@@ -5898,7 +5898,7 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context,
// Identify steps that can be merged into a single kernel.
for (int step = 1; step < numSteps; step++) {
if (invalidatesForces[step] || ((needsForces[step] || needsEnergy[step]) && forceGroupFlags[step] != forceGroupFlags[step-1]))
if ((needsForces[step] || needsEnergy[step]) && (invalidatesForces[step-1] || forceGroupFlags[step] != forceGroupFlags[step-1]))
continue;
if (stepType[step-1] == CustomIntegrator::ComputePerDof && stepType[step] == CustomIntegrator::ComputePerDof)
merged[step] = true;
......@@ -6179,7 +6179,7 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
kernelArgs[step][0][10] = &uniformRandoms->getDevicePointer();
if (requiredUniform[step] > 0)
cu.executeKernel(randomKernel, &randomArgs[0], numAtoms);
cu.executeKernel(kernels[step][0], &kernelArgs[step][0][0], numAtoms);
cu.executeKernel(kernels[step][0], &kernelArgs[step][0][0], numAtoms, 128);
}
else if (stepType[step] == CustomIntegrator::ComputeGlobal) {
expressionSet.setVariable(uniformVariableIndex, SimTKOpenMMUtilities::getUniformlyDistributedRandomNumber());
......@@ -6196,7 +6196,7 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
if (requiredUniform[step] > 0)
cu.executeKernel(randomKernel, &randomArgs[0], numAtoms);
cu.clearBuffer(*sumBuffer);
cu.executeKernel(kernels[step][0], &kernelArgs[step][0][0], numAtoms);
cu.executeKernel(kernels[step][0], &kernelArgs[step][0][0], numAtoms, 128);
cu.executeKernel(kernels[step][1], &kernelArgs[step][1][0], CudaContext::ThreadBlockSize, CudaContext::ThreadBlockSize);
if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
double value;
......
......@@ -6162,7 +6162,7 @@ void OpenCLIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context
// Identify steps that can be merged into a single kernel.
for (int step = 1; step < numSteps; step++) {
if (invalidatesForces[step] || ((needsForces[step] || needsEnergy[step]) && forceGroupFlags[step] != forceGroupFlags[step-1]))
if ((needsForces[step] || needsEnergy[step]) && (invalidatesForces[step-1] || forceGroupFlags[step] != forceGroupFlags[step-1]))
continue;
if (stepType[step-1] == CustomIntegrator::ComputePerDof && stepType[step] == CustomIntegrator::ComputePerDof)
merged[step] = true;
......@@ -6441,7 +6441,7 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
kernels[step][0].setArg<cl_float>(11, (cl_float) energy);
if (requiredUniform[step] > 0)
cl.executeKernel(randomKernel, numAtoms);
cl.executeKernel(kernels[step][0], numAtoms);
cl.executeKernel(kernels[step][0], numAtoms, 128);
}
else if (stepType[step] == CustomIntegrator::ComputeGlobal) {
expressionSet.setVariable(uniformVariableIndex, SimTKOpenMMUtilities::getUniformlyDistributedRandomNumber());
......@@ -6460,7 +6460,7 @@ void OpenCLIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegr
if (requiredUniform[step] > 0)
cl.executeKernel(randomKernel, numAtoms);
cl.clearBuffer(*sumBuffer);
cl.executeKernel(kernels[step][0], numAtoms);
cl.executeKernel(kernels[step][0], numAtoms, 128);
cl.executeKernel(kernels[step][1], OpenCLContext::ThreadBlockSize, OpenCLContext::ThreadBlockSize);
if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision()) {
double value;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment