Unverified Commit 4ab645ea authored by Evan Pretti's avatar Evan Pretti Committed by GitHub
Browse files

GPU implementation of L-BFGS (#5198)

* Make reference/CPU minimizer into a kernel

* Add per-platform support for GPU minimization

* Initial implementation of GPU minimization

* Fixes

* Increase robustness when initial gradient is huge

* Handle overflow leading to non-finite values gracefully

* Handle large forces in single precision more robustly

* Optimize kernels

* Fix kernel launch size

* Update banner years

* Don't create MinimizeKernel until first minimization requested

* Make some compile-time constants into kernel arguments

* Consolidate scale calculation kernel

* Condense alpha/beta reduction kernels using atomics

* Condense line search dot kernels with reductions

* Remove a download, and download grad norm separately

* Asynchronously check lbfgs convergence condition

* Restructure line search to avoid download waiting

* Start line search preemptively in case CPU evaluation is not needed

* In rare cases, constraint error might not decrease after one optimization round

* Better handling of unsupported 64-bit atomics, use FLT_MAX

* Pick gradient mode based on GPU vs. CPU evaluation

* Rework getDiff/getScale reduction, remove reduceBuffer

* Older CUDA might not like float hex literals

* Fix error in a comment
parent 834b1294
......@@ -455,10 +455,13 @@ int lbfgs(
goto lbfgs_exit;
}
/* Compute the initial step:
step = 1.0 / sqrt(vecdot(d, d, n))
/*
Normalize the initial steepest descent direction and set the initial
step to 1 to avoid underflow/overflow issues with large gradients.
*/
vec2norminv(&step, d, n);
vecscale(d, step, n);
step = 1.0;
k = 1;
end = 0;
......
......@@ -7,7 +7,7 @@
* This is part of the OpenMM molecular simulation toolkit. *
* See https://openmm.org/development. *
* *
* Portions copyright (c) 2008-2025 Stanford University and the Authors. *
* Portions copyright (c) 2008-2026 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -55,6 +55,7 @@
#include "openmm/HarmonicBondForce.h"
#include "openmm/KernelImpl.h"
#include "openmm/LCPOForce.h"
#include "openmm/LocalEnergyMinimizer.h"
#include "openmm/MonteCarloBarostat.h"
#include "openmm/OrientationRestraintForce.h"
#include "openmm/PeriodicTorsionForce.h"
......@@ -300,6 +301,33 @@ public:
virtual void computePositions(ContextImpl& context) = 0;
};
/**
* This kernel performs local energy minimization.
*/
class MinimizeKernel : public KernelImpl {
public:
static std::string Name() {
return "Minimize";
}
MinimizeKernel(std::string name, const Platform& platform) : KernelImpl(name, platform) {
}
/**
* Initialize the kernel.
*
* @param system the System this kernel will be applied to
*/
virtual void initialize(const System& system) = 0;
/**
* Perform local energy minimization.
*
* @param context the context with which to perform the minimization
* @param tolerance limiting root-mean-square value of all force components in kJ/mol/nm for convergence
* @param maxIterations the maximum number of iterations to perform, or 0 to continue until convergence
* @param reporter an optional reporter to invoke after each iteration of minimization
*/
virtual void execute(ContextImpl& context, double tolerance, int maxIterations, MinimizationReporter* reporter) = 0;
};
/**
* This kernel is invoked by HarmonicBondForce to calculate the forces acting on the system and the energy of the system.
*/
......
......@@ -7,7 +7,7 @@
* This is part of the OpenMM molecular simulation toolkit. *
* See https://openmm.org/development. *
* *
* Portions copyright (c) 2008-2021 Stanford University and the Authors. *
* Portions copyright (c) 2008-2026 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -287,6 +287,7 @@ private:
friend class Force;
friend class ForceImpl;
friend class Platform;
friend class LocalEnergyMinimizer;
Context(const System& system, Integrator& integrator, ContextImpl& linked);
ContextImpl& getImpl();
const ContextImpl& getImpl() const;
......
......@@ -7,7 +7,7 @@
* This is part of the OpenMM molecular simulation toolkit. *
* See https://openmm.org/development. *
* *
* Portions copyright (c) 2008-2022 Stanford University and the Authors. *
* Portions copyright (c) 2008-2026 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -31,6 +31,7 @@
* -------------------------------------------------------------------------- */
#include "openmm/Kernel.h"
#include "openmm/LocalEnergyMinimizer.h"
#include "openmm/Platform.h"
#include "openmm/Vec3.h"
#include <iosfwd>
......@@ -295,6 +296,15 @@ public:
* means you shouldn't.
*/
Context* createLinkedContext(const System& system, Integrator& integrator);
/**
* Run local energy minimization on the Context. See LocalEnergyMinimizer
* for details of the parameters.
*
* @param tolerance how precisely the energy minimum must be located
* @param maxIterations the maximum number of iterations to perform
* @param reporter an optional MinimizationReporter to invoke after each iteration
*/
void minimize(double tolerance, int maxIterations, MinimizationReporter* reporter);
private:
friend class Context;
void initialize();
......@@ -304,10 +314,10 @@ private:
std::vector<ForceImpl*> forceImpls;
std::map<std::string, double> parameters;
mutable std::vector<std::vector<int> > molecules;
bool hasInitializedForces, hasSetPositions, integratorIsDeleted;
bool hasInitializedForces, hasSetPositions, integratorIsDeleted, hasMinimizeKernel;
int lastForceGroups;
Platform* platform;
Kernel initializeForcesKernel, updateStateDataKernel, applyConstraintsKernel, virtualSitesKernel;
Kernel initializeForcesKernel, updateStateDataKernel, applyConstraintsKernel, virtualSitesKernel, minimizeKernel;
void* platformData;
};
......
......@@ -4,7 +4,7 @@
* This is part of the OpenMM molecular simulation toolkit. *
* See https://openmm.org/development. *
* *
* Portions copyright (c) 2008-2022 Stanford University and the Authors. *
* Portions copyright (c) 2008-2026 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -52,7 +52,7 @@ const static char CHECKPOINT_MAGIC_BYTES[] = "OpenMM Binary Checkpoint\n";
ContextImpl::ContextImpl(Context& owner, const System& system, Integrator& integrator, Platform* platform, const map<string, string>& properties, ContextImpl* originalContext) :
owner(owner), system(system), integrator(integrator), hasInitializedForces(false), hasSetPositions(false), integratorIsDeleted(false),
owner(owner), system(system), integrator(integrator), hasInitializedForces(false), hasSetPositions(false), integratorIsDeleted(false), hasMinimizeKernel(false),
lastForceGroups(-1), platform(platform), platformData(NULL) {
int numParticles = system.getNumParticles();
if (numParticles == 0)
......@@ -113,6 +113,7 @@ ContextImpl::ContextImpl(Context& owner, const System& system, Integrator& integ
kernelNames.push_back(UpdateStateDataKernel::Name());
kernelNames.push_back(ApplyConstraintsKernel::Name());
kernelNames.push_back(VirtualSitesKernel::Name());
kernelNames.push_back(MinimizeKernel::Name());
for (int i = 0; i < system.getNumForces(); ++i) {
forceImpls.push_back(system.getForce(i).createImpl());
vector<string> forceKernels = forceImpls[forceImpls.size()-1]->getKernelNames();
......@@ -198,6 +199,7 @@ ContextImpl::~ContextImpl() {
updateStateDataKernel = Kernel();
applyConstraintsKernel = Kernel();
virtualSitesKernel = Kernel();
minimizeKernel = Kernel();
if (!integratorIsDeleted) {
// The Context is being deleted before the Integrator, so call cleanup() on it now.
......@@ -507,3 +509,12 @@ void ContextImpl::systemChanged() {
Context* ContextImpl::createLinkedContext(const System& system, Integrator& integrator) {
return new Context(system, integrator, *this);
}
void ContextImpl::minimize(double tolerance, int maxIterations, MinimizationReporter* reporter) {
if (!hasMinimizeKernel) {
minimizeKernel = platform->createKernel(MinimizeKernel::Name(), *this);
minimizeKernel.getAs<MinimizeKernel>().initialize(system);
hasMinimizeKernel = true;
}
minimizeKernel.getAs<MinimizeKernel>().execute(*this, tolerance, maxIterations, reporter);
}
......@@ -4,7 +4,7 @@
* This is part of the OpenMM molecular simulation toolkit. *
* See https://openmm.org/development. *
* *
* Portions copyright (c) 2010-2020 Stanford University and the Authors. *
* Portions copyright (c) 2010-2026 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -28,264 +28,11 @@
* -------------------------------------------------------------------------- */
#include "openmm/LocalEnergyMinimizer.h"
#include "openmm/OpenMMException.h"
#include "openmm/Platform.h"
#include "openmm/VerletIntegrator.h"
#include "lbfgs.h"
#include <cmath>
#include <sstream>
#include <string>
#include <vector>
#include <algorithm>
#include "openmm/internal/ContextImpl.h"
using namespace OpenMM;
using namespace std;
struct MinimizerData {
Context& context;
double k;
MinimizationReporter* reporter;
bool checkLargeForces;
VerletIntegrator cpuIntegrator;
Context* cpuContext;
MinimizerData(Context& context, double k, MinimizationReporter* reporter) :
context(context), k(k), reporter(reporter), cpuIntegrator(1.0), cpuContext(NULL) {
string platformName = context.getPlatform().getName();
checkLargeForces = (platformName == "CUDA" || platformName == "OpenCL" || platformName == "HIP" || platformName == "Metal");
}
~MinimizerData() {
if (cpuContext != NULL)
delete cpuContext;
}
Context& getCpuContext() {
// Get an alternate context that runs on the CPU and doesn't place any limits
// on the magnitude of forces.
if (cpuContext == NULL) {
Platform* cpuPlatform;
try {
cpuPlatform = &Platform::getPlatformByName("CPU");
}
catch (...) {
cpuPlatform = &Platform::getPlatformByName("Reference");
}
cpuContext = new Context(context.getSystem(), cpuIntegrator, *cpuPlatform);
cpuContext->setState(context.getState(State::Positions | State::Velocities | State::Parameters));
}
return *cpuContext;
}
};
static double computeForcesAndEnergy(Context& context, const vector<Vec3>& positions, lbfgsfloatval_t *g) {
context.setPositions(positions);
context.computeVirtualSites();
State state = context.getState(State::Forces | State::Energy, false, context.getIntegrator().getIntegrationForceGroups());
const vector<Vec3>& forces = state.getForces();
const System& system = context.getSystem();
for (int i = 0; i < forces.size(); i++) {
if (system.getParticleMass(i) == 0) {
g[3*i] = 0.0;
g[3*i+1] = 0.0;
g[3*i+2] = 0.0;
}
else {
g[3*i] = -forces[i][0];
g[3*i+1] = -forces[i][1];
g[3*i+2] = -forces[i][2];
}
}
return state.getPotentialEnergy();
}
static lbfgsfloatval_t evaluate(void *instance, const lbfgsfloatval_t *x, lbfgsfloatval_t *g, const int n, const lbfgsfloatval_t step) {
MinimizerData* data = reinterpret_cast<MinimizerData*>(instance);
Context& context = data->context;
const System& system = context.getSystem();
int numParticles = system.getNumParticles();
// Compute the force and energy for this configuration.
vector<Vec3> positions(numParticles);
for (int i = 0; i < numParticles; i++)
positions[i] = Vec3(x[3*i], x[3*i+1], x[3*i+2]);
double energy = computeForcesAndEnergy(context, positions, g);
if (data->checkLargeForces) {
// The CUDA, OpenCL and HIP platforms accumulate forces in fixed point, so they
// can't handle very large forces. Check for problematic forces (very large,
// infinite, or NaN) and if necessary recompute them on the CPU.
for (int i = 0; i < 3*numParticles; i++) {
if (!(fabs(g[i]) < 2e9)) {
energy = computeForcesAndEnergy(data->getCpuContext(), positions, g);
break;
}
}
}
// Add harmonic forces for any constraints.
int numConstraints = system.getNumConstraints();
double k = data->k;
for (int i = 0; i < numConstraints; i++) {
int particle1, particle2;
double distance;
system.getConstraintParameters(i, particle1, particle2, distance);
Vec3 delta = positions[particle2]-positions[particle1];
double r2 = delta.dot(delta);
double r = sqrt(r2);
delta *= 1/r;
double dr = r-distance;
double kdr = k*dr;
energy += 0.5*kdr*dr;
if (system.getParticleMass(particle1) != 0) {
g[3*particle1] -= kdr*delta[0];
g[3*particle1+1] -= kdr*delta[1];
g[3*particle1+2] -= kdr*delta[2];
}
if (system.getParticleMass(particle2) != 0) {
g[3*particle2] += kdr*delta[0];
g[3*particle2+1] += kdr*delta[1];
g[3*particle2+2] += kdr*delta[2];
}
}
return energy;
}
static int report(void *instance, const lbfgsfloatval_t *x, const lbfgsfloatval_t *g, const lbfgsfloatval_t fx,
const lbfgsfloatval_t xnorm, const lbfgsfloatval_t gnorm, const lbfgsfloatval_t step, int n, int iteration, int ls) {
// Copy over the positions and gradients.
vector<double> xout(n), gradout(n);
for (int i = 0; i < n; i++) {
xout[i] = x[i];
gradout[i] = g[i];
}
// Compute the other arguments passed to the reporter.
MinimizerData* data = reinterpret_cast<MinimizerData*>(instance);
Context& context = data->context;
const System& system = context.getSystem();
double restraintEnergy = 0.0, maxError = 0.0;
double k = data->k;
for (int i = 0; i < system.getNumConstraints(); i++) {
int p1, p2;
double distance;
system.getConstraintParameters(i, p1, p2, distance);
Vec3 delta(x[3*p1]-x[3*p2], x[3*p1+1]-x[3*p2+1], x[3*p1+2]-x[3*p2+2]);
double r2 = delta.dot(delta);
double r = sqrt(r2);
double dr = r-distance;
restraintEnergy += 0.5*k*dr*dr;
maxError = max(maxError, fabs(dr)/distance);
}
map<string, double> args;
args["restraint energy"] = restraintEnergy;
args["system energy"] = fx-restraintEnergy;
args["restraint strength"] = k;
args["max constraint error"] = maxError;
// Invoke the reporter.
MinimizationReporter* reporter = reinterpret_cast<MinimizationReporter*>(data->reporter);
if (reporter->report(iteration-1, xout, gradout, args))
return 1;
return 0;
}
void LocalEnergyMinimizer::minimize(Context& context, double tolerance, int maxIterations, MinimizationReporter* reporter) {
const System& system = context.getSystem();
int numParticles = system.getNumParticles();
double constraintTol = context.getIntegrator().getConstraintTolerance();
double workingConstraintTol = std::max(1e-4, constraintTol);
double k = 100/workingConstraintTol;
lbfgsfloatval_t *x = lbfgs_malloc(numParticles*3);
if (x == NULL)
throw OpenMMException("LocalEnergyMinimizer: Failed to allocate memory");
try {
// Initialize the minimizer.
lbfgs_parameter_t param;
lbfgs_parameter_init(&param);
if (!context.getPlatform().supportsDoublePrecision())
param.xtol = 1e-7;
param.max_iterations = maxIterations;
param.linesearch = LBFGS_LINESEARCH_BACKTRACKING_STRONG_WOLFE;
// Make sure the initial configuration satisfies all constraints.
context.applyConstraints(workingConstraintTol);
// Record the initial positions and determine a normalization constant for scaling the tolerance.
vector<Vec3> initialPos = context.getState(State::Positions).getPositions();
double norm = 0.0;
for (int i = 0; i < numParticles; i++) {
x[3*i] = initialPos[i][0];
x[3*i+1] = initialPos[i][1];
x[3*i+2] = initialPos[i][2];
norm += initialPos[i].dot(initialPos[i]);
}
norm /= numParticles;
norm = (norm < 1 ? 1 : sqrt(norm));
param.epsilon = tolerance/norm;
// Repeatedly minimize, steadily increasing the strength of the springs until all constraints are satisfied.
double prevMaxError = 1e10;
MinimizerData data(context, k, reporter);
while (true) {
// Perform the minimization.
lbfgsfloatval_t fx;
lbfgs_progress_t reportFn = (reporter == NULL ? NULL : report);
lbfgs(numParticles*3, x, &fx, evaluate, reportFn, &data, &param);
// Check whether all constraints are satisfied.
vector<Vec3> positions = context.getState(State::Positions).getPositions();
int numConstraints = system.getNumConstraints();
double maxError = 0.0;
for (int i = 0; i < numConstraints; i++) {
int particle1, particle2;
double distance;
system.getConstraintParameters(i, particle1, particle2, distance);
Vec3 delta = positions[particle2]-positions[particle1];
double r = sqrt(delta.dot(delta));
double error = fabs(r-distance)/distance;
if (error > maxError)
maxError = error;
}
if (maxError <= workingConstraintTol)
break; // All constraints are satisfied.
context.setPositions(initialPos);
if (maxError >= prevMaxError)
break; // Further tightening the springs doesn't seem to be helping, so just give up.
prevMaxError = maxError;
data.k *= 10;
if (maxError > 100*workingConstraintTol) {
// We've gotten far enough from a valid state that we might have trouble getting
// back, so reset to the original positions.
for (int i = 0; i < numParticles; i++) {
x[3*i] = initialPos[i][0];
x[3*i+1] = initialPos[i][1];
x[3*i+2] = initialPos[i][2];
}
}
}
}
catch (...) {
lbfgs_free(x);
throw;
}
lbfgs_free(x);
// If necessary, do a final constraint projection to make sure they are satisfied
// to the full precision requested by the user.
if (constraintTol < workingConstraintTol)
context.applyConstraints(workingConstraintTol);
context.getImpl().minimize(tolerance, maxIterations, reporter);
}
#ifndef OPENMM_COMMONMINIMIZEKERNEL_H_
#define OPENMM_COMMONMINIMIZEKERNEL_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit. *
* See https://openmm.org/development. *
* *
* Portions copyright (c) 2026 Stanford University and the Authors. *
* Authors: Evan Pretti *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include "openmm/LocalEnergyMinimizer.h"
#include "openmm/kernels.h"
#include "openmm/common/ComputeContext.h"
namespace OpenMM {
/**
* This kernel performs local energy minimization.
*/
class CommonMinimizeKernel : public MinimizeKernel {
public:
CommonMinimizeKernel(std::string name, const Platform& platform, ComputeContext& cc) : MinimizeKernel(name, platform), cc(cc), isSetup(false), cpuContext(NULL), cpuIntegrator(1) {
}
~CommonMinimizeKernel();
/**
* Initialize the kernel.
*
* @param system the System this kernel will be applied to
*/
void initialize(const System& system);
/**
* Perform local energy minimization.
*
* @param context the context with which to perform the minimization
* @param tolerance limiting root-mean-square value of all force components in kJ/mol/nm for convergence
* @param maxIterations the maximum number of iterations to perform, or 0 to continue until convergence
* @param reporter an optional reporter to invoke after each iteration of minimization
*/
void execute(ContextImpl& context, double tolerance, int maxIterations, MinimizationReporter* reporter);
private:
static const double minConstraintTol, kRestraintScale, prevMaxErrorInit, kRestraintScaleUp, constraintTolScale;
static const double fTol, wolfeParam, stepScaleDown, stepScaleUp, minStep, maxStep;
static const int numVectors, maxLineSearchIterations;
void setup(ContextImpl& context);
void lbfgs(ContextImpl& context);
void evaluateGpu(ContextImpl& context);
double evaluateCpu(ContextImpl& context);
bool report(ContextImpl& context, int iteration);
void downloadReturnFlagStart();
void downloadReturnValueStart();
int downloadReturnFlagFinish();
double downloadReturnValueFinish();
double downloadReturnValueSync();
double downloadGradNormSync();
void runLineSearchKernels();
ComputeContext& cc;
int numParticles, numVariables, numConstraints;
std::vector<Vec3> hostPositions;
std::vector<double> hostX;
std::vector<double> hostGrad;
std::vector<mm_int2> hostConstraintIndices;
std::vector<double> hostConstraintDistances;
bool isSetup, mixedIsDouble;
int elementSize, threadBlockSize;
void* pinnedMemory;
int forceGroups;
double constraintTol;
double tolerance;
int maxIterations;
MinimizationReporter* reporter;
double kRestraint, energy;
bool largeGrad;
ComputeArray constraintIndices, constraintDistances;
ComputeArray xInit, x, xPrev, grad, gradPrev, dir;
ComputeArray alpha, scale, xDiff, gradDiff;
ComputeArray returnFlag, returnValue, gradNorm, lineSearchData, lineSearchDataBackup;
ComputeKernel recordInitialPosKernel;
ComputeKernel restorePosKernel;
ComputeKernel convertForcesKernel;
ComputeKernel getConstraintEnergyForcesKernel;
ComputeKernel getConstraintErrorKernel;
ComputeKernel initializeDirKernel;
ComputeKernel gradNormKernel;
ComputeKernel getDiffKernel;
ComputeKernel getScaleKernel;
ComputeKernel reinitializeDirKernel;
ComputeKernel updateDirAlphaKernel;
ComputeKernel scaleDirKernel;
ComputeKernel updateDirBetaKernel;
ComputeKernel updateDirFinalKernel;
ComputeKernel lineSearchSetupKernel;
ComputeKernel lineSearchStepKernel;
ComputeKernel lineSearchDotKernel;
ComputeKernel lineSearchContinueKernel;
ComputeEvent downloadStartEvent;
ComputeEvent downloadFinishEvent;
ComputeQueue downloadQueue;
Context* cpuContext;
VerletIntegrator cpuIntegrator;
};
} // namespace OpenMM
#endif // OPENMM_COMMONMINIMIZEKERNEL_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit. *
* See https://openmm.org/development. *
* *
* Portions copyright (c) 2026 Stanford University and the Authors. *
* Authors: Evan Pretti *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include "openmm/common/CommonMinimizeKernel.h"
#include "openmm/common/ContextSelector.h"
#include "CommonKernelSources.h"
#include <map>
using namespace OpenMM;
using namespace std;
// Constants for constraint handling.
const double CommonMinimizeKernel::minConstraintTol = 1e-4;
const double CommonMinimizeKernel::kRestraintScale = 100;
const double CommonMinimizeKernel::prevMaxErrorInit = 1e10;
const double CommonMinimizeKernel::kRestraintScaleUp = 10;
const double CommonMinimizeKernel::constraintTolScale = 100;
// Constants for L-BFGS.
const double CommonMinimizeKernel::fTol = 1e-4;
const double CommonMinimizeKernel::wolfeParam = 0.9;
const double CommonMinimizeKernel::stepScaleDown = 0.5;
const double CommonMinimizeKernel::stepScaleUp = 2.1;
const double CommonMinimizeKernel::minStep = 1e-20;
const double CommonMinimizeKernel::maxStep = 1e20;
const int CommonMinimizeKernel::numVectors = 6;
const int CommonMinimizeKernel::maxLineSearchIterations = 40;
CommonMinimizeKernel::~CommonMinimizeKernel() {
if (cpuContext != NULL) {
delete cpuContext;
}
}
void CommonMinimizeKernel::initialize(const System& system) {
numParticles = system.getNumParticles();
numVariables = numParticles * 3;
numConstraints = system.getNumConstraints();
hostPositions.resize(numParticles);
hostX.resize(numVariables);
hostGrad.resize(numVariables);
hostConstraintIndices.resize(numConstraints);
hostConstraintDistances.resize(numConstraints);
for (int i = 0; i < numConstraints; i++) {
system.getConstraintParameters(i, hostConstraintIndices[i].x, hostConstraintIndices[i].y, hostConstraintDistances[i]);
}
}
void CommonMinimizeKernel::execute(ContextImpl& context, double tolerance, int maxIterations, MinimizationReporter* reporter) {
ContextSelector selector(cc);
if (!isSetup) {
// Load system and integrator information.
mixedIsDouble = cc.getUseMixedPrecision() || cc.getUseDoublePrecision();
if (mixedIsDouble && !cc.getSupports64BitGlobalAtomics()) {
throw OpenMMException("Double precision is not supported on devices that do not support 64 bit atomic operations");
}
elementSize = mixedIsDouble ? sizeof(double) : sizeof(float);
threadBlockSize = cc.getMaxThreadBlockSize();
pinnedMemory = cc.getPinnedBuffer();
// Initialize all device-side arrays and compile kernels.
setup(context);
isSetup = true;
}
const Integrator& integrator = context.getIntegrator();
forceGroups = integrator.getIntegrationForceGroups();
constraintTol = integrator.getConstraintTolerance();
this->tolerance = tolerance * sqrt((double) numParticles);
this->maxIterations = maxIterations;
this->reporter = reporter;
if (mixedIsDouble) {
getDiffKernel->setArg(11, this->tolerance);
}
else {
getDiffKernel->setArg(11, (float) this->tolerance);
}
double workingConstraintTol = max(minConstraintTol, constraintTol);
kRestraint = kRestraintScale / workingConstraintTol;
context.applyConstraints(workingConstraintTol);
recordInitialPosKernel->execute(numParticles);
double prevMaxError1 = prevMaxErrorInit, prevMaxError2 = prevMaxErrorInit;
while (true) {
lbfgs(context);
if (!numConstraints) {
// There are no constraints, so we are finished.
break;
}
getConstraintErrorKernel->execute(threadBlockSize, threadBlockSize);
double maxError = downloadReturnValueSync();
if (maxError <= workingConstraintTol) {
// All constraints are satisfied.
break;
}
restorePosKernel->setArg(2, xInit);
restorePosKernel->execute(numParticles);
if (maxError >= prevMaxError2) {
// Further tightening the springs doesn't seem to be helping, so just give up.
break;
}
prevMaxError2 = prevMaxError1;
prevMaxError1 = maxError;
kRestraint *= kRestraintScaleUp;
if (maxError > constraintTolScale * workingConstraintTol) {
// We've gotten far enough from a valid state that we might have
// trouble getting back, so reset to the original positions.
xInit.copyTo(x);
}
}
if (constraintTol < workingConstraintTol) {
context.applyConstraints(constraintTol);
}
}
void CommonMinimizeKernel::setup(ContextImpl& context) {
// Initialize arrays.
if (numConstraints) {
constraintIndices.initialize<mm_int2>(cc, numConstraints, "constraintIndices");
constraintDistances.initialize(cc, numConstraints, elementSize, "constraintDistances");
}
xInit.initialize(cc, numVariables, elementSize, "xInit");
x.initialize(cc, numVariables, elementSize, "x");
xPrev.initialize(cc, numVariables, elementSize, "xPrev");
grad.initialize(cc, numVariables, elementSize, "grad");
gradPrev.initialize(cc, numVariables, elementSize, "gradPrev");
dir.initialize(cc, numVariables, elementSize, "dir");
alpha.initialize(cc, numVectors + 1, elementSize, "alpha");
scale.initialize(cc, numVectors, elementSize, "scale");
xDiff.initialize(cc, numVectors * numVariables, elementSize, "xDiff");
gradDiff.initialize(cc, numVectors * numVariables, elementSize, "gradDiff");
returnFlag.initialize<int>(cc, 1, "returnFlag");
returnValue.initialize(cc, 1, elementSize, "returnValue");
gradNorm.initialize(cc, 1, elementSize, "gradNorm");
lineSearchData.initialize(cc, 4, elementSize, "lineSearchData");
lineSearchDataBackup.initialize(cc, 4, elementSize, "lineSearchDataBackup");
// Compile kernels and set arguments.
map<string, string> defines;
defines["THREAD_BLOCK_SIZE"] = cc.intToString(threadBlockSize);
defines["NUM_VECTORS"] = cc.intToString(numVectors);
defines["LBFGS_FTOL"] = cc.doubleToString(fTol);
defines["LBFGS_WOLFE"] = cc.doubleToString(wolfeParam);
defines["LBFGS_SCALE_DOWN"] = cc.doubleToString(stepScaleDown);
defines["LBFGS_SCALE_UP"] = cc.doubleToString(stepScaleUp);
defines["LBFGS_MIN_STEP"] = cc.doubleToString(minStep);
defines["LBFGS_MAX_STEP"] = cc.doubleToString(maxStep);
ComputeProgram program = cc.compileProgram(CommonKernelSources::minimize, defines);
recordInitialPosKernel = program->createKernel("recordInitialPos");
recordInitialPosKernel->addArg(cc.getPosq());
recordInitialPosKernel->addArg(cc.getAtomIndexArray());
recordInitialPosKernel->addArg(xInit);
recordInitialPosKernel->addArg(x);
recordInitialPosKernel->addArg(numParticles);
if (cc.getUseMixedPrecision()) {
recordInitialPosKernel->addArg(cc.getPosqCorrection());
}
restorePosKernel = program->createKernel("restorePos");
restorePosKernel->addArg(cc.getPosq());
restorePosKernel->addArg(cc.getAtomIndexArray());
restorePosKernel->addArg(); // x (could also be launched with xInit)
restorePosKernel->addArg(returnValue);
restorePosKernel->addArg(numParticles);
if (cc.getUseMixedPrecision()) {
restorePosKernel->addArg(cc.getPosqCorrection());
}
convertForcesKernel = program->createKernel("convertForces");
convertForcesKernel->addArg(cc.getVelm());
convertForcesKernel->addArg(cc.getLongForceBuffer());
convertForcesKernel->addArg(cc.getAtomIndexArray());
convertForcesKernel->addArg(grad);
convertForcesKernel->addArg(returnValue);
convertForcesKernel->addArg(numParticles);
convertForcesKernel->addArg(cc.getPaddedNumAtoms());
if (numConstraints) {
getConstraintEnergyForcesKernel = program->createKernel("getConstraintEnergyForces");
getConstraintEnergyForcesKernel->addArg(cc.getLongForceBuffer());
getConstraintEnergyForcesKernel->addArg(cc.getAtomIndexArray());
getConstraintEnergyForcesKernel->addArg(constraintIndices);
getConstraintEnergyForcesKernel->addArg(constraintDistances);
getConstraintEnergyForcesKernel->addArg(x);
getConstraintEnergyForcesKernel->addArg(returnValue);
getConstraintEnergyForcesKernel->addArg(cc.getPaddedNumAtoms());
getConstraintEnergyForcesKernel->addArg(numConstraints);
getConstraintEnergyForcesKernel->addArg(); // kRestraint
getConstraintErrorKernel = program->createKernel("getConstraintError");
getConstraintErrorKernel->addArg(constraintIndices);
getConstraintErrorKernel->addArg(constraintDistances);
getConstraintErrorKernel->addArg(x);
getConstraintErrorKernel->addArg(returnValue);
getConstraintErrorKernel->addArg(numConstraints);
}
initializeDirKernel = program->createKernel("initializeDir");
initializeDirKernel->addArg(grad);
initializeDirKernel->addArg(dir);
initializeDirKernel->addArg(gradNorm);
initializeDirKernel->addArg(lineSearchData);
initializeDirKernel->addArg(numVariables);
gradNormKernel = program->createKernel("gradNorm");
gradNormKernel->addArg(grad);
gradNormKernel->addArg(gradNorm);
gradNormKernel->addArg(numVariables);
getDiffKernel = program->createKernel("getDiff");
getDiffKernel->addArg(x);
getDiffKernel->addArg(xPrev);
getDiffKernel->addArg(grad);
getDiffKernel->addArg(gradPrev);
getDiffKernel->addArg(scale);
getDiffKernel->addArg(xDiff);
getDiffKernel->addArg(gradDiff);
getDiffKernel->addArg(returnFlag);
getDiffKernel->addArg(returnValue);
getDiffKernel->addArg(gradNorm);
getDiffKernel->addArg(numVariables);
getDiffKernel->addArg(); // tolerance
getDiffKernel->addArg(); // end
getDiffKernel->addArg(); // largeGrad
getScaleKernel = program->createKernel("getScale");
getScaleKernel->addArg(alpha);
getScaleKernel->addArg(scale);
getScaleKernel->addArg(xDiff);
getScaleKernel->addArg(gradDiff);
getScaleKernel->addArg(returnFlag);
getScaleKernel->addArg(returnValue);
getScaleKernel->addArg(numVariables);
getScaleKernel->addArg(); // end
getScaleKernel->addArg(); // largeGrad
reinitializeDirKernel = program->createKernel("reinitializeDir");
reinitializeDirKernel->addArg(grad);
reinitializeDirKernel->addArg(dir);
reinitializeDirKernel->addArg(alpha);
reinitializeDirKernel->addArg(scale);
reinitializeDirKernel->addArg(xDiff);
reinitializeDirKernel->addArg(returnFlag);
reinitializeDirKernel->addArg(returnValue);
reinitializeDirKernel->addArg(numVariables);
reinitializeDirKernel->addArg(); // vectorIndex
reinitializeDirKernel->addArg(); // largeGrad
updateDirAlphaKernel = program->createKernel("updateDirAlpha");
updateDirAlphaKernel->addArg(dir);
updateDirAlphaKernel->addArg(alpha);
updateDirAlphaKernel->addArg(scale);
updateDirAlphaKernel->addArg(xDiff);
updateDirAlphaKernel->addArg(gradDiff);
updateDirAlphaKernel->addArg(returnFlag);
updateDirAlphaKernel->addArg(numVariables);
updateDirAlphaKernel->addArg(); // vectorIndex
scaleDirKernel = program->createKernel("scaleDir");
scaleDirKernel->addArg(dir);
scaleDirKernel->addArg(alpha);
scaleDirKernel->addArg(scale);
scaleDirKernel->addArg(gradDiff);
scaleDirKernel->addArg(returnFlag);
scaleDirKernel->addArg(returnValue);
scaleDirKernel->addArg(numVariables);
scaleDirKernel->addArg(); // vectorIndex
updateDirBetaKernel = program->createKernel("updateDirBeta");
updateDirBetaKernel->addArg(dir);
updateDirBetaKernel->addArg(alpha);
updateDirBetaKernel->addArg(scale);
updateDirBetaKernel->addArg(xDiff);
updateDirBetaKernel->addArg(gradDiff);
updateDirBetaKernel->addArg(returnFlag);
updateDirBetaKernel->addArg(numVariables);
updateDirBetaKernel->addArg(); // vectorIndex
updateDirBetaKernel->addArg(); // vectorIndexAlpha
updateDirFinalKernel = program->createKernel("updateDirFinal");
updateDirFinalKernel->addArg(dir);
updateDirFinalKernel->addArg(alpha);
updateDirFinalKernel->addArg(xDiff);
updateDirFinalKernel->addArg(returnFlag);
updateDirFinalKernel->addArg(lineSearchData);
updateDirFinalKernel->addArg(numVariables);
updateDirFinalKernel->addArg(); // vectorIndex
updateDirFinalKernel->addArg(); // vectorIndexAlpha
lineSearchSetupKernel = program->createKernel("lineSearchSetup");
lineSearchSetupKernel->addArg(x);
lineSearchSetupKernel->addArg(xPrev);
lineSearchSetupKernel->addArg(grad);
lineSearchSetupKernel->addArg(gradPrev);
lineSearchSetupKernel->addArg(dir);
lineSearchSetupKernel->addArg(returnFlag);
lineSearchSetupKernel->addArg(gradNorm);
lineSearchSetupKernel->addArg(lineSearchData);
lineSearchSetupKernel->addArg(numVariables);
lineSearchSetupKernel->addArg(); // energyStart
lineSearchStepKernel = program->createKernel("lineSearchStep");
lineSearchStepKernel->addArg(x);
lineSearchStepKernel->addArg(xPrev);
lineSearchStepKernel->addArg(grad);
lineSearchStepKernel->addArg(gradPrev);
lineSearchStepKernel->addArg(dir);
lineSearchStepKernel->addArg(returnFlag);
lineSearchStepKernel->addArg(gradNorm);
lineSearchStepKernel->addArg(lineSearchData);
lineSearchStepKernel->addArg(lineSearchDataBackup);
lineSearchStepKernel->addArg(numVariables);
lineSearchDotKernel = program->createKernel("lineSearchDot");
lineSearchDotKernel->addArg(grad);
lineSearchDotKernel->addArg(dir);
lineSearchDotKernel->addArg(lineSearchData);
lineSearchDotKernel->addArg(returnFlag);
lineSearchDotKernel->addArg(returnValue);
lineSearchDotKernel->addArg(numVariables);
lineSearchDotKernel->addArg(); // energy
lineSearchContinueKernel = program->createKernel("lineSearchContinue");
lineSearchContinueKernel->addArg(returnFlag);
lineSearchContinueKernel->addArg(gradNorm);
lineSearchContinueKernel->addArg(lineSearchData);
downloadStartEvent = cc.createEvent();
downloadFinishEvent = cc.createEvent();
downloadQueue = cc.createQueue();
// Upload constraint data.
if (numConstraints) {
constraintIndices.upload(hostConstraintIndices);
constraintDistances.upload(hostConstraintDistances, true);
}
}
void CommonMinimizeKernel::lbfgs(ContextImpl& context) {
// Evaluate the energy and gradient at the starting point.
evaluateGpu(context);
energy += downloadReturnValueSync();
if (!(fabs(energy) < std::numeric_limits<float>::max())) {
energy = evaluateCpu(context);
}
if (!isfinite(energy)) {
throw OpenMMException("Energy or force at minimization starting point is infinite or NaN.");
}
// Check to see if the starting point is already a minimum. Since we are at
// the start of the run, just use the fallback single-thread-block kernel.
gradNormKernel->execute(threadBlockSize, threadBlockSize);
if (downloadGradNormSync() <= tolerance) {
return;
}
initializeDirKernel->execute(numVariables);
for (int iteration = 1, end = 0;;) {
// Prepare for a line search.
if (mixedIsDouble) {
lineSearchSetupKernel->setArg(9, energy);
}
else {
lineSearchSetupKernel->setArg(9, (float) energy);
}
lineSearchSetupKernel->execute(numVariables);
// Take line search steps.
for (int count = 0;; count++) {
lineSearchStepKernel->execute(numVariables);
if (count) {
int hostReturnFlag = downloadReturnFlagFinish();
if (hostReturnFlag == 1) {
break; // Line search success
}
else if (hostReturnFlag == 0 || count >= maxLineSearchIterations) {
xPrev.copyTo(x);
gradPrev.copyTo(grad);
return; // Line search failure
}
}
// Evaluate the energy and gradient at the new search point, then
// decide if and how to continue the line search.
evaluateGpu(context);
downloadReturnValueStart();
runLineSearchKernels();
energy += downloadReturnValueFinish();
if (!(fabs(energy) < std::numeric_limits<float>::max())) {
// Overflow on the GPU: try the CPU.
energy = evaluateCpu(context);
// We ran the line search kernels with an invalid gradient, so
// we need to reset the state to before they ran and retry.
lineSearchDataBackup.copyTo(lineSearchData);
int hostReturnFlag = 2; // Continue the line search
returnFlag.upload(&hostReturnFlag);
// lineSearchDot will try to read any restraint energy from
// returnValue, but it will be FLT_MAX from the failed GPU run,
// so reset it to 0 (since the restraint energy from the CPU run
// is included in the return value that will get uploaded).
cc.clearBuffer(returnValue);
runLineSearchKernels();
}
downloadReturnFlagStart();
}
// Check for convergence or exceeding the maximum number of steps.
if ((reporter != NULL && report(context, iteration)) || (maxIterations && maxIterations < iteration + 1)) {
return;
}
// Do L-BFGS update of search direction.
if (largeGrad) {
gradNormKernel->execute(threadBlockSize, threadBlockSize);
}
getDiffKernel->setArg(12, end);
getDiffKernel->setArg(13, (int) largeGrad);
getDiffKernel->execute(numVariables);
downloadReturnFlagStart();
getScaleKernel->setArg(7, end);
getScaleKernel->setArg(8, (int) largeGrad);
if (largeGrad) {
getScaleKernel->execute(threadBlockSize, threadBlockSize);
}
else {
getScaleKernel->execute(numVariables);
}
int limit = min(numVectors, iteration++);
int vectorIndex = end;
if (++end >= numVectors) {
end -= numVectors;
}
reinitializeDirKernel->setArg(8, vectorIndex);
reinitializeDirKernel->setArg(9, (int) largeGrad);
reinitializeDirKernel->execute(numVariables);
for (int vector = 0; vector < limit; vector++) {
if (vector && --vectorIndex < 0) {
vectorIndex += numVectors;
}
if (vector < limit - 1) {
updateDirAlphaKernel->setArg(7, vectorIndex);
updateDirAlphaKernel->execute(numVariables);
}
}
scaleDirKernel->setArg(7, vectorIndex);
scaleDirKernel->execute(numVariables);
for (int vector = 0; vector < limit - 1; vector++) {
// scaleDirKernel puts its first result in alpha[numVectors], so for the
// first vector, load the result from here instead of alpha[vectorIndex]
updateDirBetaKernel->setArg(7, vectorIndex);
updateDirBetaKernel->setArg(8, vector ? vectorIndex : numVectors);
updateDirBetaKernel->execute(numVariables);
if (++vectorIndex >= numVectors) {
vectorIndex -= numVectors;
}
}
// If this is the first iteration, limit is 1, we did not go through the loop above,
// and we need to read the output of scaleDirKernel directly from alpha[numVectors]
updateDirFinalKernel->setArg(6, vectorIndex);
updateDirFinalKernel->setArg(7, limit > 1 ? vectorIndex : numVectors);
updateDirFinalKernel->execute(numVariables);
if (downloadReturnFlagFinish()) {
return;
}
}
}
void CommonMinimizeKernel::evaluateGpu(ContextImpl& context) {
largeGrad = false;
// Put the current positions in posq and compute virtual site positions.
restorePosKernel->setArg(2, x);
restorePosKernel->execute(numParticles);
context.computeVirtualSites();
// Evaluate the forces and energy for the desired interactions as well as
// harmonic restraints to emulate the constraints.
energy = context.calcForcesAndEnergy(true, true, forceGroups);
if (numConstraints) {
if (mixedIsDouble) {
getConstraintEnergyForcesKernel->setArg(8, kRestraint);
}
else {
getConstraintEnergyForcesKernel->setArg(8, (float) kRestraint);
}
getConstraintEnergyForcesKernel->execute(numConstraints);
}
// Convert the forces from fixed to floating point format. If they are too
// large, the energy in returnValue will be set to FLT_MAX to signal that the
// results are invalid and we must fall back to computing forces on the CPU.
convertForcesKernel->execute(numParticles);
}
double CommonMinimizeKernel::evaluateCpu(ContextImpl& context) {
largeGrad = true;
// Create a CPU context if one has not already been created.
const System& system = context.getSystem();
if (cpuContext == NULL) {
Platform* cpuPlatform;
try {
cpuPlatform = &Platform::getPlatformByName("CPU");
}
catch (...) {
cpuPlatform = &Platform::getPlatformByName("Reference");
}
cpuContext = new Context(system, cpuIntegrator, *cpuPlatform);
}
// Download positions and evaluate forces on the CPU.
x.download(hostX, true);
for (int i = 0; i < numParticles; i++) {
hostPositions[i] = Vec3(hostX[3 * i], hostX[3 * i + 1], hostX[3 * i + 2]);
}
cpuContext->setState(context.getOwner().getState(State::Parameters));
cpuContext->setPositions(hostPositions);
cpuContext->computeVirtualSites();
State state = cpuContext->getState(State::Energy | State::Forces, false, forceGroups);
double hostEnergy = state.getPotentialEnergy();
const vector<Vec3>& hostForces = state.getForces();
// Prepare the gradient to send back to the optimizer.
for (int i = 0; i < numParticles; i++) {
if (system.getParticleMass(i) != 0) {
hostGrad[3 * i] = -hostForces[i][0];
hostGrad[3 * i + 1] = -hostForces[i][1];
hostGrad[3 * i + 2] = -hostForces[i][2];
}
}
// Apply harmonic forces for constraints.
for (int i = 0; i < numConstraints; i++) {
mm_int2 indices = hostConstraintIndices[i];
double distance = hostConstraintDistances[i];
Vec3 delta = hostPositions[indices.y] - hostPositions[indices.x];
double r2 = delta.dot(delta);
double r = sqrt(r2);
delta *= 1 / r;
double dr = r - distance;
double kdr = kRestraint * dr;
hostEnergy += 0.5 * kdr * dr;
if (system.getParticleMass(indices.x) != 0) {
hostGrad[3 * indices.y] -= kdr * delta[0];
hostGrad[3 * indices.y + 1] -= kdr * delta[1];
hostGrad[3 * indices.y + 2] -= kdr * delta[2];
}
if (system.getParticleMass(indices.y) != 0) {
hostGrad[3 * indices.x] += kdr * delta[0];
hostGrad[3 * indices.x + 1] += kdr * delta[1];
hostGrad[3 * indices.x + 2] += kdr * delta[2];
}
}
// Check for overflow of the forces. We need to check for this here and
// either back up the line search or abort, since the GPU platforms won't
// check for NaN positions, and the optimizer could get stuck otherwise.
if (mixedIsDouble) {
for (int i = 0; i < numVariables; i++) {
if (!isfinite(hostGrad[i])) {
return NAN;
}
}
}
else {
for (int i = 0; i < numVariables; i++) {
if (!isfinite((float) hostGrad[i])) {
return NAN;
}
}
}
// Upload forces and return the final energy.
grad.upload(hostGrad, true);
return hostEnergy;
}
bool CommonMinimizeKernel::report(ContextImpl& context, int iteration) {
x.download(hostX, true);
grad.download(hostGrad, true);
double restraintEnergy = 0.0, maxError = 0.0;
for (int i = 0; i < numConstraints; i++) {
mm_int2 indices = hostConstraintIndices[i];
double distance = hostConstraintDistances[i];
Vec3 delta = Vec3(hostX[3 * indices.y] - hostX[3 * indices.x], hostX[3 * indices.y + 1] - hostX[3 * indices.x + 1], hostX[3 * indices.y + 2] - hostX[3 * indices.x + 2]);
double r2 = delta.dot(delta);
double r = sqrt(r2);
double dr = r - distance;
restraintEnergy += 0.5 * kRestraint * dr * dr;
maxError = max(maxError, fabs(dr) / distance);
}
map<string, double> args;
args["restraint energy"] = restraintEnergy;
args["system energy"] = energy - restraintEnergy;
args["restraint strength"] = kRestraint;
args["max constraint error"] = maxError;
return reporter->report(iteration - 1, hostX, hostGrad, args);
}
void CommonMinimizeKernel::downloadReturnFlagStart() {
downloadStartEvent->enqueue();
cc.setCurrentQueue(downloadQueue);
downloadStartEvent->queueWait(downloadQueue);
returnFlag.download(pinnedMemory, false);
downloadFinishEvent->enqueue();
cc.restoreDefaultQueue();
}
void CommonMinimizeKernel::downloadReturnValueStart() {
downloadStartEvent->enqueue();
cc.setCurrentQueue(downloadQueue);
downloadStartEvent->queueWait(downloadQueue);
returnValue.download(pinnedMemory, false);
downloadFinishEvent->enqueue();
cc.restoreDefaultQueue();
}
int CommonMinimizeKernel::downloadReturnFlagFinish() {
downloadFinishEvent->wait();
return *(int*) pinnedMemory;
}
double CommonMinimizeKernel::downloadReturnValueFinish() {
downloadFinishEvent->wait();
if (mixedIsDouble) {
return *(double*) pinnedMemory;
}
else {
return (double) *(float*) pinnedMemory;
}
}
double CommonMinimizeKernel::downloadReturnValueSync() {
if (mixedIsDouble) {
double hostReturnValue;
returnValue.download(&hostReturnValue);
return hostReturnValue;
}
else {
float hostReturnValue;
returnValue.download(&hostReturnValue);
return (double) hostReturnValue;
}
}
double CommonMinimizeKernel::downloadGradNormSync() {
if (mixedIsDouble) {
double hostGradNorm;
gradNorm.download(&hostGradNorm);
return hostGradNorm;
}
else {
float hostGradNorm;
gradNorm.download(&hostGradNorm);
return (double) hostGradNorm;
}
}
void CommonMinimizeKernel::runLineSearchKernels() {
if (mixedIsDouble) {
lineSearchDotKernel->setArg(6, isfinite(energy) ? energy : (double) std::numeric_limits<float>::max());
}
else {
float hostEnergy = (float) energy;
lineSearchDotKernel->setArg(6, isfinite(hostEnergy) ? hostEnergy : std::numeric_limits<float>::max());
}
lineSearchDotKernel->execute(numVariables);
lineSearchContinueKernel->execute(1);
}
#define LS_DOT_START 0
#define LS_DOT 1
#define LS_ENERGY 2
#define LS_STEP 3
#define LS_FAIL 0
#define LS_SUCCEED 1
#define LS_CONTINUE 2
#define WARP_SIZE 32
#ifdef USE_MIXED_PRECISION
// When mixed is double precision, use double precision sqrt and fabs to
// avoid overflow.
#define SQRT_MIXED sqrt
#define FABS_MIXED fabs
#else
#define SQRT_MIXED SQRT
#define FABS_MIXED FABS
#endif
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
#define WARP_SHUFFLE_DOWN(local, offset) __shfl_down_sync(0xffffffff, local, offset)
#elif defined(USE_HIP)
#define WARP_SHUFFLE_DOWN(local, offset) __shfl_down(local, offset)
#endif
#ifdef WARP_SHUFFLE_DOWN
#define TEMP_SIZE WARP_SIZE
#else
#define TEMP_SIZE THREAD_BLOCK_SIZE
#endif
DEVICE void resetLineSearchData(GLOBAL mixed* RESTRICT lineSearchData) {
if (GLOBAL_ID == 0) {
lineSearchData[LS_DOT_START] = 0;
lineSearchData[LS_STEP] = 1;
}
}
DEVICE mixed reduceAdd(mixed value, LOCAL_ARG volatile mixed* temp) {
const int thread = LOCAL_ID;
SYNC_THREADS;
#ifdef WARP_SHUFFLE_DOWN
const int warpCount = LOCAL_SIZE / WARP_SIZE;
const int warp = thread / WARP_SIZE;
const int lane = thread % WARP_SIZE;
for (int step = WARP_SIZE / 2; step > 0; step >>= 1) {
value += WARP_SHUFFLE_DOWN(value, step);
}
if (!lane) {
temp[warp] = value;
}
SYNC_THREADS;
if (!warp) {
value = lane < warpCount ? temp[lane] : 0;
for (int step = WARP_SIZE / 2; step > 0; step >>= 1) {
value += WARP_SHUFFLE_DOWN(value, step);
}
if (!lane) {
temp[0] = value;
}
}
SYNC_THREADS;
#else
temp[thread] = value;
SYNC_THREADS;
for (int step = 1; step < WARP_SIZE / 2; step <<= 1) {
if (thread + step < LOCAL_SIZE && thread % (2 * step) == 0) {
temp[thread] += temp[thread + step];
}
SYNC_WARPS;
}
for (int step = WARP_SIZE / 2; step < LOCAL_SIZE; step <<= 1) {
if (thread + step < LOCAL_SIZE && thread % (2 * step) == 0) {
temp[thread] += temp[thread + step];
}
SYNC_THREADS;
}
#endif
return temp[0];
}
DEVICE mixed reduceMax(mixed value, LOCAL_ARG volatile mixed* temp) {
const int thread = LOCAL_ID;
SYNC_THREADS;
#ifdef WARP_SHUFFLE_DOWN
const int warpCount = LOCAL_SIZE / WARP_SIZE;
const int warp = thread / WARP_SIZE;
const int lane = thread % WARP_SIZE;
for (int step = WARP_SIZE / 2; step > 0; step >>= 1) {
value = max(value, WARP_SHUFFLE_DOWN(value, step));
}
if (!lane) {
temp[warp] = value;
}
SYNC_THREADS;
if (!warp) {
value = lane < warpCount ? temp[lane] : 0;
for (int step = WARP_SIZE / 2; step > 0; step >>= 1) {
value = max(value, WARP_SHUFFLE_DOWN(value, step));
}
if (!lane) {
temp[0] = value;
}
}
SYNC_THREADS;
#else
temp[thread] = value;
SYNC_THREADS;
for (int step = 1; step < WARP_SIZE / 2; step <<= 1) {
if (thread + step < LOCAL_SIZE && thread % (2 * step) == 0) {
temp[thread] = max(temp[thread], temp[thread + step]);
}
SYNC_WARPS;
}
for (int step = WARP_SIZE / 2; step < LOCAL_SIZE; step <<= 1) {
if (thread + step < LOCAL_SIZE && thread % (2 * step) == 0) {
temp[thread] = max(temp[thread], temp[thread + step]);
}
SYNC_THREADS;
}
#endif
return temp[0];
}
DEVICE void atomicAddMixed(GLOBAL mixed* RESTRICT target, const mixed value) {
#if defined(__CUDA_ARCH__) || defined(USE_HIP)
// Do atomic addition of floating-point values directly
atomicAdd(target, value);
#elif defined(__OPENCL_VERSION__)
#if defined(USE_MIXED_PRECISION) || defined(USE_DOUBLE_PRECISION)
// Do 64-bit atomic compare/exchange (requires cl_khr_int64_base_atomics;
// this will be checked on the host before trying to compile this kernel)
unsigned long old = as_ulong(*target);
unsigned long check;
do {
check = old;
old = atom_cmpxchg((GLOBAL unsigned long*)target, check, as_ulong(value + as_double(check)));
} while(check != old);
#else
// Do 32-bit atomic compare/exchange
unsigned int old = as_uint(*target);
unsigned int check;
do {
check = old;
old = atomic_cmpxchg((GLOBAL unsigned int*)target, check, as_uint(value + as_float(check)));
} while(check != old);
#endif
#else
#error "Internal error: atomicAddMixed is missing an implementation for this platform"
#endif
}
KERNEL void recordInitialPos(
GLOBAL const real4* RESTRICT posq,
GLOBAL const int* RESTRICT order,
GLOBAL mixed* RESTRICT xInit,
GLOBAL mixed* RESTRICT x,
const int numParticles
#ifdef USE_MIXED_PRECISION
, GLOBAL const real4* RESTRICT posqCorrection
#endif
) {
for (int i = GLOBAL_ID; i < numParticles; i += GLOBAL_SIZE) {
int offset = 3 * order[i];
#ifdef USE_MIXED_PRECISION
real4 pos1 = posq[i];
real4 pos2 = posqCorrection[i];
mixed3 pos = make_mixed3(pos1.x + (mixed) pos2.x, pos1.y + (mixed) pos2.y, pos1.z + (mixed) pos2.z);
#else
real4 pos = posq[i];
#endif
xInit[offset] = x[offset] = pos.x;
xInit[offset + 1] = x[offset + 1] = pos.y;
xInit[offset + 2] = x[offset + 2] = pos.z;
}
}
KERNEL void restorePos(
GLOBAL real4* RESTRICT posq,
GLOBAL const int* RESTRICT order,
GLOBAL const mixed* RESTRICT x,
GLOBAL mixed* RESTRICT returnValue,
const int numParticles
#ifdef USE_MIXED_PRECISION
, GLOBAL real4* RESTRICT posqCorrection
#endif
) {
for (int i = GLOBAL_ID; i < numParticles; i += GLOBAL_SIZE) {
int offset = 3 * order[i];
posq[i].x = x[offset];
posq[i].y = x[offset + 1];
posq[i].z = x[offset + 2];
#ifdef USE_MIXED_PRECISION
posqCorrection[i].x = x[offset] - (real) x[offset];
posqCorrection[i].y = x[offset + 1] - (real) x[offset + 1];
posqCorrection[i].z = x[offset + 2] - (real) x[offset + 2];
#endif
}
// Reset in case we will accumulate constraint energies in returnValue.
if (GLOBAL_ID == 0) {
*returnValue = 0;
}
}
KERNEL void convertForces(
GLOBAL const mixed4* RESTRICT velm,
GLOBAL const mm_long* RESTRICT forceBuffer,
GLOBAL const int* RESTRICT order,
GLOBAL mixed* RESTRICT grad,
GLOBAL mixed* RESTRICT returnValue,
const int numParticles,
const int numPadded
) {
const mm_long limit = 0x4000000000000000;
const mixed scale = -1 / (mixed) 0x100000000;
for (int i = GLOBAL_ID; i < numParticles; i += GLOBAL_SIZE) {
int offset = 3 * order[i];
if (velm[i].w == 0) {
grad[offset] = 0;
grad[offset + 1] = 0;
grad[offset + 2] = 0;
}
else {
mm_long fx = forceBuffer[i];
mm_long fy = forceBuffer[i + numPadded];
mm_long fz = forceBuffer[i + 2 * numPadded];
if (fx < -limit || fx > limit || fy < -limit || fy > limit || fz < -limit || fz > limit) {
*returnValue = FLT_MAX;
}
grad[offset] = scale * fx;
grad[offset + 1] = scale * fy;
grad[offset + 2] = scale * fz;
}
}
}
KERNEL void getConstraintEnergyForces(
GLOBAL mm_ulong* RESTRICT forceBuffer,
GLOBAL const int* RESTRICT order,
GLOBAL const int2* RESTRICT constraintIndices,
GLOBAL const mixed* RESTRICT constraintDistances,
GLOBAL const mixed* RESTRICT x,
GLOBAL mixed* RESTRICT returnValue,
const int numPadded,
const int numConstraints,
const mixed kRestraint
) {
LOCAL volatile mixed temp[TEMP_SIZE];
const mixed scale = 0x100000000;
mixed energy = 0;
for (int i = GLOBAL_ID; i < numConstraints; i += GLOBAL_SIZE) {
int2 indices = constraintIndices[i];
mixed distance = constraintDistances[i];
int offset1 = 3 * order[indices.x];
int offset2 = 3 * order[indices.y];
mixed3 delta = make_mixed3(x[offset2] - x[offset1], x[offset2 + 1] - x[offset1 + 1], x[offset2 + 2] - x[offset1 + 2]);
mixed r2 = dot(delta, delta);
mixed r = SQRT_MIXED(r2);
delta *= 1 / r;
mixed dr = r - distance;
mixed kdr = kRestraint * dr;
energy += (mixed) 0.5 * kdr * dr;
mm_long fx = (mm_long) (kdr * scale * delta.x);
mm_long fy = (mm_long) (kdr * scale * delta.y);
mm_long fz = (mm_long) (kdr * scale * delta.z);
ATOMIC_ADD(&forceBuffer[indices.x], (mm_ulong) fx);
ATOMIC_ADD(&forceBuffer[indices.x + numPadded], (mm_ulong) fy);
ATOMIC_ADD(&forceBuffer[indices.x + 2 * numPadded], (mm_ulong) fz);
ATOMIC_ADD(&forceBuffer[indices.y], (mm_ulong) -fx);
ATOMIC_ADD(&forceBuffer[indices.y + numPadded], (mm_ulong) -fy);
ATOMIC_ADD(&forceBuffer[indices.y + 2 * numPadded], (mm_ulong) -fz);
}
energy = reduceAdd(energy, temp);
if (LOCAL_ID == 0) {
atomicAddMixed(returnValue, energy);
}
}
KERNEL void getConstraintError(
GLOBAL const int2* RESTRICT constraintIndices,
GLOBAL const mixed* RESTRICT constraintDistances,
GLOBAL const mixed* RESTRICT x,
GLOBAL mixed* RESTRICT returnValue,
const int numConstraints
) {
// This kernel is expected to be executed in a single thread block.
LOCAL volatile mixed temp[TEMP_SIZE];
mixed maxError = 0;
for (int i = LOCAL_ID; i < numConstraints; i += LOCAL_SIZE) {
int2 indices = constraintIndices[i];
mixed distance = constraintDistances[i];
mixed3 delta = make_mixed3(x[3 * indices.y] - x[3 * indices.x], x[3 * indices.y + 1] - x[3 * indices.x + 1], x[3 * indices.y + 2] - x[3 * indices.x + 2]);
mixed r = SQRT_MIXED(dot(delta, delta));
maxError = max(maxError, FABS_MIXED(r - distance) / distance);
}
maxError = reduceMax(maxError, temp);
if (LOCAL_ID == 0) {
*returnValue = maxError;
}
}
KERNEL void initializeDir(
GLOBAL const mixed* RESTRICT grad,
GLOBAL mixed* RESTRICT dir,
GLOBAL const mixed* RESTRICT gradNorm,
GLOBAL mixed* RESTRICT lineSearchData,
const int numVariables
) {
const real scale = -1 / *gradNorm;
for (int i = GLOBAL_ID; i < numVariables; i += GLOBAL_SIZE) {
dir[i] = scale * grad[i];
}
// Prepare for the first line search of the optimization.
resetLineSearchData(lineSearchData);
}
KERNEL void gradNorm(
GLOBAL const mixed* RESTRICT grad,
GLOBAL mixed* RESTRICT gradNorm,
const int numVariables
) {
// This kernel is expected to be executed in a single thread block.
// This is a slow path for when the gradient could overflow.
LOCAL volatile mixed temp[TEMP_SIZE];
mixed gradScale = 1;
for (int i = LOCAL_ID; i < numVariables; i += LOCAL_SIZE) {
gradScale = max(gradScale, FABS_MIXED(grad[i]));
}
gradScale = reduceMax(gradScale, temp);
mixed gradScaleInv = 1 / gradScale;
mixed scaledNorm = 0;
for (int i = LOCAL_ID; i < numVariables; i += LOCAL_SIZE) {
mixed gradScaled = gradScaleInv * grad[i];
scaledNorm += gradScaled * gradScaled;
}
scaledNorm = reduceAdd(scaledNorm, temp);
if (LOCAL_ID == 0) {
*gradNorm = gradScale * SQRT_MIXED(scaledNorm);
}
}
KERNEL void getDiff(
GLOBAL const mixed* RESTRICT x,
GLOBAL const mixed* RESTRICT xPrev,
GLOBAL const mixed* RESTRICT grad,
GLOBAL const mixed* RESTRICT gradPrev,
GLOBAL mixed* RESTRICT scale,
GLOBAL mixed* RESTRICT xDiff,
GLOBAL mixed* RESTRICT gradDiff,
GLOBAL int* RESTRICT returnFlag,
GLOBAL mixed* RESTRICT returnValue,
GLOBAL const mixed* RESTRICT gradNorm,
const int numVariables,
const mixed tolerance,
const int end,
const int largeGrad
) {
LOCAL volatile mixed temp[TEMP_SIZE];
// If the convergence condition is satisfied, update returnFlag. After this
// kernel has run, it is safe to start downloading returnFlag.
bool converged;
if (largeGrad) {
// The slow path for overflow was run: gradNorm is the norm.
converged = (*gradNorm <= tolerance);
}
else {
// The fast path using atomics was run: gradNorm is the square norm.
converged = (*gradNorm <= tolerance * tolerance);
}
if (GLOBAL_ID == 0) {
*returnFlag = (int) converged;
}
if (converged) {
return;
}
const int endOffset = numVariables * end;
for (int i = GLOBAL_ID; i < numVariables; i += GLOBAL_SIZE) {
xDiff[endOffset + i] = x[i] - xPrev[i];
gradDiff[endOffset + i] = grad[i] - gradPrev[i];
}
if (GLOBAL_ID == 0 && !largeGrad) {
// The next kernel will use atomics to do its reduction, so clear the
// destination variables first.
scale[end] = 0;
*returnValue = 0;
}
}
KERNEL void getScale(
GLOBAL mixed* RESTRICT alpha,
GLOBAL mixed* RESTRICT scale,
GLOBAL const mixed* RESTRICT xDiff,
GLOBAL const mixed* RESTRICT gradDiff,
GLOBAL const int* RESTRICT returnFlag,
GLOBAL mixed* RESTRICT returnValue,
const int numVariables,
const int end,
const int largeGrad
) {
// This kernel is expected to be executed in a single thread block if
// largeGrad is non-zero (if we are using the fallback for large values).
LOCAL volatile mixed temp[TEMP_SIZE];
if (*returnFlag) {
return;
}
const int endOffset = numVariables * end;
if (largeGrad) {
mixed gradScale = 1;
for (int i = LOCAL_ID; i < numVariables; i += LOCAL_SIZE) {
gradScale = max(gradScale, FABS_MIXED(gradDiff[endOffset + i]));
}
gradScale = reduceMax(gradScale, temp);
mixed gradScaleInv = 1 / gradScale;
mixed xGradScaled = 0;
mixed gradGradScaled = 0;
for (int i = LOCAL_ID; i < numVariables; i += LOCAL_SIZE) {
mixed gradDiffScaled = gradScaleInv * gradDiff[endOffset + i];
xGradScaled += xDiff[endOffset + i] * gradDiffScaled;
gradGradScaled += gradDiffScaled * gradDiffScaled;
}
xGradScaled = reduceAdd(xGradScaled, temp);
gradGradScaled = reduceAdd(gradGradScaled, temp);
if (LOCAL_ID == 0) {
scale[end] = xGradScaled * gradScale;
*returnValue = xGradScaled * gradScaleInv / gradGradScaled;
}
}
else {
mixed xGrad = 0;
mixed gradGrad = 0;
for (int i = GLOBAL_ID; i < numVariables; i += GLOBAL_SIZE) {
xGrad += xDiff[endOffset + i] * gradDiff[endOffset + i];
gradGrad += gradDiff[endOffset + i] * gradDiff[endOffset + i];
}
xGrad = reduceAdd(xGrad, temp);
gradGrad = reduceAdd(gradGrad, temp);
if (LOCAL_ID == 0) {
atomicAddMixed(&scale[end], xGrad);
atomicAddMixed(returnValue, gradGrad);
}
}
// Clear all values of alpha (plus the extra slot at alpha[NUM_VECTORS])
// to prepare them for use in reductions by subsequent kernels.
for (int i = LOCAL_ID; i <= NUM_VECTORS; i += LOCAL_SIZE) {
alpha[i] = 0;
}
}
KERNEL void reinitializeDir(
GLOBAL const mixed* RESTRICT grad,
GLOBAL mixed* RESTRICT dir,
GLOBAL mixed* RESTRICT alpha,
GLOBAL const mixed* RESTRICT scale,
GLOBAL const mixed* RESTRICT xDiff,
GLOBAL const int* RESTRICT returnFlag,
GLOBAL mixed* RESTRICT returnValue,
const int numVariables,
const int vectorIndex,
const int largeGrad
) {
LOCAL volatile mixed temp[TEMP_SIZE];
if (*returnFlag) {
return;
}
if (GLOBAL_ID == 0 && !largeGrad) {
// The last kernel used atomics to do its reduction. returnValue has
// gradGrad, so finish up by replacing it with xGrad / gradGrad.
*returnValue = scale[vectorIndex] / *returnValue;
}
const int indexOffset = numVariables * vectorIndex;
mixed vectorAlpha = 0;
for (int i = GLOBAL_ID; i < numVariables; i += GLOBAL_SIZE) {
dir[i] = -grad[i];
vectorAlpha -= grad[i] * xDiff[indexOffset + i];
}
vectorAlpha = reduceAdd(vectorAlpha, temp);
if (LOCAL_ID == 0) {
atomicAddMixed(&alpha[vectorIndex], vectorAlpha / scale[vectorIndex]);
}
}
KERNEL void updateDirAlpha(
GLOBAL mixed* dir,
GLOBAL mixed* RESTRICT alpha,
GLOBAL const mixed* RESTRICT scale,
GLOBAL const mixed* RESTRICT xDiff,
GLOBAL const mixed* RESTRICT gradDiff,
GLOBAL const int* RESTRICT returnFlag,
const int numVariables,
const int vectorIndex1
) {
LOCAL volatile mixed temp[TEMP_SIZE];
if (*returnFlag) {
return;
}
const int vectorIndex2 = (vectorIndex1 ? vectorIndex1 : NUM_VECTORS) - 1;
const int indexOffset1 = numVariables * vectorIndex1;
const int indexOffset2 = numVariables * vectorIndex2;
const mixed vectorScale = alpha[vectorIndex1];
mixed vectorAlpha = 0;
for (int i = GLOBAL_ID; i < numVariables; i += GLOBAL_SIZE) {
dir[i] -= vectorScale * gradDiff[indexOffset1 + i];
vectorAlpha += dir[i] * xDiff[indexOffset2 + i];
}
vectorAlpha = reduceAdd(vectorAlpha, temp);
if (LOCAL_ID == 0) {
atomicAddMixed(&alpha[vectorIndex2], vectorAlpha / scale[vectorIndex2]);
}
}
KERNEL void scaleDir(
GLOBAL mixed* dir,
GLOBAL mixed* RESTRICT alpha,
GLOBAL const mixed* RESTRICT scale,
GLOBAL const mixed* RESTRICT gradDiff,
GLOBAL const int* RESTRICT returnFlag,
GLOBAL const mixed* RESTRICT returnValue,
const int numVariables,
const int vectorIndex
) {
LOCAL volatile mixed temp[TEMP_SIZE];
if (*returnFlag) {
return;
}
const int indexOffset = numVariables * vectorIndex;
const mixed innerScale = alpha[vectorIndex];
const mixed outerScale = *returnValue;
mixed vectorBeta = 0;
for (int i = GLOBAL_ID; i < numVariables; i += GLOBAL_SIZE) {
dir[i] = (dir[i] - innerScale * gradDiff[indexOffset + i]) * outerScale;
vectorBeta += dir[i] * gradDiff[indexOffset + i];
}
vectorBeta = reduceAdd(vectorBeta, temp);
if (LOCAL_ID == 0) {
// Store the result in an extra slot alpha[NUM_VECTORS] instead of using
// alpha[vectorIndex] since other blocks might still need to read it.
atomicAddMixed(&alpha[NUM_VECTORS], (GLOBAL_ID == 0 ? innerScale : 0) - vectorBeta / scale[vectorIndex]);
}
}
KERNEL void updateDirBeta(
GLOBAL mixed* dir,
GLOBAL mixed* RESTRICT alpha,
GLOBAL const mixed* RESTRICT scale,
GLOBAL const mixed* RESTRICT xDiff,
GLOBAL const mixed* RESTRICT gradDiff,
GLOBAL const int* RESTRICT returnFlag,
const int numVariables,
const int vectorIndex1,
const int vectorIndexAlpha
) {
LOCAL volatile mixed temp[TEMP_SIZE];
if (*returnFlag) {
return;
}
const int vectorIndex2 = vectorIndex1 == NUM_VECTORS - 1 ? 0 : vectorIndex1 + 1;
const int indexOffset1 = numVariables * vectorIndex1;
const int indexOffset2 = numVariables * vectorIndex2;
const mixed vectorScale = alpha[vectorIndexAlpha];
mixed vectorBeta = 0;
for (int i = GLOBAL_ID; i < numVariables; i += GLOBAL_SIZE) {
dir[i] += vectorScale * xDiff[indexOffset1 + i];
vectorBeta += dir[i] * gradDiff[indexOffset2 + i];
}
vectorBeta = reduceAdd(vectorBeta, temp);
if (LOCAL_ID == 0) {
atomicAddMixed(&alpha[vectorIndex2], -vectorBeta / scale[vectorIndex2]);
}
}
KERNEL void updateDirFinal(
GLOBAL mixed* dir,
GLOBAL const mixed* RESTRICT alpha,
GLOBAL const mixed* RESTRICT xDiff,
GLOBAL const int* RESTRICT returnFlag,
GLOBAL mixed* RESTRICT lineSearchData,
const int numVariables,
const int vectorIndex,
const int vectorIndexAlpha
) {
LOCAL volatile mixed temp[TEMP_SIZE];
if (*returnFlag) {
return;
}
const int indexOffset = numVariables * vectorIndex;
const mixed vectorScale = alpha[vectorIndexAlpha];
for (int i = GLOBAL_ID; i < numVariables; i += GLOBAL_SIZE) {
dir[i] += vectorScale * xDiff[indexOffset + i];
}
// Prepare for a line search in the next iteration.
resetLineSearchData(lineSearchData);
}
KERNEL void lineSearchSetup(
GLOBAL const mixed* RESTRICT x,
GLOBAL mixed* RESTRICT xPrev,
GLOBAL const mixed* RESTRICT grad,
GLOBAL mixed* RESTRICT gradPrev,
GLOBAL const mixed* RESTRICT dir,
GLOBAL int* RESTRICT returnFlag,
GLOBAL mixed* RESTRICT gradNorm,
GLOBAL mixed* RESTRICT lineSearchData,
const int numVariables,
const mixed energyStart
) {
LOCAL volatile mixed temp[TEMP_SIZE];
for (int i = GLOBAL_ID; i < numVariables; i += GLOBAL_SIZE) {
xPrev[i] = x[i];
}
for (int i = GLOBAL_ID; i < numVariables; i += GLOBAL_SIZE) {
gradPrev[i] = grad[i];
}
mixed result = 0;
for (int i = GLOBAL_ID; i < numVariables; i += GLOBAL_SIZE) {
result += grad[i] * dir[i];
}
result = reduceAdd(result, temp);
if (LOCAL_ID == 0) {
atomicAddMixed(&lineSearchData[LS_DOT_START], result);
}
if (GLOBAL_ID == 0) {
*returnFlag = LS_CONTINUE;
*gradNorm = 0;
lineSearchData[LS_ENERGY] = energyStart;
}
}
KERNEL void lineSearchStep(
GLOBAL mixed* RESTRICT x,
GLOBAL const mixed* RESTRICT xPrev,
GLOBAL mixed* RESTRICT grad,
GLOBAL const mixed* RESTRICT gradPrev,
GLOBAL const mixed* RESTRICT dir,
GLOBAL int* RESTRICT returnFlag,
GLOBAL mixed* RESTRICT gradNorm,
GLOBAL mixed* RESTRICT lineSearchData,
GLOBAL mixed* RESTRICT lineSearchDataBackup,
const int numVariables
) {
LOCAL volatile mixed temp[TEMP_SIZE];
if (*returnFlag == LS_FAIL) {
return;
}
if (*returnFlag == LS_SUCCEED) {
// The strong Wolfe condition was satisfied on the last iteration.
// Instead of taking another step, start evaluating the gradient. If
// we evaluated on the CPU, it might overflow, but we will throw out
// this result and recompute it in that case.
mixed norm = 0;
for (int i = GLOBAL_ID; i < numVariables; i += GLOBAL_SIZE) {
norm += grad[i] * grad[i];
}
norm = reduceAdd(norm, temp);
if (LOCAL_ID == 0) {
atomicAddMixed(gradNorm, norm);
}
return;
}
// We are either getting started or going around for another normal line
// search iteration, so check to see whether the initial search direction is
// bad, or the updated step is bad.
if (lineSearchData[LS_DOT_START] > 0 || lineSearchData[LS_STEP] < LBFGS_MIN_STEP || lineSearchData[LS_STEP] > LBFGS_MAX_STEP) {
if (GLOBAL_ID == 0) {
*returnFlag = LS_FAIL;
}
return;
}
const mixed step = lineSearchData[LS_STEP];
for (int i = GLOBAL_ID; i < numVariables; i += GLOBAL_SIZE) {
x[i] = xPrev[i] + step * dir[i];
}
if (GLOBAL_ID == 0) {
// Back up the line search data in case we need to restore it after
// reevaluating energy and forces on the CPU, and reset LS_DOT.
lineSearchDataBackup[LS_DOT_START] = lineSearchData[LS_DOT_START];
lineSearchDataBackup[LS_DOT] = lineSearchData[LS_DOT] = 0;
lineSearchDataBackup[LS_ENERGY] = lineSearchData[LS_ENERGY];
lineSearchDataBackup[LS_STEP] = lineSearchData[LS_STEP];
}
}
KERNEL void lineSearchDot(
GLOBAL const mixed* RESTRICT grad,
GLOBAL const mixed* RESTRICT dir,
GLOBAL mixed* RESTRICT lineSearchData,
GLOBAL int* RESTRICT returnFlag,
GLOBAL const mixed* RESTRICT returnValue,
const int numVariables,
mixed energy
) {
LOCAL volatile mixed temp[TEMP_SIZE];
if (*returnFlag == LS_FAIL) {
return;
}
// Any restraint energy in returnValue hasn't been downloaded yet to be
// passed back up in the energy parameter, so add it in here.
energy += *returnValue;
// The energy may be such that we don't need to do a dot product and can
// immediately decide to scale the step, so mark this case with LS_SUCCEED.
// This will be checked in the following kernel.
if (!(FABS_MIXED(energy) < FLT_MAX) || energy > lineSearchData[LS_ENERGY] + lineSearchData[LS_STEP] * LBFGS_FTOL * lineSearchData[LS_DOT_START]) {
if (GLOBAL_ID == 0) {
*returnFlag = LS_SUCCEED;
}
return;
}
mixed result = 0;
for (int i = GLOBAL_ID; i < numVariables; i += GLOBAL_SIZE) {
result += grad[i] * dir[i];
}
result = reduceAdd(result, temp);
if (LOCAL_ID == 0) {
atomicAddMixed(&lineSearchData[LS_DOT], result);
}
}
KERNEL void lineSearchContinue(
GLOBAL int* RESTRICT returnFlag,
GLOBAL mixed* RESTRICT gradNorm,
GLOBAL mixed* RESTRICT lineSearchData
) {
// This kernel should run a single thread and just lets us update the step
// and decide how to continue the line search without returning to the CPU.
if (GLOBAL_ID != 0 || *returnFlag == LS_FAIL) {
return;
}
*gradNorm = 0;
// If the last kernel set LS_SUCCEED, the energy was out of range and we
// should scale the step back and go around again.
if (*returnFlag == LS_SUCCEED) {
lineSearchData[LS_STEP] *= LBFGS_SCALE_DOWN;
*returnFlag = LS_CONTINUE;
return;
}
// If we are here, we should be reading LS_CONTINUE that was set by
// lineSearchSetup, so either scale the step or set LS_SUCCEED if the line
// search is successful and we should exit.
if (lineSearchData[LS_DOT] < LBFGS_WOLFE * lineSearchData[LS_DOT_START]) {
lineSearchData[LS_STEP] *= LBFGS_SCALE_UP;
}
else if(lineSearchData[LS_DOT] > -LBFGS_WOLFE * lineSearchData[LS_DOT_START]) {
lineSearchData[LS_STEP] *= LBFGS_SCALE_DOWN;
}
else {
*returnFlag = LS_SUCCEED;
}
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit. *
* See https://openmm.org/development. *
* *
* Portions copyright (c) 2026 Stanford University and the Authors. *
* Authors: Evan Pretti *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include "CpuTests.h"
#include "TestLocalEnergyMinimizer.h"
void runPlatformTests() {
}
......@@ -291,6 +291,7 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
compilationDefines["ERF"] = useDoublePrecision ? "erf" : "erff";
compilationDefines["ERFC"] = useDoublePrecision ? "erfc" : "erfcf";
compilationDefines["FMA"] = useDoublePrecision ? "fma" : "fmaf";
compilationDefines["FABS"] = useDoublePrecision ? "fabs" : "fabsf";
// Set defines for applying periodic boundary conditions.
......
......@@ -4,7 +4,7 @@
* This is part of the OpenMM molecular simulation toolkit. *
* See https://openmm.org/development. *
* *
* Portions copyright (c) 2008-2025 Stanford University and the Authors. *
* Portions copyright (c) 2008-2026 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -35,6 +35,7 @@
#include "openmm/common/CommonIntegrateCustomStepKernel.h"
#include "openmm/common/CommonIntegrateNoseHooverStepKernel.h"
#include "openmm/common/CommonIntegrateQTBStepKernel.h"
#include "openmm/common/CommonMinimizeKernel.h"
#include "openmm/internal/ContextImpl.h"
#include "openmm/OpenMMException.h"
......@@ -83,6 +84,8 @@ KernelImpl* CudaKernelFactory::createKernelImpl(std::string name, const Platform
return new CommonApplyConstraintsKernel(name, platform, cu);
if (name == VirtualSitesKernel::Name())
return new CommonVirtualSitesKernel(name, platform, cu);
if (name == MinimizeKernel::Name())
return new CommonMinimizeKernel(name, platform, cu);
if (name == CalcHarmonicBondForceKernel::Name())
return new CommonCalcHarmonicBondForceKernel(name, platform, cu, context.getSystem());
if (name == CalcCustomBondForceKernel::Name())
......
......@@ -73,6 +73,7 @@ CudaPlatform::CudaPlatform() {
registerKernelFactory(UpdateStateDataKernel::Name(), factory);
registerKernelFactory(ApplyConstraintsKernel::Name(), factory);
registerKernelFactory(VirtualSitesKernel::Name(), factory);
registerKernelFactory(MinimizeKernel::Name(), factory);
registerKernelFactory(CalcHarmonicBondForceKernel::Name(), factory);
registerKernelFactory(CalcCustomBondForceKernel::Name(), factory);
registerKernelFactory(CalcHarmonicAngleForceKernel::Name(), factory);
......
......@@ -18,6 +18,7 @@
#define SYNC_THREADS __syncthreads();
#define MEM_FENCE __threadfence_block();
#define ATOMIC_ADD(dest, value) atomicAdd(dest, value)
#define FLT_MAX 3.40282347e+38f
typedef long long mm_long;
typedef unsigned long long mm_ulong;
......
......@@ -4,7 +4,7 @@
* This is part of the OpenMM molecular simulation toolkit. *
* See https://openmm.org/development. *
* *
* Portions copyright (c) 2009-2025 Stanford University and the Authors. *
* Portions copyright (c) 2009-2026 Stanford University and the Authors. *
* Portions copyright (c) 2020-2023 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis *
* Contributors: *
......@@ -294,6 +294,7 @@ HipContext::HipContext(const System& system, int deviceIndex, bool useBlockingSy
compilationDefines["ERF"] = useDoublePrecision ? "erf" : "erff";
compilationDefines["ERFC"] = useDoublePrecision ? "erfc" : "erfcf";
compilationDefines["FMA"] = useDoublePrecision ? "fma" : "fmaf";
compilationDefines["FABS"] = useDoublePrecision ? "fabs" : "fabsf";
// Set defines for applying periodic boundary conditions.
......
......@@ -4,7 +4,7 @@
* This is part of the OpenMM molecular simulation toolkit. *
* See https://openmm.org/development. *
* *
* Portions copyright (c) 2008-2025 Stanford University and the Authors. *
* Portions copyright (c) 2008-2026 Stanford University and the Authors. *
* Portions copyright (c) 2020 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis *
* Contributors: *
......@@ -36,6 +36,7 @@
#include "openmm/common/CommonIntegrateCustomStepKernel.h"
#include "openmm/common/CommonIntegrateNoseHooverStepKernel.h"
#include "openmm/common/CommonIntegrateQTBStepKernel.h"
#include "openmm/common/CommonMinimizeKernel.h"
#include "openmm/internal/ContextImpl.h"
#include "openmm/OpenMMException.h"
......@@ -84,6 +85,8 @@ KernelImpl* HipKernelFactory::createKernelImpl(std::string name, const Platform&
return new CommonApplyConstraintsKernel(name, platform, cu);
if (name == VirtualSitesKernel::Name())
return new CommonVirtualSitesKernel(name, platform, cu);
if (name == MinimizeKernel::Name())
return new CommonMinimizeKernel(name, platform, cu);
if (name == CalcHarmonicBondForceKernel::Name())
return new CommonCalcHarmonicBondForceKernel(name, platform, cu, context.getSystem());
if (name == CalcCustomBondForceKernel::Name())
......
......@@ -74,6 +74,7 @@ HipPlatform::HipPlatform() {
registerKernelFactory(UpdateStateDataKernel::Name(), factory);
registerKernelFactory(ApplyConstraintsKernel::Name(), factory);
registerKernelFactory(VirtualSitesKernel::Name(), factory);
registerKernelFactory(MinimizeKernel::Name(), factory);
registerKernelFactory(CalcHarmonicBondForceKernel::Name(), factory);
registerKernelFactory(CalcCustomBondForceKernel::Name(), factory);
registerKernelFactory(CalcHarmonicAngleForceKernel::Name(), factory);
......
......@@ -19,6 +19,7 @@
#define SYNC_WARPS {__builtin_amdgcn_wave_barrier(); __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "wavefront");}
#define MEM_FENCE __threadfence_block();
#define ATOMIC_ADD(dest, value) atomicAdd(dest, value)
#define FLT_MAX 3.40282347e+38f
typedef long long mm_long;
typedef unsigned long long mm_ulong;
......
......@@ -4,7 +4,7 @@
* This is part of the OpenMM molecular simulation toolkit. *
* See https://openmm.org/development. *
* *
* Portions copyright (c) 2009-2025 Stanford University and the Authors. *
* Portions copyright (c) 2009-2026 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -433,6 +433,7 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
compilationDefines["ERF"] = "erf";
compilationDefines["ERFC"] = "erfc";
compilationDefines["FMA"] = "fma";
compilationDefines["FABS"] = "fabs";
// Set defines for applying periodic boundary conditions.
......
......@@ -4,7 +4,7 @@
* This is part of the OpenMM molecular simulation toolkit. *
* See https://openmm.org/development. *
* *
* Portions copyright (c) 2008-2025 Stanford University and the Authors. *
* Portions copyright (c) 2008-2026 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -33,6 +33,7 @@
#include "openmm/common/CommonIntegrateCustomStepKernel.h"
#include "openmm/common/CommonIntegrateNoseHooverStepKernel.h"
#include "openmm/common/CommonIntegrateQTBStepKernel.h"
#include "openmm/common/CommonMinimizeKernel.h"
#include "openmm/internal/ContextImpl.h"
#include "openmm/OpenMMException.h"
......@@ -81,6 +82,8 @@ KernelImpl* OpenCLKernelFactory::createKernelImpl(std::string name, const Platfo
return new CommonApplyConstraintsKernel(name, platform, cl);
if (name == VirtualSitesKernel::Name())
return new CommonVirtualSitesKernel(name, platform, cl);
if (name == MinimizeKernel::Name())
return new CommonMinimizeKernel(name, platform, cl);
if (name == CalcHarmonicBondForceKernel::Name())
return new CommonCalcHarmonicBondForceKernel(name, platform, cl, context.getSystem());
if (name == CalcCustomBondForceKernel::Name())
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment