Unverified Commit 6307eb0d authored by Andy Simmonett's avatar Andy Simmonett
Browse files

Nose-Hoover working on CUDA

parent 799d4b1f
......@@ -64,11 +64,10 @@ int VelocityVerletIntegrator::addNoseHooverChainThermostat(System& system, doubl
if (context) {
throw OpenMMException("addNoseHooverChainThermostat cannot be called after binding this integrator to a context.");
}
std::vector<int> mask, parents;
int nDOF = 0;
int numForces = system.getNumForces();
vector<int> thermostatedParticles;
vector<int> parentParticles;
std::vector<int> thermostatedParticles;
std::vector<int> parentParticles;
for(int particle = 0; particle < system.getNumParticles(); ++particle) {
if(system.getParticleMass(particle) > 0) {
nDOF += 3;
......
......@@ -1762,9 +1762,14 @@ public:
private:
CudaContext& cu;
CudaArray scaleFactor, chainMasses, chainForces, heatBathEnergy;
CudaArray scaleFactorBuffer, kineticEnergyBuffer, chainMasses, chainForces, heatBathEnergy;
std::vector<CudaArray> masks;
std::map<int, CUfunction> propagateKernels;
CUfunction reduceEnergyKernel;
CUfunction computeHeatBathEnergyKernel;
CUfunction computeMaskedKineticEnergyKernel;
CUfunction scaleVelocitiesKernel;
CUfunction zeroEnergyBuffersKernel;
};
/**
......
......@@ -57,6 +57,7 @@
#include <cmath>
#include <iterator>
#include <set>
#include <assert.h>
using namespace OpenMM;
using namespace std;
......@@ -8365,6 +8366,10 @@ void CudaNoseHooverChainKernel::initialize() {
propagateKernels[5] = cu.getKernel(module, "propagateNoseHooverChain");
computeHeatBathEnergyKernel = cu.getKernel(module, "computeHeatBathEnergy");
computeMaskedKineticEnergyKernel = cu.getKernel(module, "computeMaskedKineticEnergy");
scaleVelocitiesKernel = cu.getKernel(module, "scaleVelocities");
CUmodule utilities = cu.createModule(CudaKernelSources::vectorOps+CudaKernelSources::utilities);
reduceEnergyKernel = cu.getKernel(utilities, "reduceEnergy");
}
double CudaNoseHooverChainKernel::propagateChain(ContextImpl& context, const NoseHooverChain &nhc, double kineticEnergy, double timeStep) {
......@@ -8386,24 +8391,24 @@ double CudaNoseHooverChainKernel::propagateChain(ContextImpl& context, const Nos
// We need to upload the CUDA array
if(useDouble){
chainState[chainID].initialize<double2>(cu, chainLength, "chainState" + std::to_string(chainID));
std::vector<double2> zeros(chainLength, {0.0, 0.0});
chainState[chainID].upload(zeros);
std::vector<double2> zeros(chainLength, make_double2(0, 0));
chainState[chainID].upload(zeros.data());
} else {
chainState[chainID].initialize<float2>(cu, chainLength, "chainState" + std::to_string(chainID));
std::vector<float2> zeros(chainLength, {0.0, 0.0});
chainState[chainID].upload(zeros);
std::vector<float2> zeros(chainLength, make_float2(0, 0));
chainState[chainID].upload(zeros.data());
}
}
if (!scaleFactor.isInitialized() ||scaleFactor.getSize() == 0) {
if (!scaleFactorBuffer.isInitialized() ||scaleFactorBuffer.getSize() == 0) {
if(useDouble){
std::vector<double> one(1);
scaleFactor.initialize<double>(cu, 1, "scaleFactor");
scaleFactor.upload(one);
scaleFactorBuffer.initialize<double>(cu, 1, "scaleFactorBuffer");
scaleFactorBuffer.upload(one);
} else {
std::vector<float> one(1);
scaleFactor.initialize<float>(cu, 1, "scaleFactor");
scaleFactor.upload(one);
scaleFactorBuffer.initialize<float>(cu, 1, "scaleFactorBuffer");
scaleFactorBuffer.upload(one);
}
}
......@@ -8422,23 +8427,24 @@ double CudaNoseHooverChainKernel::propagateChain(ContextImpl& context, const Nos
chainForces.upload(zeros);
}
}
double kT = BOLTZ * temperature;
float kTfloat = (float) kT;
float timeStepFloat = (float) timeStep;
float frequencyFloat = (float) frequency;
float kineticEnergyFloat = (float) kineticEnergy;
void *args[] = {&scaleFactor.getDevicePointer(),
// N.B. We ignore the incoming kineticEnergy and grab it from the device buffer instead
void *args[] = {
&chainState[chainID].getDevicePointer(),
&kineticEnergyBuffer.getDevicePointer(),
&scaleFactorBuffer.getDevicePointer(),
&chainMasses.getDevicePointer(),
&chainForces.getDevicePointer(),
&chainLength, &numMTS, &numDOFs,
&timeStepFloat,
useDouble ? (void*) &kT : (void*) &kTfloat,
&frequencyFloat,
useDouble ? (void*) &kineticEnergy : (void*) &kineticEnergyFloat, &chainState[chainID].getDevicePointer()};
&frequencyFloat
};
if (numYS == 1 || numYS == 3 || numYS == 5) {
cu.executeKernel(propagateKernels[numYS], args, 1);
cu.executeKernel(propagateKernels[numYS], args, 1, 1);
} else {
throw OpenMMException("Number of Yoshida Suzuki time steps has to be 1, 3, or 5.");
}
......@@ -8481,25 +8487,87 @@ double CudaNoseHooverChainKernel::computeHeatBathEnergy(ContextImpl& context, co
useDouble ? (void*) &kT : (void*) & kTfloat,
&frequencyFloat, &chainState[chainID].getDevicePointer()};
cu.executeKernel(computeHeatBathEnergyKernel, args, 1);
cu.executeKernel(computeHeatBathEnergyKernel, args, 1, 1);
void * pinnedBuffer = cu.getPinnedBuffer();
heatBathEnergy.download(pinnedBuffer);
if (useDouble){
std::vector<double> energy(1);
heatBathEnergy.download(energy);
return energy[0];
return *((double*) pinnedBuffer);
} else {
std::vector<float> energy(1);
heatBathEnergy.download(energy);
return (double) energy[0];
return *((float*) pinnedBuffer);
}
}
double CudaNoseHooverChainKernel::computeMaskedKineticEnergy(ContextImpl& context, const NoseHooverChain &noseHooverChain) {
double CudaNoseHooverChainKernel::computeMaskedKineticEnergy(ContextImpl& context, const NoseHooverChain &nhc) {
cu.setAsCurrent();
return 1;
bool useDouble = cu.getUseDoublePrecision() || cu.getUseMixedPrecision();
int chainID = nhc.getDefaultChainID();
const auto & parents = nhc.getParentAtoms();
const auto & thermostatedAtoms = nhc.getThermostatedAtoms();
if (chainID >= masks.size()) masks.resize(chainID+1);
auto nAtoms = cu.getPaddedNumAtoms();
if (!masks[chainID].isInitialized()) {
// We need to upload the CUDA array
std::vector<int> maskVec(nAtoms);
for (int i = 0; i < nAtoms; ++i) maskVec[i] = i;
if (parents.size() ) {
assert(parents.size() == thermostatedAtoms.size());
for ( size_t i = 0; i < parents.size(); ++i) {
int atom = thermostatedAtoms[i];
maskVec[atom] = parents[i];
}
} else {
for ( const auto & atom : thermostatedAtoms) {
maskVec[atom] = -1L;
}
}
// Account for padding in the paddedNumAtoms loops
for(int i = thermostatedAtoms.size(); i < nAtoms; ++i){
maskVec[i] = i;
}
masks[chainID].initialize<int>(cu, nAtoms, "mask" + std::to_string(chainID));
masks[chainID].upload(maskVec);
}
if (masks[chainID].getSize() != nAtoms) {
throw OpenMMException("Number of atoms changed. Cannot be handled by the same Nose-Hoover thermostat.");
}
if (!kineticEnergyBuffer.isInitialized() || kineticEnergyBuffer.getSize() == 0) {
if(useDouble){
std::vector<double> one(1);
kineticEnergyBuffer.initialize<double>(cu, 1, "kineticEnergyBuffer");
kineticEnergyBuffer.upload(one);
} else {
std::vector<float> one(1);
kineticEnergyBuffer.initialize<float>(cu, 1, "kineticEnergyBuffer");
kineticEnergyBuffer.upload(one);
}
}
cu.clearBuffer(cu.getEnergyBuffer());
void *args[] = {&cu.getEnergyBuffer().getDevicePointer(),&nAtoms, &cu.getVelm().getDevicePointer(), &masks[chainID].getDevicePointer()};
cu.executeKernel(computeMaskedKineticEnergyKernel, args, nAtoms);
//taken from CudaContext::reduceEnergy(); the final kinetic energy will live in the kineticEnergy buffer
int bufferSize = cu.getEnergyBuffer().getSize();
int workGroupSize = 512;
void* args2[] = {&cu.getEnergyBuffer().getDevicePointer(), &kineticEnergyBuffer.getDevicePointer(), &bufferSize, &workGroupSize};
cu.executeKernel(reduceEnergyKernel, args2, workGroupSize, workGroupSize, workGroupSize*cu.getEnergyBuffer().getElementSize());
return 0;
}
void CudaNoseHooverChainKernel::scaleVelocities(ContextImpl& context, const NoseHooverChain &noseHooverChain, double scaleFactor) {
void CudaNoseHooverChainKernel::scaleVelocities(ContextImpl& context, const NoseHooverChain &nhc, double scaleFactor) {
// For now we assume that the mask info is valid, because computeMaskedKineticEnergy must have been
// called before this kernel. If that ever ceases to be true, some sanity checks are needed here.
cu.setAsCurrent();
auto nAtoms = cu.getPaddedNumAtoms();
int chainID = nhc.getDefaultChainID();
void *args[] = {&scaleFactorBuffer.getDevicePointer(),
&nAtoms, &cu.getVelm().getDevicePointer(), &masks[chainID].getDevicePointer()};
cu.executeKernel(scaleVelocitiesKernel, args, nAtoms);
}
void CudaApplyMonteCarloBarostatKernel::initialize(const System& system, const Force& thermostat) {
......
......@@ -4,67 +4,101 @@
/**
* Propagate the Nose-Hoover chain with one yoshida-suzuki term
*/
extern "C" __global__ void propagateNoseHooverChain(real* __restrict__ scaleFactor, real* __restrict__ chainMasses, real* __restrict__ chainForces,
extern "C" __global__ void propagateNoseHooverChain(mixed2* __restrict__ chainData, const mixed * __restrict__ energySum, mixed* __restrict__ scaleFactor,
mixed* __restrict__ chainMasses, mixed* __restrict__ chainForces,
int chainLength, int numMTS, int numDOFs, float timeStep,
real kT, float frequency, real kineticEnergy, real2* __restrict__ chainData){
real &scale = scaleFactor[0];
scale = (real) 1;
for (int bead = 0; bead < chainLength; ++bead) chainMasses[bead] = kT / (frequency * frequency);
chainMasses[0] *= numDOFs;
real KE2 = 2.0f * kineticEnergy;
real timeOverMTS = timeStep / numMTS;
chainForces[0] = (KE2 - numDOFs * kT) / chainMasses[0];
for (int bead = 0; bead < chainLength - 1; ++bead) {
chainForces[bead + 1] = (chainMasses[bead] * chainData[bead].y * chainData[bead].y - kT) / chainMasses[bead + 1];
}
for (int mts = 0; mts < numMTS; ++mts) {
BEGIN_YS_LOOP
real wdt = ys * timeOverMTS;
chainData[chainLength-1].y += 0.25f * wdt * chainForces[chainLength-1];
for (int bead = chainLength - 2; bead >= 0; --bead) {
real aa = exp(-0.125f * wdt * chainData[bead + 1].y);
chainData[bead].y = aa * (chainData[bead].y * aa + 0.25f * wdt * chainForces[bead]);
}
// update particle velocities
real aa = exp(-0.5f * wdt * chainData[0].y);
scale *= aa;
// update the thermostat positions
for (int bead = 0; bead < chainLength; ++bead) {
chainData[bead].x += 0.5f * chainData[bead].y * wdt;
}
// update the forces
chainForces[0] = (scale * scale * KE2 - numDOFs * kT) / chainMasses[0];
// update thermostat velocities
for (int bead = 0; bead < chainLength - 1; ++bead) {
real aa = exp(-0.125f * wdt * chainData[bead + 1].y);
chainData[bead].y = aa * (aa * chainData[bead].y + 0.25f * wdt * chainForces[bead]);
chainForces[bead + 1] = (chainMasses[bead] * chainData[bead].y * chainData[bead].y - kT) / chainMasses[bead + 1];
}
chainData[chainLength-1].y += 0.25f * wdt * chainForces[chainLength-1];
END_YS_LOOP
} // MTS loop
//printf("SCALE %f\n", scale);
mixed kT, float frequency){
mixed &scale = scaleFactor[0];
scale = (mixed) 1;
const mixed & kineticEnergy = energySum[0];
if(kineticEnergy < 1e-8) return;
for (int bead = 0; bead < chainLength; ++bead) chainMasses[bead] = kT / (frequency * frequency);
chainMasses[0] *= numDOFs;
mixed KE2 = 2.0f * kineticEnergy;
mixed timeOverMTS = timeStep / numMTS;
chainForces[0] = (KE2 - numDOFs * kT) / chainMasses[0];
for (int bead = 0; bead < chainLength - 1; ++bead) {
chainForces[bead + 1] = (chainMasses[bead] * chainData[bead].y * chainData[bead].y - kT) / chainMasses[bead + 1];
}
for (int mts = 0; mts < numMTS; ++mts) {
BEGIN_YS_LOOP
mixed wdt = ys * timeOverMTS;
chainData[chainLength-1].y += 0.25f * wdt * chainForces[chainLength-1];
for (int bead = chainLength - 2; bead >= 0; --bead) {
mixed aa = EXP(-0.125f * wdt * chainData[bead + 1].y);
chainData[bead].y = aa * (chainData[bead].y * aa + 0.25f * wdt * chainForces[bead]);
}
// update particle velocities
mixed aa = EXP(-0.5f * wdt * chainData[0].y);
scale *= aa;
// update the thermostat positions
for (int bead = 0; bead < chainLength; ++bead) {
chainData[bead].x += 0.5f * chainData[bead].y * wdt;
}
// update the forces
chainForces[0] = (scale * scale * KE2 - numDOFs * kT) / chainMasses[0];
// update thermostat velocities
for (int bead = 0; bead < chainLength - 1; ++bead) {
mixed aa = EXP(-0.125f * wdt * chainData[bead + 1].y);
chainData[bead].y = aa * (aa * chainData[bead].y + 0.25f * wdt * chainForces[bead]);
chainForces[bead + 1] = (chainMasses[bead] * chainData[bead].y * chainData[bead].y - kT) / chainMasses[bead + 1];
}
chainData[chainLength-1].y += 0.25f * wdt * chainForces[chainLength-1];
END_YS_LOOP
} // MTS loop
}
/**
* Compute total (potential + kinetic) energy of the Nose-Hoover beads
*/
extern "C" __global__ void computeHeatBathEnergy(real* __restrict__ heatBathEnergy, int chainLength, int numDOFs,
real kT, float frequency, real2* __restrict__ chainData){
extern "C" __global__ void computeHeatBathEnergy(mixed* __restrict__ heatBathEnergy, int chainLength, int numDOFs,
mixed kT, float frequency, const mixed2* __restrict__ chainData){
real &energy = heatBathEnergy[0];
energy = (real) 0;
mixed &energy = heatBathEnergy[0];
energy = (mixed) 0;
for(int i = 0; i < chainLength; ++i) {
real prefac = i ? 1 : numDOFs;
real mass = prefac * kT / (frequency * frequency);
real velocity = chainData[i].y;
mixed prefac = i ? 1 : numDOFs;
mixed mass = prefac * kT / (frequency * frequency);
mixed velocity = chainData[i].y;
// The kinetic energy of this bead
energy += 0.5f * mass * velocity * velocity;
// The potential energy of this bead
real position = chainData[i].x;
mixed position = chainData[i].x;
energy += prefac * kT * position;
}
}
extern "C" __global__ void computeMaskedKineticEnergy(mixed * __restrict__ energyBuffer, int paddedNumAtoms,
const mixed4* __restrict__ velm, const int *__restrict__ mask){
mixed energy = 0;
//energy = 1; return;
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < paddedNumAtoms; index += blockDim.x*gridDim.x) {
mixed4 v = velm[index];
mixed mass = v.w == 0 ? 0 : 1 / v.w;
if (mask[index] >= 0){
const mixed4& vparent = velm[mask[index]];
mixed massp = vparent.w == 0 ? 0 : 1/vparent.w;
mass = (massp + mass) == 0 ? 0 : (massp * mass) / ( massp + mass );
v.x -= vparent.x;
v.y -= vparent.y;
v.z -= vparent.z;
}
energy += 0.5f * mass * (v.x*v.x + v.y*v.y + v.z*v.z);
}
energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] = energy;
}
extern "C" __global__ void scaleVelocities(mixed * __restrict__ scaleFactor, int paddedNumAtoms,
mixed4* __restrict__ velm, const int *__restrict__ mask){
const mixed &scale = scaleFactor[0];
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < paddedNumAtoms; index += blockDim.x*gridDim.x) {
mixed4 &v = velm[index];
mixed4 vparent = mask[index] >= 0 ? velm[mask[index]] : make_mixed4(0.0f, 0.0f, 0.0f, 0.0f);
mixed maskedScale = mask[index] == index ? 1 : scale;
v.x = vparent.x + maskedScale * (v.x - vparent.x);
v.y = vparent.y + maskedScale * (v.y - vparent.y);
v.z = vparent.z + maskedScale * (v.z - vparent.z);
}
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2019 Stanford University and the Authors. *
* Authors: Andreas Krämer and Andrew C. Simmonett *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include "CudaTests.h"
#include "TestNoseHooverThermostat.h"
void runPlatformTests() {
}
......@@ -33,4 +33,6 @@
#include "TestNoseHooverThermostat.h"
void runPlatformTests() {
testNHCPropagation();
testPropagateChainConsistentWithPythonReference();
}
......@@ -265,8 +265,6 @@ void runPlatformTests();
int main(int argc, char* argv[]) {
try {
initializeTests(argc, argv);
testNHCPropagation();
testPropagateChainConsistentWithPythonReference();
testHarmonicOscillator();
bool constrain;
constrain = false; testDimerBox(constrain);
......
......@@ -184,7 +184,7 @@ def buildKeywordDictionary(major_version_num=MAJOR_VERSION_NUM,
if not openmm_lib_path:
reportError("Set OPENMM_LIB_PATH to point to the lib directory for OpenMM")
extra_compile_args=[]
extra_compile_args=['-std=c++11']
extra_link_args=[]
if platform.system() == "Windows":
define_macros.append( ('WIN32', None) )
......@@ -193,7 +193,7 @@ def buildKeywordDictionary(major_version_num=MAJOR_VERSION_NUM,
extra_compile_args.append('/EHsc')
else:
if platform.system() == 'Darwin':
extra_compile_args += ['-stdlib=libc++', '-mmacosx-version-min=10.7']
extra_compile_args += ['-std=c++11 -stdlib=libc++', '-mmacosx-version-min=10.7']
extra_link_args += ['-stdlib=libc++', '-mmacosx-version-min=10.7', '-Wl', '-rpath', openmm_lib_path]
# Hard-code CC and CXX to clang, since gcc/g++ will *not* work with
# Anaconda, despite the fact that distutils will try to use them.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment