CudaKernels.cpp 110 KB
Newer Older
1
2
3
4
5
6
7
8
/* -------------------------------------------------------------------------- *
 *                                   OpenMM                                   *
 * -------------------------------------------------------------------------- *
 * This is part of the OpenMM molecular simulation toolkit originating from   *
 * Simbios, the NIH National Center for Physics-Based Simulation of           *
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
9
 * Portions copyright (c) 2008-2019 Stanford University and the Authors.      *
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
 * This program is free software: you can redistribute it and/or modify       *
 * it under the terms of the GNU Lesser General Public License as published   *
 * by the Free Software Foundation, either version 3 of the License, or       *
 * (at your option) any later version.                                        *
 *                                                                            *
 * This program is distributed in the hope that it will be useful,            *
 * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
 * GNU Lesser General Public License for more details.                        *
 *                                                                            *
 * You should have received a copy of the GNU Lesser General Public License   *
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
 * -------------------------------------------------------------------------- */

#include "CudaKernels.h"
#include "CudaForceInfo.h"
#include "openmm/Context.h"
#include "openmm/internal/AndersenThermostatImpl.h"
#include "openmm/internal/ContextImpl.h"
#include "openmm/internal/CustomCompoundBondForceImpl.h"
#include "openmm/internal/CustomHbondForceImpl.h"
#include "openmm/internal/NonbondedForceImpl.h"
35
#include "openmm/internal/OSRngSeed.h"
36
37
38
#include "CudaBondedUtilities.h"
#include "CudaExpressionUtilities.h"
#include "CudaIntegrationUtilities.h"
39
#include "CudaNonbondedUtilities.h"
40
#include "CudaKernelSources.h"
41
#include "lepton/CustomFunction.h"
42
43
44
45
#include "lepton/ExpressionTreeNode.h"
#include "lepton/Operation.h"
#include "lepton/Parser.h"
#include "lepton/ParsedExpression.h"
46
#include "ReferenceTabulatedFunction.h"
47
48
#include "SimTKOpenMMRealType.h"
#include "SimTKOpenMMUtilities.h"
49
#include <algorithm>
50
#include <cmath>
51
#include <iterator>
52
#include <set>
Andy Simmonett's avatar
Andy Simmonett committed
53
#include <assert.h>
54
55
56

using namespace OpenMM;
using namespace std;
57
using namespace Lepton;
58

59
60
61
62
63
64
65
#define CHECK_RESULT(result, prefix) \
    if (result != CUDA_SUCCESS) { \
        std::stringstream m; \
        m<<prefix<<": "<<CudaContext::getErrorString(result)<<" ("<<result<<")"<<" at "<<__FILE__<<":"<<__LINE__; \
        throw OpenMMException(m.str());\
    }

66
67
68
69
70
71
72
73
74
75
76
static bool isZeroExpression(const Lepton::ParsedExpression& expression) {
    const Lepton::Operation& op = expression.getRootNode().getOperation();
    if (op.getId() != Lepton::Operation::CONSTANT)
        return false;
    return (dynamic_cast<const Lepton::Operation::Constant&>(op).getValue() == 0.0);
}

static bool usesVariable(const Lepton::ExpressionTreeNode& node, const string& variable) {
    const Lepton::Operation& op = node.getOperation();
    if (op.getId() == Lepton::Operation::VARIABLE && op.getName() == variable)
        return true;
peastman's avatar
peastman committed
77
78
    for (auto& child : node.getChildren())
        if (usesVariable(child, variable))
79
80
81
82
83
84
85
86
87
88
89
90
            return true;
    return false;
}

static bool usesVariable(const Lepton::ParsedExpression& expression, const string& variable) {
    return usesVariable(expression.getRootNode(), variable);
}

static pair<ExpressionTreeNode, string> makeVariable(const string& name, const string& value) {
    return make_pair(ExpressionTreeNode(new Operation::Variable(name)), value);
}

91
92
93
94
95
96
97
98
99
static void replaceFunctionsInExpression(map<string, CustomFunction*>& functions, ExpressionProgram& expression) {
    for (int i = 0; i < expression.getNumOperations(); i++) {
        if (expression.getOperation(i).getId() == Operation::CUSTOM) {
            const Operation::Custom& op = dynamic_cast<const Operation::Custom&>(expression.getOperation(i));
            expression.setOperation(i, new Operation::Custom(op.getName(), functions[op.getName()]->clone(), op.getDerivOrder()));
        }
    }
}

100
101
102
103
void CudaCalcForcesAndEnergyKernel::initialize(const System& system) {
}

void CudaCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForces, bool includeEnergy, int groups) {
104
    cu.setForcesValid(true);
105
    cu.setAsCurrent();
106
    cu.clearAutoclearBuffers();
peastman's avatar
peastman committed
107
108
    for (auto computation : cu.getPreComputations())
        computation->computeForceAndEnergy(includeForces, includeEnergy, groups);
109
    CudaNonbondedUtilities& nb = cu.getNonbondedUtilities();
110
    cu.setComputeForceCount(cu.getComputeForceCount()+1);
111
    nb.prepareInteractions(groups);
112
    map<string, double>& derivs = cu.getEnergyParamDerivWorkspace();
peastman's avatar
peastman committed
113
114
    for (auto& param : context.getParameters())
        derivs[param.first] = 0;
115
116
}

117
double CudaCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bool includeForces, bool includeEnergy, int groups, bool& valid) {
118
    cu.setAsCurrent();
119
    cu.getBondedUtilities().computeInteractions(groups);
120
    cu.getNonbondedUtilities().computeInteractions(groups, includeForces, includeEnergy);
121
    double sum = 0.0;
peastman's avatar
peastman committed
122
123
    for (auto computation : cu.getPostComputations())
        sum += computation->computeForceAndEnergy(includeForces, includeEnergy, groups);
124
    cu.getIntegrationUtilities().distributeForcesFromVirtualSites();
Peter Eastman's avatar
Peter Eastman committed
125
126
    if (includeEnergy)
        sum += cu.reduceEnergy();
127
128
    if (!cu.getForcesValid())
        valid = false;
129
130
131
132
133
134
135
136
137
138
139
140
    return sum;
}

void CudaUpdateStateDataKernel::initialize(const System& system) {
}

double CudaUpdateStateDataKernel::getTime(const ContextImpl& context) const {
    return cu.getTime();
}

void CudaUpdateStateDataKernel::setTime(ContextImpl& context, double time) {
    vector<CudaContext*>& contexts = cu.getPlatformData().contexts;
peastman's avatar
peastman committed
141
142
    for (auto ctx : contexts)
        ctx->setTime(time);
143
144
}

peastman's avatar
peastman committed
145
146
147
148
149
150
151
152
void CudaUpdateStateDataKernel::getPositions(ContextImpl& context, vector<Vec3>& positions) {
    cu.setAsCurrent();
    int numParticles = context.getSystem().getNumParticles();
    positions.resize(numParticles);
    vector<float4> posCorrection;
    if (cu.getUseDoublePrecision()) {
        double4* posq = (double4*) cu.getPinnedBuffer();
        cu.getPosq().download(posq);
153
    }
peastman's avatar
peastman committed
154
155
156
157
158
    else if (cu.getUseMixedPrecision()) {
        float4* posq = (float4*) cu.getPinnedBuffer();
        cu.getPosq().download(posq, false);
        posCorrection.resize(numParticles);
        cu.getPosqCorrection().download(posCorrection);
159
    }
peastman's avatar
peastman committed
160
161
162
163
164
165
166
167
    else {
        float4* posq = (float4*) cu.getPinnedBuffer();
        cu.getPosq().download(posq);
    }
    
    // Filling in the output array is done in parallel for speed.
    
    cu.getPlatformData().threads.execute([&] (ThreadPool& threads, int threadIndex) {
168
169
170
171
172
173
174
175
176
177
178
179
180
        // Compute the position of each particle to return to the user.  This is done in parallel for speed.
        
        const vector<int>& order = cu.getAtomIndex();
        int numParticles = cu.getNumAtoms();
        Vec3 boxVectors[3];
        cu.getPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
        int numThreads = threads.getNumThreads();
        int start = threadIndex*numParticles/numThreads;
        int end = (threadIndex+1)*numParticles/numThreads;
        if (cu.getUseDoublePrecision()) {
            double4* posq = (double4*) cu.getPinnedBuffer();
            for (int i = start; i < end; ++i) {
                double4 pos = posq[i];
181
                mm_int4 offset = cu.getPosCellOffsets()[i];
182
183
184
185
186
187
188
189
                positions[order[i]] = Vec3(pos.x, pos.y, pos.z)-boxVectors[0]*offset.x-boxVectors[1]*offset.y-boxVectors[2]*offset.z;
            }
        }
        else if (cu.getUseMixedPrecision()) {
            float4* posq = (float4*) cu.getPinnedBuffer();
            for (int i = start; i < end; ++i) {
                float4 pos1 = posq[i];
                float4 pos2 = posCorrection[i];
190
                mm_int4 offset = cu.getPosCellOffsets()[i];
191
192
193
194
195
196
197
                positions[order[i]] = Vec3((double)pos1.x+(double)pos2.x, (double)pos1.y+(double)pos2.y, (double)pos1.z+(double)pos2.z)-boxVectors[0]*offset.x-boxVectors[1]*offset.y-boxVectors[2]*offset.z;
            }
        }
        else {
            float4* posq = (float4*) cu.getPinnedBuffer();
            for (int i = start; i < end; ++i) {
                float4 pos = posq[i];
198
                mm_int4 offset = cu.getPosCellOffsets()[i];
199
200
201
                positions[order[i]] = Vec3(pos.x, pos.y, pos.z)-boxVectors[0]*offset.x-boxVectors[1]*offset.y-boxVectors[2]*offset.z;
            }
        }
peastman's avatar
peastman committed
202
    });
203
    cu.getPlatformData().threads.waitForThreads();
204
205
206
}

void CudaUpdateStateDataKernel::setPositions(ContextImpl& context, const vector<Vec3>& positions) {
207
    cu.setAsCurrent();
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
    const vector<int>& order = cu.getAtomIndex();
    int numParticles = context.getSystem().getNumParticles();
    if (cu.getUseDoublePrecision()) {
        double4* posq = (double4*) cu.getPinnedBuffer();
        cu.getPosq().download(posq);
        for (int i = 0; i < numParticles; ++i) {
            double4& pos = posq[i];
            const Vec3& p = positions[order[i]];
            pos.x = p[0];
            pos.y = p[1];
            pos.z = p[2];
        }
        for (int i = numParticles; i < cu.getPaddedNumAtoms(); i++)
            posq[i] = make_double4(0.0, 0.0, 0.0, 0.0);
        cu.getPosq().upload(posq);
    }
    else {
        float4* posq = (float4*) cu.getPinnedBuffer();
        cu.getPosq().download(posq);
        for (int i = 0; i < numParticles; ++i) {
            float4& pos = posq[i];
            const Vec3& p = positions[order[i]];
230
231
232
            pos.x = (float) p[0];
            pos.y = (float) p[1];
            pos.z = (float) p[2];
233
234
        }
        for (int i = numParticles; i < cu.getPaddedNumAtoms(); i++)
Peter Eastman's avatar
Peter Eastman committed
235
            posq[i] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
236
237
        cu.getPosq().upload(posq);
    }
238
239
240
241
242
243
244
245
246
247
248
    if (cu.getUseMixedPrecision()) {
        float4* posCorrection = (float4*) cu.getPinnedBuffer();
        for (int i = 0; i < numParticles; ++i) {
            float4& c = posCorrection[i];
            const Vec3& p = positions[order[i]];
            c.x = (float) (p[0]-(float)p[0]);
            c.y = (float) (p[1]-(float)p[1]);
            c.z = (float) (p[2]-(float)p[2]);
            c.w = 0;
        }
        for (int i = numParticles; i < cu.getPaddedNumAtoms(); i++)
Peter Eastman's avatar
Peter Eastman committed
249
            posCorrection[i] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
250
251
        cu.getPosqCorrection().upload(posCorrection);
    }
peastman's avatar
peastman committed
252
    for (auto& offset : cu.getPosCellOffsets())
253
        offset = mm_int4(0, 0, 0, 0);
254
    cu.reorderAtoms();
255
256
257
}

void CudaUpdateStateDataKernel::getVelocities(ContextImpl& context, vector<Vec3>& velocities) {
258
    cu.setAsCurrent();
259
260
261
    const vector<int>& order = cu.getAtomIndex();
    int numParticles = context.getSystem().getNumParticles();
    velocities.resize(numParticles);
262
    if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
263
264
265
266
        double4* velm = (double4*) cu.getPinnedBuffer();
        cu.getVelm().download(velm);
        for (int i = 0; i < numParticles; ++i) {
            double4 vel = velm[i];
267
            mm_int4 offset = cu.getPosCellOffsets()[i];
268
269
270
271
272
273
274
275
            velocities[order[i]] = Vec3(vel.x, vel.y, vel.z);
        }
    }
    else {
        float4* velm = (float4*) cu.getPinnedBuffer();
        cu.getVelm().download(velm);
        for (int i = 0; i < numParticles; ++i) {
            float4 vel = velm[i];
276
            mm_int4 offset = cu.getPosCellOffsets()[i];
277
278
279
280
281
282
            velocities[order[i]] = Vec3(vel.x, vel.y, vel.z);
        }
    }
}

void CudaUpdateStateDataKernel::setVelocities(ContextImpl& context, const vector<Vec3>& velocities) {
283
    cu.setAsCurrent();
284
285
    const vector<int>& order = cu.getAtomIndex();
    int numParticles = context.getSystem().getNumParticles();
286
    if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
        double4* velm = (double4*) cu.getPinnedBuffer();
        cu.getVelm().download(velm);
        for (int i = 0; i < numParticles; ++i) {
            double4& vel = velm[i];
            const Vec3& p = velocities[order[i]];
            vel.x = p[0];
            vel.y = p[1];
            vel.z = p[2];
        }
        for (int i = numParticles; i < cu.getPaddedNumAtoms(); i++)
            velm[i] = make_double4(0.0, 0.0, 0.0, 0.0);
        cu.getVelm().upload(velm);
    }
    else {
        float4* velm = (float4*) cu.getPinnedBuffer();
        cu.getVelm().download(velm);
        for (int i = 0; i < numParticles; ++i) {
            float4& vel = velm[i];
            const Vec3& p = velocities[order[i]];
            vel.x = p[0];
            vel.y = p[1];
            vel.z = p[2];
        }
        for (int i = numParticles; i < cu.getPaddedNumAtoms(); i++)
Peter Eastman's avatar
Peter Eastman committed
311
            velm[i] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
312
313
314
315
316
        cu.getVelm().upload(velm);
    }
}

void CudaUpdateStateDataKernel::getForces(ContextImpl& context, vector<Vec3>& forces) {
317
    cu.setAsCurrent();
318
319
320
321
322
323
    long long* force = (long long*) cu.getPinnedBuffer();
    cu.getForce().download(force);
    const vector<int>& order = cu.getAtomIndex();
    int numParticles = context.getSystem().getNumParticles();
    int paddedNumParticles = cu.getPaddedNumAtoms();
    forces.resize(numParticles);
324
    double scale = 1.0/(double) 0x100000000LL;
325
326
327
328
    for (int i = 0; i < numParticles; ++i)
        forces[order[i]] = Vec3(scale*force[i], scale*force[i+paddedNumParticles], scale*force[i+paddedNumParticles*2]);
}

329
void CudaUpdateStateDataKernel::getEnergyParameterDerivatives(ContextImpl& context, map<string, double>& derivs) {
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
    const vector<string>& paramDerivNames = cu.getEnergyParamDerivNames();
    int numDerivs = paramDerivNames.size();
    if (numDerivs == 0)
        return;
    derivs = cu.getEnergyParamDerivWorkspace();
    CudaArray& derivArray = cu.getEnergyParamDerivBuffer();
    if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
        vector<double> derivBuffers;
        derivArray.download(derivBuffers);
        for (int i = numDerivs; i < derivArray.getSize(); i += numDerivs)
            for (int j = 0; j < numDerivs; j++)
                derivBuffers[j] += derivBuffers[i+j];
        for (int i = 0; i < numDerivs; i++)
            derivs[paramDerivNames[i]] += derivBuffers[i];
    }
    else {
        vector<float> derivBuffers;
        derivArray.download(derivBuffers);
        for (int i = numDerivs; i < derivArray.getSize(); i += numDerivs)
            for (int j = 0; j < numDerivs; j++)
                derivBuffers[j] += derivBuffers[i+j];
        for (int i = 0; i < numDerivs; i++)
            derivs[paramDerivNames[i]] += derivBuffers[i];
    }
354
355
}

356
void CudaUpdateStateDataKernel::getPeriodicBoxVectors(ContextImpl& context, Vec3& a, Vec3& b, Vec3& c) const {
357
    cu.getPeriodicBoxVectors(a, b, c);
358
359
}

360
void CudaUpdateStateDataKernel::setPeriodicBoxVectors(ContextImpl& context, const Vec3& a, const Vec3& b, const Vec3& c) {
361
    vector<CudaContext*>& contexts = cu.getPlatformData().contexts;
362
363
364
365
366

    // If any particles have been wrapped to the first periodic box, we need to unwrap them
    // to avoid changing their positions.

    vector<Vec3> positions;
peastman's avatar
peastman committed
367
    for (auto& offset : cu.getPosCellOffsets()) {
368
369
370
371
372
373
374
375
        if (offset.x != 0 || offset.y != 0 || offset.z != 0) {
            getPositions(context, positions);
            break;
        }
    }
    
    // Update the vectors.

peastman's avatar
peastman committed
376
377
    for (auto ctx : contexts)
        ctx->setPeriodicBoxVectors(a, b, c);
378
379
    if (positions.size() > 0)
        setPositions(context, positions);
380
381
382
}

void CudaUpdateStateDataKernel::createCheckpoint(ContextImpl& context, ostream& stream) {
383
    cu.setAsCurrent();
384
    int version = 3;
385
    stream.write((char*) &version, sizeof(int));
Peter Eastman's avatar
Peter Eastman committed
386
387
    int precision = (cu.getUseDoublePrecision() ? 2 : cu.getUseMixedPrecision() ? 1 : 0);
    stream.write((char*) &precision, sizeof(int));
388
389
390
391
    double time = cu.getTime();
    stream.write((char*) &time, sizeof(double));
    int stepCount = cu.getStepCount();
    stream.write((char*) &stepCount, sizeof(int));
392
393
    int stepsSinceReorder = cu.getStepsSinceReorder();
    stream.write((char*) &stepsSinceReorder, sizeof(int));
394
395
    char* buffer = (char*) cu.getPinnedBuffer();
    cu.getPosq().download(buffer);
396
    stream.write(buffer, cu.getPosq().getSize()*cu.getPosq().getElementSize());
Peter Eastman's avatar
Peter Eastman committed
397
398
399
400
    if (cu.getUseMixedPrecision()) {
        cu.getPosqCorrection().download(buffer);
        stream.write(buffer, cu.getPosqCorrection().getSize()*cu.getPosqCorrection().getElementSize());
    }
401
    cu.getVelm().download(buffer);
402
    stream.write(buffer, cu.getVelm().getSize()*cu.getVelm().getElementSize());
403
404
    stream.write((char*) &cu.getAtomIndex()[0], sizeof(int)*cu.getAtomIndex().size());
    stream.write((char*) &cu.getPosCellOffsets()[0], sizeof(int4)*cu.getPosCellOffsets().size());
405
406
407
    Vec3 boxVectors[3];
    cu.getPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
    stream.write((char*) boxVectors, 3*sizeof(Vec3));
408
409
    cu.getIntegrationUtilities().createCheckpoint(stream);
    SimTKOpenMMUtilities::createCheckpoint(stream);
410
411
412
}

void CudaUpdateStateDataKernel::loadCheckpoint(ContextImpl& context, istream& stream) {
413
    cu.setAsCurrent();
414
415
    int version;
    stream.read((char*) &version, sizeof(int));
416
    if (version != 3)
417
        throw OpenMMException("Checkpoint was created with a different version of OpenMM");
Peter Eastman's avatar
Peter Eastman committed
418
419
420
421
422
    int precision;
    stream.read((char*) &precision, sizeof(int));
    int expectedPrecision = (cu.getUseDoublePrecision() ? 2 : cu.getUseMixedPrecision() ? 1 : 0);
    if (precision != expectedPrecision)
        throw OpenMMException("Checkpoint was created with a different numeric precision");
423
424
    double time;
    stream.read((char*) &time, sizeof(double));
425
    int stepCount, stepsSinceReorder;
426
    stream.read((char*) &stepCount, sizeof(int));
427
    stream.read((char*) &stepsSinceReorder, sizeof(int));
428
    vector<CudaContext*>& contexts = cu.getPlatformData().contexts;
peastman's avatar
peastman committed
429
430
431
432
    for (auto ctx : contexts) {
        ctx->setTime(time);
        ctx->setStepCount(stepCount);
        ctx->setStepsSinceReorder(stepsSinceReorder);
433
434
    }
    char* buffer = (char*) cu.getPinnedBuffer();
435
    stream.read(buffer, cu.getPosq().getSize()*cu.getPosq().getElementSize());
436
    cu.getPosq().upload(buffer);
Peter Eastman's avatar
Peter Eastman committed
437
438
439
440
    if (cu.getUseMixedPrecision()) {
        stream.read(buffer, cu.getPosqCorrection().getSize()*cu.getPosqCorrection().getElementSize());
        cu.getPosqCorrection().upload(buffer);
    }
441
    stream.read(buffer, cu.getVelm().getSize()*cu.getVelm().getElementSize());
442
443
444
445
    cu.getVelm().upload(buffer);
    stream.read((char*) &cu.getAtomIndex()[0], sizeof(int)*cu.getAtomIndex().size());
    cu.getAtomIndexArray().upload(cu.getAtomIndex());
    stream.read((char*) &cu.getPosCellOffsets()[0], sizeof(int4)*cu.getPosCellOffsets().size());
446
447
    Vec3 boxVectors[3];
    stream.read((char*) &boxVectors, 3*sizeof(Vec3));
peastman's avatar
peastman committed
448
449
    for (auto ctx : contexts)
        ctx->setPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
450
451
    cu.getIntegrationUtilities().loadCheckpoint(stream);
    SimTKOpenMMUtilities::loadCheckpoint(stream);
peastman's avatar
peastman committed
452
453
    for (auto listener : cu.getReorderListeners())
        listener->execute();
454
455
456
457
458
459
}

void CudaApplyConstraintsKernel::initialize(const System& system) {
}

void CudaApplyConstraintsKernel::apply(ContextImpl& context, double tol) {
460
    cu.setAsCurrent();
461
462
463
464
465
466
467
468
469
    if (!hasInitializedKernel) {
        hasInitializedKernel = true;
        map<string, string> defines;
        CUmodule module = cu.createModule(CudaKernelSources::constraints, defines);
        applyDeltasKernel = cu.getKernel(module, "applyPositionDeltas");
    }
    CudaIntegrationUtilities& integration = cu.getIntegrationUtilities();
    cu.clearBuffer(integration.getPosDelta());
    integration.applyConstraints(tol);
470
    CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0);
471
472
    int numAtoms = cu.getNumAtoms();
    void* args[] = {&numAtoms, &cu.getPosq().getDevicePointer(), &posCorrection, &cu.getIntegrationUtilities().getPosDelta().getDevicePointer()};
473
474
    cu.executeKernel(applyDeltasKernel, args, cu.getNumAtoms());
    integration.computeVirtualSites();
475
476
}

477
478
479
480
void CudaApplyConstraintsKernel::applyToVelocities(ContextImpl& context, double tol) {
    cu.getIntegrationUtilities().applyVelocityConstraints(tol);
}

481
482
483
484
485
486
487
void CudaVirtualSitesKernel::initialize(const System& system) {
}

void CudaVirtualSitesKernel::computePositions(ContextImpl& context) {
    cu.getIntegrationUtilities().computeVirtualSites();
}

488
class CudaCalcNonbondedForceKernel::ForceInfo : public CudaForceInfo {
489
public:
490
491
492
493
494
495
496
    ForceInfo(const NonbondedForce& force) : force(force) {
    }
    bool areParticlesIdentical(int particle1, int particle2) {
        double charge1, charge2, sigma1, sigma2, epsilon1, epsilon2;
        force.getParticleParameters(particle1, charge1, sigma1, epsilon1);
        force.getParticleParameters(particle2, charge2, sigma2, epsilon2);
        return (charge1 == charge2 && sigma1 == sigma2 && epsilon1 == epsilon2);
497
498
    }
    int getNumParticleGroups() {
499
        return force.getNumExceptions();
500
501
502
    }
    void getParticlesInGroup(int index, vector<int>& particles) {
        int particle1, particle2;
503
504
        double chargeProd, sigma, epsilon;
        force.getExceptionParameters(index, particle1, particle2, chargeProd, sigma, epsilon);
505
506
507
508
509
510
        particles.resize(2);
        particles[0] = particle1;
        particles[1] = particle2;
    }
    bool areGroupsIdentical(int group1, int group2) {
        int particle1, particle2;
511
512
513
514
        double chargeProd1, chargeProd2, sigma1, sigma2, epsilon1, epsilon2;
        force.getExceptionParameters(group1, particle1, particle2, chargeProd1, sigma1, epsilon1);
        force.getExceptionParameters(group2, particle1, particle2, chargeProd2, sigma2, epsilon2);
        return (chargeProd1 == chargeProd2 && sigma1 == sigma2 && epsilon1 == epsilon2);
515
516
    }
private:
517
    const NonbondedForce& force;
518
519
};

520
class CudaCalcNonbondedForceKernel::PmeIO : public CalcPmeReciprocalForceKernel::IO {
521
public:
522
523
    PmeIO(CudaContext& cu, CUfunction addForcesKernel) : cu(cu), addForcesKernel(addForcesKernel) {
        forceTemp.initialize<float4>(cu, cu.getNumAtoms(), "PmeForce");
524
    }
525
526
527
528
    float* getPosq() {
        cu.setAsCurrent();
        cu.getPosq().download(posq);
        return (float*) &posq[0];
529
    }
530
531
532
533
    void setForce(float* force) {
        forceTemp.upload(force);
        void* args[] = {&forceTemp.getDevicePointer(), &cu.getForce().getDevicePointer()};
        cu.executeKernel(addForcesKernel, args, cu.getNumAtoms());
534
535
    }
private:
536
537
538
539
    CudaContext& cu;
    vector<float4> posq;
    CudaArray forceTemp;
    CUfunction addForcesKernel;
540
541
};

542
543
544
class CudaCalcNonbondedForceKernel::PmePreComputation : public CudaContext::ForcePreComputation {
public:
    PmePreComputation(CudaContext& cu, Kernel& pme, CalcPmeReciprocalForceKernel::IO& io) : cu(cu), pme(pme), io(io) {
545
    }
546
547
548
    void computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) {
        Vec3 boxVectors[3] = {Vec3(cu.getPeriodicBoxSize().x, 0, 0), Vec3(0, cu.getPeriodicBoxSize().y, 0), Vec3(0, 0, cu.getPeriodicBoxSize().z)};
        pme.getAs<CalcPmeReciprocalForceKernel>().beginComputation(io, boxVectors, includeEnergy);
549
    }
550
551
552
553
554
private:
    CudaContext& cu;
    Kernel pme;
    CalcPmeReciprocalForceKernel::IO& io;
};
555

556
class CudaCalcNonbondedForceKernel::PmePostComputation : public CudaContext::ForcePostComputation {
557
public:
558
    PmePostComputation(Kernel& pme, CalcPmeReciprocalForceKernel::IO& io) : pme(pme), io(io) {
559
    }
560
561
    double computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) {
        return pme.getAs<CalcPmeReciprocalForceKernel>().finishComputation(io);
562
563
    }
private:
564
565
    Kernel pme;
    CalcPmeReciprocalForceKernel::IO& io;
566
567
};

568
569
570
class CudaCalcNonbondedForceKernel::SyncStreamPreComputation : public CudaContext::ForcePreComputation {
public:
    SyncStreamPreComputation(CudaContext& cu, CUstream stream, CUevent event, int forceGroup) : cu(cu), stream(stream), event(event), forceGroup(forceGroup) {
571
    }
572
573
574
575
576
    void computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) {
        if ((groups&(1<<forceGroup)) != 0) {
            cuEventRecord(event, cu.getCurrentStream());
            cuStreamWaitEvent(stream, event, 0);
        }
577
    }
578
579
580
581
582
583
private:
    CudaContext& cu;
    CUstream stream;
    CUevent event;
    int forceGroup;
};
584

585
class CudaCalcNonbondedForceKernel::SyncStreamPostComputation : public CudaContext::ForcePostComputation {
586
public:
587
588
    SyncStreamPostComputation(CudaContext& cu, CUevent event, CUfunction addEnergyKernel, CudaArray& pmeEnergyBuffer, int forceGroup) : cu(cu), event(event),
            addEnergyKernel(addEnergyKernel), pmeEnergyBuffer(pmeEnergyBuffer), forceGroup(forceGroup) {
589
    }
590
591
592
593
594
595
596
597
598
599
    double computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) {
        if ((groups&(1<<forceGroup)) != 0) {
            cuStreamWaitEvent(cu.getCurrentStream(), event, 0);
            if (includeEnergy) {
                int bufferSize = pmeEnergyBuffer.getSize();
                void* args[] = {&pmeEnergyBuffer.getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(), &bufferSize};
                cu.executeKernel(addEnergyKernel, args, bufferSize);
            }
        }
        return 0.0;
600
601
    }
private:
602
603
604
605
606
    CudaContext& cu;
    CUevent event;
    CUfunction addEnergyKernel;
    CudaArray& pmeEnergyBuffer;
    int forceGroup;
607
608
};

609
CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
610
    cu.setAsCurrent();
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
    if (sort != NULL)
        delete sort;
    if (fft != NULL)
        delete fft;
    if (dispersionFft != NULL)
        delete dispersionFft;
    if (pmeio != NULL)
        delete pmeio;
    if (hasInitializedFFT) {
        if (useCudaFFT) {
            cufftDestroy(fftForward);
            cufftDestroy(fftBackward);
            if (doLJPME) {
                cufftDestroy(dispersionFftForward);
                cufftDestroy(dispersionFftBackward);                
            }
        }
        if (usePmeStream) {
            cuStreamDestroy(pmeStream);
            cuEventDestroy(pmeSyncEvent);
            cuEventDestroy(paramsSyncEvent);
        }
    }
634
635
}

636
void CudaCalcNonbondedForceKernel::initialize(const System& system, const NonbondedForce& force) {
637
    cu.setAsCurrent();
638
639
640
641
    int forceIndex;
    for (forceIndex = 0; forceIndex < system.getNumForces() && &system.getForce(forceIndex) != &force; ++forceIndex)
        ;
    string prefix = "nonbonded"+cu.intToString(forceIndex)+"_";
642

643
    // Identify which exceptions are 1-4 interactions.
644

645
646
647
648
649
650
651
    set<int> exceptionsWithOffsets;
    for (int i = 0; i < force.getNumExceptionParameterOffsets(); i++) {
        string param;
        int exception;
        double charge, sigma, epsilon;
        force.getExceptionParameterOffset(i, param, exception, charge, sigma, epsilon);
        exceptionsWithOffsets.insert(exception);
652
    }
653
654
655
656
657
658
659
660
661
662
663
    vector<pair<int, int> > exclusions;
    vector<int> exceptions;
    map<int, int> exceptionIndex;
    for (int i = 0; i < force.getNumExceptions(); i++) {
        int particle1, particle2;
        double chargeProd, sigma, epsilon;
        force.getExceptionParameters(i, particle1, particle2, chargeProd, sigma, epsilon);
        exclusions.push_back(pair<int, int>(particle1, particle2));
        if (chargeProd != 0.0 || epsilon != 0.0 || exceptionsWithOffsets.find(i) != exceptionsWithOffsets.end()) {
            exceptionIndex[i] = exceptions.size();
            exceptions.push_back(i);
664
665
666
        }
    }

667
    // Initialize nonbonded interactions.
668

669
670
671
672
673
674
675
676
677
678
679
680
681
682
    int numParticles = force.getNumParticles();
    vector<float4> baseParticleParamVec(cu.getPaddedNumAtoms(), make_float4(0, 0, 0, 0));
    vector<vector<int> > exclusionList(numParticles);
    hasCoulomb = false;
    hasLJ = false;
    for (int i = 0; i < numParticles; i++) {
        double charge, sigma, epsilon;
        force.getParticleParameters(i, charge, sigma, epsilon);
        baseParticleParamVec[i] = make_float4(charge, sigma, epsilon, 0);
        exclusionList[i].push_back(i);
        if (charge != 0.0)
            hasCoulomb = true;
        if (epsilon != 0.0)
            hasLJ = true;
683
    }
684
685
686
687
688
689
690
691
692
    for (int i = 0; i < force.getNumParticleParameterOffsets(); i++) {
        string param;
        int particle;
        double charge, sigma, epsilon;
        force.getParticleParameterOffset(i, param, particle, charge, sigma, epsilon);
        if (charge != 0.0)
            hasCoulomb = true;
        if (epsilon != 0.0)
            hasLJ = true;
693
    }
694
695
696
    for (auto exclusion : exclusions) {
        exclusionList[exclusion.first].push_back(exclusion.second);
        exclusionList[exclusion.second].push_back(exclusion.first);
697
    }
698
699
700
701
702
    nonbondedMethod = CalcNonbondedForceKernel::NonbondedMethod(force.getNonbondedMethod());
    bool useCutoff = (nonbondedMethod != NoCutoff);
    bool usePeriodic = (nonbondedMethod != NoCutoff && nonbondedMethod != CutoffNonPeriodic);
    doLJPME = (nonbondedMethod == LJPME && hasLJ);
    usePosqCharges = hasCoulomb ? cu.requestPosqCharges() : false;
703

704
705
706
707
708
709
    map<string, string> defines;
    defines["HAS_COULOMB"] = (hasCoulomb ? "1" : "0");
    defines["HAS_LENNARD_JONES"] = (hasLJ ? "1" : "0");
    defines["USE_LJ_SWITCH"] = (useCutoff && force.getUseSwitchingFunction() ? "1" : "0");
    if (useCutoff) {
        // Compute the reaction field constants.
710

711
712
713
714
715
716
717
718
719
720
721
722
        double reactionFieldK = pow(force.getCutoffDistance(), -3.0)*(force.getReactionFieldDielectric()-1.0)/(2.0*force.getReactionFieldDielectric()+1.0);
        double reactionFieldC = (1.0 / force.getCutoffDistance())*(3.0*force.getReactionFieldDielectric())/(2.0*force.getReactionFieldDielectric()+1.0);
        defines["REACTION_FIELD_K"] = cu.doubleToString(reactionFieldK);
        defines["REACTION_FIELD_C"] = cu.doubleToString(reactionFieldC);
        
        // Compute the switching coefficients.
        
        if (force.getUseSwitchingFunction()) {
            defines["LJ_SWITCH_CUTOFF"] = cu.doubleToString(force.getSwitchingDistance());
            defines["LJ_SWITCH_C3"] = cu.doubleToString(10/pow(force.getSwitchingDistance()-force.getCutoffDistance(), 3.0));
            defines["LJ_SWITCH_C4"] = cu.doubleToString(15/pow(force.getSwitchingDistance()-force.getCutoffDistance(), 4.0));
            defines["LJ_SWITCH_C5"] = cu.doubleToString(6/pow(force.getSwitchingDistance()-force.getCutoffDistance(), 5.0));
723
724
        }
    }
725
726
727
728
729
730
731
732
733
734
735
736
737
738
    if (force.getUseDispersionCorrection() && cu.getContextIndex() == 0 && !doLJPME)
        dispersionCoefficient = NonbondedForceImpl::calcDispersionCorrection(system, force);
    else
        dispersionCoefficient = 0.0;
    alpha = 0;
    ewaldSelfEnergy = 0.0;
    map<string, string> paramsDefines;
    hasOffsets = (force.getNumParticleParameterOffsets() > 0 || force.getNumExceptionParameterOffsets() > 0);
    if (hasOffsets)
        paramsDefines["HAS_OFFSETS"] = "1";
    if (usePosqCharges)
        paramsDefines["USE_POSQ_CHARGES"] = "1";
    if (nonbondedMethod == Ewald) {
        // Compute the Ewald parameters.
739

740
741
742
743
744
745
746
747
748
749
        int kmaxx, kmaxy, kmaxz;
        NonbondedForceImpl::calcEwaldParameters(system, force, alpha, kmaxx, kmaxy, kmaxz);
        defines["EWALD_ALPHA"] = cu.doubleToString(alpha);
        defines["TWO_OVER_SQRT_PI"] = cu.doubleToString(2.0/sqrt(M_PI));
        defines["USE_EWALD"] = "1";
        if (cu.getContextIndex() == 0) {
            paramsDefines["INCLUDE_EWALD"] = "1";
            paramsDefines["EWALD_SELF_ENERGY_SCALE"] = cu.doubleToString(ONE_4PI_EPS0*alpha/sqrt(M_PI));
            for (int i = 0; i < numParticles; i++)
                ewaldSelfEnergy -= baseParticleParamVec[i].x*baseParticleParamVec[i].x*ONE_4PI_EPS0*alpha/sqrt(M_PI);
750

751
            // Create the reciprocal space kernels.
752

753
754
755
756
757
758
759
760
761
762
763
764
765
766
            map<string, string> replacements;
            replacements["NUM_ATOMS"] = cu.intToString(numParticles);
            replacements["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
            replacements["KMAX_X"] = cu.intToString(kmaxx);
            replacements["KMAX_Y"] = cu.intToString(kmaxy);
            replacements["KMAX_Z"] = cu.intToString(kmaxz);
            replacements["EXP_COEFFICIENT"] = cu.doubleToString(-1.0/(4.0*alpha*alpha));
            replacements["ONE_4PI_EPS0"] = cu.doubleToString(ONE_4PI_EPS0);
            replacements["M_PI"] = cu.doubleToString(M_PI);
            CUmodule module = cu.createModule(CudaKernelSources::vectorOps+CudaKernelSources::ewald, replacements);
            ewaldSumsKernel = cu.getKernel(module, "calculateEwaldCosSinSums");
            ewaldForcesKernel = cu.getKernel(module, "calculateEwaldForces");
            int elementSize = (cu.getUseDoublePrecision() ? sizeof(double2) : sizeof(float2));
            cosSinSums.initialize(cu, (2*kmaxx-1)*(2*kmaxy-1)*(2*kmaxz-1), elementSize, "cosSinSums");
767
768
        }
    }
769
770
    else if (((nonbondedMethod == PME || nonbondedMethod == LJPME) && hasCoulomb) || doLJPME) {
        // Compute the PME parameters.
771

772
773
774
775
776
777
778
779
780
781
        NonbondedForceImpl::calcPMEParameters(system, force, alpha, gridSizeX, gridSizeY, gridSizeZ, false);
        gridSizeX = CudaFFT3D::findLegalDimension(gridSizeX);
        gridSizeY = CudaFFT3D::findLegalDimension(gridSizeY);
        gridSizeZ = CudaFFT3D::findLegalDimension(gridSizeZ);
        if (doLJPME) {
            NonbondedForceImpl::calcPMEParameters(system, force, dispersionAlpha, dispersionGridSizeX,
                                                  dispersionGridSizeY, dispersionGridSizeZ, true);
            dispersionGridSizeX = CudaFFT3D::findLegalDimension(dispersionGridSizeX);
            dispersionGridSizeY = CudaFFT3D::findLegalDimension(dispersionGridSizeY);
            dispersionGridSizeZ = CudaFFT3D::findLegalDimension(dispersionGridSizeZ);
782
783
        }

784
785
786
        defines["EWALD_ALPHA"] = cu.doubleToString(alpha);
        defines["TWO_OVER_SQRT_PI"] = cu.doubleToString(2.0/sqrt(M_PI));
        defines["USE_EWALD"] = "1";
787
788
789
        defines["DO_LJPME"] = doLJPME ? "1" : "0";
        if (doLJPME)
            defines["EWALD_DISPERSION_ALPHA"] = cu.doubleToString(dispersionAlpha);
790
        if (cu.getContextIndex() == 0) {
791
792
793
794
795
796
797
798
799
800
            paramsDefines["INCLUDE_EWALD"] = "1";
            paramsDefines["EWALD_SELF_ENERGY_SCALE"] = cu.doubleToString(ONE_4PI_EPS0*alpha/sqrt(M_PI));
            for (int i = 0; i < numParticles; i++)
                ewaldSelfEnergy -= baseParticleParamVec[i].x*baseParticleParamVec[i].x*ONE_4PI_EPS0*alpha/sqrt(M_PI);
            if (doLJPME) {
                paramsDefines["INCLUDE_LJPME"] = "1";
                paramsDefines["LJPME_SELF_ENERGY_SCALE"] = cu.doubleToString(pow(dispersionAlpha, 6)/3.0);
                for (int i = 0; i < numParticles; i++)
                    ewaldSelfEnergy += baseParticleParamVec[i].z*pow(baseParticleParamVec[i].y*dispersionAlpha, 6)/3.0;
            }
Peter Eastman's avatar
Peter Eastman committed
801
802
            char deviceName[100];
            cuDeviceGetName(deviceName, 100, cu.getDevice());
803
            usePmeStream = (!cu.getPlatformData().disablePmeStream && string(deviceName) != "GeForce GTX 980"); // Using a separate stream is slower on GTX 980
804
            map<string, string> pmeDefines;
805
806
807
808
809
810
811
812
813
814
815
            pmeDefines["PME_ORDER"] = cu.intToString(PmeOrder);
            pmeDefines["NUM_ATOMS"] = cu.intToString(numParticles);
            pmeDefines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
            pmeDefines["RECIP_EXP_FACTOR"] = cu.doubleToString(M_PI*M_PI/(alpha*alpha));
            pmeDefines["GRID_SIZE_X"] = cu.intToString(gridSizeX);
            pmeDefines["GRID_SIZE_Y"] = cu.intToString(gridSizeY);
            pmeDefines["GRID_SIZE_Z"] = cu.intToString(gridSizeZ);
            pmeDefines["EPSILON_FACTOR"] = cu.doubleToString(sqrt(ONE_4PI_EPS0));
            pmeDefines["M_PI"] = cu.doubleToString(M_PI);
            if (cu.getUseDoublePrecision())
                pmeDefines["USE_DOUBLE_PRECISION"] = "1";
Peter Eastman's avatar
Peter Eastman committed
816
817
            if (usePmeStream)
                pmeDefines["USE_PME_STREAM"] = "1";
818
819
            if (cu.getPlatformData().deterministicForces)
                pmeDefines["USE_DETERMINISTIC_FORCES"] = "1";
820
821
822
            map<string, string> replacements;
            replacements["CHARGE"] = (usePosqCharges ? "pos.w" : "charges[atom]");
            CUmodule module = cu.createModule(CudaKernelSources::vectorOps+cu.replaceStrings(CudaKernelSources::pme, replacements), pmeDefines);
823
            if (cu.getPlatformData().useCpuPme && !doLJPME && usePosqCharges) {
824
825
                // Create the CPU PME kernel.

826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
                try {
                    cpuPme = getPlatform().createKernel(CalcPmeReciprocalForceKernel::Name(), *cu.getPlatformData().context);
                    cpuPme.getAs<CalcPmeReciprocalForceKernel>().initialize(gridSizeX, gridSizeY, gridSizeZ, numParticles, alpha, cu.getPlatformData().deterministicForces);
                    CUfunction addForcesKernel = cu.getKernel(module, "addForces");
                    pmeio = new PmeIO(cu, addForcesKernel);
                    cu.addPreComputation(new PmePreComputation(cu, cpuPme, *pmeio));
                    cu.addPostComputation(new PmePostComputation(cpuPme, *pmeio));
                }
                catch (OpenMMException& ex) {
                    // The CPU PME plugin isn't available.
                }
            }
            if (pmeio == NULL) {
                pmeGridIndexKernel = cu.getKernel(module, "findAtomGridIndex");
                pmeSpreadChargeKernel = cu.getKernel(module, "gridSpreadCharge");
                pmeConvolutionKernel = cu.getKernel(module, "reciprocalConvolution");
                pmeInterpolateForceKernel = cu.getKernel(module, "gridInterpolateForce");
                pmeEvalEnergyKernel = cu.getKernel(module, "gridEvaluateEnergy");
                pmeFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadCharge");
                cuFuncSetCacheConfig(pmeSpreadChargeKernel, CU_FUNC_CACHE_PREFER_L1);
                cuFuncSetCacheConfig(pmeInterpolateForceKernel, CU_FUNC_CACHE_PREFER_L1);
                if (doLJPME) {
                    pmeDefines["EWALD_ALPHA"] = cu.doubleToString(dispersionAlpha);
                    pmeDefines["GRID_SIZE_X"] = cu.intToString(dispersionGridSizeX);
                    pmeDefines["GRID_SIZE_Y"] = cu.intToString(dispersionGridSizeY);
                    pmeDefines["GRID_SIZE_Z"] = cu.intToString(dispersionGridSizeZ);
                    pmeDefines["RECIP_EXP_FACTOR"] = cu.doubleToString(M_PI*M_PI/(dispersionAlpha*dispersionAlpha));
                    pmeDefines["USE_LJPME"] = "1";
                    pmeDefines["CHARGE_FROM_SIGEPS"] = "1";
                    double invRCut6 = pow(force.getCutoffDistance(), -6);
                    double dalphaR = dispersionAlpha * force.getCutoffDistance();
                    double dar2 = dalphaR*dalphaR;
                    double dar4 = dar2*dar2;
                    double multShift6 = -invRCut6*(1.0 - exp(-dar2) * (1.0 + dar2 + 0.5*dar4));
                    defines["INVCUT6"] = cu.doubleToString(invRCut6);
                    defines["MULTSHIFT6"] = cu.doubleToString(multShift6);
                    module = cu.createModule(CudaKernelSources::vectorOps+CudaKernelSources::pme, pmeDefines);
                    pmeDispersionFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadCharge");
                    pmeDispersionGridIndexKernel = cu.getKernel(module, "findAtomGridIndex");
                    pmeDispersionSpreadChargeKernel = cu.getKernel(module, "gridSpreadCharge");
                    pmeDispersionConvolutionKernel = cu.getKernel(module, "reciprocalConvolution");
                    pmeEvalDispersionEnergyKernel = cu.getKernel(module, "gridEvaluateEnergy");
                    pmeInterpolateDispersionForceKernel = cu.getKernel(module, "gridInterpolateForce");
                    cuFuncSetCacheConfig(pmeDispersionSpreadChargeKernel, CU_FUNC_CACHE_PREFER_L1);
                }
peastman's avatar
peastman committed
871

872
                // Create required data structures.
peastman's avatar
peastman committed
873

874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
                int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
                int roundedZSize = PmeOrder*(int) ceil(gridSizeZ/(double) PmeOrder);
                int gridElements = gridSizeX*gridSizeY*roundedZSize;
                if (doLJPME) {
                    roundedZSize = PmeOrder*(int) ceil(dispersionGridSizeZ/(double) PmeOrder);
                    gridElements = max(gridElements, dispersionGridSizeX*dispersionGridSizeY*roundedZSize);
                }
                pmeGrid1.initialize(cu, gridElements, 2*elementSize, "pmeGrid1");
                pmeGrid2.initialize(cu, gridElements, 2*elementSize, "pmeGrid2");
                cu.addAutoclearBuffer(pmeGrid2);
                pmeBsplineModuliX.initialize(cu, gridSizeX, elementSize, "pmeBsplineModuliX");
                pmeBsplineModuliY.initialize(cu, gridSizeY, elementSize, "pmeBsplineModuliY");
                pmeBsplineModuliZ.initialize(cu, gridSizeZ, elementSize, "pmeBsplineModuliZ");
                if (doLJPME) {
                    pmeDispersionBsplineModuliX.initialize(cu, dispersionGridSizeX, elementSize, "pmeDispersionBsplineModuliX");
                    pmeDispersionBsplineModuliY.initialize(cu, dispersionGridSizeY, elementSize, "pmeDispersionBsplineModuliY");
                    pmeDispersionBsplineModuliZ.initialize(cu, dispersionGridSizeZ, elementSize, "pmeDispersionBsplineModuliZ");
                }
                pmeAtomGridIndex.initialize<int2>(cu, numParticles, "pmeAtomGridIndex");
                int energyElementSize = (cu.getUseDoublePrecision() || cu.getUseMixedPrecision() ? sizeof(double) : sizeof(float));
                pmeEnergyBuffer.initialize(cu, cu.getNumThreadBlocks()*CudaContext::ThreadBlockSize, energyElementSize, "pmeEnergyBuffer");
                cu.clearBuffer(pmeEnergyBuffer);
                sort = new CudaSort(cu, new SortTrait(), cu.getNumAtoms());
                int cufftVersion;
                cufftGetVersion(&cufftVersion);
                useCudaFFT = (cufftVersion >= 7050); // There was a critical bug in version 7.0
                if (useCudaFFT) {
                    cufftResult result = cufftPlan3d(&fftForward, gridSizeX, gridSizeY, gridSizeZ, cu.getUseDoublePrecision() ? CUFFT_D2Z : CUFFT_R2C);
                    if (result != CUFFT_SUCCESS)
                        throw OpenMMException("Error initializing FFT: "+cu.intToString(result));
                    result = cufftPlan3d(&fftBackward, gridSizeX, gridSizeY, gridSizeZ, cu.getUseDoublePrecision() ? CUFFT_Z2D : CUFFT_C2R);
                    if (result != CUFFT_SUCCESS)
                        throw OpenMMException("Error initializing FFT: "+cu.intToString(result));
                    if (doLJPME) {
                        result = cufftPlan3d(&dispersionFftForward, dispersionGridSizeX, dispersionGridSizeY, 
                                                dispersionGridSizeZ, cu.getUseDoublePrecision() ? CUFFT_D2Z : CUFFT_R2C);
                        if (result != CUFFT_SUCCESS)
                            throw OpenMMException("Error initializing disperison FFT: "+cu.intToString(result));
                        result = cufftPlan3d(&dispersionFftBackward, dispersionGridSizeX, dispersionGridSizeY,
                                             dispersionGridSizeZ, cu.getUseDoublePrecision() ? CUFFT_Z2D : CUFFT_C2R);
                        if (result != CUFFT_SUCCESS)
                            throw OpenMMException("Error initializing disperison FFT: "+cu.intToString(result));
                    }
                }
                else {
                    fft = new CudaFFT3D(cu, gridSizeX, gridSizeY, gridSizeZ, true);
                    if (doLJPME)
                        dispersionFft = new CudaFFT3D(cu, dispersionGridSizeX, dispersionGridSizeY, dispersionGridSizeZ, true);
                }
923

924
                // Prepare for doing PME on its own stream.
925

926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
                if (usePmeStream) {
                    cuStreamCreate(&pmeStream, CU_STREAM_NON_BLOCKING);
                    if (useCudaFFT) {
                        cufftSetStream(fftForward, pmeStream);
                        cufftSetStream(fftBackward, pmeStream);
                        if (doLJPME) {
                            cufftSetStream(dispersionFftForward, pmeStream);
                            cufftSetStream(dispersionFftBackward, pmeStream);
                        }
                    }
                    CHECK_RESULT(cuEventCreate(&pmeSyncEvent, CU_EVENT_DISABLE_TIMING), "Error creating event for NonbondedForce");
                    CHECK_RESULT(cuEventCreate(&paramsSyncEvent, CU_EVENT_DISABLE_TIMING), "Error creating event for NonbondedForce");
                    int recipForceGroup = force.getReciprocalSpaceForceGroup();
                    if (recipForceGroup < 0)
                        recipForceGroup = force.getForceGroup();
                    cu.addPreComputation(new SyncStreamPreComputation(cu, pmeStream, pmeSyncEvent, recipForceGroup));
                    cu.addPostComputation(new SyncStreamPostComputation(cu, pmeSyncEvent, cu.getKernel(module, "addEnergy"), pmeEnergyBuffer, recipForceGroup));
                }
                hasInitializedFFT = true;
945

946
                // Initialize the b-spline moduli.
947

948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
                for (int grid = 0; grid < 2; grid++) {
                    int xsize, ysize, zsize;
                    CudaArray *xmoduli, *ymoduli, *zmoduli;
                    if (grid == 0) {
                        xsize = gridSizeX;
                        ysize = gridSizeY;
                        zsize = gridSizeZ;
                        xmoduli = &pmeBsplineModuliX;
                        ymoduli = &pmeBsplineModuliY;
                        zmoduli = &pmeBsplineModuliZ;
                    }
                    else {
                        if (!doLJPME)
                            continue;
                        xsize = dispersionGridSizeX;
                        ysize = dispersionGridSizeY;
                        zsize = dispersionGridSizeZ;
                        xmoduli = &pmeDispersionBsplineModuliX;
                        ymoduli = &pmeDispersionBsplineModuliY;
                        zmoduli = &pmeDispersionBsplineModuliZ;
                    }
                    int maxSize = max(max(xsize, ysize), zsize);
                    vector<double> data(PmeOrder);
                    vector<double> ddata(PmeOrder);
                    vector<double> bsplines_data(maxSize);
                    data[PmeOrder-1] = 0.0;
                    data[1] = 0.0;
                    data[0] = 1.0;
                    for (int i = 3; i < PmeOrder; i++) {
                        double div = 1.0/(i-1.0);
                        data[i-1] = 0.0;
                        for (int j = 1; j < (i-1); j++)
                            data[i-j-1] = div*(j*data[i-j-2]+(i-j)*data[i-j-1]);
                        data[0] = div*data[0];
                    }
983

984
                    // Differentiate.
985

986
987
988
989
990
991
992
993
994
995
996
997
                    ddata[0] = -data[0];
                    for (int i = 1; i < PmeOrder; i++)
                        ddata[i] = data[i-1]-data[i];
                    double div = 1.0/(PmeOrder-1);
                    data[PmeOrder-1] = 0.0;
                    for (int i = 1; i < (PmeOrder-1); i++)
                        data[PmeOrder-i-1] = div*(i*data[PmeOrder-i-2]+(PmeOrder-i)*data[PmeOrder-i-1]);
                    data[0] = div*data[0];
                    for (int i = 0; i < maxSize; i++)
                        bsplines_data[i] = 0.0;
                    for (int i = 1; i <= PmeOrder; i++)
                        bsplines_data[i] = data[i-1];
998

999
                    // Evaluate the actual bspline moduli for X/Y/Z.
1000

1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
                    for (int dim = 0; dim < 3; dim++) {
                        int ndata = (dim == 0 ? xsize : dim == 1 ? ysize : zsize);
                        vector<double> moduli(ndata);
                        for (int i = 0; i < ndata; i++) {
                            double sc = 0.0;
                            double ss = 0.0;
                            for (int j = 0; j < ndata; j++) {
                                double arg = (2.0*M_PI*i*j)/ndata;
                                sc += bsplines_data[j]*cos(arg);
                                ss += bsplines_data[j]*sin(arg);
                            }
                            moduli[i] = sc*sc+ss*ss;
                        }
                        for (int i = 0; i < ndata; i++)
                            if (moduli[i] < 1.0e-7)
                                moduli[i] = (moduli[i-1]+moduli[i+1])*0.5;
                        if (dim == 0)
                            xmoduli->upload(moduli, true);
                        else if (dim == 1)
                            ymoduli->upload(moduli, true);
                        else
                            zmoduli->upload(moduli, true);
                    }
                }
            }
        }
    }
1028

1029
    // Add code to subtract off the reciprocal part of excluded interactions.
1030

1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
    if ((nonbondedMethod == Ewald || nonbondedMethod == PME || nonbondedMethod == LJPME) && pmeio == NULL) {
        int numContexts = cu.getPlatformData().contexts.size();
        int startIndex = cu.getContextIndex()*force.getNumExceptions()/numContexts;
        int endIndex = (cu.getContextIndex()+1)*force.getNumExceptions()/numContexts;
        int numExclusions = endIndex-startIndex;
        if (numExclusions > 0) {
            paramsDefines["HAS_EXCLUSIONS"] = "1";
            vector<vector<int> > atoms(numExclusions, vector<int>(2));
            exclusionAtoms.initialize<int2>(cu, numExclusions, "exclusionAtoms");
            exclusionParams.initialize<float4>(cu, numExclusions, "exclusionParams");
            vector<int2> exclusionAtomsVec(numExclusions);
            for (int i = 0; i < numExclusions; i++) {
                int j = i+startIndex;
                exclusionAtomsVec[i] = make_int2(exclusions[j].first, exclusions[j].second);
                atoms[i][0] = exclusions[j].first;
                atoms[i][1] = exclusions[j].second;
            }
            exclusionAtoms.upload(exclusionAtomsVec);
            map<string, string> replacements;
            replacements["PARAMS"] = cu.getBondedUtilities().addArgument(exclusionParams.getDevicePointer(), "float4");
            replacements["EWALD_ALPHA"] = cu.doubleToString(alpha);
            replacements["TWO_OVER_SQRT_PI"] = cu.doubleToString(2.0/sqrt(M_PI));
            replacements["DO_LJPME"] = doLJPME ? "1" : "0";
            if (doLJPME)
                replacements["EWALD_DISPERSION_ALPHA"] = cu.doubleToString(dispersionAlpha);
            cu.getBondedUtilities().addInteraction(atoms, cu.replaceStrings(CudaKernelSources::pmeExclusions, replacements), force.getForceGroup());
        }
1058
    }
1059

1060
    // Add the interaction to the default nonbonded kernel.
1061

1062
1063
1064
1065
1066
1067
1068
1069
    string source = cu.replaceStrings(CudaKernelSources::coulombLennardJones, defines);
    charges.initialize(cu, cu.getPaddedNumAtoms(), cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float), "charges");
    baseParticleParams.initialize<float4>(cu, cu.getPaddedNumAtoms(), "baseParticleParams");
    baseParticleParams.upload(baseParticleParamVec);
    map<string, string> replacements;
    if (usePosqCharges) {
        replacements["CHARGE1"] = "posq1.w";
        replacements["CHARGE2"] = "posq2.w";
1070
    }
1071
1072
1073
    else {
        replacements["CHARGE1"] = prefix+"charge1";
        replacements["CHARGE2"] = prefix+"charge2";
1074
    }
1075
1076
1077
1078
1079
1080
1081
    if (hasCoulomb)
        cu.getNonbondedUtilities().addParameter(CudaNonbondedUtilities::ParameterInfo(prefix+"charge", "real", 1, charges.getElementSize(), charges.getDevicePointer()));
    sigmaEpsilon.initialize<float2>(cu, cu.getPaddedNumAtoms(), "sigmaEpsilon");
    if (hasLJ) {
        replacements["SIGMA_EPSILON1"] = prefix+"sigmaEpsilon1";
        replacements["SIGMA_EPSILON2"] = prefix+"sigmaEpsilon2";
        cu.getNonbondedUtilities().addParameter(CudaNonbondedUtilities::ParameterInfo(prefix+"sigmaEpsilon", "float", 2, sizeof(float2), sigmaEpsilon.getDevicePointer()));
1082
    }
1083
1084
    source = cu.replaceStrings(source, replacements);
    cu.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, true, force.getCutoffDistance(), exclusionList, source, force.getForceGroup(), true);
1085

1086
    // Initialize the exceptions.
1087

1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
    int numContexts = cu.getPlatformData().contexts.size();
    int startIndex = cu.getContextIndex()*exceptions.size()/numContexts;
    int endIndex = (cu.getContextIndex()+1)*exceptions.size()/numContexts;
    int numExceptions = endIndex-startIndex;
    if (numExceptions > 0) {
        paramsDefines["HAS_EXCEPTIONS"] = "1";
        exceptionAtoms.resize(numExceptions);
        vector<vector<int> > atoms(numExceptions, vector<int>(2));
        exceptionParams.initialize<float4>(cu, numExceptions, "exceptionParams");
        baseExceptionParams.initialize<float4>(cu, numExceptions, "baseExceptionParams");
        vector<float4> baseExceptionParamsVec(numExceptions);
        for (int i = 0; i < numExceptions; i++) {
            double chargeProd, sigma, epsilon;
            force.getExceptionParameters(exceptions[startIndex+i], atoms[i][0], atoms[i][1], chargeProd, sigma, epsilon);
            baseExceptionParamsVec[i] = make_float4(chargeProd, sigma, epsilon, 0);
            exceptionAtoms[i] = make_pair(atoms[i][0], atoms[i][1]);
        }
        baseExceptionParams.upload(baseExceptionParamsVec);
        map<string, string> replacements;
        replacements["PARAMS"] = cu.getBondedUtilities().addArgument(exceptionParams.getDevicePointer(), "float4");
        cu.getBondedUtilities().addInteraction(atoms, cu.replaceStrings(CudaKernelSources::nonbondedExceptions, replacements), force.getForceGroup());
    }
    
    // Initialize parameter offsets.
1112

1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
    vector<vector<float4> > particleOffsetVec(force.getNumParticles());
    vector<vector<float4> > exceptionOffsetVec(force.getNumExceptions());
    for (int i = 0; i < force.getNumParticleParameterOffsets(); i++) {
        string param;
        int particle;
        double charge, sigma, epsilon;
        force.getParticleParameterOffset(i, param, particle, charge, sigma, epsilon);
        auto paramPos = find(paramNames.begin(), paramNames.end(), param);
        int paramIndex;
        if (paramPos == paramNames.end()) {
            paramIndex = paramNames.size();
            paramNames.push_back(param);
        }
        else
            paramIndex = paramPos-paramNames.begin();
        particleOffsetVec[particle].push_back(make_float4(charge, sigma, epsilon, paramIndex));
    }
    for (int i = 0; i < force.getNumExceptionParameterOffsets(); i++) {
        string param;
        int exception;
        double charge, sigma, epsilon;
        force.getExceptionParameterOffset(i, param, exception, charge, sigma, epsilon);
        auto paramPos = find(paramNames.begin(), paramNames.end(), param);
        int paramIndex;
        if (paramPos == paramNames.end()) {
            paramIndex = paramNames.size();
            paramNames.push_back(param);
        }
        else
            paramIndex = paramPos-paramNames.begin();
        exceptionOffsetVec[exceptionIndex[exception]].push_back(make_float4(charge, sigma, epsilon, paramIndex));
1144
    }
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
    paramValues.resize(paramNames.size(), 0.0);
    particleParamOffsets.initialize<float4>(cu, max(force.getNumParticleParameterOffsets(), 1), "particleParamOffsets");
    exceptionParamOffsets.initialize<float4>(cu, max(force.getNumExceptionParameterOffsets(), 1), "exceptionParamOffsets");
    particleOffsetIndices.initialize<int>(cu, cu.getPaddedNumAtoms()+1, "particleOffsetIndices");
    exceptionOffsetIndices.initialize<int>(cu, force.getNumExceptions()+1, "exceptionOffsetIndices");
    vector<int> particleOffsetIndicesVec, exceptionOffsetIndicesVec;
    vector<float4> p, e;
    for (int i = 0; i < particleOffsetVec.size(); i++) {
        particleOffsetIndicesVec.push_back(p.size());
        for (int j = 0; j < particleOffsetVec[i].size(); j++)
            p.push_back(particleOffsetVec[i][j]);
1156
    }
1157
1158
1159
1160
1161
1162
    while (particleOffsetIndicesVec.size() < particleOffsetIndices.getSize())
        particleOffsetIndicesVec.push_back(p.size());
    for (int i = 0; i < exceptionOffsetVec.size(); i++) {
        exceptionOffsetIndicesVec.push_back(e.size());
        for (int j = 0; j < exceptionOffsetVec[i].size(); j++)
            e.push_back(exceptionOffsetVec[i][j]);
1163
    }
1164
1165
1166
1167
    exceptionOffsetIndicesVec.push_back(e.size());
    if (force.getNumParticleParameterOffsets() > 0) {
        particleParamOffsets.upload(p);
        particleOffsetIndices.upload(particleOffsetIndicesVec);
1168
    }
1169
1170
1171
    if (force.getNumExceptionParameterOffsets() > 0) {
        exceptionParamOffsets.upload(e);
        exceptionOffsetIndices.upload(exceptionOffsetIndicesVec);
1172
    }
1173
1174
    globalParams.initialize(cu, max((int) paramValues.size(), 1), cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float), "globalParams");
    recomputeParams = true;
1175
    
1176
    // Initialize the kernel for updating parameters.
1177
    
1178
1179
1180
1181
1182
    CUmodule module = cu.createModule(CudaKernelSources::nonbondedParameters, paramsDefines);
    computeParamsKernel = cu.getKernel(module, "computeParameters");
    computeExclusionParamsKernel = cu.getKernel(module, "computeExclusionParameters");
    info = new ForceInfo(force);
    cu.addForce(info);
1183
1184
}

1185
1186
double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy, bool includeDirect, bool includeReciprocal) {
    // Update particle and exception parameters.
1187

1188
1189
1190
1191
1192
1193
    bool paramChanged = false;
    for (int i = 0; i < paramNames.size(); i++) {
        double value = context.getParameter(paramNames[i]);
        if (value != paramValues[i]) {
            paramValues[i] = value;;
            paramChanged = true;
1194
        }
1195
    }
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
    if (paramChanged) {
        recomputeParams = true;
        globalParams.upload(paramValues, true);
    }
    double energy = (includeReciprocal ? ewaldSelfEnergy : 0.0);
    if (recomputeParams || hasOffsets) {
        bool computeSelfEnergy = (includeEnergy && includeReciprocal);
        int numAtoms = cu.getPaddedNumAtoms();
        vector<void*> paramsArgs = {&cu.getEnergyBuffer().getDevicePointer(), &computeSelfEnergy, &globalParams.getDevicePointer(), &numAtoms,
                &baseParticleParams.getDevicePointer(), &cu.getPosq().getDevicePointer(), &charges.getDevicePointer(), &sigmaEpsilon.getDevicePointer(),
                &particleParamOffsets.getDevicePointer(), &particleOffsetIndices.getDevicePointer()};
        int numExceptions;
        if (exceptionParams.isInitialized()) {
            numExceptions = exceptionParams.getSize();
            paramsArgs.push_back(&numExceptions);
            paramsArgs.push_back(&baseExceptionParams.getDevicePointer());
            paramsArgs.push_back(&exceptionParams.getDevicePointer());
            paramsArgs.push_back(&exceptionParamOffsets.getDevicePointer());
            paramsArgs.push_back(&exceptionOffsetIndices.getDevicePointer());
1215
        }
1216
1217
1218
1219
1220
1221
        cu.executeKernel(computeParamsKernel, &paramsArgs[0], cu.getPaddedNumAtoms());
        if (exclusionParams.isInitialized()) {
            int numExclusions = exclusionParams.getSize();
            vector<void*> exclusionParamsArgs = {&cu.getPosq().getDevicePointer(), &charges.getDevicePointer(), &sigmaEpsilon.getDevicePointer(),
                    &numExclusions, &exclusionAtoms.getDevicePointer(), &exclusionParams.getDevicePointer()};
            cu.executeKernel(computeExclusionParamsKernel, &exclusionParamsArgs[0], numExclusions);
1222
        }
1223
1224
1225
1226
1227
1228
1229
        if (usePmeStream) {
            cuEventRecord(paramsSyncEvent, cu.getCurrentStream());
            cuStreamWaitEvent(pmeStream, paramsSyncEvent, 0);
        }
        if (hasOffsets)
            energy = 0.0; // The Ewald self energy was computed in the kernel.
        recomputeParams = false;
1230
    }
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
    
    // Do reciprocal space calculations.
    
    if (cosSinSums.isInitialized() && includeReciprocal) {
        void* sumsArgs[] = {&cu.getEnergyBuffer().getDevicePointer(), &cu.getPosq().getDevicePointer(), &cosSinSums.getDevicePointer(), cu.getPeriodicBoxSizePointer()};
        cu.executeKernel(ewaldSumsKernel, sumsArgs, cosSinSums.getSize());
        void* forcesArgs[] = {&cu.getForce().getDevicePointer(), &cu.getPosq().getDevicePointer(), &cosSinSums.getDevicePointer(), cu.getPeriodicBoxSizePointer()};
        cu.executeKernel(ewaldForcesKernel, forcesArgs, cu.getNumAtoms());
    }
    if (pmeGrid1.isInitialized() && includeReciprocal) {
        if (usePmeStream)
            cu.setCurrentStream(pmeStream);
1243

1244
        // Invert the periodic box vectors.
1245

1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
        Vec3 boxVectors[3];
        cu.getPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
        double determinant = boxVectors[0][0]*boxVectors[1][1]*boxVectors[2][2];
        double scale = 1.0/determinant;
        double3 recipBoxVectors[3];
        recipBoxVectors[0] = make_double3(boxVectors[1][1]*boxVectors[2][2]*scale, 0, 0);
        recipBoxVectors[1] = make_double3(-boxVectors[1][0]*boxVectors[2][2]*scale, boxVectors[0][0]*boxVectors[2][2]*scale, 0);
        recipBoxVectors[2] = make_double3((boxVectors[1][0]*boxVectors[2][1]-boxVectors[1][1]*boxVectors[2][0])*scale, -boxVectors[0][0]*boxVectors[2][1]*scale, boxVectors[0][0]*boxVectors[1][1]*scale);
        float3 recipBoxVectorsFloat[3];
        void* recipBoxVectorPointer[3];
        if (cu.getUseDoublePrecision()) {
            recipBoxVectorPointer[0] = &recipBoxVectors[0];
            recipBoxVectorPointer[1] = &recipBoxVectors[1];
            recipBoxVectorPointer[2] = &recipBoxVectors[2];
        }
        else {
            recipBoxVectorsFloat[0] = make_float3((float) recipBoxVectors[0].x, 0, 0);
            recipBoxVectorsFloat[1] = make_float3((float) recipBoxVectors[1].x, (float) recipBoxVectors[1].y, 0);
            recipBoxVectorsFloat[2] = make_float3((float) recipBoxVectors[2].x, (float) recipBoxVectors[2].y, (float) recipBoxVectors[2].z);
            recipBoxVectorPointer[0] = &recipBoxVectorsFloat[0];
            recipBoxVectorPointer[1] = &recipBoxVectorsFloat[1];
            recipBoxVectorPointer[2] = &recipBoxVectorsFloat[2];
        }
1269

1270
        // Execute the reciprocal space kernels.
1271

1272
1273
1274
1275
1276
        if (hasCoulomb) {
            void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomGridIndex.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
                    cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
                    recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
            cu.executeKernel(pmeGridIndexKernel, gridIndexArgs, cu.getNumAtoms());
1277

1278
            sort->sort(pmeAtomGridIndex);
1279

1280
1281
1282
1283
1284
            void* spreadArgs[] = {&cu.getPosq().getDevicePointer(), &pmeGrid2.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
                    cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
                    recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(),
                    &charges.getDevicePointer()};
            cu.executeKernel(pmeSpreadChargeKernel, spreadArgs, cu.getNumAtoms(), 128);
1285

1286
1287
            void* finishSpreadArgs[] = {&pmeGrid2.getDevicePointer(), &pmeGrid1.getDevicePointer()};
            cu.executeKernel(pmeFinishSpreadChargeKernel, finishSpreadArgs, gridSizeX*gridSizeY*gridSizeZ, 256);
1288

1289
1290
1291
1292
1293
1294
1295
1296
1297
            if (useCudaFFT) {
                if (cu.getUseDoublePrecision())
                    cufftExecD2Z(fftForward, (double*) pmeGrid1.getDevicePointer(), (double2*) pmeGrid2.getDevicePointer());
                else
                    cufftExecR2C(fftForward, (float*) pmeGrid1.getDevicePointer(), (float2*) pmeGrid2.getDevicePointer());
            }
            else {
                fft->execFFT(pmeGrid1, pmeGrid2, true);
            }
1298

1299
1300
1301
1302
1303
1304
            if (includeEnergy) {
                void* computeEnergyArgs[] = {&pmeGrid2.getDevicePointer(), usePmeStream ? &pmeEnergyBuffer.getDevicePointer() : &cu.getEnergyBuffer().getDevicePointer(),
                        &pmeBsplineModuliX.getDevicePointer(), &pmeBsplineModuliY.getDevicePointer(), &pmeBsplineModuliZ.getDevicePointer(),
                        cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
                cu.executeKernel(pmeEvalEnergyKernel, computeEnergyArgs, gridSizeX*gridSizeY*gridSizeZ);
            }
1305

1306
1307
1308
1309
            void* convolutionArgs[] = {&pmeGrid2.getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(),
                    &pmeBsplineModuliX.getDevicePointer(), &pmeBsplineModuliY.getDevicePointer(), &pmeBsplineModuliZ.getDevicePointer(),
                    cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
            cu.executeKernel(pmeConvolutionKernel, convolutionArgs, gridSizeX*gridSizeY*gridSizeZ, 256);
1310

1311
1312
1313
1314
1315
1316
1317
1318
1319
            if (useCudaFFT) {
                if (cu.getUseDoublePrecision())
                    cufftExecZ2D(fftBackward, (double2*) pmeGrid2.getDevicePointer(), (double*) pmeGrid1.getDevicePointer());
                else
                    cufftExecC2R(fftBackward, (float2*) pmeGrid2.getDevicePointer(), (float*)  pmeGrid1.getDevicePointer());
            }
            else {
                fft->execFFT(pmeGrid2, pmeGrid1, false);
            }
1320

1321
1322
1323
1324
1325
1326
            void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &pmeGrid1.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
                    cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
                    recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(),
                    &charges.getDevicePointer()};
            cu.executeKernel(pmeInterpolateForceKernel, interpolateArgs, cu.getNumAtoms(), 128);
        }
1327

1328
1329
1330
1331
1332
1333
        if (doLJPME && hasLJ) {
            if (!hasCoulomb) {
                void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomGridIndex.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
                        cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
                        recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
                cu.executeKernel(pmeDispersionGridIndexKernel, gridIndexArgs, cu.getNumAtoms());
1334

1335
1336
1337
                sort->sort(pmeAtomGridIndex);
                cu.clearBuffer(pmeEnergyBuffer);
            }
1338

1339
1340
1341
1342
1343
1344
            cu.clearBuffer(pmeGrid2);
            void* spreadArgs[] = {&cu.getPosq().getDevicePointer(), &pmeGrid2.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
                    cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
                    recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(),
                    &sigmaEpsilon.getDevicePointer()};
            cu.executeKernel(pmeDispersionSpreadChargeKernel, spreadArgs, cu.getNumAtoms(), 128);
1345

1346
1347
            void* finishSpreadArgs[] = {&pmeGrid2.getDevicePointer(), &pmeGrid1.getDevicePointer()};
            cu.executeKernel(pmeDispersionFinishSpreadChargeKernel, finishSpreadArgs, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ, 256);
1348

1349
1350
1351
1352
1353
1354
1355
1356
1357
            if (useCudaFFT) {
                if (cu.getUseDoublePrecision())
                    cufftExecD2Z(dispersionFftForward, (double*) pmeGrid1.getDevicePointer(), (double2*) pmeGrid2.getDevicePointer());
                else
                    cufftExecR2C(dispersionFftForward, (float*) pmeGrid1.getDevicePointer(), (float2*) pmeGrid2.getDevicePointer());
            }
            else {
                dispersionFft->execFFT(pmeGrid1, pmeGrid2, true);
            }
1358

1359
1360
1361
1362
1363
1364
            if (includeEnergy) {
                void* computeEnergyArgs[] = {&pmeGrid2.getDevicePointer(), usePmeStream ? &pmeEnergyBuffer.getDevicePointer() : &cu.getEnergyBuffer().getDevicePointer(),
                        &pmeDispersionBsplineModuliX.getDevicePointer(), &pmeDispersionBsplineModuliY.getDevicePointer(), &pmeDispersionBsplineModuliZ.getDevicePointer(),
                        cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
                cu.executeKernel(pmeEvalDispersionEnergyKernel, computeEnergyArgs, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ);
            }
1365

1366
1367
1368
1369
            void* convolutionArgs[] = {&pmeGrid2.getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(),
                    &pmeDispersionBsplineModuliX.getDevicePointer(), &pmeDispersionBsplineModuliY.getDevicePointer(), &pmeDispersionBsplineModuliZ.getDevicePointer(),
                    cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
            cu.executeKernel(pmeDispersionConvolutionKernel, convolutionArgs, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ, 256);
1370

1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
            if (useCudaFFT) {
                if (cu.getUseDoublePrecision())
                    cufftExecZ2D(dispersionFftBackward, (double2*) pmeGrid2.getDevicePointer(), (double*) pmeGrid1.getDevicePointer());
                else
                    cufftExecC2R(dispersionFftBackward, (float2*) pmeGrid2.getDevicePointer(), (float*)  pmeGrid1.getDevicePointer());
            }
            else {
                dispersionFft->execFFT(pmeGrid2, pmeGrid1, false);
            }

            void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &pmeGrid1.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
                    cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
                    recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(),
                    &sigmaEpsilon.getDevicePointer()};
            cu.executeKernel(pmeInterpolateDispersionForceKernel, interpolateArgs, cu.getNumAtoms(), 128);
        }
        if (usePmeStream) {
            cuEventRecord(pmeSyncEvent, pmeStream);
            cu.restoreDefaultStream();
        }
    }
1392

1393
1394
1395
1396
1397
    if (dispersionCoefficient != 0.0 && includeDirect) {
        double4 boxSize = cu.getPeriodicBoxSize();
        energy += dispersionCoefficient/(boxSize.x*boxSize.y*boxSize.z);
    }
    return energy;
1398
1399
}

1400
1401
1402
void CudaCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& context, const NonbondedForce& force) {
    // Make sure the new parameters are acceptable.
    
1403
    cu.setAsCurrent();
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
    if (force.getNumParticles() != cu.getNumAtoms())
        throw OpenMMException("updateParametersInContext: The number of particles has changed");
    if (!hasCoulomb || !hasLJ) {
        for (int i = 0; i < force.getNumParticles(); i++) {
            double charge, sigma, epsilon;
            force.getParticleParameters(i, charge, sigma, epsilon);
            if (!hasCoulomb && charge != 0.0)
                throw OpenMMException("updateParametersInContext: The nonbonded force kernel does not include Coulomb interactions, because all charges were originally 0");
            if (!hasLJ && epsilon != 0.0)
                throw OpenMMException("updateParametersInContext: The nonbonded force kernel does not include Lennard-Jones interactions, because all epsilons were originally 0");
        }
1415
    }
1416
1417
1418
1419
1420
1421
1422
1423
1424
    vector<int> exceptions;
    for (int i = 0; i < force.getNumExceptions(); i++) {
        int particle1, particle2;
        double chargeProd, sigma, epsilon;
        force.getExceptionParameters(i, particle1, particle2, chargeProd, sigma, epsilon);
        if (exceptionAtoms.size() > exceptions.size() && make_pair(particle1, particle2) == exceptionAtoms[exceptions.size()])
            exceptions.push_back(i);
        else if (chargeProd != 0.0 || epsilon != 0.0)
            throw OpenMMException("updateParametersInContext: The set of non-excluded exceptions has changed");
1425
    }
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
    int numContexts = cu.getPlatformData().contexts.size();
    int startIndex = cu.getContextIndex()*exceptions.size()/numContexts;
    int endIndex = (cu.getContextIndex()+1)*exceptions.size()/numContexts;
    int numExceptions = endIndex-startIndex;
    
    // Record the per-particle parameters.
    
    vector<float4> baseParticleParamVec(cu.getPaddedNumAtoms(), make_float4(0, 0, 0, 0));
    const vector<int>& order = cu.getAtomIndex();
    for (int i = 0; i < force.getNumParticles(); i++) {
        double charge, sigma, epsilon;
        force.getParticleParameters(i, charge, sigma, epsilon);
        baseParticleParamVec[i] = make_float4(charge, sigma, epsilon, 0);
1439
    }
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
    baseParticleParams.upload(baseParticleParamVec);
    
    // Record the exceptions.
    
    if (numExceptions > 0) {
        vector<vector<int> > atoms(numExceptions, vector<int>(2));
        vector<float4> baseExceptionParamsVec(numExceptions);
        for (int i = 0; i < numExceptions; i++) {
            double chargeProd, sigma, epsilon;
            force.getExceptionParameters(exceptions[startIndex+i], atoms[i][0], atoms[i][1], chargeProd, sigma, epsilon);
            baseExceptionParamsVec[i] = make_float4(chargeProd, sigma, epsilon, 0);
        }
        baseExceptionParams.upload(baseExceptionParamsVec);
1453
1454
    }
    
1455
    // Compute other values.
1456
    
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
    ewaldSelfEnergy = 0.0;
    if (nonbondedMethod == Ewald || nonbondedMethod == PME || nonbondedMethod == LJPME) {
        if (cu.getContextIndex() == 0) {
            for (int i = 0; i < force.getNumParticles(); i++) {
                ewaldSelfEnergy -= baseParticleParamVec[i].x*baseParticleParamVec[i].x*ONE_4PI_EPS0*alpha/sqrt(M_PI);
                if (doLJPME)
                    ewaldSelfEnergy += baseParticleParamVec[i].z*pow(baseParticleParamVec[i].y*dispersionAlpha, 6)/3.0;
            }
        }
    }
    if (force.getUseDispersionCorrection() && cu.getContextIndex() == 0 && (nonbondedMethod == CutoffPeriodic || nonbondedMethod == Ewald || nonbondedMethod == PME))
        dispersionCoefficient = NonbondedForceImpl::calcDispersionCorrection(context.getSystem(), force);
    cu.invalidateMolecules();
    recomputeParams = true;
1471
1472
}

1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
void CudaCalcNonbondedForceKernel::getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
    if (nonbondedMethod != PME)
        throw OpenMMException("getPMEParametersInContext: This Context is not using PME");
    if (cu.getPlatformData().useCpuPme)
        cpuPme.getAs<CalcPmeReciprocalForceKernel>().getPMEParameters(alpha, nx, ny, nz);
    else {
        alpha = this->alpha;
        nx = gridSizeX;
        ny = gridSizeY;
        nz = gridSizeZ;
    }
1484
1485
}

1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
void CudaCalcNonbondedForceKernel::getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
    if (!doLJPME)
        throw OpenMMException("getPMEParametersInContext: This Context is not using PME");
    if (cu.getPlatformData().useCpuPme)
        //cpuPme.getAs<CalcPmeReciprocalForceKernel>().getLJPMEParameters(alpha, nx, ny, nz);
        throw OpenMMException("getPMEParametersInContext: CPUPME has not been implemented for LJPME yet.");
    else {
        alpha = this->dispersionAlpha;
        nx = dispersionGridSizeX;
        ny = dispersionGridSizeY;
        nz = dispersionGridSizeZ;
    }
1498
}
1499
void CudaIntegrateVelocityVerletStepKernel::initialize(const System& system, const NoseHooverIntegrator& integrator) {
1500
    cu.getPlatformData().initializeContexts(system);
1501
    cu.setAsCurrent();
1502
    map<string, string> defines;
1503
1504
1505
1506
1507
1508
1509
1510
    defines["BOLTZ"] = cu.doubleToString(BOLTZ);
    CUmodule module = cu.createModule(CudaKernelSources::velocityVerlet, defines, "");
    kernel1 = cu.getKernel(module, "integrateVelocityVerletPart1");
    kernel2 = cu.getKernel(module, "integrateVelocityVerletPart2");
    kernel3 = cu.getKernel(module, "integrateVelocityVerletPart3");
    kernelHardWall = cu.getKernel(module, "integrateVelocityVerletHardWall");
    prevMaxPairDistance = -1.0;
    maxPairDistanceBuffer.initialize<float>(cu, 1, "maxPairDistanceBuffer");
1511
1512
}

1513
void CudaIntegrateVelocityVerletStepKernel::execute(ContextImpl& context, const NoseHooverIntegrator& integrator, bool &forcesAreValid) {
1514
    cu.setAsCurrent();
1515
    CudaIntegrationUtilities& integration = cu.getIntegrationUtilities();
1516
    int paddedNumAtoms = cu.getPaddedNumAtoms();
1517
1518
    double dt = integrator.getStepSize();
    cu.getIntegrationUtilities().setNextStepSize(dt);
1519

1520
    if (!forcesAreValid) context.calcForcesAndEnergy(true, false);
1521

1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
    const auto& atomList = integrator.getAllThermostatedIndividualParticles();
    const auto& pairList = integrator.getAllThermostatedPairs();
    int numAtoms = atomList.size();
    int numPairs = pairList.size();
    int numParticles = numAtoms + 2*numPairs;
    float maxPairDistance = integrator.getMaximumPairDistance();
    // Make sure atom and pair metadata is uploaded and has the correct dimensions
    if (prevMaxPairDistance != maxPairDistance) {
        std::vector<float> tmp(1, maxPairDistance);
        maxPairDistanceBuffer.upload(tmp);
        prevMaxPairDistance = maxPairDistance;
    }
    if (numAtoms !=0 && (!atomListBuffer.isInitialized() || atomListBuffer.getSize() != numAtoms)) {
        if (atomListBuffer.isInitialized())
            atomListBuffer.resize(atomList.size());
        else
            atomListBuffer.initialize<int>(cu, atomList.size(), "atomListBuffer");
        atomListBuffer.upload(atomList);
    }
    if (numPairs !=0 && (!pairListBuffer.isInitialized() || pairListBuffer.getSize() != numPairs)) {
        if (pairListBuffer.isInitialized()) {
            pairListBuffer.resize(pairList.size());
            pairTemperatureBuffer.resize(pairList.size());
        }
        else {
            pairListBuffer.initialize<int2>(cu, pairList.size(), "pairListBuffer");
            pairTemperatureBuffer.initialize<float>(cu, pairList.size(), "pairTemperatureBuffer");
        }
        std::vector<int2> tmp;
        std::vector<float> tmp2;
        for(const auto &pair : pairList) {
            tmp.push_back(make_int2(std::get<0>(pair), std::get<1>(pair)));
            tmp2.push_back(std::get<2>(pair));
        }
        pairListBuffer.upload(tmp);
        pairTemperatureBuffer.upload(tmp2);
    }
    //// Call the first integration kernel.
1560

1561
    CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0);
1562
1563
1564
1565
    void* args1[] = {&numAtoms, &numPairs, &paddedNumAtoms, &cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getPosq().getDevicePointer(), &posCorrection,
            &cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer(),
            &atomListBuffer.getDevicePointer(), &pairListBuffer.getDevicePointer()};
    cu.executeKernel(kernel1, args1, std::max(numAtoms,numPairs), 128);
1566

1567
    //// Apply constraints.
1568
1569
1570

    integration.applyConstraints(integrator.getConstraintTolerance());

1571
1572
    //// Call the second integration kernel.
    void* args2[] = {&numParticles, &cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getPosq().getDevicePointer(), &posCorrection,
1573
            &cu.getVelm().getDevicePointer(), &integration.getPosDelta().getDevicePointer()};
1574
    cu.executeKernel(kernel2, args2, numParticles, 128);
1575

1576
1577
1578
1579
1580
1581
1582
    if (numPairs > 0) {
        //// Enforce hard wall constraint
        void* argsHardWall[] = {&numPairs, &maxPairDistanceBuffer.getDevicePointer(), 
                 &cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getPosq().getDevicePointer(), &posCorrection,
                 &cu.getVelm().getDevicePointer(), &pairListBuffer.getDevicePointer(), 
                 &pairTemperatureBuffer.getDevicePointer()}; 
        cu.executeKernel(kernelHardWall, argsHardWall, numPairs, 128);
1583
1584
    }

1585
    integration.computeVirtualSites();
1586

1587
1588
1589
    //// Update forces
    context.calcForcesAndEnergy(true, false);
    forcesAreValid = true;
1590

1591
1592
1593
1594
1595
    //// Call the third integration kernel.
    void* args3[] = {&numAtoms, &numPairs, &paddedNumAtoms, &cu.getIntegrationUtilities().getStepSize().getDevicePointer(), &cu.getPosq().getDevicePointer(), &posCorrection,
            &cu.getVelm().getDevicePointer(), &cu.getForce().getDevicePointer(), &integration.getPosDelta().getDevicePointer(),
            &atomListBuffer.getDevicePointer(), &pairListBuffer.getDevicePointer()};
    cu.executeKernel(kernel3, args3, std::max(numAtoms,numPairs), 128);
1596

1597
1598
1599
    // TODO: Figure out if this is really needed.  The constraint velocities are accounted for
    // in a finite difference sense in the step 3 kernel, when the velocities are updated.
    integration.applyVelocityConstraints(integrator.getConstraintTolerance());
1600

1601
    //// Update the time and step count.
1602

1603
    cu.setTime(cu.getTime()+dt);
1604
    cu.setStepCount(cu.getStepCount()+1);
1605
    cu.reorderAtoms();
1606
1607
}

1608
1609
double CudaIntegrateVelocityVerletStepKernel::computeKineticEnergy(ContextImpl& context, const NoseHooverIntegrator& integrator) {
    return cu.getIntegrationUtilities().computeKineticEnergy(0);
1610
1611
}

1612
class CudaCalcCustomCVForceKernel::ForceInfo : public CudaForceInfo {
1613
public:
1614
    ForceInfo(ComputeForceInfo& force) : force(force) {
1615
    }
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
    bool areParticlesIdentical(int particle1, int particle2) {
        return force.areParticlesIdentical(particle1, particle2);
    }
    int getNumParticleGroups() {
        return force.getNumParticleGroups();
    }
    void getParticlesInGroup(int index, std::vector<int>& particles) {
        force.getParticlesInGroup(index, particles);
    }
    bool areGroupsIdentical(int group1, int group2) {
        return force.areGroupsIdentical(group1, group2);
1627
1628
    }
private:
1629
    ComputeForceInfo& force;
1630
1631
};

1632
class CudaCalcCustomCVForceKernel::ReorderListener : public CudaContext::ReorderListener {
1633
public:
1634
    ReorderListener(CudaContext& cu, CudaArray& invAtomOrder) : cu(cu), invAtomOrder(invAtomOrder) {
1635
    }
1636
1637
1638
1639
1640
1641
    void execute() {
        vector<int> invOrder(cu.getPaddedNumAtoms());
        const vector<int>& order = cu.getAtomIndex();
        for (int i = 0; i < order.size(); i++)
            invOrder[order[i]] = i;
        invAtomOrder.upload(invOrder);
1642
1643
    }
private:
1644
1645
    CudaContext& cu;
    CudaArray& invAtomOrder;
1646
1647
};

1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
void CudaCalcCustomCVForceKernel::initialize(const System& system, const CustomCVForce& force, ContextImpl& innerContext) {
    int numCVs = force.getNumCollectiveVariables();
    for (int i = 0; i < force.getNumGlobalParameters(); i++)
        globalParameterNames.push_back(force.getGlobalParameterName(i));
    for (int i = 0; i < numCVs; i++)
        variableNames.push_back(force.getCollectiveVariableName(i));
    for (int i = 0; i < force.getNumEnergyParameterDerivatives(); i++) {
        string name = force.getEnergyParameterDerivativeName(i);
        paramDerivNames.push_back(name);
        cu.addEnergyParameterDerivative(name);
1658
    }
1659

1660
    // Create custom functions for the tabulated functions.
1661

1662
1663
1664
    map<string, Lepton::CustomFunction*> functions;
    for (int i = 0; i < (int) force.getNumTabulatedFunctions(); i++)
        functions[force.getTabulatedFunctionName(i)] = createReferenceTabulatedFunction(force.getTabulatedFunction(i));
1665

1666
    // Create the expressions.
1667

1668
1669
1670
1671
1672
1673
1674
1675
    Lepton::ParsedExpression energyExpr = Lepton::Parser::parse(force.getEnergyFunction(), functions);
    energyExpression = energyExpr.createProgram();
    variableDerivExpressions.clear();
    for (auto& name : variableNames)
        variableDerivExpressions.push_back(energyExpr.differentiate(name).optimize().createProgram());
    paramDerivExpressions.clear();
    for (auto& name : paramDerivNames)
        paramDerivExpressions.push_back(energyExpr.differentiate(name).optimize().createProgram());
1676

1677
    // Delete the custom functions.
1678

1679
1680
    for (auto& function : functions)
        delete function.second;
1681

1682
    // Copy parameter derivatives from the inner context.
1683

1684
1685
1686
    CudaContext& cu2 = *reinterpret_cast<CudaPlatform::PlatformData*>(innerContext.getPlatformData())->contexts[0];
    for (auto& param : cu2.getEnergyParamDerivNames())
        cu.addEnergyParameterDerivative(param);
1687
    
1688
    // Create arrays for storing information.
1689
    
1690
1691
1692
1693
1694
1695
    int elementSize = (cu.getUseDoublePrecision() || cu.getUseMixedPrecision() ? sizeof(double) : sizeof(float));
    cvForces.resize(numCVs);
    for (int i = 0; i < numCVs; i++)
        cvForces[i].initialize<long long>(cu, 3*cu.getPaddedNumAtoms(), "cvForce");
    invAtomOrder.initialize<int>(cu, cu.getPaddedNumAtoms(), "invAtomOrder");
    innerInvAtomOrder.initialize<int>(cu, cu.getPaddedNumAtoms(), "innerInvAtomOrder");
1696
    
1697
    // Create the kernels.
1698
    
1699
1700
1701
1702
    stringstream args, add;
    for (int i = 0; i < numCVs; i++) {
        args << ", long long* __restrict__ force" << i << ", real dEdV" << i;
        add << "forces[i] += (long long) (force" << i << "[i]*dEdV" << i << ");\n";
1703
    }
1704
1705
1706
1707
1708
1709
1710
    map<string, string> replacements;
    replacements["PARAMETER_ARGUMENTS"] = args.str();
    replacements["ADD_FORCES"] = add.str();
    CUmodule module = cu.createModule(cu.replaceStrings(CudaKernelSources::vectorOps+CudaKernelSources::customCVForce, replacements));
    copyStateKernel = cu.getKernel(module, "copyState");
    copyForcesKernel = cu.getKernel(module, "copyForces");
    addForcesKernel = cu.getKernel(module, "addForces");
1711

1712
    // This context needs to respect all forces in the inner context when reordering atoms.
1713

1714
1715
    for (auto* info : cu2.getForceInfos())
        cu.addForce(new ForceInfo(*info));
1716
1717
}

1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
double CudaCalcCustomCVForceKernel::execute(ContextImpl& context, ContextImpl& innerContext, bool includeForces, bool includeEnergy) {
    copyState(context, innerContext);
    int numCVs = variableNames.size();
    int numAtoms = cu.getNumAtoms();
    int paddedNumAtoms = cu.getPaddedNumAtoms();
    CudaContext& cu2 = *reinterpret_cast<CudaPlatform::PlatformData*>(innerContext.getPlatformData())->contexts[0];
    vector<double> cvValues;
    vector<map<string, double> > cvDerivs(numCVs);
    void* copyForcesArgs[] = {NULL, &invAtomOrder.getDevicePointer(), &cu2.getForce().getDevicePointer(), &cu2.getAtomIndexArray().getDevicePointer(), &numAtoms, &paddedNumAtoms};
    for (int i = 0; i < numCVs; i++) {
        cvValues.push_back(innerContext.calcForcesAndEnergy(true, true, 1<<i));
        copyForcesArgs[0] = &cvForces[i].getDevicePointer();
        cu.executeKernel(copyForcesKernel, copyForcesArgs, numAtoms);
        innerContext.getEnergyParameterDerivatives(cvDerivs[i]);
1732
    }
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
    
    // Compute the energy and forces.
    
    map<string, double> variables;
    for (auto& name : globalParameterNames)
        variables[name] = context.getParameter(name);
    for (int i = 0; i < numCVs; i++)
        variables[variableNames[i]] = cvValues[i];
    double energy = energyExpression.evaluate(variables);
    int bufferSize = cu.getForce().getSize();
    vector<void*> addForcesArgs;
    addForcesArgs.push_back(&cu.getForce().getDevicePointer());
    addForcesArgs.push_back(&bufferSize);
    vector<double> dEdV(numCVs);
    vector<float> dEdVFloat(numCVs);
    for (int i = 0; i < numCVs; i++) {
        dEdV[i] = variableDerivExpressions[i].evaluate(variables);
        dEdVFloat[i] = (float) dEdV[i];
        addForcesArgs.push_back(&cvForces[i].getDevicePointer());
        if (cu.getUseDoublePrecision())
            addForcesArgs.push_back(&dEdV[i]);
        else
            addForcesArgs.push_back(&dEdVFloat[i]);
1756
    }
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
    cu.executeKernel(addForcesKernel, &addForcesArgs[0], numAtoms);
    
    // Compute the energy parameter derivatives.
    
    map<string, double>& energyParamDerivs = cu.getEnergyParamDerivWorkspace();
    for (int i = 0; i < paramDerivExpressions.size(); i++)
        energyParamDerivs[paramDerivNames[i]] += paramDerivExpressions[i].evaluate(variables);
    for (int i = 0; i < numCVs; i++) {
        double dEdV = variableDerivExpressions[i].evaluate(variables);
        for (auto& deriv : cvDerivs[i])
            energyParamDerivs[deriv.first] += dEdV*deriv.second;
1768
    }
1769
    return energy;
1770
1771
}

1772
1773
1774
1775
1776
void CudaCalcCustomCVForceKernel::copyState(ContextImpl& context, ContextImpl& innerContext) {
    int numAtoms = cu.getNumAtoms();
    CudaContext& cu2 = *reinterpret_cast<CudaPlatform::PlatformData*>(innerContext.getPlatformData())->contexts[0];
    if (!hasInitializedListeners) {
        hasInitializedListeners = true;
1777
        
1778
        // Initialize the listeners.
1779
        
1780
1781
1782
1783
1784
1785
        ReorderListener* listener1 = new ReorderListener(cu, invAtomOrder);
        ReorderListener* listener2 = new ReorderListener(cu2, innerInvAtomOrder);
        cu.addReorderListener(listener1);
        cu2.addReorderListener(listener2);
        listener1->execute();
        listener2->execute();
1786
    }
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
    CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0);
    CUdeviceptr posCorrection2 = (cu2.getUseMixedPrecision() ? cu2.getPosqCorrection().getDevicePointer() : 0);
    void* copyStateArgs[] = {&cu.getPosq().getDevicePointer(), &posCorrection, &cu.getVelm().getDevicePointer(), &cu.getAtomIndexArray().getDevicePointer(),
        &cu2.getPosq().getDevicePointer(), &posCorrection2,& cu2.getVelm().getDevicePointer(), &innerInvAtomOrder.getDevicePointer(), &numAtoms};
    cu.executeKernel(copyStateKernel, copyStateArgs, numAtoms);
    Vec3 a, b, c;
    context.getPeriodicBoxVectors(a, b, c);
    innerContext.setPeriodicBoxVectors(a, b, c);
    innerContext.setTime(context.getTime());
    map<string, double> innerParameters = innerContext.getParameters();
    for (auto& param : innerParameters)
        innerContext.setParameter(param.first, context.getParameter(param.first));
1799
1800
}

1801
1802
void CudaCalcCustomCVForceKernel::copyParametersToContext(ContextImpl& context, const CustomCVForce& force) {
    // Create custom functions for the tabulated functions.
1803

1804
1805
1806
    map<string, CustomFunction*> functions;
    for (int i = 0; i < (int) force.getNumTabulatedFunctions(); i++)
        functions[force.getTabulatedFunctionName(i)] = createReferenceTabulatedFunction(force.getTabulatedFunction(i));
1807

1808
    // Replace tabulated functions in the expressions.
1809

1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
    replaceFunctionsInExpression(functions, energyExpression);
    for (auto& expression : variableDerivExpressions)
        replaceFunctionsInExpression(functions, expression);
    for (auto& expression : paramDerivExpressions)
        replaceFunctionsInExpression(functions, expression);

    // Delete the custom functions.

    for (auto& function : functions)
        delete function.second;
1820
1821
}

1822
void CudaNoseHooverChainKernel::initialize() {
1823
    cu.setAsCurrent();
1824
    bool useDouble = cu.getUseDoublePrecision() || cu.getUseMixedPrecision();
1825
    map<string, string> defines;
1826
1827
    sumWorkGroupSize = 512;
    defines["WORK_GROUP_SIZE"] = cu.intToString(sumWorkGroupSize);
1828
    defines["MIXEDEXP"] = useDouble ? "exp" : "expf";
1829
    defines["BEGIN_YS_LOOP"] = "for(const real & ys : {1}) {";
1830
    defines["END_YS_LOOP"] = "}";
1831
    CUmodule module = cu.createModule(CudaKernelSources::noseHooverChain, defines, "-std=c++11");
1832
    propagateKernels[1] = cu.getKernel(module, "propagateNoseHooverChain");
1833
    defines["BEGIN_YS_LOOP"] = "for(const real & ys : {0.828981543588751, -0.657963087177502, 0.828981543588751}) {";
1834
    module = cu.createModule(CudaKernelSources::noseHooverChain, defines, "-std=c++11");
1835
    propagateKernels[3] = cu.getKernel(module, "propagateNoseHooverChain");
1836
    defines["BEGIN_YS_LOOP"] = "for(const real & ys : {0.2967324292201065, 0.2967324292201065, -0.186929716880426, 0.2967324292201065, 0.2967324292201065}) {";
1837
    module = cu.createModule(CudaKernelSources::noseHooverChain, defines, "-std=c++11");
1838
    propagateKernels[5] = cu.getKernel(module, "propagateNoseHooverChain");
1839
1840
    module = cu.createModule(CudaKernelSources::noseHooverChain, defines, "-std=c++11");
    reduceEnergyKernel = cu.getKernel(module, "reduceEnergyPair");
1841

1842
    computeHeatBathEnergyKernel = cu.getKernel(module, "computeHeatBathEnergy");
1843
1844
1845
1846
    computeAtomsKineticEnergyKernel = cu.getKernel(module, "computeAtomsKineticEnergy");
    computePairsKineticEnergyKernel = cu.getKernel(module, "computePairsKineticEnergy");
    scaleAtomsVelocitiesKernel = cu.getKernel(module, "scaleAtomsVelocities");
    scalePairsVelocitiesKernel = cu.getKernel(module, "scalePairsVelocities");
1847
1848

    int energyBufferSize = cu.getEnergyBuffer().getSize();
1849
    if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision())
1850
        energyBuffer.initialize<double2>(cu, energyBufferSize, "energyBuffer");
1851
    else
1852
        energyBuffer.initialize<float2>(cu, energyBufferSize, "energyBuffer");
1853
1854
}

1855
std::pair<double, double> CudaNoseHooverChainKernel::propagateChain(ContextImpl& context, const NoseHooverChain &nhc, std::pair<double, double> kineticEnergies, double timeStep) {
1856
1857
    cu.setAsCurrent();

1858
1859
    bool useDouble = cu.getUseDoublePrecision() || cu.getUseMixedPrecision();

1860
    int chainID = nhc.getChainID();
1861
1862
    int nAtoms = nhc.getThermostatedAtoms().size();
    int nPairs = nhc.getThermostatedPairs().size();
1863
1864
1865
1866
1867
1868
1869
1870
    int chainLength = nhc.getChainLength();
    int numYS = nhc.getNumYoshidaSuzukiTimeSteps();
    int numMTS = nhc.getNumMultiTimeSteps();
    int numDOFs = nhc.getNumDegreesOfFreedom();
    double temperature = nhc.getTemperature();
    double frequency = nhc.getCollisionFrequency();
    double relativeTemperature = nhc.getRelativeTemperature();
    double relativeFrequency = nhc.getRelativeCollisionFrequency();
1871

1872
1873
    if (numYS != 1 && numYS != 3 && numYS != 5) {
        throw OpenMMException("Number of Yoshida Suzuki time steps has to be 1, 3, or 5.");
1874
1875
    }

1876
1877
    auto & chainState = cu.getIntegrationUtilities().getNoseHooverChainState();

1878
1879
    if (!scaleFactorBuffer.isInitialized() || scaleFactorBuffer.getSize() == 0) {
        if (useDouble) {
1880
            std::vector<double2> zeros{{0,0}};
1881
            if (scaleFactorBuffer.isInitialized())
1882
                scaleFactorBuffer.resize(1);
1883
            else
1884
                scaleFactorBuffer.initialize<double2>(cu, 1, "scaleFactorBuffer");
1885
            scaleFactorBuffer.upload(zeros);
1886
1887
        }
        else {
1888
            std::vector<float2> zeros{{0,0}};
1889
            if (scaleFactorBuffer.isInitialized())
1890
                scaleFactorBuffer.resize(1);
1891
            else
1892
                scaleFactorBuffer.initialize<float2>(cu, 1, "scaleFactorBuffer");
1893
            scaleFactorBuffer.upload(zeros);
1894
1895
        }
    }
1896
    std::vector<double> zeros(chainLength,0);
1897
1898
    if (!chainForces.isInitialized() || !chainMasses.isInitialized()) {
        if (useDouble) {
1899
1900
1901
            if (chainForces.isInitialized()) {
                chainMasses.resize(chainLength);
                chainForces.resize(chainLength);
1902
1903
            }
            else {
1904
1905
1906
                chainMasses.initialize<double>(cu, chainLength, "chainMasses");
                chainForces.initialize<double>(cu, chainLength, "chainForces");
            }
1907
1908
            chainMasses.upload(zeros);
            chainForces.upload(zeros);
1909
1910
        }
        else {
1911
1912
1913
            if (chainForces.isInitialized()) {
                chainMasses.resize(chainLength);
                chainForces.resize(chainLength);
1914
1915
            }
            else {
1916
1917
1918
                chainMasses.initialize<float>(cu, chainLength, "chainMasses");
                chainForces.initialize<float>(cu, chainLength, "chainForces");
            }
1919
1920
            chainMasses.upload(zeros, true);
            chainForces.upload(zeros, true);
1921
1922
        }
    }
1923
1924
1925
1926
    if (chainForces.getSize() < chainLength)
        chainMasses.resize(chainLength);
    if (chainMasses.getSize() < chainLength)
        chainMasses.resize(chainLength);
1927

1928
    float timeStepFloat = (float) timeStep;
Andy Simmonett's avatar
Andy Simmonett committed
1929
    // N.B. We ignore the incoming kineticEnergy and grab it from the device buffer instead
1930
    if (nAtoms) {
1931
1932
1933
        if (!chainState.count(2*chainID))
            chainState[2*chainID] = ComputeArray();
        if (!chainState.at(2*chainID).isInitialized() || chainState.at(2*chainID).getSize() != chainLength) {
1934
            // We need to upload the CUDA array
1935
1936
            if (useDouble) {
                if (chainState.at(2*chainID).isInitialized())
1937
                    chainState.at(2*chainID).resize(chainLength);
1938
                else
1939
                    chainState.at(2*chainID).initialize<double2>(cu, chainLength, "chainState" + std::to_string(2*chainID));
1940
1941
                std::vector<double2> zeros(chainLength, make_double2(0, 0));
                chainState.at(2*chainID).upload(zeros.data());
1942
1943
1944
            }
            else {
                if (chainState.at(2*chainID).isInitialized())
1945
                    chainState.at(2*chainID).resize(chainLength);
1946
                else
1947
                    chainState.at(2*chainID).initialize<float2>(cu, chainLength, "chainState" + std::to_string(2*chainID));
1948
1949
1950
1951
1952
1953
1954
1955
1956
                std::vector<float2> zeros(chainLength, make_float2(0, 0));
                chainState.at(2*chainID).upload(zeros.data());
            }
        }
        int chainType = 0;
        double kT = BOLTZ * temperature;
        float kTfloat = (float) kT;
        float frequencyFloat = (float) frequency;
        void *args[] = {
1957
                    &cu.unwrap(chainState[2*chainID]).getDevicePointer(),
Andy Simmonett's avatar
Andy Simmonett committed
1958
1959
                    &kineticEnergyBuffer.getDevicePointer(),
                    &scaleFactorBuffer.getDevicePointer(),
1960
1961
                    &chainMasses.getDevicePointer(),
                    &chainForces.getDevicePointer(),
1962
                    &chainType, &chainLength, &numMTS, &numDOFs,
1963
1964
                    &timeStepFloat,
                    useDouble ? (void*) &kT : (void*) &kTfloat, 
Andy Simmonett's avatar
Andy Simmonett committed
1965
1966
1967
                    &frequencyFloat
                    };
        cu.executeKernel(propagateKernels[numYS], args, 1, 1);
1968
    }
1969
    if (nPairs) {
1970
1971
1972
        if (!chainState.count(2*chainID+1))
            chainState[2*chainID+1] = ComputeArray();
        if (!chainState.at(2*chainID+1).isInitialized() || chainState.at(2*chainID+1).getSize() != chainLength) {
1973
            // We need to upload the CUDA array
1974
1975
            if (useDouble) {
                if (chainState.at(2*chainID+1).isInitialized())
1976
                    chainState.at(2*chainID+1).resize(chainLength);
1977
                else
1978
                    chainState.at(2*chainID+1).initialize<double2>(cu, chainLength, "chainState" + std::to_string(2*chainID+1));
1979
1980
                std::vector<double2> zeros(chainLength, make_double2(0, 0));
                chainState.at(2*chainID+1).upload(zeros.data());
1981
1982
1983
            }
            else {
                if (chainState.at(2*chainID+1).isInitialized())
1984
                    chainState.at(2*chainID+1).resize(chainLength);
1985
                else
1986
                    chainState.at(2*chainID+1).initialize<float2>(cu, chainLength, "chainState" + std::to_string(2*chainID+1));
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
                std::vector<float2> zeros(chainLength, make_float2(0, 0));
                chainState.at(2*chainID+1).upload(zeros.data());
            }
        }
        int chainType = 1;
        double kT = BOLTZ * relativeTemperature;
        int ndf = 3*nPairs;
        float kTfloat = (float) kT;
        float frequencyFloat = (float) relativeFrequency;
        void *args[] = {
1997
                    &cu.unwrap(chainState[2*chainID+1]).getDevicePointer(),
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
                    &kineticEnergyBuffer.getDevicePointer(),
                    &scaleFactorBuffer.getDevicePointer(),
                    &chainMasses.getDevicePointer(),
                    &chainForces.getDevicePointer(),
                    &chainType, &chainLength, &numMTS, &ndf,
                    &timeStepFloat,
                    useDouble ? (void*) &kT : (void*) &kTfloat, 
                    &frequencyFloat
                    };
        cu.executeKernel(propagateKernels[numYS], args, 1, 1);
    }
    return {0, 0};
2010
2011
2012
}

double CudaNoseHooverChainKernel::computeHeatBathEnergy(ContextImpl& context, const NoseHooverChain &nhc) {
2013
    cu.setAsCurrent();
2014
2015
2016

    bool useDouble = cu.getUseDoublePrecision() || cu.getUseMixedPrecision();

2017
2018
    int chainID = nhc.getChainID();
    int chainLength = nhc.getChainLength();
2019
2020

    auto & chainState = cu.getIntegrationUtilities().getNoseHooverChainState();
2021
2022
2023
2024
2025
2026
2027
2028
2029

    bool absChainIsValid = chainState.count(2*chainID) != 0 &&
                           chainState[2*chainID].isInitialized() &&
                           chainState[2*chainID].getSize() == chainLength;
    bool relChainIsValid = chainState.count(2*chainID+1) != 0 &&
                           chainState[2*chainID+1].isInitialized() &&
                           chainState[2*chainID+1].getSize() == chainLength;

    if (!absChainIsValid && !relChainIsValid) return 0.0;
2030
2031

    if (!heatBathEnergy.isInitialized() || heatBathEnergy.getSize() == 0) {
2032
        if (useDouble) {
2033
2034
2035
            std::vector<double> one(1);
            heatBathEnergy.initialize<double>(cu, 1, "heatBathEnergy");
            heatBathEnergy.upload(one);
2036
2037
        }
        else {
2038
2039
2040
2041
2042
2043
            std::vector<float> one(1);
            heatBathEnergy.initialize<float>(cu, 1, "heatBathEnergy");
            heatBathEnergy.upload(one);
        }
    }

2044
2045
2046
    cu.clearBuffer(heatBathEnergy);

    if (absChainIsValid) {
2047
2048
2049
        int numDOFs = nhc.getNumDegreesOfFreedom();
        double temperature = nhc.getTemperature();
        double frequency = nhc.getCollisionFrequency();
2050
2051
2052
        double kT = BOLTZ * temperature;
        float kTfloat = (float) kT;
        float frequencyFloat = (float) frequency;
2053

2054
2055
        void * args[] = {&heatBathEnergy.getDevicePointer(), &chainLength, &numDOFs, 
                         useDouble ? (void*) &kT : (void*) & kTfloat,
2056
                         &frequencyFloat, &cu.unwrap(chainState[2*chainID]).getDevicePointer()};
2057
2058
2059
2060
        cu.executeKernel(computeHeatBathEnergyKernel, args, 1, 1);
    }
    if (relChainIsValid) {
        int numDOFs = 3 * nhc.getThermostatedPairs().size();
2061
2062
        double temperature = nhc.getRelativeTemperature();
        double frequency = nhc.getRelativeCollisionFrequency();
2063
2064
2065
2066
2067
2068
        double kT = BOLTZ * temperature;
        float kTfloat = (float) kT;
        float frequencyFloat = (float) frequency;

        void * args[] = {&heatBathEnergy.getDevicePointer(), &chainLength, &numDOFs, 
                         useDouble ? (void*) &kT : (void*) & kTfloat,
2069
                         &frequencyFloat, &cu.unwrap(chainState[2*chainID+1]).getDevicePointer()};
2070
2071
        cu.executeKernel(computeHeatBathEnergyKernel, args, 1, 1);
    }
2072
2073


Andy Simmonett's avatar
Andy Simmonett committed
2074
2075
    void * pinnedBuffer = cu.getPinnedBuffer();
    heatBathEnergy.download(pinnedBuffer);
2076
    if (useDouble)
Andy Simmonett's avatar
Andy Simmonett committed
2077
        return *((double*) pinnedBuffer);
2078
    else
Andy Simmonett's avatar
Andy Simmonett committed
2079
        return *((float*) pinnedBuffer);
2080
2081
}

2082
std::pair<double, double> CudaNoseHooverChainKernel::computeMaskedKineticEnergy(ContextImpl& context, const NoseHooverChain &nhc, bool downloadValue) {
2083
    cu.setAsCurrent();
Andy Simmonett's avatar
Andy Simmonett committed
2084
2085
2086

    bool useDouble = cu.getUseDoublePrecision() || cu.getUseMixedPrecision();

2087
    int chainID = nhc.getChainID();
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
    const auto & nhcAtoms = nhc.getThermostatedAtoms();
    const auto & nhcPairs = nhc.getThermostatedPairs();
    auto nAtoms = nhcAtoms.size();
    auto nPairs = nhcPairs.size();
    if (nAtoms) {
        if (!atomlists.count(chainID)) { 
            // We need to upload the CUDA array
            atomlists[chainID] = CudaArray();
            atomlists[chainID].initialize<int>(cu, nAtoms, "atomlist" + std::to_string(chainID));
            atomlists[chainID].upload(nhcAtoms);
        }
        if (atomlists[chainID].getSize() != nAtoms) {
            throw OpenMMException("Number of atoms changed. Cannot be handled by the same Nose-Hoover thermostat.");
        }
    }
    if (nPairs) {
        if (!pairlists.count(chainID)) { 
            // We need to upload the CUDA array
            pairlists[chainID] = CudaArray();
            pairlists[chainID].initialize<int2>(cu, nPairs, "pairlist" + std::to_string(chainID));
            std::vector<int2> int2vec;
            for(const auto &p : nhcPairs) int2vec.push_back(make_int2(p.first, p.second));
            pairlists[chainID].upload(int2vec);
        }
        if (pairlists[chainID].getSize() != nPairs) {
            throw OpenMMException("Number of thermostated pairs changed. Cannot be handled by the same Nose-Hoover thermostat.");
Andy Simmonett's avatar
Andy Simmonett committed
2114
2115
2116
        }
    }
    if (!kineticEnergyBuffer.isInitialized() || kineticEnergyBuffer.getSize() == 0) {
2117
        if (useDouble) {
2118
2119
2120
            std::vector<double2> zeros{{0,0}};
            kineticEnergyBuffer.initialize<double2>(cu, 1, "kineticEnergyBuffer");
            kineticEnergyBuffer.upload(zeros);
2121
2122
        }
        else {
2123
2124
2125
            std::vector<float2> zeros{{0,0}};
            kineticEnergyBuffer.initialize<float2>(cu, 1, "kineticEnergyBuffer");
            kineticEnergyBuffer.upload(zeros);
Andy Simmonett's avatar
Andy Simmonett committed
2126
2127
        }
    }
2128
    cu.clearBuffer(energyBuffer);
2129
    if (nAtoms) {
2130
        void *args[] = {&energyBuffer.getDevicePointer(),&nAtoms, &cu.getVelm().getDevicePointer(), &atomlists[chainID].getDevicePointer()};
2131
2132
2133
        cu.executeKernel(computeAtomsKineticEnergyKernel, args, nAtoms);
    }
    if (nPairs) {
2134
        void *args[] = {&energyBuffer.getDevicePointer(),&nPairs, &cu.getVelm().getDevicePointer(), &pairlists[chainID].getDevicePointer()};
2135
2136
        cu.executeKernel(computePairsKineticEnergyKernel, args, nPairs);
    }
Andy Simmonett's avatar
Andy Simmonett committed
2137
2138

    //taken from CudaContext::reduceEnergy(); the final kinetic energy will live in the kineticEnergy buffer
2139
2140
2141
    int bufferSize = energyBuffer.getSize();
    void* args2[] = {&energyBuffer.getDevicePointer(), &kineticEnergyBuffer.getDevicePointer(), &bufferSize, &sumWorkGroupSize};
    cu.executeKernel(reduceEnergyKernel, args2, sumWorkGroupSize, sumWorkGroupSize, sumWorkGroupSize*energyBuffer.getElementSize());
Andy Simmonett's avatar
Andy Simmonett committed
2142

2143
    std::pair<double, double> KEs = {0, 0};
2144
2145
2146
    if (downloadValue) {
        void * pinnedBuffer = cu.getPinnedBuffer();
        kineticEnergyBuffer.download(pinnedBuffer);
2147
2148
        KEs.first = useDouble ? *((double*) pinnedBuffer) : *((float*) pinnedBuffer);
        KEs.second = useDouble ? *((double*) pinnedBuffer + 1) : *((float*) pinnedBuffer + 1);
2149
    }
2150
    return KEs;
2151
2152
}

2153
2154
void CudaNoseHooverChainKernel::scaleVelocities(ContextImpl& context, const NoseHooverChain &nhc, std::pair<double, double> scaleFactor) {
    // For now we assume that the atoms and pairs info is valid, because compute{Atoms|Pairs}KineticEnergy must have been
Andy Simmonett's avatar
Andy Simmonett committed
2155
2156
2157
    // called before this kernel.  If that ever ceases to be true, some sanity checks are needed here.
    cu.setAsCurrent();

2158
    int chainID = nhc.getChainID();
2159
2160
    auto nAtoms = nhc.getThermostatedAtoms().size();
    auto nPairs = nhc.getThermostatedPairs().size();
2161
    if (nAtoms) {
2162
2163
2164
2165
        void *args[] = {&scaleFactorBuffer.getDevicePointer(),
                        &nAtoms, &cu.getVelm().getDevicePointer(), &atomlists[chainID].getDevicePointer()};
        cu.executeKernel(scaleAtomsVelocitiesKernel, args, nAtoms);
    }
2166
    if (nPairs) {
2167
2168
2169
2170
        void *args[] = {&scaleFactorBuffer.getDevicePointer(),
                        &nPairs, &cu.getVelm().getDevicePointer(), &pairlists[chainID].getDevicePointer()};
        cu.executeKernel(scalePairsVelocitiesKernel, args, nPairs);
    }
2171
2172
}

2173
void CudaApplyMonteCarloBarostatKernel::initialize(const System& system, const Force& thermostat) {
2174
    cu.setAsCurrent();
2175
2176
    savedPositions.initialize(cu, cu.getPaddedNumAtoms(), cu.getUseDoublePrecision() ? sizeof(double4) : sizeof(float4), "savedPositions");
    savedForces.initialize<long long>(cu, cu.getPaddedNumAtoms()*3, "savedForces");
2177
    CUmodule module = cu.createModule(CudaKernelSources::monteCarloBarostat);
2178
    kernel = cu.getKernel(module, "scalePositions");
2179
2180
}

2181
void CudaApplyMonteCarloBarostatKernel::scaleCoordinates(ContextImpl& context, double scaleX, double scaleY, double scaleZ) {
2182
2183
2184
2185
2186
2187
2188
2189
    cu.setAsCurrent();
    if (!hasInitializedKernels) {
        hasInitializedKernels = true;

        // Create the arrays with the molecule definitions.

        vector<vector<int> > molecules = context.getMolecules();
        numMolecules = molecules.size();
2190
2191
2192
2193
        moleculeAtoms.initialize<int>(cu, cu.getNumAtoms(), "moleculeAtoms");
        moleculeStartIndex.initialize<int>(cu, numMolecules+1, "moleculeStartIndex");
        vector<int> atoms(moleculeAtoms.getSize());
        vector<int> startIndex(moleculeStartIndex.getSize());
2194
2195
2196
        int index = 0;
        for (int i = 0; i < numMolecules; i++) {
            startIndex[i] = index;
peastman's avatar
peastman committed
2197
2198
            for (int molecule : molecules[i])
                atoms[index++] = molecule;
2199
2200
        }
        startIndex[numMolecules] = index;
2201
2202
        moleculeAtoms.upload(atoms);
        moleculeStartIndex.upload(startIndex);
2203
2204
2205
2206
2207

        // Initialize the kernel arguments.
        
    }
    int bytesToCopy = cu.getPosq().getSize()*(cu.getUseDoublePrecision() ? sizeof(double4) : sizeof(float4));
2208
    CUresult result = cuMemcpyDtoD(savedPositions.getDevicePointer(), cu.getPosq().getDevicePointer(), bytesToCopy);
2209
2210
2211
2212
    if (result != CUDA_SUCCESS) {
        std::stringstream m;
        m<<"Error saving positions for MC barostat: "<<cu.getErrorString(result)<<" ("<<result<<")";
        throw OpenMMException(m.str());
2213
    }
2214
    result = cuMemcpyDtoD(savedForces.getDevicePointer(), cu.getForce().getDevicePointer(), savedForces.getSize()*savedForces.getElementSize());
2215
2216
2217
2218
2219
    if (result != CUDA_SUCCESS) {
        std::stringstream m;
        m<<"Error saving forces for MC barostat: "<<cu.getErrorString(result)<<" ("<<result<<")";
        throw OpenMMException(m.str());
    }
2220
2221
2222
    float scalefX = (float) scaleX;
    float scalefY = (float) scaleY;
    float scalefZ = (float) scaleZ;
2223
2224
    void* args[] = {&scalefX, &scalefY, &scalefZ, &numMolecules, cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer(),
                    cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
2225
		    &cu.getPosq().getDevicePointer(), &moleculeAtoms.getDevicePointer(), &moleculeStartIndex.getDevicePointer()};
2226
    cu.executeKernel(kernel, args, cu.getNumAtoms());
peastman's avatar
peastman committed
2227
    for (auto& offset : cu.getPosCellOffsets())
2228
        offset = mm_int4(0, 0, 0, 0);
2229
    lastAtomOrder = cu.getAtomIndex();
2230
2231
2232
}

void CudaApplyMonteCarloBarostatKernel::restoreCoordinates(ContextImpl& context) {
2233
    cu.setAsCurrent();
2234
    int bytesToCopy = cu.getPosq().getSize()*(cu.getUseDoublePrecision() ? sizeof(double4) : sizeof(float4));
2235
    CUresult result = cuMemcpyDtoD(cu.getPosq().getDevicePointer(), savedPositions.getDevicePointer(), bytesToCopy);
2236
2237
2238
2239
    if (result != CUDA_SUCCESS) {
        std::stringstream m;
        m<<"Error restoring positions for MC barostat: "<<cu.getErrorString(result)<<" ("<<result<<")";
        throw OpenMMException(m.str());
2240
    }
2241
    result = cuMemcpyDtoD(cu.getForce().getDevicePointer(), savedForces.getDevicePointer(), savedForces.getSize()*savedForces.getElementSize());
2242
2243
2244
2245
2246
    if (result != CUDA_SUCCESS) {
        std::stringstream m;
        m<<"Error restoring forces for MC barostat: "<<cu.getErrorString(result)<<" ("<<result<<")";
        throw OpenMMException(m.str());
    }
2247
}