CudaParallelKernels.cpp 40.4 KB
Newer Older
1
2
3
4
5
6
7
8
/* -------------------------------------------------------------------------- *
 *                                   OpenMM                                   *
 * -------------------------------------------------------------------------- *
 * This is part of the OpenMM molecular simulation toolkit originating from   *
 * Simbios, the NIH National Center for Physics-Based Simulation of           *
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
9
 * Portions copyright (c) 2011-2024 Stanford University and the Authors.      *
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
 * This program is free software: you can redistribute it and/or modify       *
 * it under the terms of the GNU Lesser General Public License as published   *
 * by the Free Software Foundation, either version 3 of the License, or       *
 * (at your option) any later version.                                        *
 *                                                                            *
 * This program is distributed in the hope that it will be useful,            *
 * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
 * GNU Lesser General Public License for more details.                        *
 *                                                                            *
 * You should have received a copy of the GNU Lesser General Public License   *
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
 * -------------------------------------------------------------------------- */

#include "CudaParallelKernels.h"
#include "CudaKernelSources.h"
29
#include "openmm/common/ContextSelector.h"
30
31
32
33
34

using namespace OpenMM;
using namespace std;


35
#define CHECK_RESULT(result, prefix) \
36
37
if (result != CUDA_SUCCESS) { \
    std::stringstream m; \
38
    m<<prefix<<": "<<cu.getErrorString(result)<<" ("<<result<<")"<<" at "<<__FILE__<<":"<<__LINE__; \
39
40
41
42
43
44
45
46
47
48
    throw OpenMMException(m.str());\
}

/**
 * Get the current clock time, measured in microseconds.
 */
#ifdef _MSC_VER
    #include <Windows.h>
    static long long getTime() {
        FILETIME ft;
49
        GetSystemTimeAsFileTime(&ft); // 100-nanoseconds since 1-1-1601
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
        ULARGE_INTEGER result;
        result.LowPart = ft.dwLowDateTime;
        result.HighPart = ft.dwHighDateTime;
        return result.QuadPart/10;
    }
#else
    #include <sys/time.h> 
    static long long getTime() {
        struct timeval tod;
        gettimeofday(&tod, 0);
        return 1000000*tod.tv_sec+tod.tv_usec;
    }
#endif

class CudaParallelCalcForcesAndEnergyKernel::BeginComputationTask : public CudaContext::WorkTask {
public:
    BeginComputationTask(ContextImpl& context, CudaContext& cu, CudaCalcForcesAndEnergyKernel& kernel,
67
68
            bool includeForce, bool includeEnergy, int groups, void* pinnedMemory, CUevent event, int2& interactionCount) : context(context), cu(cu), kernel(kernel),
            includeForce(includeForce), includeEnergy(includeEnergy), groups(groups), pinnedMemory(pinnedMemory), event(event), interactionCount(interactionCount) {
69
70
71
72
    }
    void execute() {
        // Copy coordinates over to this device and execute the kernel.

73
        ContextSelector selector(cu);
74
        if (cu.getContextIndex() > 0) {
75
76
            cuStreamWaitEvent(cu.getCurrentStream(), event, 0);
            if (!cu.getPlatformData().peerAccessSupported)
77
78
                cu.getPosq().upload(pinnedMemory, false);
        }
79
        kernel.beginComputation(context, includeForce, includeEnergy, groups);
80
        if (cu.getNonbondedUtilities().getUsePeriodic())
81
            cu.getNonbondedUtilities().getInteractionCount().download(&interactionCount, false);
82
83
84
85
86
87
88
89
    }
private:
    ContextImpl& context;
    CudaContext& cu;
    CudaCalcForcesAndEnergyKernel& kernel;
    bool includeForce, includeEnergy;
    int groups;
    void* pinnedMemory;
root's avatar
root committed
90
    CUevent event;
91
    int2& interactionCount;
92
93
94
95
96
};

class CudaParallelCalcForcesAndEnergyKernel::FinishComputationTask : public CudaContext::WorkTask {
public:
    FinishComputationTask(ContextImpl& context, CudaContext& cu, CudaCalcForcesAndEnergyKernel& kernel,
97
            bool includeForce, bool includeEnergy, int groups, double& energy, long long& completionTime, long long* pinnedMemory, CudaArray& contextForces,
98
            bool& valid, int2& interactionCount, CUstream stream, CUevent event, CUevent localEvent, bool loadBalance) :
99
            context(context), cu(cu), kernel(kernel), includeForce(includeForce), includeEnergy(includeEnergy), groups(groups), energy(energy),
100
            completionTime(completionTime), pinnedMemory(pinnedMemory), contextForces(contextForces), valid(valid), interactionCount(interactionCount),
101
            stream(stream), event(event), localEvent(localEvent), loadBalance(loadBalance) {
102
103
104
    }
    void execute() {
        // Execute the kernel, then download forces.
105
        
106
        ContextSelector selector(cu);
107
        energy += kernel.finishComputation(context, includeForce, includeEnergy, groups, valid);
108
        if (loadBalance) {
109
110
111
112
113
            // Record timing information for load balancing.  Since this takes time, only do it at the start of the simulation.

            CHECK_RESULT(cuCtxSynchronize(), "Error synchronizing CUDA context");
            completionTime = getTime();
        }
114
115
        if (includeForce) {
            if (cu.getContextIndex() > 0) {
116
117
                cuEventRecord(localEvent, cu.getCurrentStream());
                cuStreamWaitEvent(stream, localEvent, 0);
118
                int numAtoms = cu.getPaddedNumAtoms();
119
120
121
122
                if (cu.getPlatformData().peerAccessSupported) {
                    int numBytes = numAtoms*3*sizeof(long long);
                    int offset = (cu.getContextIndex()-1)*numBytes;
                    CudaContext& context0 = *cu.getPlatformData().contexts[0];
123
124
                    CHECK_RESULT(cuMemcpyAsync(contextForces.getDevicePointer()+offset, cu.getForce().getDevicePointer(), numBytes, stream), "Error copying forces");
                    cuEventRecord(event, stream);
125
126
127
                }
                else
                    cu.getForce().download(&pinnedMemory[(cu.getContextIndex()-1)*numAtoms*3]);
128
129
            }
        }
130
131
        if (cu.getNonbondedUtilities().getUsePeriodic() && (interactionCount.x > cu.getNonbondedUtilities().getInteractingTiles().getSize() ||
                interactionCount.y > cu.getNonbondedUtilities().getSinglePairs().getSize())) {
peastman's avatar
peastman committed
132
133
134
            valid = false;
            cu.getNonbondedUtilities().updateNeighborListSize();
        }
135
136
137
138
139
    }
private:
    ContextImpl& context;
    CudaContext& cu;
    CudaCalcForcesAndEnergyKernel& kernel;
140
    bool includeForce, includeEnergy, loadBalance;
141
142
143
    int groups;
    double& energy;
    long long& completionTime;
Peter Eastman's avatar
Peter Eastman committed
144
    long long* pinnedMemory;
145
    CudaArray& contextForces;
146
    bool& valid;
147
    int2& interactionCount;
148
149
150
    CUstream stream;
    CUevent event;
    CUevent localEvent;
151
152
153
};

CudaParallelCalcForcesAndEnergyKernel::CudaParallelCalcForcesAndEnergyKernel(string name, const Platform& platform, CudaPlatform::PlatformData& data) :
154
        CalcForcesAndEnergyKernel(name, platform), data(data), completionTimes(data.contexts.size()), contextNonbondedFractions(data.contexts.size()),
155
        interactionCounts(NULL), pinnedPositionBuffer(NULL), pinnedForceBuffer(NULL) {
156
157
158
159
160
    for (int i = 0; i < (int) data.contexts.size(); i++)
        kernels.push_back(Kernel(new CudaCalcForcesAndEnergyKernel(name, platform, *data.contexts[i])));
}

CudaParallelCalcForcesAndEnergyKernel::~CudaParallelCalcForcesAndEnergyKernel() {
161
    ContextSelector selector(*data.contexts[0]);
162
163
164
165
    if (pinnedPositionBuffer != NULL)
        cuMemFreeHost(pinnedPositionBuffer);
    if (pinnedForceBuffer != NULL)
        cuMemFreeHost(pinnedForceBuffer);
root's avatar
root committed
166
    cuEventDestroy(event);
167
168
169
170
171
172
    for (int i = 0; i < peerCopyEvent.size(); i++)
        cuEventDestroy(peerCopyEvent[i]);
    for (int i = 0; i < peerCopyEventLocal.size(); i++)
        cuEventDestroy(peerCopyEventLocal[i]);
    for (int i = 0; i < peerCopyStream.size(); i++)
        cuStreamDestroy(peerCopyStream[i]);
173
174
    if (interactionCounts != NULL)
        cuMemFreeHost(interactionCounts);
175
176
177
178
}

void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
    CudaContext& cu = *data.contexts[0];
179
    ContextSelector selector(cu);
180
181
    CUmodule module = cu.createModule(CudaKernelSources::parallel);
    sumKernel = cu.getKernel(module, "sumForces");
182
183
    int numContexts = data.contexts.size();
    for (int i = 0; i < numContexts; i++)
184
        getKernel(i).initialize(system);
185
186
187
188
189
    for (int i = 0; i < contextNonbondedFractions.size(); i++) {
        double x0 = i/(double) contextNonbondedFractions.size();
        double x1 = (i+1)/(double) contextNonbondedFractions.size();
        contextNonbondedFractions[i] = x1*x1 - x0*x0;
    }
190
    CHECK_RESULT(cuEventCreate(&event, cu.getEventFlags()), "Error creating event");
191
192
193
194
    peerCopyEvent.resize(numContexts);
    peerCopyEventLocal.resize(numContexts);
    peerCopyStream.resize(numContexts);
    for (int i = 0; i < numContexts; i++) {
195
        CHECK_RESULT(cuEventCreate(&peerCopyEvent[i], cu.getEventFlags()), "Error creating event");
196
197
198
199
200
        CHECK_RESULT(cuStreamCreate(&peerCopyStream[i], CU_STREAM_NON_BLOCKING), "Error creating stream");
    }
    for (int i = 0; i < numContexts; i++) {
        CudaContext& cuLocal = *data.contexts[i];
        ContextSelector selectorLocal(cuLocal);
201
        CHECK_RESULT(cuEventCreate(&peerCopyEventLocal[i], cu.getEventFlags()), "Error creating event");
202
    }
203
    CHECK_RESULT(cuMemHostAlloc((void**) &interactionCounts, numContexts*sizeof(int2), 0), "Error creating interaction counts buffer");
204
205
206
207
}

void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) {
    CudaContext& cu = *data.contexts[0];
208
    ContextSelector selector(cu);
209
210
    if (!contextForces.isInitialized()) {
        contextForces.initialize<long long>(cu, 3*(data.contexts.size()-1)*cu.getPaddedNumAtoms(), "contextForces");
211
212
        CHECK_RESULT(cuMemHostAlloc((void**) &pinnedForceBuffer, 3*(data.contexts.size()-1)*cu.getPaddedNumAtoms()*sizeof(long long), CU_MEMHOSTALLOC_PORTABLE), "Error allocating pinned memory");
        CHECK_RESULT(cuMemHostAlloc(&pinnedPositionBuffer, cu.getPaddedNumAtoms()*(cu.getUseDoublePrecision() ? sizeof(double4) : sizeof(float4)), CU_MEMHOSTALLOC_PORTABLE), "Error allocating pinned memory");
213
    }
214
    loadBalance = (cu.getComputeForceCount() < 200 || cu.getComputeForceCount()%30 == 0);
215
216
217

    // Copy coordinates over to each device and execute the kernel.
    
218
    if (!cu.getPlatformData().peerAccessSupported) {
root's avatar
root committed
219
220
221
        cu.getPosq().download(pinnedPositionBuffer, false);
        cuEventRecord(event, cu.getCurrentStream());
    }
222
223
    else {
        int numBytes = cu.getPosq().getSize()*cu.getPosq().getElementSize();
224
        cuEventRecord(event, cu.getCurrentStream());
225
226
227
228
229
        for (int i = 1; i < (int) data.contexts.size(); i++) {
            cuStreamWaitEvent(peerCopyStream[i], event, 0);
            CHECK_RESULT(cuMemcpyAsync(data.contexts[i]->getPosq().getDevicePointer(), cu.getPosq().getDevicePointer(), numBytes, peerCopyStream[i]), "Error copying positions");
            cuEventRecord(peerCopyEvent[i], peerCopyStream[i]);
        }
230
    }
231
232
233
    for (int i = 0; i < (int) data.contexts.size(); i++) {
        data.contextEnergy[i] = 0.0;
        CudaContext& cu = *data.contexts[i];
234
        ComputeContext::WorkThread& thread = cu.getWorkThread();
235
236
        CUevent waitEvent = (cu.getPlatformData().peerAccessSupported ? peerCopyEvent[i] : event);
        thread.addTask(new BeginComputationTask(context, cu, getKernel(i), includeForce, includeEnergy, groups, pinnedPositionBuffer, waitEvent, interactionCounts[i]));
237
    }
238
    data.syncContexts();
239
240
}

241
double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups, bool& valid) {
242
243
    for (int i = 0; i < (int) data.contexts.size(); i++) {
        CudaContext& cu = *data.contexts[i];
244
        ComputeContext::WorkThread& thread = cu.getWorkThread();
245
        thread.addTask(new FinishComputationTask(context, cu, getKernel(i), includeForce, includeEnergy, groups, data.contextEnergy[i], completionTimes[i],
246
                pinnedForceBuffer, contextForces, valid, interactionCounts[i], peerCopyStream[i], peerCopyEvent[i], peerCopyEventLocal[i], loadBalance));
247
248
    }
    data.syncContexts();
249
250
251
252
253
    CudaContext& cu = *data.contexts[0];
    ContextSelector selector(cu);
    if (cu.getPlatformData().peerAccessSupported)
        for (int i = 1; i < data.contexts.size(); i++)
            cuStreamWaitEvent(cu.getCurrentStream(), peerCopyEvent[i], 0);
254
255
256
    double energy = 0.0;
    for (int i = 0; i < (int) data.contextEnergy.size(); i++)
        energy += data.contextEnergy[i];
257
    if (includeForce && valid) {
258
259
        // Sum the forces from all devices.
        
260
        if (!cu.getPlatformData().peerAccessSupported)
261
            contextForces.upload(pinnedForceBuffer, false);
262
263
        int bufferSize = 3*cu.getPaddedNumAtoms();
        int numBuffers = data.contexts.size()-1;
264
        void* args[] = {&cu.getForce().getDevicePointer(), &contextForces.getDevicePointer(), &bufferSize, &numBuffers};
265
266
        cu.executeKernel(sumKernel, args, bufferSize);
        
267
        // Balance work between the contexts by transferring a little nonbonded work from the context that
268
269
        // finished last to the one that finished first.
        
270
        if (loadBalance) {
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
            int firstIndex = 0, lastIndex = 0;
            for (int i = 0; i < (int) completionTimes.size(); i++) {
                if (completionTimes[i] < completionTimes[firstIndex])
                    firstIndex = i;
                if (completionTimes[i] > completionTimes[lastIndex])
                    lastIndex = i;
            }
            double fractionToTransfer = min(0.01, contextNonbondedFractions[lastIndex]);
            contextNonbondedFractions[firstIndex] += fractionToTransfer;
            contextNonbondedFractions[lastIndex] -= fractionToTransfer;
            double startFraction = 0.0;
            for (int i = 0; i < (int) contextNonbondedFractions.size(); i++) {
                double endFraction = startFraction+contextNonbondedFractions[i];
                if (i == contextNonbondedFractions.size()-1)
                    endFraction = 1.0; // Avoid roundoff error
                data.contexts[i]->getNonbondedUtilities().setAtomBlockRange(startFraction, endFraction);
                startFraction = endFraction;
            }
	}
290
291
292
293
294
295
    }
    return energy;
}

class CudaParallelCalcHarmonicBondForceKernel::Task : public CudaContext::WorkTask {
public:
296
    Task(ContextImpl& context, CommonCalcHarmonicBondForceKernel& kernel, bool includeForce,
297
298
299
300
301
302
303
304
            bool includeEnergy, double& energy) : context(context), kernel(kernel),
            includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
    }
    void execute() {
        energy += kernel.execute(context, includeForce, includeEnergy);
    }
private:
    ContextImpl& context;
305
    CommonCalcHarmonicBondForceKernel& kernel;
306
307
308
309
    bool includeForce, includeEnergy;
    double& energy;
};

310
CudaParallelCalcHarmonicBondForceKernel::CudaParallelCalcHarmonicBondForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, const System& system) :
311
312
        CalcHarmonicBondForceKernel(name, platform), data(data) {
    for (int i = 0; i < (int) data.contexts.size(); i++)
313
        kernels.push_back(Kernel(new CommonCalcHarmonicBondForceKernel(name, platform, *data.contexts[i], system)));
314
315
316
317
318
319
320
321
322
323
}

void CudaParallelCalcHarmonicBondForceKernel::initialize(const System& system, const HarmonicBondForce& force) {
    for (int i = 0; i < (int) kernels.size(); i++)
        getKernel(i).initialize(system, force);
}

double CudaParallelCalcHarmonicBondForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
    for (int i = 0; i < (int) data.contexts.size(); i++) {
        CudaContext& cu = *data.contexts[i];
324
        ComputeContext::WorkThread& thread = cu.getWorkThread();
325
326
327
328
329
        thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
    }
    return 0.0;
}

330
void CudaParallelCalcHarmonicBondForceKernel::copyParametersToContext(ContextImpl& context, const HarmonicBondForce& force, int firstBond, int lastBond) {
331
    for (int i = 0; i < (int) kernels.size(); i++)
332
        getKernel(i).copyParametersToContext(context, force, firstBond, lastBond);
333
334
335
336
}

class CudaParallelCalcCustomBondForceKernel::Task : public CudaContext::WorkTask {
public:
337
    Task(ContextImpl& context, CommonCalcCustomBondForceKernel& kernel, bool includeForce,
338
339
340
341
342
343
344
345
            bool includeEnergy, double& energy) : context(context), kernel(kernel),
            includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
    }
    void execute() {
        energy += kernel.execute(context, includeForce, includeEnergy);
    }
private:
    ContextImpl& context;
346
    CommonCalcCustomBondForceKernel& kernel;
347
348
349
350
    bool includeForce, includeEnergy;
    double& energy;
};

351
CudaParallelCalcCustomBondForceKernel::CudaParallelCalcCustomBondForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, const System& system) :
352
353
        CalcCustomBondForceKernel(name, platform), data(data) {
    for (int i = 0; i < (int) data.contexts.size(); i++)
354
        kernels.push_back(Kernel(new CommonCalcCustomBondForceKernel(name, platform, *data.contexts[i], system)));
355
356
357
358
359
360
361
362
363
364
}

void CudaParallelCalcCustomBondForceKernel::initialize(const System& system, const CustomBondForce& force) {
    for (int i = 0; i < (int) kernels.size(); i++)
        getKernel(i).initialize(system, force);
}

double CudaParallelCalcCustomBondForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
    for (int i = 0; i < (int) data.contexts.size(); i++) {
        CudaContext& cu = *data.contexts[i];
365
        ComputeContext::WorkThread& thread = cu.getWorkThread();
366
367
368
369
370
        thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
    }
    return 0.0;
}

371
void CudaParallelCalcCustomBondForceKernel::copyParametersToContext(ContextImpl& context, const CustomBondForce& force, int firstBond, int lastBond) {
372
    for (int i = 0; i < (int) kernels.size(); i++)
373
        getKernel(i).copyParametersToContext(context, force, firstBond, lastBond);
374
375
376
377
}

class CudaParallelCalcHarmonicAngleForceKernel::Task : public CudaContext::WorkTask {
public:
378
    Task(ContextImpl& context, CommonCalcHarmonicAngleForceKernel& kernel, bool includeForce,
379
380
381
382
383
384
385
386
            bool includeEnergy, double& energy) : context(context), kernel(kernel),
            includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
    }
    void execute() {
        energy += kernel.execute(context, includeForce, includeEnergy);
    }
private:
    ContextImpl& context;
387
    CommonCalcHarmonicAngleForceKernel& kernel;
388
389
390
391
    bool includeForce, includeEnergy;
    double& energy;
};

392
CudaParallelCalcHarmonicAngleForceKernel::CudaParallelCalcHarmonicAngleForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, const System& system) :
393
394
        CalcHarmonicAngleForceKernel(name, platform), data(data) {
    for (int i = 0; i < (int) data.contexts.size(); i++)
395
        kernels.push_back(Kernel(new CommonCalcHarmonicAngleForceKernel(name, platform, *data.contexts[i], system)));
396
397
398
399
400
401
402
403
404
405
}

void CudaParallelCalcHarmonicAngleForceKernel::initialize(const System& system, const HarmonicAngleForce& force) {
    for (int i = 0; i < (int) kernels.size(); i++)
        getKernel(i).initialize(system, force);
}

double CudaParallelCalcHarmonicAngleForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
    for (int i = 0; i < (int) data.contexts.size(); i++) {
        CudaContext& cu = *data.contexts[i];
406
        ComputeContext::WorkThread& thread = cu.getWorkThread();
407
408
409
410
411
        thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
    }
    return 0.0;
}

412
void CudaParallelCalcHarmonicAngleForceKernel::copyParametersToContext(ContextImpl& context, const HarmonicAngleForce& force, int firstAngle, int lastAngle) {
413
    for (int i = 0; i < (int) kernels.size(); i++)
414
        getKernel(i).copyParametersToContext(context, force, firstAngle, lastAngle);
415
416
417
418
}

class CudaParallelCalcCustomAngleForceKernel::Task : public CudaContext::WorkTask {
public:
419
    Task(ContextImpl& context, CommonCalcCustomAngleForceKernel& kernel, bool includeForce,
420
421
422
423
424
425
426
427
            bool includeEnergy, double& energy) : context(context), kernel(kernel),
            includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
    }
    void execute() {
        energy += kernel.execute(context, includeForce, includeEnergy);
    }
private:
    ContextImpl& context;
428
    CommonCalcCustomAngleForceKernel& kernel;
429
430
431
432
    bool includeForce, includeEnergy;
    double& energy;
};

433
CudaParallelCalcCustomAngleForceKernel::CudaParallelCalcCustomAngleForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, const System& system) :
434
435
        CalcCustomAngleForceKernel(name, platform), data(data) {
    for (int i = 0; i < (int) data.contexts.size(); i++)
436
        kernels.push_back(Kernel(new CommonCalcCustomAngleForceKernel(name, platform, *data.contexts[i], system)));
437
438
439
440
441
442
443
444
445
446
}

void CudaParallelCalcCustomAngleForceKernel::initialize(const System& system, const CustomAngleForce& force) {
    for (int i = 0; i < (int) kernels.size(); i++)
        getKernel(i).initialize(system, force);
}

double CudaParallelCalcCustomAngleForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
    for (int i = 0; i < (int) data.contexts.size(); i++) {
        CudaContext& cu = *data.contexts[i];
447
        ComputeContext::WorkThread& thread = cu.getWorkThread();
448
449
450
451
452
        thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
    }
    return 0.0;
}

453
void CudaParallelCalcCustomAngleForceKernel::copyParametersToContext(ContextImpl& context, const CustomAngleForce& force, int firstAngle, int lastAngle) {
454
    for (int i = 0; i < (int) kernels.size(); i++)
455
        getKernel(i).copyParametersToContext(context, force, firstAngle, lastAngle);
456
457
458
459
}

class CudaParallelCalcPeriodicTorsionForceKernel::Task : public CudaContext::WorkTask {
public:
460
    Task(ContextImpl& context, CommonCalcPeriodicTorsionForceKernel& kernel, bool includeForce,
461
462
463
464
465
466
467
468
            bool includeEnergy, double& energy) : context(context), kernel(kernel),
            includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
    }
    void execute() {
        energy += kernel.execute(context, includeForce, includeEnergy);
    }
private:
    ContextImpl& context;
469
    CommonCalcPeriodicTorsionForceKernel& kernel;
470
471
472
473
    bool includeForce, includeEnergy;
    double& energy;
};

474
CudaParallelCalcPeriodicTorsionForceKernel::CudaParallelCalcPeriodicTorsionForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, const System& system) :
475
476
        CalcPeriodicTorsionForceKernel(name, platform), data(data) {
    for (int i = 0; i < (int) data.contexts.size(); i++)
477
        kernels.push_back(Kernel(new CommonCalcPeriodicTorsionForceKernel(name, platform, *data.contexts[i], system)));
478
479
480
481
482
483
484
485
486
487
}

void CudaParallelCalcPeriodicTorsionForceKernel::initialize(const System& system, const PeriodicTorsionForce& force) {
    for (int i = 0; i < (int) kernels.size(); i++)
        getKernel(i).initialize(system, force);
}

double CudaParallelCalcPeriodicTorsionForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
    for (int i = 0; i < (int) data.contexts.size(); i++) {
        CudaContext& cu = *data.contexts[i];
488
        ComputeContext::WorkThread& thread = cu.getWorkThread();
489
490
491
492
493
        thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
    }
    return 0.0;
}

494
void CudaParallelCalcPeriodicTorsionForceKernel::copyParametersToContext(ContextImpl& context, const PeriodicTorsionForce& force, int firstTorsion, int lastTorsion) {
495
    for (int i = 0; i < (int) kernels.size(); i++)
496
        getKernel(i).copyParametersToContext(context, force, firstTorsion, lastTorsion);
497
498
499
500
}

class CudaParallelCalcRBTorsionForceKernel::Task : public CudaContext::WorkTask {
public:
501
    Task(ContextImpl& context, CommonCalcRBTorsionForceKernel& kernel, bool includeForce,
502
503
504
505
506
507
508
509
            bool includeEnergy, double& energy) : context(context), kernel(kernel),
            includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
    }
    void execute() {
        energy += kernel.execute(context, includeForce, includeEnergy);
    }
private:
    ContextImpl& context;
510
    CommonCalcRBTorsionForceKernel& kernel;
511
512
513
514
    bool includeForce, includeEnergy;
    double& energy;
};

515
CudaParallelCalcRBTorsionForceKernel::CudaParallelCalcRBTorsionForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, const System& system) :
516
517
        CalcRBTorsionForceKernel(name, platform), data(data) {
    for (int i = 0; i < (int) data.contexts.size(); i++)
518
        kernels.push_back(Kernel(new CommonCalcRBTorsionForceKernel(name, platform, *data.contexts[i], system)));
519
520
521
522
523
524
525
526
527
528
}

void CudaParallelCalcRBTorsionForceKernel::initialize(const System& system, const RBTorsionForce& force) {
    for (int i = 0; i < (int) kernels.size(); i++)
        getKernel(i).initialize(system, force);
}

double CudaParallelCalcRBTorsionForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
    for (int i = 0; i < (int) data.contexts.size(); i++) {
        CudaContext& cu = *data.contexts[i];
529
        ComputeContext::WorkThread& thread = cu.getWorkThread();
530
531
532
533
534
535
536
537
538
539
540
541
        thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
    }
    return 0.0;
}

void CudaParallelCalcRBTorsionForceKernel::copyParametersToContext(ContextImpl& context, const RBTorsionForce& force) {
    for (int i = 0; i < (int) kernels.size(); i++)
        getKernel(i).copyParametersToContext(context, force);
}

class CudaParallelCalcCMAPTorsionForceKernel::Task : public CudaContext::WorkTask {
public:
542
    Task(ContextImpl& context, CommonCalcCMAPTorsionForceKernel& kernel, bool includeForce,
543
544
545
546
547
548
549
550
            bool includeEnergy, double& energy) : context(context), kernel(kernel),
            includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
    }
    void execute() {
        energy += kernel.execute(context, includeForce, includeEnergy);
    }
private:
    ContextImpl& context;
551
    CommonCalcCMAPTorsionForceKernel& kernel;
552
553
554
555
    bool includeForce, includeEnergy;
    double& energy;
};

556
CudaParallelCalcCMAPTorsionForceKernel::CudaParallelCalcCMAPTorsionForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, const System& system) :
557
558
        CalcCMAPTorsionForceKernel(name, platform), data(data) {
    for (int i = 0; i < (int) data.contexts.size(); i++)
559
        kernels.push_back(Kernel(new CommonCalcCMAPTorsionForceKernel(name, platform, *data.contexts[i], system)));
560
561
562
563
564
565
566
567
568
569
}

void CudaParallelCalcCMAPTorsionForceKernel::initialize(const System& system, const CMAPTorsionForce& force) {
    for (int i = 0; i < (int) kernels.size(); i++)
        getKernel(i).initialize(system, force);
}

double CudaParallelCalcCMAPTorsionForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
    for (int i = 0; i < (int) data.contexts.size(); i++) {
        CudaContext& cu = *data.contexts[i];
570
        ComputeContext::WorkThread& thread = cu.getWorkThread();
571
572
573
574
575
        thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
    }
    return 0.0;
}

576
577
578
579
580
void CudaParallelCalcCMAPTorsionForceKernel::copyParametersToContext(ContextImpl& context, const CMAPTorsionForce& force) {
    for (int i = 0; i < (int) kernels.size(); i++)
        getKernel(i).copyParametersToContext(context, force);
}

581
582
class CudaParallelCalcCustomTorsionForceKernel::Task : public CudaContext::WorkTask {
public:
583
    Task(ContextImpl& context, CommonCalcCustomTorsionForceKernel& kernel, bool includeForce,
584
585
586
587
588
589
590
591
            bool includeEnergy, double& energy) : context(context), kernel(kernel),
            includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
    }
    void execute() {
        energy += kernel.execute(context, includeForce, includeEnergy);
    }
private:
    ContextImpl& context;
592
    CommonCalcCustomTorsionForceKernel& kernel;
593
594
595
596
    bool includeForce, includeEnergy;
    double& energy;
};

597
CudaParallelCalcCustomTorsionForceKernel::CudaParallelCalcCustomTorsionForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, const System& system) :
598
599
        CalcCustomTorsionForceKernel(name, platform), data(data) {
    for (int i = 0; i < (int) data.contexts.size(); i++)
600
        kernels.push_back(Kernel(new CommonCalcCustomTorsionForceKernel(name, platform, *data.contexts[i], system)));
601
602
603
604
605
606
607
608
609
610
}

void CudaParallelCalcCustomTorsionForceKernel::initialize(const System& system, const CustomTorsionForce& force) {
    for (int i = 0; i < (int) kernels.size(); i++)
        getKernel(i).initialize(system, force);
}

double CudaParallelCalcCustomTorsionForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
    for (int i = 0; i < (int) data.contexts.size(); i++) {
        CudaContext& cu = *data.contexts[i];
611
        ComputeContext::WorkThread& thread = cu.getWorkThread();
612
613
614
615
616
        thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
    }
    return 0.0;
}

617
void CudaParallelCalcCustomTorsionForceKernel::copyParametersToContext(ContextImpl& context, const CustomTorsionForce& force, int firstTorsion, int lastTorsion) {
618
    for (int i = 0; i < (int) kernels.size(); i++)
619
        getKernel(i).copyParametersToContext(context, force, firstTorsion, lastTorsion);
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
}

class CudaParallelCalcNonbondedForceKernel::Task : public CudaContext::WorkTask {
public:
    Task(ContextImpl& context, CudaCalcNonbondedForceKernel& kernel, bool includeForce,
            bool includeEnergy, bool includeDirect, bool includeReciprocal, double& energy) : context(context), kernel(kernel),
            includeForce(includeForce), includeEnergy(includeEnergy), includeDirect(includeDirect), includeReciprocal(includeReciprocal), energy(energy) {
    }
    void execute() {
        energy += kernel.execute(context, includeForce, includeEnergy, includeDirect, includeReciprocal);
    }
private:
    ContextImpl& context;
    CudaCalcNonbondedForceKernel& kernel;
    bool includeForce, includeEnergy, includeDirect, includeReciprocal;
    double& energy;
};

638
CudaParallelCalcNonbondedForceKernel::CudaParallelCalcNonbondedForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, const System& system) :
639
640
641
642
643
644
645
646
647
648
649
650
651
        CalcNonbondedForceKernel(name, platform), data(data) {
    for (int i = 0; i < (int) data.contexts.size(); i++)
        kernels.push_back(Kernel(new CudaCalcNonbondedForceKernel(name, platform, *data.contexts[i], system)));
}

void CudaParallelCalcNonbondedForceKernel::initialize(const System& system, const NonbondedForce& force) {
    for (int i = 0; i < (int) kernels.size(); i++)
        getKernel(i).initialize(system, force);
}

double CudaParallelCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy, bool includeDirect, bool includeReciprocal) {
    for (int i = 0; i < (int) data.contexts.size(); i++) {
        CudaContext& cu = *data.contexts[i];
652
        ComputeContext::WorkThread& thread = cu.getWorkThread();
653
654
655
656
657
        thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, includeDirect, includeReciprocal, data.contextEnergy[i]));
    }
    return 0.0;
}

658
void CudaParallelCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& context, const NonbondedForce& force, int firstParticle, int lastParticle, int firstException, int lastException) {
659
    for (int i = 0; i < (int) kernels.size(); i++)
660
        getKernel(i).copyParametersToContext(context, force, firstParticle, lastParticle, firstException, lastException);
661
662
}

663
664
665
666
void CudaParallelCalcNonbondedForceKernel::getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
    dynamic_cast<const CudaCalcNonbondedForceKernel&>(kernels[0].getImpl()).getPMEParameters(alpha, nx, ny, nz);
}

667
668
669
670
void CudaParallelCalcNonbondedForceKernel::getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
    dynamic_cast<const CudaCalcNonbondedForceKernel&>(kernels[0].getImpl()).getLJPMEParameters(alpha, nx, ny, nz);
}

671
672
class CudaParallelCalcCustomNonbondedForceKernel::Task : public CudaContext::WorkTask {
public:
673
    Task(ContextImpl& context, CommonCalcCustomNonbondedForceKernel& kernel, bool includeForce,
674
675
676
677
678
679
680
681
            bool includeEnergy, double& energy) : context(context), kernel(kernel),
            includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
    }
    void execute() {
        energy += kernel.execute(context, includeForce, includeEnergy);
    }
private:
    ContextImpl& context;
682
    CommonCalcCustomNonbondedForceKernel& kernel;
683
684
685
686
    bool includeForce, includeEnergy;
    double& energy;
};

687
CudaParallelCalcCustomNonbondedForceKernel::CudaParallelCalcCustomNonbondedForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, const System& system) :
688
689
        CalcCustomNonbondedForceKernel(name, platform), data(data) {
    for (int i = 0; i < (int) data.contexts.size(); i++)
690
        kernels.push_back(Kernel(new CommonCalcCustomNonbondedForceKernel(name, platform, *data.contexts[i], system)));
691
692
693
694
695
696
697
698
699
700
}

void CudaParallelCalcCustomNonbondedForceKernel::initialize(const System& system, const CustomNonbondedForce& force) {
    for (int i = 0; i < (int) kernels.size(); i++)
        getKernel(i).initialize(system, force);
}

double CudaParallelCalcCustomNonbondedForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
    for (int i = 0; i < (int) data.contexts.size(); i++) {
        CudaContext& cu = *data.contexts[i];
701
        ComputeContext::WorkThread& thread = cu.getWorkThread();
702
703
704
705
706
        thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
    }
    return 0.0;
}

707
void CudaParallelCalcCustomNonbondedForceKernel::copyParametersToContext(ContextImpl& context, const CustomNonbondedForce& force, int firstParticle, int lastParticle) {
708
    for (int i = 0; i < (int) kernels.size(); i++)
709
        getKernel(i).copyParametersToContext(context, force, firstParticle, lastParticle);
710
711
712
713
}

class CudaParallelCalcCustomExternalForceKernel::Task : public CudaContext::WorkTask {
public:
714
    Task(ContextImpl& context, CommonCalcCustomExternalForceKernel& kernel, bool includeForce,
715
716
717
718
719
720
721
722
            bool includeEnergy, double& energy) : context(context), kernel(kernel),
            includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
    }
    void execute() {
        energy += kernel.execute(context, includeForce, includeEnergy);
    }
private:
    ContextImpl& context;
723
    CommonCalcCustomExternalForceKernel& kernel;
724
725
726
727
    bool includeForce, includeEnergy;
    double& energy;
};

728
CudaParallelCalcCustomExternalForceKernel::CudaParallelCalcCustomExternalForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, const System& system) :
729
730
        CalcCustomExternalForceKernel(name, platform), data(data) {
    for (int i = 0; i < (int) data.contexts.size(); i++)
731
        kernels.push_back(Kernel(new CommonCalcCustomExternalForceKernel(name, platform, *data.contexts[i], system)));
732
733
734
735
736
737
738
739
740
741
}

void CudaParallelCalcCustomExternalForceKernel::initialize(const System& system, const CustomExternalForce& force) {
    for (int i = 0; i < (int) kernels.size(); i++)
        getKernel(i).initialize(system, force);
}

double CudaParallelCalcCustomExternalForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
    for (int i = 0; i < (int) data.contexts.size(); i++) {
        CudaContext& cu = *data.contexts[i];
742
        ComputeContext::WorkThread& thread = cu.getWorkThread();
743
744
745
746
747
        thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
    }
    return 0.0;
}

748
void CudaParallelCalcCustomExternalForceKernel::copyParametersToContext(ContextImpl& context, const CustomExternalForce& force, int firstParticle, int lastParticle) {
749
    for (int i = 0; i < (int) kernels.size(); i++)
750
        getKernel(i).copyParametersToContext(context, force, firstParticle, lastParticle);
751
752
753
754
}

class CudaParallelCalcCustomHbondForceKernel::Task : public CudaContext::WorkTask {
public:
755
    Task(ContextImpl& context, CommonCalcCustomHbondForceKernel& kernel, bool includeForce,
756
757
758
759
760
761
762
763
            bool includeEnergy, double& energy) : context(context), kernel(kernel),
            includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
    }
    void execute() {
        energy += kernel.execute(context, includeForce, includeEnergy);
    }
private:
    ContextImpl& context;
764
    CommonCalcCustomHbondForceKernel& kernel;
765
766
767
768
    bool includeForce, includeEnergy;
    double& energy;
};

769
CudaParallelCalcCustomHbondForceKernel::CudaParallelCalcCustomHbondForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, const System& system) :
770
771
        CalcCustomHbondForceKernel(name, platform), data(data) {
    for (int i = 0; i < (int) data.contexts.size(); i++)
772
        kernels.push_back(Kernel(new CommonCalcCustomHbondForceKernel(name, platform, *data.contexts[i], system)));
773
774
775
776
777
778
779
780
781
782
}

void CudaParallelCalcCustomHbondForceKernel::initialize(const System& system, const CustomHbondForce& force) {
    for (int i = 0; i < (int) kernels.size(); i++)
        getKernel(i).initialize(system, force);
}

double CudaParallelCalcCustomHbondForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
    for (int i = 0; i < (int) data.contexts.size(); i++) {
        CudaContext& cu = *data.contexts[i];
783
        ComputeContext::WorkThread& thread = cu.getWorkThread();
784
785
786
787
788
789
790
791
792
793
794
795
        thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
    }
    return 0.0;
}

void CudaParallelCalcCustomHbondForceKernel::copyParametersToContext(ContextImpl& context, const CustomHbondForce& force) {
    for (int i = 0; i < (int) kernels.size(); i++)
        getKernel(i).copyParametersToContext(context, force);
}

class CudaParallelCalcCustomCompoundBondForceKernel::Task : public CudaContext::WorkTask {
public:
796
    Task(ContextImpl& context, CommonCalcCustomCompoundBondForceKernel& kernel, bool includeForce,
797
798
799
800
801
802
803
804
            bool includeEnergy, double& energy) : context(context), kernel(kernel),
            includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
    }
    void execute() {
        energy += kernel.execute(context, includeForce, includeEnergy);
    }
private:
    ContextImpl& context;
805
    CommonCalcCustomCompoundBondForceKernel& kernel;
806
807
808
809
    bool includeForce, includeEnergy;
    double& energy;
};

810
CudaParallelCalcCustomCompoundBondForceKernel::CudaParallelCalcCustomCompoundBondForceKernel(std::string name, const Platform& platform, CudaPlatform::PlatformData& data, const System& system) :
811
812
        CalcCustomCompoundBondForceKernel(name, platform), data(data) {
    for (int i = 0; i < (int) data.contexts.size(); i++)
813
        kernels.push_back(Kernel(new CommonCalcCustomCompoundBondForceKernel(name, platform, *data.contexts[i], system)));
814
815
816
817
818
819
820
821
822
823
}

void CudaParallelCalcCustomCompoundBondForceKernel::initialize(const System& system, const CustomCompoundBondForce& force) {
    for (int i = 0; i < (int) kernels.size(); i++)
        getKernel(i).initialize(system, force);
}

double CudaParallelCalcCustomCompoundBondForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
    for (int i = 0; i < (int) data.contexts.size(); i++) {
        CudaContext& cu = *data.contexts[i];
824
        ComputeContext::WorkThread& thread = cu.getWorkThread();
825
826
827
828
829
830
831
832
833
        thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
    }
    return 0.0;
}

void CudaParallelCalcCustomCompoundBondForceKernel::copyParametersToContext(ContextImpl& context, const CustomCompoundBondForce& force) {
    for (int i = 0; i < (int) kernels.size(); i++)
        getKernel(i).copyParametersToContext(context, force);
}