OpenCLNonbondedUtilities.cpp 41 KB
Newer Older
1
2
3
4
5
6
7
8
/* -------------------------------------------------------------------------- *
 *                                   OpenMM                                   *
 * -------------------------------------------------------------------------- *
 * This is part of the OpenMM molecular simulation toolkit originating from   *
 * Simbios, the NIH National Center for Physics-Based Simulation of           *
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
9
 * Portions copyright (c) 2009-2023 Stanford University and the Authors.      *
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
 * This program is free software: you can redistribute it and/or modify       *
 * it under the terms of the GNU Lesser General Public License as published   *
 * by the Free Software Foundation, either version 3 of the License, or       *
 * (at your option) any later version.                                        *
 *                                                                            *
 * This program is distributed in the hope that it will be useful,            *
 * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
 * GNU Lesser General Public License for more details.                        *
 *                                                                            *
 * You should have received a copy of the GNU Lesser General Public License   *
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
 * -------------------------------------------------------------------------- */

27
#include "openmm/OpenMMException.h"
28
29
#include "OpenCLNonbondedUtilities.h"
#include "OpenCLArray.h"
30
#include "OpenCLContext.h"
31
#include "OpenCLKernelSources.h"
32
#include "OpenCLExpressionUtilities.h"
33
34
#include "OpenCLSort.h"
#include <algorithm>
35
#include <map>
36
37
#include <set>
#include <utility>
38
39
40
41

using namespace OpenMM;
using namespace std;

42
43
class OpenCLNonbondedUtilities::BlockSortTrait : public OpenCLSort::SortTrait {
public:
44
45
46
47
48
49
50
51
52
    BlockSortTrait() {}
    int getDataSize() const {return sizeof(int);}
    int getKeySize() const {return sizeof(int);}
    const char* getDataType() const {return "unsigned int";}
    const char* getKeyType() const {return "unsigned int";}
    const char* getMinKey() const {return "0";}
    const char* getMaxKey() const {return "0xFFFFFFFFu";}
    const char* getMaxValue() const {return "0xFFFFFFFFu";}
    const char* getSortKey() const {return "value";}
53
54
};

55
OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : context(context), useCutoff(false), usePeriodic(false), useNeighborList(false), anyExclusions(false), usePadding(true),
56
        blockSorter(NULL), pinnedCountBuffer(NULL), pinnedCountMemory(NULL), forceRebuildNeighborList(true), lastCutoff(0.0), groupFlags(0), tilesAfterReorder(0) {
57
    // Decide how many thread blocks and force buffers to use.
58

59
    deviceIsCpu = (context.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU);
60
61
62
63
64
    if (deviceIsCpu) {
        numForceThreadBlocks = context.getNumThreadBlocks();
        forceThreadBlockSize = 1;
    }
    else if (context.getSIMDWidth() == 32) {
65
66
67
68
69
70
71
72
        int blocksPerComputeUnit = 4;
        std::string vendor = context.getDevice().getInfo<CL_DEVICE_VENDOR>();
        if (vendor.size() >= 5 && vendor.substr(0, 5) == "Apple") {
            // 1536 threads per GPU core.
            blocksPerComputeUnit = 6;
        }
        numForceThreadBlocks = blocksPerComputeUnit*context.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
        forceThreadBlockSize = 256;
73
    }
74
    else {
75
        numForceThreadBlocks = context.getNumThreadBlocks();
76
        forceThreadBlockSize = (context.getSIMDWidth() >= 32 ? OpenCLContext::ThreadBlockSize : 32);
77
    }
78
79
    pinnedCountBuffer = new cl::Buffer(context.getContext(), CL_MEM_ALLOC_HOST_PTR, sizeof(unsigned int));
    pinnedCountMemory = (unsigned int*) context.getQueue().enqueueMapBuffer(*pinnedCountBuffer, CL_TRUE, CL_MAP_READ, 0, sizeof(int));
80
81
82
83
84
85
    
    // When building the neighbor list, we can optionally use large blocks (1024 atoms) to
    // accelerate the process.  This makes building the neighbor list faster, but it prevents
    // us from sorting atom blocks by size, which leads to a slightly less efficient neighbor
    // list.  We guess based on system size which will be faster.

86
    useLargeBlocks = (!deviceIsCpu && context.getNumAtoms() > 100000);
87
88
89
90

    std::string vendor = context.getDevice().getInfo<CL_DEVICE_VENDOR>();
    isAMD = !deviceIsCpu && ((vendor.size() >= 3 && vendor.substr(0, 3) == "AMD") || (vendor.size() >= 28 && vendor.substr(0, 28) == "Advanced Micro Devices, Inc."));

91
    setKernelSource(deviceIsCpu ? OpenCLKernelSources::nonbonded_cpu : OpenCLKernelSources::nonbonded);
92
93
94
}

OpenCLNonbondedUtilities::~OpenCLNonbondedUtilities() {
95
96
    if (blockSorter != NULL)
        delete blockSorter;
97
98
    if (pinnedCountBuffer != NULL)
        delete pinnedCountBuffer;
99
100
}

101
void OpenCLNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector<vector<int> >& exclusionList, const string& kernel, int forceGroup, bool useNeighborList) {
102
    if (groupCutoff.size() > 0) {
103
104
105
106
        if (usesCutoff != useCutoff)
            throw OpenMMException("All Forces must agree on whether to use a cutoff");
        if (usesPeriodic != usePeriodic)
            throw OpenMMException("All Forces must agree on whether to use periodic boundary conditions");
107
108
        if (usesCutoff && groupCutoff.find(forceGroup) != groupCutoff.end() && groupCutoff[forceGroup] != cutoffDistance)
            throw OpenMMException("All Forces in a single force group must use the same cutoff distance");
109
    }
110
111
112
113
    if (usesExclusions)
        requestExclusions(exclusionList);
    useCutoff = usesCutoff;
    usePeriodic = usesPeriodic;
114
    this->useNeighborList |= ((useNeighborList || deviceIsCpu) && useCutoff);
115
116
117
118
119
120
121
122
123
124
    groupCutoff[forceGroup] = cutoffDistance;
    groupFlags |= 1<<forceGroup;
    if (kernel.size() > 0) {
        if (groupKernelSource.find(forceGroup) == groupKernelSource.end())
            groupKernelSource[forceGroup] = "";
        map<string, string> replacements;
        replacements["CUTOFF"] = "CUTOFF_"+context.intToString(forceGroup);
        replacements["CUTOFF_SQUARED"] = "CUTOFF_"+context.intToString(forceGroup)+"_SQUARED";
        groupKernelSource[forceGroup] += context.replaceStrings(kernel, replacements)+"\n";
    }
125
126
}

127
128
void OpenCLNonbondedUtilities::addParameter(ComputeParameterInfo parameter) {
    parameters.push_back(ParameterInfo(parameter.getName(), parameter.getComponentType(), parameter.getNumComponents(),
129
            parameter.getSize(), context.unwrap(parameter.getArray()).getDeviceBuffer(), parameter.isConstant()));
130
131
}

132
133
134
135
void OpenCLNonbondedUtilities::addParameter(const ParameterInfo& parameter) {
    parameters.push_back(parameter);
}

136
137
void OpenCLNonbondedUtilities::addArgument(ComputeParameterInfo parameter) {
    arguments.push_back(ParameterInfo(parameter.getName(), parameter.getComponentType(), parameter.getNumComponents(),
138
            parameter.getSize(), context.unwrap(parameter.getArray()).getDeviceBuffer(), parameter.isConstant()));
139
140
}

141
142
143
144
void OpenCLNonbondedUtilities::addArgument(const ParameterInfo& parameter) {
    arguments.push_back(parameter);
}

145
146
147
148
149
150
151
152
153
154
155
156
157
string OpenCLNonbondedUtilities::addEnergyParameterDerivative(const string& param) {
    // See if the parameter has already been added.
    
    int index;
    for (index = 0; index < energyParameterDerivatives.size(); index++)
        if (param == energyParameterDerivatives[index])
            break;
    if (index == energyParameterDerivatives.size())
        energyParameterDerivatives.push_back(param);
    context.addEnergyParameterDerivative(param);
    return string("energyParamDeriv")+context.intToString(index);
}

158
159
void OpenCLNonbondedUtilities::requestExclusions(const vector<vector<int> >& exclusionList) {
    if (anyExclusions) {
160
        bool sameExclusions = (exclusionList.size() == atomExclusions.size());
161
        for (int i = 0; i < (int) exclusionList.size() && sameExclusions; i++) {
162
163
            if (exclusionList[i].size() != atomExclusions[i].size())
                sameExclusions = false;
164
165
            set<int> expectedExclusions;
            expectedExclusions.insert(atomExclusions[i].begin(), atomExclusions[i].end());
166
            for (int j = 0; j < (int) exclusionList[i].size(); j++)
167
                if (expectedExclusions.find(exclusionList[i][j]) == expectedExclusions.end())
168
169
170
171
172
                    sameExclusions = false;
        }
        if (!sameExclusions)
            throw OpenMMException("All Forces must have identical exceptions");
    }
173
    else {
174
        atomExclusions = exclusionList;
175
176
        anyExclusions = true;
    }
177
178
}

179
static bool compareInt2(mm_int2 a, mm_int2 b) {
peastman's avatar
peastman committed
180
181
182
183
184
    // This version is used on devices with SIMD width of 32 or less.  It sorts tiles to improve cache efficiency.

    return ((a.y < b.y) || (a.y == b.y && a.x < b.x));
}

185
static bool compareInt2LargeSIMD(mm_int2 a, mm_int2 b) {
peastman's avatar
peastman committed
186
187
188
189
190
191
192
193
194
195
    // This version is used on devices with SIMD width greater than 32.  It puts diagonal tiles before off-diagonal
    // ones to reduce thread divergence.
    
    if (a.x == a.y) {
        if (b.x == b.y)
            return (a.x < b.x);
        return true;
    }
    if (b.x == b.y)
        return false;
196
197
198
    return ((a.y < b.y) || (a.y == b.y && a.x < b.x));
}

199
void OpenCLNonbondedUtilities::initialize(const System& system) {
200
201
    if (atomExclusions.size() == 0) {
        // No exclusions were specifically requested, so just mark every atom as not interacting with itself.
202

203
        atomExclusions.resize(context.getNumAtoms());
204
        for (int i = 0; i < (int) atomExclusions.size(); i++)
205
206
207
            atomExclusions[i].push_back(i);
    }

208
209
210
    // Create the list of tiles.

    int numAtomBlocks = context.getNumAtomBlocks();
211
    int numContexts = context.getPlatformData().contexts.size();
212
    setAtomBlockRange(context.getContextIndex()/(double) numContexts, (context.getContextIndex()+1)/(double) numContexts);
213

214
    // Build a list of tiles that contain exclusions.
215

216
217
218
219
220
221
222
223
224
    set<pair<int, int> > tilesWithExclusions;
    for (int atom1 = 0; atom1 < (int) atomExclusions.size(); ++atom1) {
        int x = atom1/OpenCLContext::TileSize;
        for (int j = 0; j < (int) atomExclusions[atom1].size(); ++j) {
            int atom2 = atomExclusions[atom1][j];
            int y = atom2/OpenCLContext::TileSize;
            tilesWithExclusions.insert(make_pair(max(x, y), min(x, y)));
        }
    }
225
    vector<mm_int2> exclusionTilesVec;
226
    for (set<pair<int, int> >::const_iterator iter = tilesWithExclusions.begin(); iter != tilesWithExclusions.end(); ++iter)
227
        exclusionTilesVec.push_back(mm_int2(iter->first, iter->second));
228
    sort(exclusionTilesVec.begin(), exclusionTilesVec.end(), context.getSIMDWidth() <= 32 || !useNeighborList ? compareInt2 : compareInt2LargeSIMD);
229
    exclusionTiles.initialize<mm_int2>(context, exclusionTilesVec.size(), "exclusionTiles");
peastman's avatar
peastman committed
230
    exclusionTiles.upload(exclusionTilesVec);
231
232
    map<pair<int, int>, int> exclusionTileMap;
    for (int i = 0; i < (int) exclusionTilesVec.size(); i++) {
233
        mm_int2 tile = exclusionTilesVec[i];
234
235
236
237
238
239
240
        exclusionTileMap[make_pair(tile.x, tile.y)] = i;
    }
    vector<vector<int> > exclusionBlocksForBlock(numAtomBlocks);
    for (set<pair<int, int> >::const_iterator iter = tilesWithExclusions.begin(); iter != tilesWithExclusions.end(); ++iter) {
        exclusionBlocksForBlock[iter->first].push_back(iter->second);
        if (iter->first != iter->second)
            exclusionBlocksForBlock[iter->second].push_back(iter->first);
241
242
243
    }
    vector<cl_uint> exclusionRowIndicesVec(numAtomBlocks+1, 0);
    vector<cl_uint> exclusionIndicesVec;
244
245
246
    for (int i = 0; i < numAtomBlocks; i++) {
        exclusionIndicesVec.insert(exclusionIndicesVec.end(), exclusionBlocksForBlock[i].begin(), exclusionBlocksForBlock[i].end());
        exclusionRowIndicesVec[i+1] = exclusionIndicesVec.size();
247
    }
248
249
250
    maxExclusions = 0;
    for (int i = 0; i < (int) exclusionBlocksForBlock.size(); i++)
        maxExclusions = (maxExclusions > exclusionBlocksForBlock[i].size() ? maxExclusions : exclusionBlocksForBlock[i].size());
peastman's avatar
peastman committed
251
252
253
254
    exclusionIndices.initialize<cl_uint>(context, exclusionIndicesVec.size(), "exclusionIndices");
    exclusionRowIndices.initialize<cl_uint>(context, exclusionRowIndicesVec.size(), "exclusionRowIndices");
    exclusionIndices.upload(exclusionIndicesVec);
    exclusionRowIndices.upload(exclusionRowIndicesVec);
255
256
257

    // Record the exclusion data.

peastman's avatar
peastman committed
258
    exclusions.initialize<cl_uint>(context, tilesWithExclusions.size()*OpenCLContext::TileSize, "exclusions");
259
    cl_uint allFlags = (cl_uint) -1;
peastman's avatar
peastman committed
260
261
    vector<cl_uint> exclusionVec(exclusions.getSize(), allFlags);
    for (int i = 0; i < exclusions.getSize(); ++i)
262
263
264
265
266
267
268
269
270
        exclusionVec[i] = 0xFFFFFFFF;
    for (int atom1 = 0; atom1 < (int) atomExclusions.size(); ++atom1) {
        int x = atom1/OpenCLContext::TileSize;
        int offset1 = atom1-x*OpenCLContext::TileSize;
        for (int j = 0; j < (int) atomExclusions[atom1].size(); ++j) {
            int atom2 = atomExclusions[atom1][j];
            int y = atom2/OpenCLContext::TileSize;
            int offset2 = atom2-y*OpenCLContext::TileSize;
            if (x > y) {
271
272
                int index = exclusionTileMap[make_pair(x, y)]*OpenCLContext::TileSize;
                exclusionVec[index+offset1] &= allFlags-(1<<offset2);
273
274
            }
            else {
275
276
                int index = exclusionTileMap[make_pair(y, x)]*OpenCLContext::TileSize;
                exclusionVec[index+offset2] &= allFlags-(1<<offset1);
277
278
279
280
            }
        }
    }
    atomExclusions.clear(); // We won't use this again, so free the memory it used
peastman's avatar
peastman committed
281
    exclusions.upload(exclusionVec);
282
283
284
285

    // Create data structures for the neighbor list.

    if (useCutoff) {
286
287
288
289
290
291
292
293
294
        // Select a size for the arrays that hold the neighbor list.  We have to make a fairly
        // arbitrary guess, but if this turns out to be too small we'll increase it later.

        int maxTiles = 20*numAtomBlocks;
        if (maxTiles > numTiles)
            maxTiles = numTiles;
        if (maxTiles < 1)
            maxTiles = 1;
        int numAtoms = context.getNumAtoms();
peastman's avatar
peastman committed
295
296
297
        interactingTiles.initialize<cl_int>(context, maxTiles, "interactingTiles");
        interactingAtoms.initialize<cl_int>(context, OpenCLContext::TileSize*maxTiles, "interactingAtoms");
        interactionCount.initialize<cl_uint>(context, 1, "interactionCount");
298
        int elementSize = (context.getUseDoublePrecision() ? sizeof(cl_double) : sizeof(cl_float));
peastman's avatar
peastman committed
299
300
        blockCenter.initialize(context, numAtomBlocks, 4*elementSize, "blockCenter");
        blockBoundingBox.initialize(context, numAtomBlocks, 4*elementSize, "blockBoundingBox");
301
        sortedBlocks.initialize<cl_uint>(context, numAtomBlocks, "sortedBlocks");
peastman's avatar
peastman committed
302
303
        sortedBlockCenter.initialize(context, numAtomBlocks+1, 4*elementSize, "sortedBlockCenter");
        sortedBlockBoundingBox.initialize(context, numAtomBlocks+1, 4*elementSize, "sortedBlockBoundingBox");
304
305
        numBlockSizes = min((context.getNumAtomBlocks()+63)/64, context.getNumThreadBlocks());
        blockSizeRange.initialize(context, numBlockSizes, 2*elementSize, "blockSizeRange");
306
307
        largeBlockCenter.initialize(context, numAtomBlocks, 4*elementSize, "largeBlockCenter");
        largeBlockBoundingBox.initialize(context, numAtomBlocks, 4*elementSize, "largeBlockBoundingBox");
peastman's avatar
peastman committed
308
309
        oldPositions.initialize(context, numAtoms, 4*elementSize, "oldPositions");
        rebuildNeighborList.initialize<int>(context, 1, "rebuildNeighborList");
310
        blockSorter = new OpenCLSort(context, new BlockSortTrait(), numAtomBlocks, false);
311
        vector<cl_uint> count(1, 0);
peastman's avatar
peastman committed
312
        interactionCount.upload(count);
peastman's avatar
peastman committed
313
        rebuildNeighborList.upload(count);
314
    }
315
316
}

317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
static void setPeriodicBoxArgs(OpenCLContext& cl, cl::Kernel& kernel, int index) {
    if (cl.getUseDoublePrecision()) {
        kernel.setArg<mm_double4>(index++, cl.getPeriodicBoxSizeDouble());
        kernel.setArg<mm_double4>(index++, cl.getInvPeriodicBoxSizeDouble());
        kernel.setArg<mm_double4>(index++, cl.getPeriodicBoxVecXDouble());
        kernel.setArg<mm_double4>(index++, cl.getPeriodicBoxVecYDouble());
        kernel.setArg<mm_double4>(index, cl.getPeriodicBoxVecZDouble());
    }
    else {
        kernel.setArg<mm_float4>(index++, cl.getPeriodicBoxSize());
        kernel.setArg<mm_float4>(index++, cl.getInvPeriodicBoxSize());
        kernel.setArg<mm_float4>(index++, cl.getPeriodicBoxVecX());
        kernel.setArg<mm_float4>(index++, cl.getPeriodicBoxVecY());
        kernel.setArg<mm_float4>(index, cl.getPeriodicBoxVecZ());
    }
332
333
}

334
335
336
337
338
339
340
double OpenCLNonbondedUtilities::getMaxCutoffDistance() {
    double cutoff = 0.0;
    for (map<int, double>::const_iterator iter = groupCutoff.begin(); iter != groupCutoff.end(); ++iter)
        cutoff = max(cutoff, iter->second);
    return cutoff;
}

341
342
343
344
345
double OpenCLNonbondedUtilities::padCutoff(double cutoff) {
    double padding = (usePadding ? 0.1*cutoff : 0.0);
    return cutoff+padding;
}

346
347
348
349
350
351
void OpenCLNonbondedUtilities::prepareInteractions(int forceGroups) {
    if ((forceGroups&groupFlags) == 0)
        return;
    if (groupKernels.find(forceGroups) == groupKernels.end())
        createKernelsForGroups(forceGroups);
    KernelSet& kernels = groupKernels[forceGroups];
352
    if (useCutoff && usePeriodic) {
353
        mm_float4 box = context.getPeriodicBoxSize();
354
        double minAllowedSize = 1.999999*kernels.cutoffDistance;
355
356
357
        if (box.x < minAllowedSize || box.y < minAllowedSize || box.z < minAllowedSize)
            throw OpenMMException("The periodic box size has decreased to less than twice the nonbonded cutoff.");
    }
358
359
360
361
    if (!useNeighborList)
        return;
    if (numTiles == 0)
        return;
362
363
364

    // Compute the neighbor list.

365
366
    if (lastCutoff != kernels.cutoffDistance)
        forceRebuildNeighborList = true;
367
    setPeriodicBoxArgs(context, kernels.findBlockBoundsKernel, 1);
368
369
    context.executeKernel(kernels.findBlockBoundsKernel, context.getNumAtomBlocks());
    context.executeKernel(kernels.computeSortKeysKernel, context.getNumAtomBlocks());
370
371
    if (useLargeBlocks)
        setPeriodicBoxArgs(context, kernels.sortBoxDataKernel, 12);
372
    blockSorter->sort(sortedBlocks);
373
374
375
376
377
    kernels.sortBoxDataKernel.setArg<cl_int>(9, forceRebuildNeighborList);
    context.executeKernel(kernels.sortBoxDataKernel, context.getNumAtoms());
    setPeriodicBoxArgs(context, kernels.findInteractingBlocksKernel, 0);
    context.executeKernel(kernels.findInteractingBlocksKernel, context.getNumAtoms(), interactingBlocksThreadBlockSize);
    forceRebuildNeighborList = false;
378
    lastCutoff = kernels.cutoffDistance;
379
    context.getQueue().enqueueReadBuffer(interactionCount.getDeviceBuffer(), CL_FALSE, 0, sizeof(int), pinnedCountMemory, NULL, &downloadCountEvent);
380
381
    if (isAMD)
        context.getQueue().flush();
382
383
384
385
386
387

    #if __APPLE__ && defined(__aarch64__)
    // Segment the command stream to avoid stalls later.
    if (groupKernels[forceGroups].hasForces)
        context.getQueue().flush();
    #endif
388
389
}

390
void OpenCLNonbondedUtilities::computeInteractions(int forceGroups, bool includeForces, bool includeEnergy) {
391
392
393
394
    if ((forceGroups&groupFlags) == 0)
        return;
    KernelSet& kernels = groupKernels[forceGroups];
    if (kernels.hasForces) {
395
396
        if (isAMD)
            context.getQueue().flush();
397
398
399
        cl::Kernel& kernel = (includeForces ? (includeEnergy ? kernels.forceEnergyKernel : kernels.forceKernel) : kernels.energyKernel);
        if (*reinterpret_cast<cl_kernel*>(&kernel) == NULL)
            kernel = createInteractionKernel(kernels.source, parameters, arguments, true, true, forceGroups, includeForces, includeEnergy);
400
        if (useCutoff)
401
402
            setPeriodicBoxArgs(context, kernel, 9);
        context.executeKernel(kernel, numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
403
    }
404
    if (useNeighborList && numTiles > 0) {
405
406
407
408
409
        #if __APPLE__ && defined(__aarch64__)
        // Ensure cached up work executes while you're waiting.
        if (kernels.hasForces)
            context.getQueue().flush();
        #endif
410
411
412
        downloadCountEvent.wait();
        updateNeighborListSize();
    }
413
414
}

415
bool OpenCLNonbondedUtilities::updateNeighborListSize() {
416
    if (!useCutoff)
417
        return false;
418
    if (context.getStepsSinceReorder() == 0 || tilesAfterReorder == 0)
419
420
421
        tilesAfterReorder = pinnedCountMemory[0];
    else if (context.getStepsSinceReorder() > 25 && pinnedCountMemory[0] > 1.1*tilesAfterReorder)
        context.forceReorder();
422
    if (pinnedCountMemory[0] <= interactingTiles.getSize())
423
        return false;
424
425
426
427

    // The most recent timestep had too many interactions to fit in the arrays.  Make the arrays bigger to prevent
    // this from happening in the future.

428
429
430
    unsigned int maxTiles = (unsigned int) (1.2*pinnedCountMemory[0]);
    unsigned int numBlocks = context.getNumAtomBlocks();
    int totalTiles = numBlocks*(numBlocks+1)/2;
431
432
    if (maxTiles > totalTiles)
        maxTiles = totalTiles;
peastman's avatar
peastman committed
433
    interactingTiles.resize(maxTiles);
434
    interactingAtoms.resize(OpenCLContext::TileSize*(size_t) maxTiles);
435
    for (map<int, KernelSet>::iterator iter = groupKernels.begin(); iter != groupKernels.end(); ++iter) {
436
437
        KernelSet& kernels = iter->second;
        if (*reinterpret_cast<cl_kernel*>(&kernels.forceKernel) != NULL) {
peastman's avatar
peastman committed
438
            kernels.forceKernel.setArg<cl::Buffer>(7, interactingTiles.getDeviceBuffer());
439
            kernels.forceKernel.setArg<cl_uint>(14, maxTiles);
peastman's avatar
peastman committed
440
            kernels.forceKernel.setArg<cl::Buffer>(17, interactingAtoms.getDeviceBuffer());
441
442
        }
        if (*reinterpret_cast<cl_kernel*>(&kernels.energyKernel) != NULL) {
peastman's avatar
peastman committed
443
            kernels.energyKernel.setArg<cl::Buffer>(7, interactingTiles.getDeviceBuffer());
444
            kernels.energyKernel.setArg<cl_uint>(14, maxTiles);
peastman's avatar
peastman committed
445
            kernels.energyKernel.setArg<cl::Buffer>(17, interactingAtoms.getDeviceBuffer());
446
447
        }
        if (*reinterpret_cast<cl_kernel*>(&kernels.forceEnergyKernel) != NULL) {
peastman's avatar
peastman committed
448
            kernels.forceEnergyKernel.setArg<cl::Buffer>(7, interactingTiles.getDeviceBuffer());
449
            kernels.forceEnergyKernel.setArg<cl_uint>(14, maxTiles);
peastman's avatar
peastman committed
450
            kernels.forceEnergyKernel.setArg<cl::Buffer>(17, interactingAtoms.getDeviceBuffer());
451
        }
peastman's avatar
peastman committed
452
453
        kernels.findInteractingBlocksKernel.setArg<cl::Buffer>(6, interactingTiles.getDeviceBuffer());
        kernels.findInteractingBlocksKernel.setArg<cl::Buffer>(7, interactingAtoms.getDeviceBuffer());
454
        kernels.findInteractingBlocksKernel.setArg<cl_uint>(9, maxTiles);
455
456
    }
    forceRebuildNeighborList = true;
457
    context.setForcesValid(false);
458
    return true;
459
460
}

461
462
463
464
465
466
467
468
void OpenCLNonbondedUtilities::setUsePadding(bool padding) {
    usePadding = padding;
}

void OpenCLNonbondedUtilities::setAtomBlockRange(double startFraction, double endFraction) {
    int numAtomBlocks = context.getNumAtomBlocks();
    startBlockIndex = (int) (startFraction*numAtomBlocks);
    numBlocks = (int) (endFraction*numAtomBlocks)-startBlockIndex;
469
    long long totalTiles = context.getNumAtomBlocks()*((long long)context.getNumAtomBlocks()+1)/2;
470
    startTileIndex = (int) (startFraction*totalTiles);;
471
    numTiles = (long long) (endFraction*totalTiles)-startTileIndex;
472
    if (useCutoff) {
473
        // We are using a cutoff, and the kernels have already been created.
474

475
        for (map<int, KernelSet>::iterator iter = groupKernels.begin(); iter != groupKernels.end(); ++iter) {
476
477
478
            KernelSet& kernels = iter->second;
            if (*reinterpret_cast<cl_kernel*>(&kernels.forceKernel) != NULL) {
                kernels.forceKernel.setArg<cl_uint>(5, startTileIndex);
479
                kernels.forceKernel.setArg<cl_ulong>(6, numTiles);
480
481
482
            }
            if (*reinterpret_cast<cl_kernel*>(&kernels.energyKernel) != NULL) {
                kernels.energyKernel.setArg<cl_uint>(5, startTileIndex);
483
                kernels.energyKernel.setArg<cl_ulong>(6, numTiles);
484
485
486
            }
            if (*reinterpret_cast<cl_kernel*>(&kernels.forceEnergyKernel) != NULL) {
                kernels.forceEnergyKernel.setArg<cl_uint>(5, startTileIndex);
487
                kernels.forceEnergyKernel.setArg<cl_ulong>(6, numTiles);
488
489
490
            }
            kernels.findInteractingBlocksKernel.setArg<cl_uint>(10, startBlockIndex);
            kernels.findInteractingBlocksKernel.setArg<cl_uint>(11, numBlocks);
491
492
        }
        forceRebuildNeighborList = true;
493
494
495
    }
}

496
497
498
499
500
501
502
503
504
505
506
507
void OpenCLNonbondedUtilities::createKernelsForGroups(int groups) {
    KernelSet kernels;
    double cutoff = 0.0;
    string source;
    for (int i = 0; i < 32; i++) {
        if ((groups&(1<<i)) != 0) {
            cutoff = max(cutoff, groupCutoff[i]);
            source += groupKernelSource[i];
        }
    }
    kernels.hasForces = (source.size() > 0);
    kernels.cutoffDistance = cutoff;
508
    kernels.source = source;
509
    if (useCutoff) {
510
        double paddedCutoff = padCutoff(cutoff);
511
512
513
        map<string, string> defines;
        defines["TILE_SIZE"] = context.intToString(OpenCLContext::TileSize);
        defines["NUM_ATOMS"] = context.intToString(context.getNumAtoms());
514
        defines["PADDING"] = context.doubleToString(paddedCutoff-cutoff);
515
516
        defines["PADDED_CUTOFF"] = context.doubleToString(paddedCutoff);
        defines["PADDED_CUTOFF_SQUARED"] = context.doubleToString(paddedCutoff*paddedCutoff);
peastman's avatar
peastman committed
517
        defines["NUM_TILES_WITH_EXCLUSIONS"] = context.intToString(exclusionTiles.getSize());
518
519
520
521
        defines["NUM_BLOCKS"] = context.intToString(context.getNumAtomBlocks());
        defines["SIMD_WIDTH"] = context.intToString(context.getSIMDWidth());
        if (usePeriodic)
            defines["USE_PERIODIC"] = "1";
522
523
        if (context.getBoxIsTriclinic())
            defines["TRICLINIC"] = "1";
524
525
        if (useLargeBlocks)
            defines["USE_LARGE_BLOCKS"] = "1";
526
527
        defines["MAX_EXCLUSIONS"] = context.intToString(maxExclusions);
        defines["BUFFER_GROUPS"] = (deviceIsCpu ? "4" : "2");
528
529
530
531
532
        int binShift = 1;
        while (1<<binShift <= context.getNumAtomBlocks())
            binShift++;
        defines["BIN_SHIFT"] = context.intToString(binShift);
        defines["BLOCK_INDEX_MASK"] = context.intToString((1<<binShift)-1);
533
534
535
536
537
538
539
540
        string file = (deviceIsCpu ? OpenCLKernelSources::findInteractingBlocks_cpu : OpenCLKernelSources::findInteractingBlocks);
        int groupSize = (deviceIsCpu || context.getSIMDWidth() < 32 ? 32 : 256);
        while (true) {
            defines["GROUP_SIZE"] = context.intToString(groupSize);
            cl::Program interactingBlocksProgram = context.createProgram(file, defines);
            kernels.findBlockBoundsKernel = cl::Kernel(interactingBlocksProgram, "findBlockBounds");
            kernels.findBlockBoundsKernel.setArg<cl_int>(0, context.getNumAtoms());
            kernels.findBlockBoundsKernel.setArg<cl::Buffer>(6, context.getPosq().getDeviceBuffer());
peastman's avatar
peastman committed
541
542
543
            kernels.findBlockBoundsKernel.setArg<cl::Buffer>(7, blockCenter.getDeviceBuffer());
            kernels.findBlockBoundsKernel.setArg<cl::Buffer>(8, blockBoundingBox.getDeviceBuffer());
            kernels.findBlockBoundsKernel.setArg<cl::Buffer>(9, rebuildNeighborList.getDeviceBuffer());
544
545
546
547
548
549
            kernels.findBlockBoundsKernel.setArg<cl::Buffer>(10, blockSizeRange.getDeviceBuffer());
            kernels.computeSortKeysKernel = cl::Kernel(interactingBlocksProgram, "computeSortKeys");
            kernels.computeSortKeysKernel.setArg<cl::Buffer>(0, blockBoundingBox.getDeviceBuffer());
            kernels.computeSortKeysKernel.setArg<cl::Buffer>(1, sortedBlocks.getDeviceBuffer());
            kernels.computeSortKeysKernel.setArg<cl::Buffer>(2, blockSizeRange.getDeviceBuffer());
            kernels.computeSortKeysKernel.setArg<cl_int>(3, numBlockSizes);
550
            kernels.sortBoxDataKernel = cl::Kernel(interactingBlocksProgram, "sortBoxData");
peastman's avatar
peastman committed
551
552
553
554
555
            kernels.sortBoxDataKernel.setArg<cl::Buffer>(0, sortedBlocks.getDeviceBuffer());
            kernels.sortBoxDataKernel.setArg<cl::Buffer>(1, blockCenter.getDeviceBuffer());
            kernels.sortBoxDataKernel.setArg<cl::Buffer>(2, blockBoundingBox.getDeviceBuffer());
            kernels.sortBoxDataKernel.setArg<cl::Buffer>(3, sortedBlockCenter.getDeviceBuffer());
            kernels.sortBoxDataKernel.setArg<cl::Buffer>(4, sortedBlockBoundingBox.getDeviceBuffer());
556
            kernels.sortBoxDataKernel.setArg<cl::Buffer>(5, context.getPosq().getDeviceBuffer());
peastman's avatar
peastman committed
557
558
559
            kernels.sortBoxDataKernel.setArg<cl::Buffer>(6, oldPositions.getDeviceBuffer());
            kernels.sortBoxDataKernel.setArg<cl::Buffer>(7, interactionCount.getDeviceBuffer());
            kernels.sortBoxDataKernel.setArg<cl::Buffer>(8, rebuildNeighborList.getDeviceBuffer());
560
            kernels.sortBoxDataKernel.setArg<cl_int>(9, true);
561
562
563
564
            if (useLargeBlocks) {
                kernels.sortBoxDataKernel.setArg<cl::Buffer>(10, largeBlockCenter.getDeviceBuffer());
                kernels.sortBoxDataKernel.setArg<cl::Buffer>(11, largeBlockBoundingBox.getDeviceBuffer());
            }
565
            kernels.findInteractingBlocksKernel = cl::Kernel(interactingBlocksProgram, "findBlocksWithInteractions");
peastman's avatar
peastman committed
566
567
568
            kernels.findInteractingBlocksKernel.setArg<cl::Buffer>(5, interactionCount.getDeviceBuffer());
            kernels.findInteractingBlocksKernel.setArg<cl::Buffer>(6, interactingTiles.getDeviceBuffer());
            kernels.findInteractingBlocksKernel.setArg<cl::Buffer>(7, interactingAtoms.getDeviceBuffer());
569
            kernels.findInteractingBlocksKernel.setArg<cl::Buffer>(8, context.getPosq().getDeviceBuffer());
peastman's avatar
peastman committed
570
            kernels.findInteractingBlocksKernel.setArg<cl_uint>(9, interactingTiles.getSize());
571
572
            kernels.findInteractingBlocksKernel.setArg<cl_uint>(10, startBlockIndex);
            kernels.findInteractingBlocksKernel.setArg<cl_uint>(11, numBlocks);
peastman's avatar
peastman committed
573
574
575
576
577
578
579
            kernels.findInteractingBlocksKernel.setArg<cl::Buffer>(12, sortedBlocks.getDeviceBuffer());
            kernels.findInteractingBlocksKernel.setArg<cl::Buffer>(13, sortedBlockCenter.getDeviceBuffer());
            kernels.findInteractingBlocksKernel.setArg<cl::Buffer>(14, sortedBlockBoundingBox.getDeviceBuffer());
            kernels.findInteractingBlocksKernel.setArg<cl::Buffer>(15, exclusionIndices.getDeviceBuffer());
            kernels.findInteractingBlocksKernel.setArg<cl::Buffer>(16, exclusionRowIndices.getDeviceBuffer());
            kernels.findInteractingBlocksKernel.setArg<cl::Buffer>(17, oldPositions.getDeviceBuffer());
            kernels.findInteractingBlocksKernel.setArg<cl::Buffer>(18, rebuildNeighborList.getDeviceBuffer());
580
581
582
583
            if (useLargeBlocks) {
                kernels.findInteractingBlocksKernel.setArg<cl::Buffer>(19, largeBlockCenter.getDeviceBuffer());
                kernels.findInteractingBlocksKernel.setArg<cl::Buffer>(20, largeBlockBoundingBox.getDeviceBuffer());
            }
584
585
            if (kernels.findInteractingBlocksKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice()) < groupSize) {
                // The device can't handle this block size, so reduce it.
586

587
588
589
590
591
592
593
594
595
596
597
598
                groupSize -= 32;
                if (groupSize < 32)
                    throw OpenMMException("Failed to create findInteractingBlocks kernel");
                continue;
            }
            break;
        }
        interactingBlocksThreadBlockSize = (deviceIsCpu ? 1 : groupSize);
    }
    groupKernels[groups] = kernels;
}

599
cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& source, const vector<ParameterInfo>& params, const vector<ParameterInfo>& arguments, bool useExclusions, bool isSymmetric, int groups, bool includeForces, bool includeEnergy) {
600
601
    map<string, string> replacements;
    replacements["COMPUTE_INTERACTION"] = source;
602
    const string suffixes[] = {"x", "y", "z", "w"};
603
    stringstream localData;
604
    int localDataSize = 0;
605
606
607
    for (const ParameterInfo& param : params) {
        if (param.getNumComponents() == 1)
            localData<<param.getType()<<" "<<param.getName()<<";\n";
608
        else {
609
610
            for (int j = 0; j < param.getNumComponents(); ++j)
                localData<<param.getComponentType()<<" "<<param.getName()<<"_"<<suffixes[j]<<";\n";
611
        }
612
        localDataSize += param.getSize();
613
614
    }
    replacements["ATOM_PARAMETER_DATA"] = localData.str();
615
    stringstream args;
616
617
618
619
620
621
622
623
    for (const ParameterInfo& param : params) {
        args << ", __global ";
        if (param.isConstant())
            args << "const ";
        if (param.getNumComponents() == 3)
            args << param.getComponentType();
        else
            args << param.getType();
624
        args << "* restrict global_";
625
        args << param.getName();
626
    }
627
628
    for (const ParameterInfo& arg : arguments) {
        if (arg.getMemory().getInfo<CL_MEM_TYPE>() == CL_MEM_OBJECT_IMAGE2D) {
629
            args << ", __read_only image2d_t ";
630
            args << arg.getName();
631
632
        }
        else {
633
634
635
636
637
            if ((arg.getMemory().getInfo<CL_MEM_FLAGS>() & CL_MEM_READ_ONLY) == 0) {
                args << ", __global ";
                if (arg.isConstant())
                    args << "const ";
            }
638
639
            else
                args << ", __constant ";
640
            args << arg.getType();
641
            args << "* restrict ";
642
            args << arg.getName();
643
        }
644
    }
645
    if (energyParameterDerivatives.size() > 0)
646
        args << ", __global mixed* restrict energyParamDerivs";
647
648
    replacements["PARAMETER_ARGUMENTS"] = args.str();
    stringstream loadLocal1;
649
650
651
    for (const ParameterInfo& param : params) {
        if (param.getNumComponents() == 1) {
            loadLocal1<<"localData[localAtomIndex]."<<param.getName()<<" = "<<param.getName()<<"1;\n";
652
653
        }
        else {
654
655
            for (int j = 0; j < param.getNumComponents(); ++j)
                loadLocal1<<"localData[localAtomIndex]."<<param.getName()<<"_"<<suffixes[j]<<" = "<<param.getName()<<"1."<<suffixes[j]<<";\n";
656
        }
657
658
    }
    replacements["LOAD_LOCAL_PARAMETERS_FROM_1"] = loadLocal1.str();
659
    replacements["DECLARE_LOCAL_PARAMETERS"] = "";
660
    stringstream loadLocal2;
661
662
663
    for (const ParameterInfo& param : params) {
        if (param.getNumComponents() == 1) {
            loadLocal2<<"localData[localAtomIndex]."<<param.getName()<<" = global_"<<param.getName()<<"[j];\n";
664
665
        }
        else {
666
667
668
669
670
671
            if (param.getNumComponents() == 3)
                loadLocal2<<param.getType()<<" temp_"<<param.getName()<<" = make_"<<param.getType()<<"(global_"<<param.getName()<<"[3*j], global_"<<param.getName()<<"[3*j+1], global_"<<param.getName()<<"[3*j+2]);\n";
            else
                loadLocal2<<param.getType()<<" temp_"<<param.getName()<<" = global_"<<param.getName()<<"[j];\n";
            for (int j = 0; j < param.getNumComponents(); ++j)
                loadLocal2<<"localData[localAtomIndex]."<<param.getName()<<"_"<<suffixes[j]<<" = temp_"<<param.getName()<<"."<<suffixes[j]<<";\n";
672
        }
673
674
675
    }
    replacements["LOAD_LOCAL_PARAMETERS_FROM_GLOBAL"] = loadLocal2.str();
    stringstream load1;
676
677
678
679
680
681
    for (const ParameterInfo& param : params) {
        load1<<param.getType()<<" "<<param.getName()<<"1 = ";
        if (param.getNumComponents() == 3)
            load1<<"make_"<<param.getType()<<"(global_"<<param.getName()<<"[3*atom1], global_"<<param.getName()<<"[3*atom1+1], global_"<<param.getName()<<"[3*atom1+2]);\n";
        else
            load1<<"global_"<<param.getName()<<"[atom1];\n";
682
683
684
    }
    replacements["LOAD_ATOM1_PARAMETERS"] = load1.str();
    stringstream load2j;
685
686
687
    for (const ParameterInfo& param : params) {
        if (param.getNumComponents() == 1) {
            load2j<<param.getType()<<" "<<param.getName()<<"2 = localData[atom2]."<<param.getName()<<";\n";
688
689
        }
        else {
690
691
            load2j<<param.getType()<<" "<<param.getName()<<"2 = ("<<param.getType()<<") (";
            for (int j = 0; j < param.getNumComponents(); ++j) {
692
693
                if (j > 0)
                    load2j<<", ";
694
                load2j<<"localData[atom2]."<<param.getName()<<"_"<<suffixes[j];
695
696
697
            }
            load2j<<");\n";
        }
698
    }
699
    replacements["LOAD_ATOM2_PARAMETERS"] = load2j.str();
700
    stringstream clearLocal;
701
702
703
    for (const ParameterInfo& param : params) {
        if (param.getNumComponents() == 1)
            clearLocal<<"localData[localAtomIndex]."<<param.getName()<<" = 0;\n";
704
        else
705
706
            for (int j = 0; j < param.getNumComponents(); ++j)
                clearLocal<<"localData[localAtomIndex]."<<param.getName()<<"_"<<suffixes[j]<<" = 0;\n";
707
708
    }
    replacements["CLEAR_LOCAL_PARAMETERS"] = clearLocal.str();
709
710
711
712
713
714
715
716
717
718
    stringstream initDerivs;
    for (int i = 0; i < energyParameterDerivatives.size(); i++)
        initDerivs<<"mixed energyParamDeriv"<<i<<" = 0;\n";
    replacements["INIT_DERIVATIVES"] = initDerivs.str();
    stringstream saveDerivs;
    const vector<string>& allParamDerivNames = context.getEnergyParamDerivNames();
    int numDerivs = allParamDerivNames.size();
    for (int i = 0; i < energyParameterDerivatives.size(); i++)
        for (int index = 0; index < numDerivs; index++)
            if (allParamDerivNames[index] == energyParameterDerivatives[i])
719
                saveDerivs<<"energyParamDerivs[GLOBAL_ID*"<<numDerivs<<"+"<<index<<"] += energyParamDeriv"<<i<<";\n";
720
    replacements["SAVE_DERIVATIVES"] = saveDerivs.str();
721
722
723
724
725
726
727
    map<string, string> defines;
    if (useCutoff)
        defines["USE_CUTOFF"] = "1";
    if (usePeriodic)
        defines["USE_PERIODIC"] = "1";
    if (useExclusions)
        defines["USE_EXCLUSIONS"] = "1";
728
729
    if (isSymmetric)
        defines["USE_SYMMETRIC"] = "1";
730
731
    if (useNeighborList)
        defines["USE_NEIGHBOR_LIST"] = "1";
732
733
    if (useCutoff && context.getSIMDWidth() < 32)
        defines["PRUNE_BY_CUTOFF"] = "1";
734
735
736
737
    if (includeForces)
        defines["INCLUDE_FORCES"] = "1";
    if (includeEnergy)
        defines["INCLUDE_ENERGY"] = "1";
738
    defines["THREAD_BLOCK_SIZE"] = context.intToString(forceThreadBlockSize);
739
    defines["FORCE_WORK_GROUP_SIZE"] = context.intToString(forceThreadBlockSize);
740
741
742
743
744
745
746
747
748
749
    double maxCutoff = 0.0;
    for (int i = 0; i < 32; i++) {
        if ((groups&(1<<i)) != 0) {
            double cutoff = groupCutoff[i];
            maxCutoff = max(maxCutoff, cutoff);
            defines["CUTOFF_"+context.intToString(i)+"_SQUARED"] = context.doubleToString(cutoff*cutoff);
            defines["CUTOFF_"+context.intToString(i)] = context.doubleToString(cutoff);
        }
    }
    defines["MAX_CUTOFF"] = context.doubleToString(maxCutoff);
750
751
752
    defines["NUM_ATOMS"] = context.intToString(context.getNumAtoms());
    defines["PADDED_NUM_ATOMS"] = context.intToString(context.getPaddedNumAtoms());
    defines["NUM_BLOCKS"] = context.intToString(context.getNumAtomBlocks());
753
    defines["TILE_SIZE"] = context.intToString(OpenCLContext::TileSize);
peastman's avatar
peastman committed
754
    int numExclusionTiles = exclusionTiles.getSize();
755
756
757
758
759
760
    defines["NUM_TILES_WITH_EXCLUSIONS"] = context.intToString(numExclusionTiles);
    int numContexts = context.getPlatformData().contexts.size();
    int startExclusionIndex = context.getContextIndex()*numExclusionTiles/numContexts;
    int endExclusionIndex = (context.getContextIndex()+1)*numExclusionTiles/numContexts;
    defines["FIRST_EXCLUSION_TILE"] = context.intToString(startExclusionIndex);
    defines["LAST_EXCLUSION_TILE"] = context.intToString(endExclusionIndex);
761
762
    if ((localDataSize/4)%2 == 0)
        defines["PARAMETER_SIZE_IS_EVEN"] = "1";
763
    cl::Program program = context.createProgram(context.replaceStrings(kernelSource, replacements), defines);
764
765
766
    cl::Kernel kernel(program, "computeNonbonded");

    // Set arguments to the Kernel.
767

768
    int index = 0;
769
    kernel.setArg<cl::Memory>(index++, context.getLongForceBuffer().getDeviceBuffer());
770
771
    kernel.setArg<cl::Buffer>(index++, context.getEnergyBuffer().getDeviceBuffer());
    kernel.setArg<cl::Buffer>(index++, context.getPosq().getDeviceBuffer());
peastman's avatar
peastman committed
772
773
    kernel.setArg<cl::Buffer>(index++, exclusions.getDeviceBuffer());
    kernel.setArg<cl::Buffer>(index++, exclusionTiles.getDeviceBuffer());
774
    kernel.setArg<cl_uint>(index++, startTileIndex);
775
    kernel.setArg<cl_ulong>(index++, numTiles);
776
    if (useCutoff) {
peastman's avatar
peastman committed
777
778
        kernel.setArg<cl::Buffer>(index++, interactingTiles.getDeviceBuffer());
        kernel.setArg<cl::Buffer>(index++, interactionCount.getDeviceBuffer());
779
        index += 5; // The periodic box size arguments are set when the kernel is executed.
peastman's avatar
peastman committed
780
781
782
783
        kernel.setArg<cl_uint>(index++, interactingTiles.getSize());
        kernel.setArg<cl::Buffer>(index++, blockCenter.getDeviceBuffer());
        kernel.setArg<cl::Buffer>(index++, blockBoundingBox.getDeviceBuffer());
        kernel.setArg<cl::Buffer>(index++, interactingAtoms.getDeviceBuffer());
784
    }
785
786
787
788
    for (const ParameterInfo& param : params)
        kernel.setArg<cl::Memory>(index++, param.getMemory());
    for (const ParameterInfo& arg : arguments)
        kernel.setArg<cl::Memory>(index++, arg.getMemory());
789
790
    if (energyParameterDerivatives.size() > 0)
        kernel.setArg<cl::Memory>(index++, context.getEnergyParamDerivBuffer().getDeviceBuffer());
791
    return kernel;
792
}
793
794
795
796

void OpenCLNonbondedUtilities::setKernelSource(const string& source) {
    kernelSource = source;
}