OpenCLRpmdKernels.cpp 28.9 KB
Newer Older
1
2
3
4
5
6
7
8
/* -------------------------------------------------------------------------- *
 *                                   OpenMM                                   *
 * -------------------------------------------------------------------------- *
 * This is part of the OpenMM molecular simulation toolkit originating from   *
 * Simbios, the NIH National Center for Physics-Based Simulation of           *
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
9
 * Portions copyright (c) 2011-2020 Stanford University and the Authors.      *
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
 * Permission is hereby granted, free of charge, to any person obtaining a    *
 * copy of this software and associated documentation files (the "Software"), *
 * to deal in the Software without restriction, including without limitation  *
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
 * and/or sell copies of the Software, and to permit persons to whom the      *
 * Software is furnished to do so, subject to the following conditions:       *
 *                                                                            *
 * The above copyright notice and this permission notice shall be included in *
 * all copies or substantial portions of the Software.                        *
 *                                                                            *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
 * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
 * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
 * -------------------------------------------------------------------------- */

#include "OpenCLRpmdKernels.h"
#include "OpenCLRpmdKernelSources.h"
#include "openmm/internal/ContextImpl.h"
#include "OpenCLIntegrationUtilities.h"
#include "OpenCLExpressionUtilities.h"
#include "OpenCLFFT3D.h"
Peter Eastman's avatar
Peter Eastman committed
38
#include "OpenCLNonbondedUtilities.h"
39
#include "SimTKOpenMMRealType.h"
40
41
42
43
44
45
46
47
48
49
50
51

using namespace OpenMM;
using namespace std;

void OpenCLIntegrateRPMDStepKernel::initialize(const System& system, const RPMDIntegrator& integrator) {
    cl.getPlatformData().initializeContexts(system);
    numCopies = integrator.getNumCopies();
    numParticles = system.getNumParticles();
    workgroupSize = numCopies;
    if (numCopies != OpenCLFFT3D::findLegalDimension(numCopies))
        throw OpenMMException("RPMDIntegrator: the number of copies must be a multiple of powers of 2, 3, and 5.");
    int paddedParticles = cl.getPaddedNumAtoms();
52
    int forceElementSize = (cl.getUseDoublePrecision() ? sizeof(mm_double4) : sizeof(mm_float4));
peastman's avatar
peastman committed
53
    forces.initialize(cl, numCopies*paddedParticles, forceElementSize, "rpmdForces");
54
55
    bool useDoublePrecision = (cl.getUseDoublePrecision() || cl.getUseMixedPrecision());
    int elementSize = (useDoublePrecision ? sizeof(mm_double4) : sizeof(mm_float4));
peastman's avatar
peastman committed
56
57
    positions.initialize(cl, numCopies*paddedParticles, elementSize, "rpmdPositions");
    velocities.initialize(cl, numCopies*paddedParticles, elementSize, "rpmdVelocities");
58
59
60
61
    cl.getIntegrationUtilities().initRandomNumberGenerator((unsigned int) integrator.getRandomNumberSeed());
    
    // Fill in the posq and velm arrays with safe values to avoid a risk of nans.
    
62
    if (useDoublePrecision) {
peastman's avatar
peastman committed
63
64
        vector<mm_double4> temp(positions.getSize());
        for (int i = 0; i < positions.getSize(); i++)
65
            temp[i] = mm_double4(0, 0, 0, 0);
peastman's avatar
peastman committed
66
67
        positions.upload(temp);
        for (int i = 0; i < velocities.getSize(); i++)
68
            temp[i] = mm_double4(0, 0, 0, 1);
peastman's avatar
peastman committed
69
        velocities.upload(temp);
70
71
    }
    else {
peastman's avatar
peastman committed
72
73
        vector<mm_float4> temp(positions.getSize());
        for (int i = 0; i < positions.getSize(); i++)
74
            temp[i] = mm_float4(0, 0, 0, 0);
peastman's avatar
peastman committed
75
76
        positions.upload(temp);
        for (int i = 0; i < velocities.getSize(); i++)
77
            temp[i] = mm_float4(0, 0, 0, 1);
peastman's avatar
peastman committed
78
        velocities.upload(temp);
79
    }
80
81
82
83
84
85
    
    // Build a list of contractions.
    
    groupsNotContracted = -1;
    const map<int, int>& contractions = integrator.getContractions();
    int maxContractedCopies = 0;
peastman's avatar
peastman committed
86
87
88
    for (auto& c : contractions) {
        int group = c.first;
        int copies = c.second;
89
90
91
92
        if (group < 0 || group > 31)
            throw OpenMMException("RPMDIntegrator: Force group must be between 0 and 31");
        if (copies < 0 || copies > numCopies)
            throw OpenMMException("RPMDIntegrator: Number of copies for contraction cannot be greater than the total number of copies being simulated");
93
94
        if (copies != OpenCLFFT3D::findLegalDimension(copies))
            throw OpenMMException("RPMDIntegrator: Number of copies for contraction must be a multiple of powers of 2, 3, and 5.");
95
96
97
98
99
100
101
102
        if (copies != numCopies) {
            if (groupsByCopies.find(copies) == groupsByCopies.end()) {
                groupsByCopies[copies] = 1<<group;
                if (copies > maxContractedCopies)
                    maxContractedCopies = copies;
            }
            else
                groupsByCopies[copies] |= 1<<group;
peastman's avatar
peastman committed
103
            groupsNotContracted -= 1<<group;
104
105
        }
    }
106
    groupsNotContracted &= integrator.getIntegrationForceGroups();
107
    if (maxContractedCopies > 0) {
peastman's avatar
peastman committed
108
109
        contractedForces.initialize(cl, maxContractedCopies*paddedParticles, forceElementSize, "rpmdContractedForces");
        contractedPositions.initialize(cl, maxContractedCopies*paddedParticles, elementSize, "rpmdContractedPositions");
110
    }
111
112
113
114

    // Create kernels.
    
    map<string, string> defines;
115
116
117
    defines["NUM_ATOMS"] = cl.intToString(cl.getNumAtoms());
    defines["PADDED_NUM_ATOMS"] = cl.intToString(cl.getPaddedNumAtoms());
    defines["NUM_COPIES"] = cl.intToString(numCopies);
118
    defines["THREAD_BLOCK_SIZE"] = cl.intToString(workgroupSize);
119
120
121
    defines["HBAR"] = cl.doubleToString(1.054571628e-34*AVOGADRO/(1000*1e-12));
    defines["SCALE"] = cl.doubleToString(1.0/sqrt((double) numCopies));
    defines["M_PI"] = cl.doubleToString(M_PI);
122
123
124
125
126
127
128
129
130
    map<string, string> replacements;
    replacements["FFT_Q_FORWARD"] = createFFT(numCopies, "q", true);
    replacements["FFT_Q_BACKWARD"] = createFFT(numCopies, "q", false);
    replacements["FFT_V_FORWARD"] = createFFT(numCopies, "v", true);
    replacements["FFT_V_BACKWARD"] = createFFT(numCopies, "v", false);
    cl::Program program = cl.createProgram(cl.replaceStrings(OpenCLRpmdKernelSources::rpmd, replacements), defines, "");
    pileKernel = cl::Kernel(program, "applyPileThermostat");
    stepKernel = cl::Kernel(program, "integrateStep");
    velocitiesKernel = cl::Kernel(program, "advanceVelocities");
131
132
    copyToContextKernel = cl::Kernel(program, "copyDataToContext");
    copyFromContextKernel = cl::Kernel(program, "copyDataFromContext");
Peter Eastman's avatar
Peter Eastman committed
133
    translateKernel = cl::Kernel(program, "applyCellTranslations");
134
135
136
    
    // Create kernels for doing contractions.
    
peastman's avatar
peastman committed
137
138
    for (auto& g : groupsByCopies) {
        int copies = g.first;
139
140
141
142
143
144
145
146
147
148
149
150
        replacements.clear();
        replacements["NUM_CONTRACTED_COPIES"] = cl.intToString(copies);
        replacements["POS_SCALE"] = cl.doubleToString(1.0/numCopies);
        replacements["FORCE_SCALE"] = cl.doubleToString(1.0/copies);
        replacements["FFT_Q_FORWARD"] = createFFT(numCopies, "q", true);
        replacements["FFT_Q_BACKWARD"] = createFFT(copies, "q", false);
        replacements["FFT_F_FORWARD"] = createFFT(copies, "f", true);
        replacements["FFT_F_BACKWARD"] = createFFT(numCopies, "f", false);
        program = cl.createProgram(cl.replaceStrings(OpenCLRpmdKernelSources::rpmdContraction, replacements), defines, "");
        positionContractionKernels[copies] = cl::Kernel(program, "contractPositions");
        forceContractionKernels[copies] = cl::Kernel(program, "contractForces");
    }
151
152
}

153
154
void OpenCLIntegrateRPMDStepKernel::initializeKernels(ContextImpl& context) {
    hasInitializedKernel = true;
peastman's avatar
peastman committed
155
156
157
158
159
160
161
    pileKernel.setArg<cl::Buffer>(0, velocities.getDeviceBuffer());
    stepKernel.setArg<cl::Buffer>(0, positions.getDeviceBuffer());
    stepKernel.setArg<cl::Buffer>(1, velocities.getDeviceBuffer());
    stepKernel.setArg<cl::Buffer>(2, forces.getDeviceBuffer());
    velocitiesKernel.setArg<cl::Buffer>(0, velocities.getDeviceBuffer());
    velocitiesKernel.setArg<cl::Buffer>(1, forces.getDeviceBuffer());
    translateKernel.setArg<cl::Buffer>(0, positions.getDeviceBuffer());
162
163
    translateKernel.setArg<cl::Buffer>(1, cl.getPosq().getDeviceBuffer());
    translateKernel.setArg<cl::Buffer>(2, cl.getAtomIndexArray().getDeviceBuffer());
peastman's avatar
peastman committed
164
    copyToContextKernel.setArg<cl::Buffer>(0, velocities.getDeviceBuffer());
165
166
167
168
169
    copyToContextKernel.setArg<cl::Buffer>(1, cl.getVelm().getDeviceBuffer());
    copyToContextKernel.setArg<cl::Buffer>(3, cl.getPosq().getDeviceBuffer());
    copyToContextKernel.setArg<cl::Buffer>(4, cl.getAtomIndexArray().getDeviceBuffer());
    copyFromContextKernel.setArg<cl::Buffer>(0, cl.getForce().getDeviceBuffer());
    copyFromContextKernel.setArg<cl::Buffer>(2, cl.getVelm().getDeviceBuffer());
peastman's avatar
peastman committed
170
    copyFromContextKernel.setArg<cl::Buffer>(3, velocities.getDeviceBuffer());
171
172
    copyFromContextKernel.setArg<cl::Buffer>(4, cl.getPosq().getDeviceBuffer());
    copyFromContextKernel.setArg<cl::Buffer>(6, cl.getAtomIndexArray().getDeviceBuffer());
peastman's avatar
peastman committed
173
174
    for (auto& g : groupsByCopies) {
        int copies = g.first;
peastman's avatar
peastman committed
175
176
177
178
        positionContractionKernels[copies].setArg<cl::Buffer>(0, positions.getDeviceBuffer());
        positionContractionKernels[copies].setArg<cl::Buffer>(1, contractedPositions.getDeviceBuffer());
        forceContractionKernels[copies].setArg<cl::Buffer>(0, forces.getDeviceBuffer());
        forceContractionKernels[copies].setArg<cl::Buffer>(1, contractedForces.getDeviceBuffer());
179
    }
180
181
}

182
183
void OpenCLIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDIntegrator& integrator, bool forcesAreValid) {
    OpenCLIntegrationUtilities& integration = cl.getIntegrationUtilities();
184
185
    if (!hasInitializedKernel)
        initializeKernels(context);
186
187
188
    
    // Loop over copies and compute the force on each one.
    
Peter Eastman's avatar
Peter Eastman committed
189
190
    if (!forcesAreValid)
        computeForces(context);
191
192
193
    
    // Apply the PILE-L thermostat.
    
194
    bool useDoublePrecision = (cl.getUseDoublePrecision() || cl.getUseMixedPrecision());
195
    const double dt = integrator.getStepSize();
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
    pileKernel.setArg<cl_uint>(2, integration.prepareRandomNumbers(numParticles*numCopies));
    pileKernel.setArg<cl::Buffer>(1, integration.getRandom().getDeviceBuffer()); // Do this *after* prepareRandomNumbers(), which might rebuild the array.
    if (useDoublePrecision) {
        pileKernel.setArg<cl_double>(3, dt);
        pileKernel.setArg<cl_double>(4, integrator.getTemperature()*BOLTZ);
        pileKernel.setArg<cl_double>(5, integrator.getFriction());
        stepKernel.setArg<cl_double>(3, dt);
        stepKernel.setArg<cl_double>(4, integrator.getTemperature()*BOLTZ);
        velocitiesKernel.setArg<cl_double>(2, dt);
    }
    else {
        pileKernel.setArg<cl_float>(3, (cl_float) dt);
        pileKernel.setArg<cl_float>(4, (cl_float) (integrator.getTemperature()*BOLTZ));
        pileKernel.setArg<cl_float>(5, (cl_float) integrator.getFriction());
        stepKernel.setArg<cl_float>(3, (cl_float) dt);
        stepKernel.setArg<cl_float>(4, (cl_float) (integrator.getTemperature()*BOLTZ));
        velocitiesKernel.setArg<cl_float>(2, (cl_float) dt);
    }
214
215
    if (integrator.getApplyThermostat())
        cl.executeKernel(pileKernel, numParticles*numCopies, workgroupSize);
216
217
218
219
220
221
222

    // Update positions and velocities.
    
    cl.executeKernel(stepKernel, numParticles*numCopies, workgroupSize);

    // Calculate forces based on the updated positions.
    
Peter Eastman's avatar
Peter Eastman committed
223
    computeForces(context);
224
225
226
227
228
229
    
    // Update velocities.
    cl.executeKernel(velocitiesKernel, numParticles*numCopies, workgroupSize);

    // Apply the PILE-L thermostat again.

230
231
232
233
    if (integrator.getApplyThermostat()) {
        pileKernel.setArg<cl_uint>(2, integration.prepareRandomNumbers(numParticles*numCopies));
        cl.executeKernel(pileKernel, numParticles*numCopies, workgroupSize);
    }
234
235
236
237
238

    // Update the time and step count.

    cl.setTime(cl.getTime()+dt);
    cl.setStepCount(cl.getStepCount()+1);
239
240
241
242
243
244
245
246
    cl.reorderAtoms();
    if (cl.getAtomsWereReordered() && cl.getNonbondedUtilities().getUsePeriodic()) {
        // Atoms may have been translated into a different periodic box, so apply
        // the same translation to all the beads.

        translateKernel.setArg<cl_int>(3, numCopies-1);
        cl.executeKernel(translateKernel, cl.getNumAtoms());
    }
247
248
}

Peter Eastman's avatar
Peter Eastman committed
249
void OpenCLIntegrateRPMDStepKernel::computeForces(ContextImpl& context) {
250
251
    // Compute forces from all groups that didn't have a specified contraction.

peastman's avatar
peastman committed
252
253
254
    copyToContextKernel.setArg<cl::Buffer>(2, positions.getDeviceBuffer());
    copyFromContextKernel.setArg<cl::Buffer>(1, forces.getDeviceBuffer());
    copyFromContextKernel.setArg<cl::Buffer>(5, positions.getDeviceBuffer());
Peter Eastman's avatar
Peter Eastman committed
255
    for (int i = 0; i < numCopies; i++) {
256
257
        copyToContextKernel.setArg<cl_int>(5, i);
        cl.executeKernel(copyToContextKernel, cl.getNumAtoms());
Peter Eastman's avatar
Peter Eastman committed
258
        context.computeVirtualSites();
259
260
        Vec3 initialBox[3];
        context.getPeriodicBoxVectors(initialBox[0], initialBox[1], initialBox[2]);
261
        context.updateContextState();
262
263
        Vec3 finalBox[3];
        context.getPeriodicBoxVectors(finalBox[0], finalBox[1], finalBox[2]);
264
265
        if (initialBox[0] != finalBox[0] || initialBox[1] != finalBox[1] || initialBox[2] != finalBox[2])
            throw OpenMMException("Standard barostats cannot be used with RPMDIntegrator.  Use RPMDMonteCarloBarostat instead.");
266
        context.calcForcesAndEnergy(true, false, groupsNotContracted);
267
268
        copyFromContextKernel.setArg<cl_int>(7, i);
        cl.executeKernel(copyFromContextKernel, cl.getNumAtoms());
Peter Eastman's avatar
Peter Eastman committed
269
    }
270
271
272
273
    
    // Now loop over contractions and compute forces from them.
    
    if (groupsByCopies.size() > 0) {
peastman's avatar
peastman committed
274
275
276
        copyToContextKernel.setArg<cl::Buffer>(2, contractedPositions.getDeviceBuffer());
        copyFromContextKernel.setArg<cl::Buffer>(1, contractedForces.getDeviceBuffer());
        copyFromContextKernel.setArg<cl::Buffer>(5, contractedPositions.getDeviceBuffer());
peastman's avatar
peastman committed
277
278
279
        for (auto& g : groupsByCopies) {
            int copies = g.first;
            int groupFlags = g.second;
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300

            // Find the contracted positions.

            cl.executeKernel(positionContractionKernels[copies], numParticles*numCopies, workgroupSize);

            // Compute forces.

            for (int i = 0; i < copies; i++) {
                copyToContextKernel.setArg<cl_int>(5, i);
                cl.executeKernel(copyToContextKernel, cl.getNumAtoms());
                context.computeVirtualSites();
                context.calcForcesAndEnergy(true, false, groupFlags);
                copyFromContextKernel.setArg<cl_int>(7, i);
                cl.executeKernel(copyFromContextKernel, cl.getNumAtoms());
            }

            // Apply the forces to the original copies.

            cl.executeKernel(forceContractionKernels[copies], numParticles*numCopies, workgroupSize);
        }
    }
301
302
303
    if (groupsByCopies.size() > 0) {
        // Ensure the Context contains the positions from the last copy, since we'll assume that later.
        
peastman's avatar
peastman committed
304
        copyToContextKernel.setArg<cl::Buffer>(2, positions.getDeviceBuffer());
305
306
307
        copyToContextKernel.setArg<cl_int>(5, numCopies-1);
        cl.executeKernel(copyToContextKernel, cl.getNumAtoms());
    }
Peter Eastman's avatar
Peter Eastman committed
308
309
}

310
311
312
313
double OpenCLIntegrateRPMDStepKernel::computeKineticEnergy(ContextImpl& context, const RPMDIntegrator& integrator) {
    return cl.getIntegrationUtilities().computeKineticEnergy(0);
}

314
void OpenCLIntegrateRPMDStepKernel::setPositions(int copy, const vector<Vec3>& pos) {
peastman's avatar
peastman committed
315
    if (!positions.isInitialized())
316
317
318
        throw OpenMMException("RPMDIntegrator: Cannot set positions before the integrator is added to a Context");
    if (pos.size() != numParticles)
        throw OpenMMException("RPMDIntegrator: wrong number of values passed to setPositions()");
319
320
321
322
323
324
325
326
327
328
329
330
331

    // Adjust the positions based on the current cell offsets.
    
    const vector<int>& order = cl.getAtomIndex();
    mm_double4 periodicBoxSize = cl.getPeriodicBoxSizeDouble();
    vector<Vec3> offsetPos(numParticles);
    for (int i = 0; i < numParticles; ++i) {
        mm_int4 offset = cl.getPosCellOffsets()[i];
        offsetPos[order[i]] = pos[order[i]] + Vec3(offset.x*periodicBoxSize.x, offset.y*periodicBoxSize.y, offset.z*periodicBoxSize.z);
    }

    // Record the positions.

332
333
334
335
    if (cl.getUseDoublePrecision()) {
        vector<mm_double4> posq(cl.getPaddedNumAtoms());
        cl.getPosq().download(posq);
        for (int i = 0; i < numParticles; i++)
336
            posq[i] = mm_double4(offsetPos[i][0], offsetPos[i][1], offsetPos[i][2], posq[i].w);
peastman's avatar
peastman committed
337
        cl.getQueue().enqueueWriteBuffer(positions.getDeviceBuffer(), CL_TRUE, copy*cl.getPaddedNumAtoms()*sizeof(mm_double4), numParticles*sizeof(mm_double4), &posq[0]);
338
339
340
341
342
343
    }
    else if (cl.getUseMixedPrecision()) {
        vector<mm_float4> posqf(cl.getPaddedNumAtoms());
        cl.getPosq().download(posqf);
        vector<mm_double4> posq(cl.getPaddedNumAtoms());
        for (int i = 0; i < numParticles; i++)
344
            posq[i] = mm_double4(offsetPos[i][0], offsetPos[i][1], offsetPos[i][2], posqf[i].w);
peastman's avatar
peastman committed
345
        cl.getQueue().enqueueWriteBuffer(positions.getDeviceBuffer(), CL_TRUE, copy*cl.getPaddedNumAtoms()*sizeof(mm_double4), numParticles*sizeof(mm_double4), &posq[0]);
346
347
348
349
350
    }
    else {
        vector<mm_float4> posq(cl.getPaddedNumAtoms());
        cl.getPosq().download(posq);
        for (int i = 0; i < numParticles; i++)
351
            posq[i] = mm_float4((cl_float) offsetPos[i][0], (cl_float) offsetPos[i][1], (cl_float) offsetPos[i][2], posq[i].w);
peastman's avatar
peastman committed
352
        cl.getQueue().enqueueWriteBuffer(positions.getDeviceBuffer(), CL_TRUE, copy*cl.getPaddedNumAtoms()*sizeof(mm_float4), numParticles*sizeof(mm_float4), &posq[0]);
353
    }
354
355
356
}

void OpenCLIntegrateRPMDStepKernel::setVelocities(int copy, const vector<Vec3>& vel) {
peastman's avatar
peastman committed
357
    if (!velocities.isInitialized())
358
359
360
        throw OpenMMException("RPMDIntegrator: Cannot set velocities before the integrator is added to a Context");
    if (vel.size() != numParticles)
        throw OpenMMException("RPMDIntegrator: wrong number of values passed to setVelocities()");
361
362
363
364
365
    if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision()) {
        vector<mm_double4> velm(cl.getPaddedNumAtoms());
        cl.getVelm().download(velm);
        for (int i = 0; i < numParticles; i++)
            velm[i] = mm_double4(vel[i][0], vel[i][1], vel[i][2], velm[i].w);
peastman's avatar
peastman committed
366
        cl.getQueue().enqueueWriteBuffer(velocities.getDeviceBuffer(), CL_TRUE, copy*cl.getPaddedNumAtoms()*sizeof(mm_double4), numParticles*sizeof(mm_double4), &velm[0]);
367
368
369
370
371
372
    }
    else {
        vector<mm_float4> velm(cl.getPaddedNumAtoms());
        cl.getVelm().download(velm);
        for (int i = 0; i < numParticles; i++)
            velm[i] = mm_float4((cl_float) vel[i][0], (cl_float) vel[i][1], (cl_float) vel[i][2], velm[i].w);
peastman's avatar
peastman committed
373
        cl.getQueue().enqueueWriteBuffer(velocities.getDeviceBuffer(), CL_TRUE, copy*cl.getPaddedNumAtoms()*sizeof(mm_float4), numParticles*sizeof(mm_float4), &velm[0]);
374
    }
375
376
}

377
void OpenCLIntegrateRPMDStepKernel::copyToContext(int copy, ContextImpl& context) {
378
379
    if (!hasInitializedKernel)
        initializeKernels(context);
peastman's avatar
peastman committed
380
    copyToContextKernel.setArg<cl::Buffer>(2, positions.getDeviceBuffer());
381
382
    copyToContextKernel.setArg<cl_int>(5, copy);
    cl.executeKernel(copyToContextKernel, cl.getNumAtoms());
383
384
385
386
387
388
389
390
}

string OpenCLIntegrateRPMDStepKernel::createFFT(int size, const string& variable, bool forward) {
    stringstream source;
    int stage = 0;
    int L = size;
    int m = 1;
    string sign = (forward ? "1.0f" : "-1.0f");
Peter Eastman's avatar
Peter Eastman committed
391
392
    string multReal = (forward ? "multiplyComplexRealPart" : "multiplyComplexRealPartConj");
    string multImag = (forward ? "multiplyComplexImagPart" : "multiplyComplexImagPartConj");
393
394

    source<<"{\n";
395
396
397
398
    source<<"__local mixed4* real0 = "<<variable<<"real;\n";
    source<<"__local mixed4* imag0 = "<<variable<<"imag;\n";
    source<<"__local mixed4* real1 = &temp[blockStart];\n";
    source<<"__local mixed4* imag1 = &temp[blockStart+get_local_size(0)];\n";
399
400
401

    // Factor size, generating an appropriate block of code for each factor.

peastman's avatar
peastman committed
402
    while (L > 1) {
403
404
        int input = stage%2;
        int output = 1-input;
peastman's avatar
peastman committed
405
406
407
408
409
410
411
412
413
414
415
        int radix;
        if (L%5 == 0)
            radix = 5;
        else if (L%4 == 0)
            radix = 4;
        else if (L%3 == 0)
            radix = 3;
        else if (L%2 == 0)
            radix = 2;
        else
            throw OpenMMException("Illegal size for FFT: "+cl.intToString(size));
416
        source<<"{\n";
peastman's avatar
peastman committed
417
418
419
420
421
422
        L = L/radix;
        source<<"// Pass "<<(stage+1)<<" (radix "<<radix<<")\n";
        source<<"if (indexInBlock < "<<(L*m)<<") {\n";
        source<<"int i = indexInBlock;\n";
        source<<"int j = i/"<<m<<";\n";
        if (radix == 5) {
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
            source<<"mixed4 c0r = real"<<input<<"[i];\n";
            source<<"mixed4 c0i = imag"<<input<<"[i];\n";
            source<<"mixed4 c1r = real"<<input<<"[i+"<<(L*m)<<"];\n";
            source<<"mixed4 c1i = imag"<<input<<"[i+"<<(L*m)<<"];\n";
            source<<"mixed4 c2r = real"<<input<<"[i+"<<(2*L*m)<<"];\n";
            source<<"mixed4 c2i = imag"<<input<<"[i+"<<(2*L*m)<<"];\n";
            source<<"mixed4 c3r = real"<<input<<"[i+"<<(3*L*m)<<"];\n";
            source<<"mixed4 c3i = imag"<<input<<"[i+"<<(3*L*m)<<"];\n";
            source<<"mixed4 c4r = real"<<input<<"[i+"<<(4*L*m)<<"];\n";
            source<<"mixed4 c4i = imag"<<input<<"[i+"<<(4*L*m)<<"];\n";
            source<<"mixed4 d0r = c1r+c4r;\n";
            source<<"mixed4 d0i = c1i+c4i;\n";
            source<<"mixed4 d1r = c2r+c3r;\n";
            source<<"mixed4 d1i = c2i+c3i;\n";
            source<<"mixed4 d2r = "<<cl.doubleToString(sin(0.4*M_PI))<<"*(c1r-c4r);\n";
            source<<"mixed4 d2i = "<<cl.doubleToString(sin(0.4*M_PI))<<"*(c1i-c4i);\n";
            source<<"mixed4 d3r = "<<cl.doubleToString(sin(0.4*M_PI))<<"*(c2r-c3r);\n";
            source<<"mixed4 d3i = "<<cl.doubleToString(sin(0.4*M_PI))<<"*(c2i-c3i);\n";
            source<<"mixed4 d4r = d0r+d1r;\n";
            source<<"mixed4 d4i = d0i+d1i;\n";
            source<<"mixed4 d5r = "<<cl.doubleToString(0.25*sqrt(5.0))<<"*(d0r-d1r);\n";
            source<<"mixed4 d5i = "<<cl.doubleToString(0.25*sqrt(5.0))<<"*(d0i-d1i);\n";
            source<<"mixed4 d6r = c0r-0.25f*d4r;\n";
            source<<"mixed4 d6i = c0i-0.25f*d4i;\n";
            source<<"mixed4 d7r = d6r+d5r;\n";
            source<<"mixed4 d7i = d6i+d5i;\n";
            source<<"mixed4 d8r = d6r-d5r;\n";
            source<<"mixed4 d8i = d6i-d5i;\n";
451
            string coeff = cl.doubleToString(sin(0.2*M_PI)/sin(0.4*M_PI));
452
453
454
455
            source<<"mixed4 d9r = "<<sign<<"*(d2i+"<<coeff<<"*d3i);\n";
            source<<"mixed4 d9i = "<<sign<<"*(-d2r-"<<coeff<<"*d3r);\n";
            source<<"mixed4 d10r = "<<sign<<"*("<<coeff<<"*d2i-d3i);\n";
            source<<"mixed4 d10i = "<<sign<<"*(d3r-"<<coeff<<"*d2r);\n";
456
457
            source<<"real"<<output<<"[i+4*j*"<<m<<"] = c0r+d4r;\n";
            source<<"imag"<<output<<"[i+4*j*"<<m<<"] = c0i+d4i;\n";
Peter Eastman's avatar
Peter Eastman committed
458
459
460
461
462
463
464
465
            source<<"real"<<output<<"[i+(4*j+1)*"<<m<<"] = "<<multReal<<"(w[j*"<<size<<"/"<<(5*L)<<"], d7r+d9r, d7i+d9i);\n";
            source<<"imag"<<output<<"[i+(4*j+1)*"<<m<<"] = "<<multImag<<"(w[j*"<<size<<"/"<<(5*L)<<"], d7r+d9r, d7i+d9i);\n";
            source<<"real"<<output<<"[i+(4*j+2)*"<<m<<"] = "<<multReal<<"(w[j*"<<(2*size)<<"/"<<(5*L)<<"], d8r+d10r, d8i+d10i);\n";
            source<<"imag"<<output<<"[i+(4*j+2)*"<<m<<"] = "<<multImag<<"(w[j*"<<(2*size)<<"/"<<(5*L)<<"], d8r+d10r, d8i+d10i);\n";
            source<<"real"<<output<<"[i+(4*j+3)*"<<m<<"] = "<<multReal<<"(w[j*"<<(3*size)<<"/"<<(5*L)<<"], d8r-d10r, d8i-d10i);\n";
            source<<"imag"<<output<<"[i+(4*j+3)*"<<m<<"] = "<<multImag<<"(w[j*"<<(3*size)<<"/"<<(5*L)<<"], d8r-d10r, d8i-d10i);\n";
            source<<"real"<<output<<"[i+(4*j+4)*"<<m<<"] = "<<multReal<<"(w[j*"<<(4*size)<<"/"<<(5*L)<<"], d7r-d9r, d7i-d9i);\n";
            source<<"imag"<<output<<"[i+(4*j+4)*"<<m<<"] = "<<multImag<<"(w[j*"<<(4*size)<<"/"<<(5*L)<<"], d7r-d9r, d7i-d9i);\n";
466
        }
peastman's avatar
peastman committed
467
        else if (radix == 4) {
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
            source<<"mixed4 c0r = real"<<input<<"[i];\n";
            source<<"mixed4 c0i = imag"<<input<<"[i];\n";
            source<<"mixed4 c1r = real"<<input<<"[i+"<<(L*m)<<"];\n";
            source<<"mixed4 c1i = imag"<<input<<"[i+"<<(L*m)<<"];\n";
            source<<"mixed4 c2r = real"<<input<<"[i+"<<(2*L*m)<<"];\n";
            source<<"mixed4 c2i = imag"<<input<<"[i+"<<(2*L*m)<<"];\n";
            source<<"mixed4 c3r = real"<<input<<"[i+"<<(3*L*m)<<"];\n";
            source<<"mixed4 c3i = imag"<<input<<"[i+"<<(3*L*m)<<"];\n";
            source<<"mixed4 d0r = c0r+c2r;\n";
            source<<"mixed4 d0i = c0i+c2i;\n";
            source<<"mixed4 d1r = c0r-c2r;\n";
            source<<"mixed4 d1i = c0i-c2i;\n";
            source<<"mixed4 d2r = c1r+c3r;\n";
            source<<"mixed4 d2i = c1i+c3i;\n";
            source<<"mixed4 d3r = "<<sign<<"*(c1i-c3i);\n";
            source<<"mixed4 d3i = "<<sign<<"*(c3r-c1r);\n";
484
485
            source<<"real"<<output<<"[i+3*j*"<<m<<"] = d0r+d2r;\n";
            source<<"imag"<<output<<"[i+3*j*"<<m<<"] = d0i+d2i;\n";
Peter Eastman's avatar
Peter Eastman committed
486
487
488
489
490
491
            source<<"real"<<output<<"[i+(3*j+1)*"<<m<<"] = "<<multReal<<"(w[j*"<<size<<"/"<<(4*L)<<"], d1r+d3r, d1i+d3i);\n";
            source<<"imag"<<output<<"[i+(3*j+1)*"<<m<<"] = "<<multImag<<"(w[j*"<<size<<"/"<<(4*L)<<"], d1r+d3r, d1i+d3i);\n";
            source<<"real"<<output<<"[i+(3*j+2)*"<<m<<"] = "<<multReal<<"(w[j*"<<(2*size)<<"/"<<(4*L)<<"], d0r-d2r, d0i-d2i);\n";
            source<<"imag"<<output<<"[i+(3*j+2)*"<<m<<"] = "<<multImag<<"(w[j*"<<(2*size)<<"/"<<(4*L)<<"], d0r-d2r, d0i-d2i);\n";
            source<<"real"<<output<<"[i+(3*j+3)*"<<m<<"] = "<<multReal<<"(w[j*"<<(3*size)<<"/"<<(4*L)<<"], d1r-d3r, d1i-d3i);\n";
            source<<"imag"<<output<<"[i+(3*j+3)*"<<m<<"] = "<<multImag<<"(w[j*"<<(3*size)<<"/"<<(4*L)<<"], d1r-d3r, d1i-d3i);\n";
492
        }
peastman's avatar
peastman committed
493
        else if (radix == 3) {
494
495
496
497
498
499
500
501
502
503
504
505
            source<<"mixed4 c0r = real"<<input<<"[i];\n";
            source<<"mixed4 c0i = imag"<<input<<"[i];\n";
            source<<"mixed4 c1r = real"<<input<<"[i+"<<(L*m)<<"];\n";
            source<<"mixed4 c1i = imag"<<input<<"[i+"<<(L*m)<<"];\n";
            source<<"mixed4 c2r = real"<<input<<"[i+"<<(2*L*m)<<"];\n";
            source<<"mixed4 c2i = imag"<<input<<"[i+"<<(2*L*m)<<"];\n";
            source<<"mixed4 d0r = c1r+c2r;\n";
            source<<"mixed4 d0i = c1i+c2i;\n";
            source<<"mixed4 d1r = c0r-0.5f*d0r;\n";
            source<<"mixed4 d1i = c0i-0.5f*d0i;\n";
            source<<"mixed4 d2r = "<<sign<<"*"<<cl.doubleToString(sin(M_PI/3.0))<<"*(c1i-c2i);\n";
            source<<"mixed4 d2i = "<<sign<<"*"<<cl.doubleToString(sin(M_PI/3.0))<<"*(c2r-c1r);\n";
506
507
            source<<"real"<<output<<"[i+2*j*"<<m<<"] = c0r+d0r;\n";
            source<<"imag"<<output<<"[i+2*j*"<<m<<"] = c0i+d0i;\n";
Peter Eastman's avatar
Peter Eastman committed
508
509
510
511
            source<<"real"<<output<<"[i+(2*j+1)*"<<m<<"] = "<<multReal<<"(w[j*"<<size<<"/"<<(3*L)<<"], d1r+d2r, d1i+d2i);\n";
            source<<"imag"<<output<<"[i+(2*j+1)*"<<m<<"] = "<<multImag<<"(w[j*"<<size<<"/"<<(3*L)<<"], d1r+d2r, d1i+d2i);\n";
            source<<"real"<<output<<"[i+(2*j+2)*"<<m<<"] = "<<multReal<<"(w[j*"<<(2*size)<<"/"<<(3*L)<<"], d1r-d2r, d1i-d2i);\n";
            source<<"imag"<<output<<"[i+(2*j+2)*"<<m<<"] = "<<multImag<<"(w[j*"<<(2*size)<<"/"<<(3*L)<<"], d1r-d2r, d1i-d2i);\n";
512
        }
peastman's avatar
peastman committed
513
        else if (radix == 2) {
514
515
516
517
            source<<"mixed4 c0r = real"<<input<<"[i];\n";
            source<<"mixed4 c0i = imag"<<input<<"[i];\n";
            source<<"mixed4 c1r = real"<<input<<"[i+"<<(L*m)<<"];\n";
            source<<"mixed4 c1i = imag"<<input<<"[i+"<<(L*m)<<"];\n";
518
519
            source<<"real"<<output<<"[i+j*"<<m<<"] = c0r+c1r;\n";
            source<<"imag"<<output<<"[i+j*"<<m<<"] = c0i+c1i;\n";
Peter Eastman's avatar
Peter Eastman committed
520
521
            source<<"real"<<output<<"[i+(j+1)*"<<m<<"] = "<<multReal<<"(w[j*"<<size<<"/"<<(2*L)<<"], c0r-c1r, c0i-c1i);\n";
            source<<"imag"<<output<<"[i+(j+1)*"<<m<<"] = "<<multImag<<"(w[j*"<<size<<"/"<<(2*L)<<"], c0r-c1r, c0i-c1i);\n";
522
        }
peastman's avatar
peastman committed
523
524
        source<<"}\n";
        m = m*radix;
525
526
527
528
529
530
531
532
        source<<"barrier(CLK_LOCAL_MEM_FENCE);\n";
        source<<"}\n";
        ++stage;
    }

    // Create the kernel.

    if (stage%2 == 1) {
Peter Eastman's avatar
Peter Eastman committed
533
534
        source<<"real0[indexInBlock] = real1[indexInBlock];\n";
        source<<"imag0[indexInBlock] = imag1[indexInBlock];\n";
Peter Eastman's avatar
Bug fix  
Peter Eastman committed
535
        source<<"barrier(CLK_LOCAL_MEM_FENCE);\n";
536
537
538
539
    }
    source<<"}\n";
    return source.str();
}