OpenCLRpmdKernels.cpp 29.7 KB
Newer Older
1
2
3
4
5
6
7
8
/* -------------------------------------------------------------------------- *
 *                                   OpenMM                                   *
 * -------------------------------------------------------------------------- *
 * This is part of the OpenMM molecular simulation toolkit originating from   *
 * Simbios, the NIH National Center for Physics-Based Simulation of           *
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
9
 * Portions copyright (c) 2011-2013 Stanford University and the Authors.      *
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
 * Permission is hereby granted, free of charge, to any person obtaining a    *
 * copy of this software and associated documentation files (the "Software"), *
 * to deal in the Software without restriction, including without limitation  *
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
 * and/or sell copies of the Software, and to permit persons to whom the      *
 * Software is furnished to do so, subject to the following conditions:       *
 *                                                                            *
 * The above copyright notice and this permission notice shall be included in *
 * all copies or substantial portions of the Software.                        *
 *                                                                            *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
 * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
 * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
 * -------------------------------------------------------------------------- */

#include "OpenCLRpmdKernels.h"
#include "OpenCLRpmdKernelSources.h"
#include "openmm/internal/ContextImpl.h"
#include "OpenCLIntegrationUtilities.h"
#include "OpenCLExpressionUtilities.h"
#include "OpenCLFFT3D.h"
Peter Eastman's avatar
Peter Eastman committed
38
#include "OpenCLNonbondedUtilities.h"
39
#include "SimTKOpenMMRealType.h"
40
41
42
43
44
45
46
47
48
49
50

using namespace OpenMM;
using namespace std;

OpenCLIntegrateRPMDStepKernel::~OpenCLIntegrateRPMDStepKernel() {
    if (forces != NULL)
        delete forces;
    if (positions != NULL)
        delete positions;
    if (velocities != NULL)
        delete velocities;
51
52
53
54
    if (contractedForces != NULL)
        delete contractedForces;
    if (contractedPositions != NULL)
        delete contractedPositions;
55
}
56

57
58
59
60
61
62
63
64
void OpenCLIntegrateRPMDStepKernel::initialize(const System& system, const RPMDIntegrator& integrator) {
    cl.getPlatformData().initializeContexts(system);
    numCopies = integrator.getNumCopies();
    numParticles = system.getNumParticles();
    workgroupSize = numCopies;
    if (numCopies != OpenCLFFT3D::findLegalDimension(numCopies))
        throw OpenMMException("RPMDIntegrator: the number of copies must be a multiple of powers of 2, 3, and 5.");
    int paddedParticles = cl.getPaddedNumAtoms();
65
66
67
68
69
70
    int forceElementSize = (cl.getUseDoublePrecision() ? sizeof(mm_double4) : sizeof(mm_float4));
    forces = new OpenCLArray(cl, numCopies*paddedParticles, forceElementSize, "rpmdForces");
    bool useDoublePrecision = (cl.getUseDoublePrecision() || cl.getUseMixedPrecision());
    int elementSize = (useDoublePrecision ? sizeof(mm_double4) : sizeof(mm_float4));
    positions = new OpenCLArray(cl, numCopies*paddedParticles, elementSize, "rpmdPositions");
    velocities = new OpenCLArray(cl, numCopies*paddedParticles, elementSize, "rpmdVelocities");
71
72
73
74
    cl.getIntegrationUtilities().initRandomNumberGenerator((unsigned int) integrator.getRandomNumberSeed());
    
    // Fill in the posq and velm arrays with safe values to avoid a risk of nans.
    
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
    if (useDoublePrecision) {
        vector<mm_double4> temp(positions->getSize());
        for (int i = 0; i < positions->getSize(); i++)
            temp[i] = mm_double4(0, 0, 0, 0);
        positions->upload(temp);
        for (int i = 0; i < velocities->getSize(); i++)
            temp[i] = mm_double4(0, 0, 0, 1);
        velocities->upload(temp);
    }
    else {
        vector<mm_float4> temp(positions->getSize());
        for (int i = 0; i < positions->getSize(); i++)
            temp[i] = mm_float4(0, 0, 0, 0);
        positions->upload(temp);
        for (int i = 0; i < velocities->getSize(); i++)
            temp[i] = mm_float4(0, 0, 0, 1);
        velocities->upload(temp);
    }
93
94
95
96
97
98
99
100
101
102
103
104
105
    
    // Build a list of contractions.
    
    groupsNotContracted = -1;
    const map<int, int>& contractions = integrator.getContractions();
    int maxContractedCopies = 0;
    for (map<int, int>::const_iterator iter = contractions.begin(); iter != contractions.end(); ++iter) {
        int group = iter->first;
        int copies = iter->second;
        if (group < 0 || group > 31)
            throw OpenMMException("RPMDIntegrator: Force group must be between 0 and 31");
        if (copies < 0 || copies > numCopies)
            throw OpenMMException("RPMDIntegrator: Number of copies for contraction cannot be greater than the total number of copies being simulated");
106
107
        if (copies != OpenCLFFT3D::findLegalDimension(copies))
            throw OpenMMException("RPMDIntegrator: Number of copies for contraction must be a multiple of powers of 2, 3, and 5.");
108
109
110
111
112
113
114
115
        if (copies != numCopies) {
            if (groupsByCopies.find(copies) == groupsByCopies.end()) {
                groupsByCopies[copies] = 1<<group;
                if (copies > maxContractedCopies)
                    maxContractedCopies = copies;
            }
            else
                groupsByCopies[copies] |= 1<<group;
peastman's avatar
peastman committed
116
            groupsNotContracted -= 1<<group;
117
118
119
120
121
122
        }
    }
    if (maxContractedCopies > 0) {
        contractedForces = new OpenCLArray(cl, maxContractedCopies*paddedParticles, forceElementSize, "rpmdContractedForces");
        contractedPositions = new OpenCLArray(cl, maxContractedCopies*paddedParticles, elementSize, "rpmdContractedPositions");
    }
123
124
125
126

    // Create kernels.
    
    map<string, string> defines;
127
128
129
    defines["NUM_ATOMS"] = cl.intToString(cl.getNumAtoms());
    defines["PADDED_NUM_ATOMS"] = cl.intToString(cl.getPaddedNumAtoms());
    defines["NUM_COPIES"] = cl.intToString(numCopies);
130
    defines["THREAD_BLOCK_SIZE"] = cl.intToString(workgroupSize);
131
132
133
    defines["HBAR"] = cl.doubleToString(1.054571628e-34*AVOGADRO/(1000*1e-12));
    defines["SCALE"] = cl.doubleToString(1.0/sqrt((double) numCopies));
    defines["M_PI"] = cl.doubleToString(M_PI);
134
135
136
137
138
139
140
141
142
    map<string, string> replacements;
    replacements["FFT_Q_FORWARD"] = createFFT(numCopies, "q", true);
    replacements["FFT_Q_BACKWARD"] = createFFT(numCopies, "q", false);
    replacements["FFT_V_FORWARD"] = createFFT(numCopies, "v", true);
    replacements["FFT_V_BACKWARD"] = createFFT(numCopies, "v", false);
    cl::Program program = cl.createProgram(cl.replaceStrings(OpenCLRpmdKernelSources::rpmd, replacements), defines, "");
    pileKernel = cl::Kernel(program, "applyPileThermostat");
    stepKernel = cl::Kernel(program, "integrateStep");
    velocitiesKernel = cl::Kernel(program, "advanceVelocities");
143
144
    copyToContextKernel = cl::Kernel(program, "copyDataToContext");
    copyFromContextKernel = cl::Kernel(program, "copyDataFromContext");
Peter Eastman's avatar
Peter Eastman committed
145
    translateKernel = cl::Kernel(program, "applyCellTranslations");
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
    
    // Create kernels for doing contractions.
    
    for (map<int, int>::const_iterator iter = groupsByCopies.begin(); iter != groupsByCopies.end(); ++iter) {
        int copies = iter->first;
        replacements.clear();
        replacements["NUM_CONTRACTED_COPIES"] = cl.intToString(copies);
        replacements["POS_SCALE"] = cl.doubleToString(1.0/numCopies);
        replacements["FORCE_SCALE"] = cl.doubleToString(1.0/copies);
        replacements["FFT_Q_FORWARD"] = createFFT(numCopies, "q", true);
        replacements["FFT_Q_BACKWARD"] = createFFT(copies, "q", false);
        replacements["FFT_F_FORWARD"] = createFFT(copies, "f", true);
        replacements["FFT_F_BACKWARD"] = createFFT(numCopies, "f", false);
        program = cl.createProgram(cl.replaceStrings(OpenCLRpmdKernelSources::rpmdContraction, replacements), defines, "");
        positionContractionKernels[copies] = cl::Kernel(program, "contractPositions");
        forceContractionKernels[copies] = cl::Kernel(program, "contractForces");
    }
163
164
}

165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
void OpenCLIntegrateRPMDStepKernel::initializeKernels(ContextImpl& context) {
    hasInitializedKernel = true;
    pileKernel.setArg<cl::Buffer>(0, velocities->getDeviceBuffer());
    stepKernel.setArg<cl::Buffer>(0, positions->getDeviceBuffer());
    stepKernel.setArg<cl::Buffer>(1, velocities->getDeviceBuffer());
    stepKernel.setArg<cl::Buffer>(2, forces->getDeviceBuffer());
    velocitiesKernel.setArg<cl::Buffer>(0, velocities->getDeviceBuffer());
    velocitiesKernel.setArg<cl::Buffer>(1, forces->getDeviceBuffer());
    translateKernel.setArg<cl::Buffer>(0, positions->getDeviceBuffer());
    translateKernel.setArg<cl::Buffer>(1, cl.getPosq().getDeviceBuffer());
    translateKernel.setArg<cl::Buffer>(2, cl.getAtomIndexArray().getDeviceBuffer());
    copyToContextKernel.setArg<cl::Buffer>(0, velocities->getDeviceBuffer());
    copyToContextKernel.setArg<cl::Buffer>(1, cl.getVelm().getDeviceBuffer());
    copyToContextKernel.setArg<cl::Buffer>(3, cl.getPosq().getDeviceBuffer());
    copyToContextKernel.setArg<cl::Buffer>(4, cl.getAtomIndexArray().getDeviceBuffer());
    copyFromContextKernel.setArg<cl::Buffer>(0, cl.getForce().getDeviceBuffer());
    copyFromContextKernel.setArg<cl::Buffer>(2, cl.getVelm().getDeviceBuffer());
    copyFromContextKernel.setArg<cl::Buffer>(3, velocities->getDeviceBuffer());
    copyFromContextKernel.setArg<cl::Buffer>(4, cl.getPosq().getDeviceBuffer());
    copyFromContextKernel.setArg<cl::Buffer>(6, cl.getAtomIndexArray().getDeviceBuffer());
185
186
187
188
189
190
191
    for (map<int, int>::const_iterator iter = groupsByCopies.begin(); iter != groupsByCopies.end(); ++iter) {
        int copies = iter->first;
        positionContractionKernels[copies].setArg<cl::Buffer>(0, positions->getDeviceBuffer());
        positionContractionKernels[copies].setArg<cl::Buffer>(1, contractedPositions->getDeviceBuffer());
        forceContractionKernels[copies].setArg<cl::Buffer>(0, forces->getDeviceBuffer());
        forceContractionKernels[copies].setArg<cl::Buffer>(1, contractedForces->getDeviceBuffer());
    }
192
193
}

194
195
void OpenCLIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDIntegrator& integrator, bool forcesAreValid) {
    OpenCLIntegrationUtilities& integration = cl.getIntegrationUtilities();
196
197
    if (!hasInitializedKernel)
        initializeKernels(context);
198
199
200
    
    // Loop over copies and compute the force on each one.
    
Peter Eastman's avatar
Peter Eastman committed
201
202
    if (!forcesAreValid)
        computeForces(context);
203
204
205
    
    // Apply the PILE-L thermostat.
    
206
    bool useDoublePrecision = (cl.getUseDoublePrecision() || cl.getUseMixedPrecision());
207
    const double dt = integrator.getStepSize();
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
    pileKernel.setArg<cl_uint>(2, integration.prepareRandomNumbers(numParticles*numCopies));
    pileKernel.setArg<cl::Buffer>(1, integration.getRandom().getDeviceBuffer()); // Do this *after* prepareRandomNumbers(), which might rebuild the array.
    if (useDoublePrecision) {
        pileKernel.setArg<cl_double>(3, dt);
        pileKernel.setArg<cl_double>(4, integrator.getTemperature()*BOLTZ);
        pileKernel.setArg<cl_double>(5, integrator.getFriction());
        stepKernel.setArg<cl_double>(3, dt);
        stepKernel.setArg<cl_double>(4, integrator.getTemperature()*BOLTZ);
        velocitiesKernel.setArg<cl_double>(2, dt);
    }
    else {
        pileKernel.setArg<cl_float>(3, (cl_float) dt);
        pileKernel.setArg<cl_float>(4, (cl_float) (integrator.getTemperature()*BOLTZ));
        pileKernel.setArg<cl_float>(5, (cl_float) integrator.getFriction());
        stepKernel.setArg<cl_float>(3, (cl_float) dt);
        stepKernel.setArg<cl_float>(4, (cl_float) (integrator.getTemperature()*BOLTZ));
        velocitiesKernel.setArg<cl_float>(2, (cl_float) dt);
    }
226
227
    if (integrator.getApplyThermostat())
        cl.executeKernel(pileKernel, numParticles*numCopies, workgroupSize);
228
229
230
231
232
233
234

    // Update positions and velocities.
    
    cl.executeKernel(stepKernel, numParticles*numCopies, workgroupSize);

    // Calculate forces based on the updated positions.
    
Peter Eastman's avatar
Peter Eastman committed
235
    computeForces(context);
236
237
238
239
240
241
    
    // Update velocities.
    cl.executeKernel(velocitiesKernel, numParticles*numCopies, workgroupSize);

    // Apply the PILE-L thermostat again.

242
243
244
245
    if (integrator.getApplyThermostat()) {
        pileKernel.setArg<cl_uint>(2, integration.prepareRandomNumbers(numParticles*numCopies));
        cl.executeKernel(pileKernel, numParticles*numCopies, workgroupSize);
    }
246
247
248
249
250

    // Update the time and step count.

    cl.setTime(cl.getTime()+dt);
    cl.setStepCount(cl.getStepCount()+1);
251
252
253
254
255
256
257
258
    cl.reorderAtoms();
    if (cl.getAtomsWereReordered() && cl.getNonbondedUtilities().getUsePeriodic()) {
        // Atoms may have been translated into a different periodic box, so apply
        // the same translation to all the beads.

        translateKernel.setArg<cl_int>(3, numCopies-1);
        cl.executeKernel(translateKernel, cl.getNumAtoms());
    }
259
260
}

Peter Eastman's avatar
Peter Eastman committed
261
void OpenCLIntegrateRPMDStepKernel::computeForces(ContextImpl& context) {
262
263
264
265
266
    // Compute forces from all groups that didn't have a specified contraction.

    copyToContextKernel.setArg<cl::Buffer>(2, positions->getDeviceBuffer());
    copyFromContextKernel.setArg<cl::Buffer>(1, forces->getDeviceBuffer());
    copyFromContextKernel.setArg<cl::Buffer>(5, positions->getDeviceBuffer());
Peter Eastman's avatar
Peter Eastman committed
267
    for (int i = 0; i < numCopies; i++) {
268
269
        copyToContextKernel.setArg<cl_int>(5, i);
        cl.executeKernel(copyToContextKernel, cl.getNumAtoms());
Peter Eastman's avatar
Peter Eastman committed
270
        context.computeVirtualSites();
271
272
        Vec3 initialBox[3];
        context.getPeriodicBoxVectors(initialBox[0], initialBox[1], initialBox[2]);
273
        context.updateContextState();
274
275
276
277
278
279
280
281
282
        Vec3 finalBox[3];
        context.getPeriodicBoxVectors(finalBox[0], finalBox[1], finalBox[2]);
        if (initialBox[0] != finalBox[0] || initialBox[1] != finalBox[1] || initialBox[2] != finalBox[2]) {
            // A barostat was applied during updateContextState().  Adjust the particle positions in all the
            // other copies to match this one.
            
            translateKernel.setArg<cl_int>(3, i);
            cl.executeKernel(translateKernel, cl.getNumAtoms());
        }
283
        context.calcForcesAndEnergy(true, false, groupsNotContracted);
284
285
        copyFromContextKernel.setArg<cl_int>(7, i);
        cl.executeKernel(copyFromContextKernel, cl.getNumAtoms());
Peter Eastman's avatar
Peter Eastman committed
286
    }
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
    
    // Now loop over contractions and compute forces from them.
    
    if (groupsByCopies.size() > 0) {
        copyToContextKernel.setArg<cl::Buffer>(2, contractedPositions->getDeviceBuffer());
        copyFromContextKernel.setArg<cl::Buffer>(1, contractedForces->getDeviceBuffer());
        copyFromContextKernel.setArg<cl::Buffer>(5, contractedPositions->getDeviceBuffer());
        for (map<int, int>::const_iterator iter = groupsByCopies.begin(); iter != groupsByCopies.end(); ++iter) {
            int copies = iter->first;
            int groupFlags = iter->second;

            // Find the contracted positions.

            cl.executeKernel(positionContractionKernels[copies], numParticles*numCopies, workgroupSize);

            // Compute forces.

            for (int i = 0; i < copies; i++) {
                copyToContextKernel.setArg<cl_int>(5, i);
                cl.executeKernel(copyToContextKernel, cl.getNumAtoms());
                context.computeVirtualSites();
                context.calcForcesAndEnergy(true, false, groupFlags);
                copyFromContextKernel.setArg<cl_int>(7, i);
                cl.executeKernel(copyFromContextKernel, cl.getNumAtoms());
            }

            // Apply the forces to the original copies.

            cl.executeKernel(forceContractionKernels[copies], numParticles*numCopies, workgroupSize);
        }
    }
318
319
320
321
322
323
324
    if (groupsByCopies.size() > 0) {
        // Ensure the Context contains the positions from the last copy, since we'll assume that later.
        
        copyToContextKernel.setArg<cl::Buffer>(2, positions->getDeviceBuffer());
        copyToContextKernel.setArg<cl_int>(5, numCopies-1);
        cl.executeKernel(copyToContextKernel, cl.getNumAtoms());
    }
Peter Eastman's avatar
Peter Eastman committed
325
326
}

327
328
329
330
double OpenCLIntegrateRPMDStepKernel::computeKineticEnergy(ContextImpl& context, const RPMDIntegrator& integrator) {
    return cl.getIntegrationUtilities().computeKineticEnergy(0);
}

331
332
333
334
335
void OpenCLIntegrateRPMDStepKernel::setPositions(int copy, const vector<Vec3>& pos) {
    if (positions == NULL)
        throw OpenMMException("RPMDIntegrator: Cannot set positions before the integrator is added to a Context");
    if (pos.size() != numParticles)
        throw OpenMMException("RPMDIntegrator: wrong number of values passed to setPositions()");
336
337
338
339
340
341
342
343
344
345
346
347
348

    // Adjust the positions based on the current cell offsets.
    
    const vector<int>& order = cl.getAtomIndex();
    mm_double4 periodicBoxSize = cl.getPeriodicBoxSizeDouble();
    vector<Vec3> offsetPos(numParticles);
    for (int i = 0; i < numParticles; ++i) {
        mm_int4 offset = cl.getPosCellOffsets()[i];
        offsetPos[order[i]] = pos[order[i]] + Vec3(offset.x*periodicBoxSize.x, offset.y*periodicBoxSize.y, offset.z*periodicBoxSize.z);
    }

    // Record the positions.

349
350
351
352
    if (cl.getUseDoublePrecision()) {
        vector<mm_double4> posq(cl.getPaddedNumAtoms());
        cl.getPosq().download(posq);
        for (int i = 0; i < numParticles; i++)
353
            posq[i] = mm_double4(offsetPos[i][0], offsetPos[i][1], offsetPos[i][2], posq[i].w);
354
355
356
357
358
359
360
        cl.getQueue().enqueueWriteBuffer(positions->getDeviceBuffer(), CL_TRUE, copy*cl.getPaddedNumAtoms()*sizeof(mm_double4), numParticles*sizeof(mm_double4), &posq[0]);
    }
    else if (cl.getUseMixedPrecision()) {
        vector<mm_float4> posqf(cl.getPaddedNumAtoms());
        cl.getPosq().download(posqf);
        vector<mm_double4> posq(cl.getPaddedNumAtoms());
        for (int i = 0; i < numParticles; i++)
361
            posq[i] = mm_double4(offsetPos[i][0], offsetPos[i][1], offsetPos[i][2], posqf[i].w);
362
363
364
365
366
367
        cl.getQueue().enqueueWriteBuffer(positions->getDeviceBuffer(), CL_TRUE, copy*cl.getPaddedNumAtoms()*sizeof(mm_double4), numParticles*sizeof(mm_double4), &posq[0]);
    }
    else {
        vector<mm_float4> posq(cl.getPaddedNumAtoms());
        cl.getPosq().download(posq);
        for (int i = 0; i < numParticles; i++)
368
            posq[i] = mm_float4((cl_float) offsetPos[i][0], (cl_float) offsetPos[i][1], (cl_float) offsetPos[i][2], posq[i].w);
369
370
        cl.getQueue().enqueueWriteBuffer(positions->getDeviceBuffer(), CL_TRUE, copy*cl.getPaddedNumAtoms()*sizeof(mm_float4), numParticles*sizeof(mm_float4), &posq[0]);
    }
371
372
373
374
375
376
377
}

void OpenCLIntegrateRPMDStepKernel::setVelocities(int copy, const vector<Vec3>& vel) {
    if (velocities == NULL)
        throw OpenMMException("RPMDIntegrator: Cannot set velocities before the integrator is added to a Context");
    if (vel.size() != numParticles)
        throw OpenMMException("RPMDIntegrator: wrong number of values passed to setVelocities()");
378
379
380
381
382
383
384
385
386
387
388
389
390
391
    if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision()) {
        vector<mm_double4> velm(cl.getPaddedNumAtoms());
        cl.getVelm().download(velm);
        for (int i = 0; i < numParticles; i++)
            velm[i] = mm_double4(vel[i][0], vel[i][1], vel[i][2], velm[i].w);
        cl.getQueue().enqueueWriteBuffer(velocities->getDeviceBuffer(), CL_TRUE, copy*cl.getPaddedNumAtoms()*sizeof(mm_double4), numParticles*sizeof(mm_double4), &velm[0]);
    }
    else {
        vector<mm_float4> velm(cl.getPaddedNumAtoms());
        cl.getVelm().download(velm);
        for (int i = 0; i < numParticles; i++)
            velm[i] = mm_float4((cl_float) vel[i][0], (cl_float) vel[i][1], (cl_float) vel[i][2], velm[i].w);
        cl.getQueue().enqueueWriteBuffer(velocities->getDeviceBuffer(), CL_TRUE, copy*cl.getPaddedNumAtoms()*sizeof(mm_float4), numParticles*sizeof(mm_float4), &velm[0]);
    }
392
393
}

394
void OpenCLIntegrateRPMDStepKernel::copyToContext(int copy, ContextImpl& context) {
395
396
    if (!hasInitializedKernel)
        initializeKernels(context);
397
    copyToContextKernel.setArg<cl::Buffer>(2, positions->getDeviceBuffer());
398
399
    copyToContextKernel.setArg<cl_int>(5, copy);
    cl.executeKernel(copyToContextKernel, cl.getNumAtoms());
400
401
402
403
404
405
406
407
}

string OpenCLIntegrateRPMDStepKernel::createFFT(int size, const string& variable, bool forward) {
    stringstream source;
    int stage = 0;
    int L = size;
    int m = 1;
    string sign = (forward ? "1.0f" : "-1.0f");
Peter Eastman's avatar
Peter Eastman committed
408
409
    string multReal = (forward ? "multiplyComplexRealPart" : "multiplyComplexRealPartConj");
    string multImag = (forward ? "multiplyComplexImagPart" : "multiplyComplexImagPartConj");
410
411

    source<<"{\n";
412
413
414
415
    source<<"__local mixed4* real0 = "<<variable<<"real;\n";
    source<<"__local mixed4* imag0 = "<<variable<<"imag;\n";
    source<<"__local mixed4* real1 = &temp[blockStart];\n";
    source<<"__local mixed4* imag1 = &temp[blockStart+get_local_size(0)];\n";
416
417
418

    // Factor size, generating an appropriate block of code for each factor.

peastman's avatar
peastman committed
419
    while (L > 1) {
420
421
        int input = stage%2;
        int output = 1-input;
peastman's avatar
peastman committed
422
423
424
425
426
427
428
429
430
431
432
        int radix;
        if (L%5 == 0)
            radix = 5;
        else if (L%4 == 0)
            radix = 4;
        else if (L%3 == 0)
            radix = 3;
        else if (L%2 == 0)
            radix = 2;
        else
            throw OpenMMException("Illegal size for FFT: "+cl.intToString(size));
433
        source<<"{\n";
peastman's avatar
peastman committed
434
435
436
437
438
439
        L = L/radix;
        source<<"// Pass "<<(stage+1)<<" (radix "<<radix<<")\n";
        source<<"if (indexInBlock < "<<(L*m)<<") {\n";
        source<<"int i = indexInBlock;\n";
        source<<"int j = i/"<<m<<";\n";
        if (radix == 5) {
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
            source<<"mixed4 c0r = real"<<input<<"[i];\n";
            source<<"mixed4 c0i = imag"<<input<<"[i];\n";
            source<<"mixed4 c1r = real"<<input<<"[i+"<<(L*m)<<"];\n";
            source<<"mixed4 c1i = imag"<<input<<"[i+"<<(L*m)<<"];\n";
            source<<"mixed4 c2r = real"<<input<<"[i+"<<(2*L*m)<<"];\n";
            source<<"mixed4 c2i = imag"<<input<<"[i+"<<(2*L*m)<<"];\n";
            source<<"mixed4 c3r = real"<<input<<"[i+"<<(3*L*m)<<"];\n";
            source<<"mixed4 c3i = imag"<<input<<"[i+"<<(3*L*m)<<"];\n";
            source<<"mixed4 c4r = real"<<input<<"[i+"<<(4*L*m)<<"];\n";
            source<<"mixed4 c4i = imag"<<input<<"[i+"<<(4*L*m)<<"];\n";
            source<<"mixed4 d0r = c1r+c4r;\n";
            source<<"mixed4 d0i = c1i+c4i;\n";
            source<<"mixed4 d1r = c2r+c3r;\n";
            source<<"mixed4 d1i = c2i+c3i;\n";
            source<<"mixed4 d2r = "<<cl.doubleToString(sin(0.4*M_PI))<<"*(c1r-c4r);\n";
            source<<"mixed4 d2i = "<<cl.doubleToString(sin(0.4*M_PI))<<"*(c1i-c4i);\n";
            source<<"mixed4 d3r = "<<cl.doubleToString(sin(0.4*M_PI))<<"*(c2r-c3r);\n";
            source<<"mixed4 d3i = "<<cl.doubleToString(sin(0.4*M_PI))<<"*(c2i-c3i);\n";
            source<<"mixed4 d4r = d0r+d1r;\n";
            source<<"mixed4 d4i = d0i+d1i;\n";
            source<<"mixed4 d5r = "<<cl.doubleToString(0.25*sqrt(5.0))<<"*(d0r-d1r);\n";
            source<<"mixed4 d5i = "<<cl.doubleToString(0.25*sqrt(5.0))<<"*(d0i-d1i);\n";
            source<<"mixed4 d6r = c0r-0.25f*d4r;\n";
            source<<"mixed4 d6i = c0i-0.25f*d4i;\n";
            source<<"mixed4 d7r = d6r+d5r;\n";
            source<<"mixed4 d7i = d6i+d5i;\n";
            source<<"mixed4 d8r = d6r-d5r;\n";
            source<<"mixed4 d8i = d6i-d5i;\n";
468
            string coeff = cl.doubleToString(sin(0.2*M_PI)/sin(0.4*M_PI));
469
470
471
472
            source<<"mixed4 d9r = "<<sign<<"*(d2i+"<<coeff<<"*d3i);\n";
            source<<"mixed4 d9i = "<<sign<<"*(-d2r-"<<coeff<<"*d3r);\n";
            source<<"mixed4 d10r = "<<sign<<"*("<<coeff<<"*d2i-d3i);\n";
            source<<"mixed4 d10i = "<<sign<<"*(d3r-"<<coeff<<"*d2r);\n";
473
474
            source<<"real"<<output<<"[i+4*j*"<<m<<"] = c0r+d4r;\n";
            source<<"imag"<<output<<"[i+4*j*"<<m<<"] = c0i+d4i;\n";
Peter Eastman's avatar
Peter Eastman committed
475
476
477
478
479
480
481
482
            source<<"real"<<output<<"[i+(4*j+1)*"<<m<<"] = "<<multReal<<"(w[j*"<<size<<"/"<<(5*L)<<"], d7r+d9r, d7i+d9i);\n";
            source<<"imag"<<output<<"[i+(4*j+1)*"<<m<<"] = "<<multImag<<"(w[j*"<<size<<"/"<<(5*L)<<"], d7r+d9r, d7i+d9i);\n";
            source<<"real"<<output<<"[i+(4*j+2)*"<<m<<"] = "<<multReal<<"(w[j*"<<(2*size)<<"/"<<(5*L)<<"], d8r+d10r, d8i+d10i);\n";
            source<<"imag"<<output<<"[i+(4*j+2)*"<<m<<"] = "<<multImag<<"(w[j*"<<(2*size)<<"/"<<(5*L)<<"], d8r+d10r, d8i+d10i);\n";
            source<<"real"<<output<<"[i+(4*j+3)*"<<m<<"] = "<<multReal<<"(w[j*"<<(3*size)<<"/"<<(5*L)<<"], d8r-d10r, d8i-d10i);\n";
            source<<"imag"<<output<<"[i+(4*j+3)*"<<m<<"] = "<<multImag<<"(w[j*"<<(3*size)<<"/"<<(5*L)<<"], d8r-d10r, d8i-d10i);\n";
            source<<"real"<<output<<"[i+(4*j+4)*"<<m<<"] = "<<multReal<<"(w[j*"<<(4*size)<<"/"<<(5*L)<<"], d7r-d9r, d7i-d9i);\n";
            source<<"imag"<<output<<"[i+(4*j+4)*"<<m<<"] = "<<multImag<<"(w[j*"<<(4*size)<<"/"<<(5*L)<<"], d7r-d9r, d7i-d9i);\n";
483
        }
peastman's avatar
peastman committed
484
        else if (radix == 4) {
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
            source<<"mixed4 c0r = real"<<input<<"[i];\n";
            source<<"mixed4 c0i = imag"<<input<<"[i];\n";
            source<<"mixed4 c1r = real"<<input<<"[i+"<<(L*m)<<"];\n";
            source<<"mixed4 c1i = imag"<<input<<"[i+"<<(L*m)<<"];\n";
            source<<"mixed4 c2r = real"<<input<<"[i+"<<(2*L*m)<<"];\n";
            source<<"mixed4 c2i = imag"<<input<<"[i+"<<(2*L*m)<<"];\n";
            source<<"mixed4 c3r = real"<<input<<"[i+"<<(3*L*m)<<"];\n";
            source<<"mixed4 c3i = imag"<<input<<"[i+"<<(3*L*m)<<"];\n";
            source<<"mixed4 d0r = c0r+c2r;\n";
            source<<"mixed4 d0i = c0i+c2i;\n";
            source<<"mixed4 d1r = c0r-c2r;\n";
            source<<"mixed4 d1i = c0i-c2i;\n";
            source<<"mixed4 d2r = c1r+c3r;\n";
            source<<"mixed4 d2i = c1i+c3i;\n";
            source<<"mixed4 d3r = "<<sign<<"*(c1i-c3i);\n";
            source<<"mixed4 d3i = "<<sign<<"*(c3r-c1r);\n";
501
502
            source<<"real"<<output<<"[i+3*j*"<<m<<"] = d0r+d2r;\n";
            source<<"imag"<<output<<"[i+3*j*"<<m<<"] = d0i+d2i;\n";
Peter Eastman's avatar
Peter Eastman committed
503
504
505
506
507
508
            source<<"real"<<output<<"[i+(3*j+1)*"<<m<<"] = "<<multReal<<"(w[j*"<<size<<"/"<<(4*L)<<"], d1r+d3r, d1i+d3i);\n";
            source<<"imag"<<output<<"[i+(3*j+1)*"<<m<<"] = "<<multImag<<"(w[j*"<<size<<"/"<<(4*L)<<"], d1r+d3r, d1i+d3i);\n";
            source<<"real"<<output<<"[i+(3*j+2)*"<<m<<"] = "<<multReal<<"(w[j*"<<(2*size)<<"/"<<(4*L)<<"], d0r-d2r, d0i-d2i);\n";
            source<<"imag"<<output<<"[i+(3*j+2)*"<<m<<"] = "<<multImag<<"(w[j*"<<(2*size)<<"/"<<(4*L)<<"], d0r-d2r, d0i-d2i);\n";
            source<<"real"<<output<<"[i+(3*j+3)*"<<m<<"] = "<<multReal<<"(w[j*"<<(3*size)<<"/"<<(4*L)<<"], d1r-d3r, d1i-d3i);\n";
            source<<"imag"<<output<<"[i+(3*j+3)*"<<m<<"] = "<<multImag<<"(w[j*"<<(3*size)<<"/"<<(4*L)<<"], d1r-d3r, d1i-d3i);\n";
509
        }
peastman's avatar
peastman committed
510
        else if (radix == 3) {
511
512
513
514
515
516
517
518
519
520
521
522
            source<<"mixed4 c0r = real"<<input<<"[i];\n";
            source<<"mixed4 c0i = imag"<<input<<"[i];\n";
            source<<"mixed4 c1r = real"<<input<<"[i+"<<(L*m)<<"];\n";
            source<<"mixed4 c1i = imag"<<input<<"[i+"<<(L*m)<<"];\n";
            source<<"mixed4 c2r = real"<<input<<"[i+"<<(2*L*m)<<"];\n";
            source<<"mixed4 c2i = imag"<<input<<"[i+"<<(2*L*m)<<"];\n";
            source<<"mixed4 d0r = c1r+c2r;\n";
            source<<"mixed4 d0i = c1i+c2i;\n";
            source<<"mixed4 d1r = c0r-0.5f*d0r;\n";
            source<<"mixed4 d1i = c0i-0.5f*d0i;\n";
            source<<"mixed4 d2r = "<<sign<<"*"<<cl.doubleToString(sin(M_PI/3.0))<<"*(c1i-c2i);\n";
            source<<"mixed4 d2i = "<<sign<<"*"<<cl.doubleToString(sin(M_PI/3.0))<<"*(c2r-c1r);\n";
523
524
            source<<"real"<<output<<"[i+2*j*"<<m<<"] = c0r+d0r;\n";
            source<<"imag"<<output<<"[i+2*j*"<<m<<"] = c0i+d0i;\n";
Peter Eastman's avatar
Peter Eastman committed
525
526
527
528
            source<<"real"<<output<<"[i+(2*j+1)*"<<m<<"] = "<<multReal<<"(w[j*"<<size<<"/"<<(3*L)<<"], d1r+d2r, d1i+d2i);\n";
            source<<"imag"<<output<<"[i+(2*j+1)*"<<m<<"] = "<<multImag<<"(w[j*"<<size<<"/"<<(3*L)<<"], d1r+d2r, d1i+d2i);\n";
            source<<"real"<<output<<"[i+(2*j+2)*"<<m<<"] = "<<multReal<<"(w[j*"<<(2*size)<<"/"<<(3*L)<<"], d1r-d2r, d1i-d2i);\n";
            source<<"imag"<<output<<"[i+(2*j+2)*"<<m<<"] = "<<multImag<<"(w[j*"<<(2*size)<<"/"<<(3*L)<<"], d1r-d2r, d1i-d2i);\n";
529
        }
peastman's avatar
peastman committed
530
        else if (radix == 2) {
531
532
533
534
            source<<"mixed4 c0r = real"<<input<<"[i];\n";
            source<<"mixed4 c0i = imag"<<input<<"[i];\n";
            source<<"mixed4 c1r = real"<<input<<"[i+"<<(L*m)<<"];\n";
            source<<"mixed4 c1i = imag"<<input<<"[i+"<<(L*m)<<"];\n";
535
536
            source<<"real"<<output<<"[i+j*"<<m<<"] = c0r+c1r;\n";
            source<<"imag"<<output<<"[i+j*"<<m<<"] = c0i+c1i;\n";
Peter Eastman's avatar
Peter Eastman committed
537
538
            source<<"real"<<output<<"[i+(j+1)*"<<m<<"] = "<<multReal<<"(w[j*"<<size<<"/"<<(2*L)<<"], c0r-c1r, c0i-c1i);\n";
            source<<"imag"<<output<<"[i+(j+1)*"<<m<<"] = "<<multImag<<"(w[j*"<<size<<"/"<<(2*L)<<"], c0r-c1r, c0i-c1i);\n";
539
        }
peastman's avatar
peastman committed
540
541
        source<<"}\n";
        m = m*radix;
542
543
544
545
546
547
548
549
        source<<"barrier(CLK_LOCAL_MEM_FENCE);\n";
        source<<"}\n";
        ++stage;
    }

    // Create the kernel.

    if (stage%2 == 1) {
Peter Eastman's avatar
Peter Eastman committed
550
551
        source<<"real0[indexInBlock] = real1[indexInBlock];\n";
        source<<"imag0[indexInBlock] = imag1[indexInBlock];\n";
Peter Eastman's avatar
Bug fix  
Peter Eastman committed
552
        source<<"barrier(CLK_LOCAL_MEM_FENCE);\n";
553
554
555
556
    }
    source<<"}\n";
    return source.str();
}