OpenCLRpmdKernels.cpp 28.8 KB
Newer Older
1
2
3
4
5
6
7
8
/* -------------------------------------------------------------------------- *
 *                                   OpenMM                                   *
 * -------------------------------------------------------------------------- *
 * This is part of the OpenMM molecular simulation toolkit originating from   *
 * Simbios, the NIH National Center for Physics-Based Simulation of           *
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
peastman's avatar
peastman committed
9
 * Portions copyright (c) 2011-2018 Stanford University and the Authors.      *
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
 * Permission is hereby granted, free of charge, to any person obtaining a    *
 * copy of this software and associated documentation files (the "Software"), *
 * to deal in the Software without restriction, including without limitation  *
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
 * and/or sell copies of the Software, and to permit persons to whom the      *
 * Software is furnished to do so, subject to the following conditions:       *
 *                                                                            *
 * The above copyright notice and this permission notice shall be included in *
 * all copies or substantial portions of the Software.                        *
 *                                                                            *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
 * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
 * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
 * -------------------------------------------------------------------------- */

#include "OpenCLRpmdKernels.h"
#include "OpenCLRpmdKernelSources.h"
#include "openmm/internal/ContextImpl.h"
#include "OpenCLIntegrationUtilities.h"
#include "OpenCLExpressionUtilities.h"
#include "OpenCLFFT3D.h"
Peter Eastman's avatar
Peter Eastman committed
38
#include "OpenCLNonbondedUtilities.h"
39
#include "SimTKOpenMMRealType.h"
40
41
42
43
44
45
46
47
48
49
50
51

using namespace OpenMM;
using namespace std;

void OpenCLIntegrateRPMDStepKernel::initialize(const System& system, const RPMDIntegrator& integrator) {
    cl.getPlatformData().initializeContexts(system);
    numCopies = integrator.getNumCopies();
    numParticles = system.getNumParticles();
    workgroupSize = numCopies;
    if (numCopies != OpenCLFFT3D::findLegalDimension(numCopies))
        throw OpenMMException("RPMDIntegrator: the number of copies must be a multiple of powers of 2, 3, and 5.");
    int paddedParticles = cl.getPaddedNumAtoms();
52
    int forceElementSize = (cl.getUseDoublePrecision() ? sizeof(mm_double4) : sizeof(mm_float4));
peastman's avatar
peastman committed
53
    forces.initialize(cl, numCopies*paddedParticles, forceElementSize, "rpmdForces");
54
55
    bool useDoublePrecision = (cl.getUseDoublePrecision() || cl.getUseMixedPrecision());
    int elementSize = (useDoublePrecision ? sizeof(mm_double4) : sizeof(mm_float4));
peastman's avatar
peastman committed
56
57
    positions.initialize(cl, numCopies*paddedParticles, elementSize, "rpmdPositions");
    velocities.initialize(cl, numCopies*paddedParticles, elementSize, "rpmdVelocities");
58
59
60
61
    cl.getIntegrationUtilities().initRandomNumberGenerator((unsigned int) integrator.getRandomNumberSeed());
    
    // Fill in the posq and velm arrays with safe values to avoid a risk of nans.
    
62
    if (useDoublePrecision) {
peastman's avatar
peastman committed
63
64
        vector<mm_double4> temp(positions.getSize());
        for (int i = 0; i < positions.getSize(); i++)
65
            temp[i] = mm_double4(0, 0, 0, 0);
peastman's avatar
peastman committed
66
67
        positions.upload(temp);
        for (int i = 0; i < velocities.getSize(); i++)
68
            temp[i] = mm_double4(0, 0, 0, 1);
peastman's avatar
peastman committed
69
        velocities.upload(temp);
70
71
    }
    else {
peastman's avatar
peastman committed
72
73
        vector<mm_float4> temp(positions.getSize());
        for (int i = 0; i < positions.getSize(); i++)
74
            temp[i] = mm_float4(0, 0, 0, 0);
peastman's avatar
peastman committed
75
76
        positions.upload(temp);
        for (int i = 0; i < velocities.getSize(); i++)
77
            temp[i] = mm_float4(0, 0, 0, 1);
peastman's avatar
peastman committed
78
        velocities.upload(temp);
79
    }
80
81
82
83
84
85
    
    // Build a list of contractions.
    
    groupsNotContracted = -1;
    const map<int, int>& contractions = integrator.getContractions();
    int maxContractedCopies = 0;
peastman's avatar
peastman committed
86
87
88
    for (auto& c : contractions) {
        int group = c.first;
        int copies = c.second;
89
90
91
92
        if (group < 0 || group > 31)
            throw OpenMMException("RPMDIntegrator: Force group must be between 0 and 31");
        if (copies < 0 || copies > numCopies)
            throw OpenMMException("RPMDIntegrator: Number of copies for contraction cannot be greater than the total number of copies being simulated");
93
94
        if (copies != OpenCLFFT3D::findLegalDimension(copies))
            throw OpenMMException("RPMDIntegrator: Number of copies for contraction must be a multiple of powers of 2, 3, and 5.");
95
96
97
98
99
100
101
102
        if (copies != numCopies) {
            if (groupsByCopies.find(copies) == groupsByCopies.end()) {
                groupsByCopies[copies] = 1<<group;
                if (copies > maxContractedCopies)
                    maxContractedCopies = copies;
            }
            else
                groupsByCopies[copies] |= 1<<group;
peastman's avatar
peastman committed
103
            groupsNotContracted -= 1<<group;
104
105
106
        }
    }
    if (maxContractedCopies > 0) {
peastman's avatar
peastman committed
107
108
        contractedForces.initialize(cl, maxContractedCopies*paddedParticles, forceElementSize, "rpmdContractedForces");
        contractedPositions.initialize(cl, maxContractedCopies*paddedParticles, elementSize, "rpmdContractedPositions");
109
    }
110
111
112
113

    // Create kernels.
    
    map<string, string> defines;
114
115
116
    defines["NUM_ATOMS"] = cl.intToString(cl.getNumAtoms());
    defines["PADDED_NUM_ATOMS"] = cl.intToString(cl.getPaddedNumAtoms());
    defines["NUM_COPIES"] = cl.intToString(numCopies);
117
    defines["THREAD_BLOCK_SIZE"] = cl.intToString(workgroupSize);
118
119
120
    defines["HBAR"] = cl.doubleToString(1.054571628e-34*AVOGADRO/(1000*1e-12));
    defines["SCALE"] = cl.doubleToString(1.0/sqrt((double) numCopies));
    defines["M_PI"] = cl.doubleToString(M_PI);
121
122
123
124
125
126
127
128
129
    map<string, string> replacements;
    replacements["FFT_Q_FORWARD"] = createFFT(numCopies, "q", true);
    replacements["FFT_Q_BACKWARD"] = createFFT(numCopies, "q", false);
    replacements["FFT_V_FORWARD"] = createFFT(numCopies, "v", true);
    replacements["FFT_V_BACKWARD"] = createFFT(numCopies, "v", false);
    cl::Program program = cl.createProgram(cl.replaceStrings(OpenCLRpmdKernelSources::rpmd, replacements), defines, "");
    pileKernel = cl::Kernel(program, "applyPileThermostat");
    stepKernel = cl::Kernel(program, "integrateStep");
    velocitiesKernel = cl::Kernel(program, "advanceVelocities");
130
131
    copyToContextKernel = cl::Kernel(program, "copyDataToContext");
    copyFromContextKernel = cl::Kernel(program, "copyDataFromContext");
Peter Eastman's avatar
Peter Eastman committed
132
    translateKernel = cl::Kernel(program, "applyCellTranslations");
133
134
135
    
    // Create kernels for doing contractions.
    
peastman's avatar
peastman committed
136
137
    for (auto& g : groupsByCopies) {
        int copies = g.first;
138
139
140
141
142
143
144
145
146
147
148
149
        replacements.clear();
        replacements["NUM_CONTRACTED_COPIES"] = cl.intToString(copies);
        replacements["POS_SCALE"] = cl.doubleToString(1.0/numCopies);
        replacements["FORCE_SCALE"] = cl.doubleToString(1.0/copies);
        replacements["FFT_Q_FORWARD"] = createFFT(numCopies, "q", true);
        replacements["FFT_Q_BACKWARD"] = createFFT(copies, "q", false);
        replacements["FFT_F_FORWARD"] = createFFT(copies, "f", true);
        replacements["FFT_F_BACKWARD"] = createFFT(numCopies, "f", false);
        program = cl.createProgram(cl.replaceStrings(OpenCLRpmdKernelSources::rpmdContraction, replacements), defines, "");
        positionContractionKernels[copies] = cl::Kernel(program, "contractPositions");
        forceContractionKernels[copies] = cl::Kernel(program, "contractForces");
    }
150
151
}

152
153
void OpenCLIntegrateRPMDStepKernel::initializeKernels(ContextImpl& context) {
    hasInitializedKernel = true;
peastman's avatar
peastman committed
154
155
156
157
158
159
160
    pileKernel.setArg<cl::Buffer>(0, velocities.getDeviceBuffer());
    stepKernel.setArg<cl::Buffer>(0, positions.getDeviceBuffer());
    stepKernel.setArg<cl::Buffer>(1, velocities.getDeviceBuffer());
    stepKernel.setArg<cl::Buffer>(2, forces.getDeviceBuffer());
    velocitiesKernel.setArg<cl::Buffer>(0, velocities.getDeviceBuffer());
    velocitiesKernel.setArg<cl::Buffer>(1, forces.getDeviceBuffer());
    translateKernel.setArg<cl::Buffer>(0, positions.getDeviceBuffer());
161
162
    translateKernel.setArg<cl::Buffer>(1, cl.getPosq().getDeviceBuffer());
    translateKernel.setArg<cl::Buffer>(2, cl.getAtomIndexArray().getDeviceBuffer());
peastman's avatar
peastman committed
163
    copyToContextKernel.setArg<cl::Buffer>(0, velocities.getDeviceBuffer());
164
165
166
167
168
    copyToContextKernel.setArg<cl::Buffer>(1, cl.getVelm().getDeviceBuffer());
    copyToContextKernel.setArg<cl::Buffer>(3, cl.getPosq().getDeviceBuffer());
    copyToContextKernel.setArg<cl::Buffer>(4, cl.getAtomIndexArray().getDeviceBuffer());
    copyFromContextKernel.setArg<cl::Buffer>(0, cl.getForce().getDeviceBuffer());
    copyFromContextKernel.setArg<cl::Buffer>(2, cl.getVelm().getDeviceBuffer());
peastman's avatar
peastman committed
169
    copyFromContextKernel.setArg<cl::Buffer>(3, velocities.getDeviceBuffer());
170
171
    copyFromContextKernel.setArg<cl::Buffer>(4, cl.getPosq().getDeviceBuffer());
    copyFromContextKernel.setArg<cl::Buffer>(6, cl.getAtomIndexArray().getDeviceBuffer());
peastman's avatar
peastman committed
172
173
    for (auto& g : groupsByCopies) {
        int copies = g.first;
peastman's avatar
peastman committed
174
175
176
177
        positionContractionKernels[copies].setArg<cl::Buffer>(0, positions.getDeviceBuffer());
        positionContractionKernels[copies].setArg<cl::Buffer>(1, contractedPositions.getDeviceBuffer());
        forceContractionKernels[copies].setArg<cl::Buffer>(0, forces.getDeviceBuffer());
        forceContractionKernels[copies].setArg<cl::Buffer>(1, contractedForces.getDeviceBuffer());
178
    }
179
180
}

181
182
void OpenCLIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDIntegrator& integrator, bool forcesAreValid) {
    OpenCLIntegrationUtilities& integration = cl.getIntegrationUtilities();
183
184
    if (!hasInitializedKernel)
        initializeKernels(context);
185
186
187
    
    // Loop over copies and compute the force on each one.
    
Peter Eastman's avatar
Peter Eastman committed
188
189
    if (!forcesAreValid)
        computeForces(context);
190
191
192
    
    // Apply the PILE-L thermostat.
    
193
    bool useDoublePrecision = (cl.getUseDoublePrecision() || cl.getUseMixedPrecision());
194
    const double dt = integrator.getStepSize();
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
    pileKernel.setArg<cl_uint>(2, integration.prepareRandomNumbers(numParticles*numCopies));
    pileKernel.setArg<cl::Buffer>(1, integration.getRandom().getDeviceBuffer()); // Do this *after* prepareRandomNumbers(), which might rebuild the array.
    if (useDoublePrecision) {
        pileKernel.setArg<cl_double>(3, dt);
        pileKernel.setArg<cl_double>(4, integrator.getTemperature()*BOLTZ);
        pileKernel.setArg<cl_double>(5, integrator.getFriction());
        stepKernel.setArg<cl_double>(3, dt);
        stepKernel.setArg<cl_double>(4, integrator.getTemperature()*BOLTZ);
        velocitiesKernel.setArg<cl_double>(2, dt);
    }
    else {
        pileKernel.setArg<cl_float>(3, (cl_float) dt);
        pileKernel.setArg<cl_float>(4, (cl_float) (integrator.getTemperature()*BOLTZ));
        pileKernel.setArg<cl_float>(5, (cl_float) integrator.getFriction());
        stepKernel.setArg<cl_float>(3, (cl_float) dt);
        stepKernel.setArg<cl_float>(4, (cl_float) (integrator.getTemperature()*BOLTZ));
        velocitiesKernel.setArg<cl_float>(2, (cl_float) dt);
    }
213
214
    if (integrator.getApplyThermostat())
        cl.executeKernel(pileKernel, numParticles*numCopies, workgroupSize);
215
216
217
218
219
220
221

    // Update positions and velocities.
    
    cl.executeKernel(stepKernel, numParticles*numCopies, workgroupSize);

    // Calculate forces based on the updated positions.
    
Peter Eastman's avatar
Peter Eastman committed
222
    computeForces(context);
223
224
225
226
227
228
    
    // Update velocities.
    cl.executeKernel(velocitiesKernel, numParticles*numCopies, workgroupSize);

    // Apply the PILE-L thermostat again.

229
230
231
232
    if (integrator.getApplyThermostat()) {
        pileKernel.setArg<cl_uint>(2, integration.prepareRandomNumbers(numParticles*numCopies));
        cl.executeKernel(pileKernel, numParticles*numCopies, workgroupSize);
    }
233
234
235
236
237

    // Update the time and step count.

    cl.setTime(cl.getTime()+dt);
    cl.setStepCount(cl.getStepCount()+1);
238
239
240
241
242
243
244
245
    cl.reorderAtoms();
    if (cl.getAtomsWereReordered() && cl.getNonbondedUtilities().getUsePeriodic()) {
        // Atoms may have been translated into a different periodic box, so apply
        // the same translation to all the beads.

        translateKernel.setArg<cl_int>(3, numCopies-1);
        cl.executeKernel(translateKernel, cl.getNumAtoms());
    }
246
247
}

Peter Eastman's avatar
Peter Eastman committed
248
void OpenCLIntegrateRPMDStepKernel::computeForces(ContextImpl& context) {
249
250
    // Compute forces from all groups that didn't have a specified contraction.

peastman's avatar
peastman committed
251
252
253
    copyToContextKernel.setArg<cl::Buffer>(2, positions.getDeviceBuffer());
    copyFromContextKernel.setArg<cl::Buffer>(1, forces.getDeviceBuffer());
    copyFromContextKernel.setArg<cl::Buffer>(5, positions.getDeviceBuffer());
Peter Eastman's avatar
Peter Eastman committed
254
    for (int i = 0; i < numCopies; i++) {
255
256
        copyToContextKernel.setArg<cl_int>(5, i);
        cl.executeKernel(copyToContextKernel, cl.getNumAtoms());
Peter Eastman's avatar
Peter Eastman committed
257
        context.computeVirtualSites();
258
259
        Vec3 initialBox[3];
        context.getPeriodicBoxVectors(initialBox[0], initialBox[1], initialBox[2]);
260
        context.updateContextState();
261
262
        Vec3 finalBox[3];
        context.getPeriodicBoxVectors(finalBox[0], finalBox[1], finalBox[2]);
263
264
        if (initialBox[0] != finalBox[0] || initialBox[1] != finalBox[1] || initialBox[2] != finalBox[2])
            throw OpenMMException("Standard barostats cannot be used with RPMDIntegrator.  Use RPMDMonteCarloBarostat instead.");
265
        context.calcForcesAndEnergy(true, false, groupsNotContracted);
266
267
        copyFromContextKernel.setArg<cl_int>(7, i);
        cl.executeKernel(copyFromContextKernel, cl.getNumAtoms());
Peter Eastman's avatar
Peter Eastman committed
268
    }
269
270
271
272
    
    // Now loop over contractions and compute forces from them.
    
    if (groupsByCopies.size() > 0) {
peastman's avatar
peastman committed
273
274
275
        copyToContextKernel.setArg<cl::Buffer>(2, contractedPositions.getDeviceBuffer());
        copyFromContextKernel.setArg<cl::Buffer>(1, contractedForces.getDeviceBuffer());
        copyFromContextKernel.setArg<cl::Buffer>(5, contractedPositions.getDeviceBuffer());
peastman's avatar
peastman committed
276
277
278
        for (auto& g : groupsByCopies) {
            int copies = g.first;
            int groupFlags = g.second;
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299

            // Find the contracted positions.

            cl.executeKernel(positionContractionKernels[copies], numParticles*numCopies, workgroupSize);

            // Compute forces.

            for (int i = 0; i < copies; i++) {
                copyToContextKernel.setArg<cl_int>(5, i);
                cl.executeKernel(copyToContextKernel, cl.getNumAtoms());
                context.computeVirtualSites();
                context.calcForcesAndEnergy(true, false, groupFlags);
                copyFromContextKernel.setArg<cl_int>(7, i);
                cl.executeKernel(copyFromContextKernel, cl.getNumAtoms());
            }

            // Apply the forces to the original copies.

            cl.executeKernel(forceContractionKernels[copies], numParticles*numCopies, workgroupSize);
        }
    }
300
301
302
    if (groupsByCopies.size() > 0) {
        // Ensure the Context contains the positions from the last copy, since we'll assume that later.
        
peastman's avatar
peastman committed
303
        copyToContextKernel.setArg<cl::Buffer>(2, positions.getDeviceBuffer());
304
305
306
        copyToContextKernel.setArg<cl_int>(5, numCopies-1);
        cl.executeKernel(copyToContextKernel, cl.getNumAtoms());
    }
Peter Eastman's avatar
Peter Eastman committed
307
308
}

309
310
311
312
double OpenCLIntegrateRPMDStepKernel::computeKineticEnergy(ContextImpl& context, const RPMDIntegrator& integrator) {
    return cl.getIntegrationUtilities().computeKineticEnergy(0);
}

313
void OpenCLIntegrateRPMDStepKernel::setPositions(int copy, const vector<Vec3>& pos) {
peastman's avatar
peastman committed
314
    if (!positions.isInitialized())
315
316
317
        throw OpenMMException("RPMDIntegrator: Cannot set positions before the integrator is added to a Context");
    if (pos.size() != numParticles)
        throw OpenMMException("RPMDIntegrator: wrong number of values passed to setPositions()");
318
319
320
321
322
323
324
325
326
327
328
329
330

    // Adjust the positions based on the current cell offsets.
    
    const vector<int>& order = cl.getAtomIndex();
    mm_double4 periodicBoxSize = cl.getPeriodicBoxSizeDouble();
    vector<Vec3> offsetPos(numParticles);
    for (int i = 0; i < numParticles; ++i) {
        mm_int4 offset = cl.getPosCellOffsets()[i];
        offsetPos[order[i]] = pos[order[i]] + Vec3(offset.x*periodicBoxSize.x, offset.y*periodicBoxSize.y, offset.z*periodicBoxSize.z);
    }

    // Record the positions.

331
332
333
334
    if (cl.getUseDoublePrecision()) {
        vector<mm_double4> posq(cl.getPaddedNumAtoms());
        cl.getPosq().download(posq);
        for (int i = 0; i < numParticles; i++)
335
            posq[i] = mm_double4(offsetPos[i][0], offsetPos[i][1], offsetPos[i][2], posq[i].w);
peastman's avatar
peastman committed
336
        cl.getQueue().enqueueWriteBuffer(positions.getDeviceBuffer(), CL_TRUE, copy*cl.getPaddedNumAtoms()*sizeof(mm_double4), numParticles*sizeof(mm_double4), &posq[0]);
337
338
339
340
341
342
    }
    else if (cl.getUseMixedPrecision()) {
        vector<mm_float4> posqf(cl.getPaddedNumAtoms());
        cl.getPosq().download(posqf);
        vector<mm_double4> posq(cl.getPaddedNumAtoms());
        for (int i = 0; i < numParticles; i++)
343
            posq[i] = mm_double4(offsetPos[i][0], offsetPos[i][1], offsetPos[i][2], posqf[i].w);
peastman's avatar
peastman committed
344
        cl.getQueue().enqueueWriteBuffer(positions.getDeviceBuffer(), CL_TRUE, copy*cl.getPaddedNumAtoms()*sizeof(mm_double4), numParticles*sizeof(mm_double4), &posq[0]);
345
346
347
348
349
    }
    else {
        vector<mm_float4> posq(cl.getPaddedNumAtoms());
        cl.getPosq().download(posq);
        for (int i = 0; i < numParticles; i++)
350
            posq[i] = mm_float4((cl_float) offsetPos[i][0], (cl_float) offsetPos[i][1], (cl_float) offsetPos[i][2], posq[i].w);
peastman's avatar
peastman committed
351
        cl.getQueue().enqueueWriteBuffer(positions.getDeviceBuffer(), CL_TRUE, copy*cl.getPaddedNumAtoms()*sizeof(mm_float4), numParticles*sizeof(mm_float4), &posq[0]);
352
    }
353
354
355
}

void OpenCLIntegrateRPMDStepKernel::setVelocities(int copy, const vector<Vec3>& vel) {
peastman's avatar
peastman committed
356
    if (!velocities.isInitialized())
357
358
359
        throw OpenMMException("RPMDIntegrator: Cannot set velocities before the integrator is added to a Context");
    if (vel.size() != numParticles)
        throw OpenMMException("RPMDIntegrator: wrong number of values passed to setVelocities()");
360
361
362
363
364
    if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision()) {
        vector<mm_double4> velm(cl.getPaddedNumAtoms());
        cl.getVelm().download(velm);
        for (int i = 0; i < numParticles; i++)
            velm[i] = mm_double4(vel[i][0], vel[i][1], vel[i][2], velm[i].w);
peastman's avatar
peastman committed
365
        cl.getQueue().enqueueWriteBuffer(velocities.getDeviceBuffer(), CL_TRUE, copy*cl.getPaddedNumAtoms()*sizeof(mm_double4), numParticles*sizeof(mm_double4), &velm[0]);
366
367
368
369
370
371
    }
    else {
        vector<mm_float4> velm(cl.getPaddedNumAtoms());
        cl.getVelm().download(velm);
        for (int i = 0; i < numParticles; i++)
            velm[i] = mm_float4((cl_float) vel[i][0], (cl_float) vel[i][1], (cl_float) vel[i][2], velm[i].w);
peastman's avatar
peastman committed
372
        cl.getQueue().enqueueWriteBuffer(velocities.getDeviceBuffer(), CL_TRUE, copy*cl.getPaddedNumAtoms()*sizeof(mm_float4), numParticles*sizeof(mm_float4), &velm[0]);
373
    }
374
375
}

376
void OpenCLIntegrateRPMDStepKernel::copyToContext(int copy, ContextImpl& context) {
377
378
    if (!hasInitializedKernel)
        initializeKernels(context);
peastman's avatar
peastman committed
379
    copyToContextKernel.setArg<cl::Buffer>(2, positions.getDeviceBuffer());
380
381
    copyToContextKernel.setArg<cl_int>(5, copy);
    cl.executeKernel(copyToContextKernel, cl.getNumAtoms());
382
383
384
385
386
387
388
389
}

string OpenCLIntegrateRPMDStepKernel::createFFT(int size, const string& variable, bool forward) {
    stringstream source;
    int stage = 0;
    int L = size;
    int m = 1;
    string sign = (forward ? "1.0f" : "-1.0f");
Peter Eastman's avatar
Peter Eastman committed
390
391
    string multReal = (forward ? "multiplyComplexRealPart" : "multiplyComplexRealPartConj");
    string multImag = (forward ? "multiplyComplexImagPart" : "multiplyComplexImagPartConj");
392
393

    source<<"{\n";
394
395
396
397
    source<<"__local mixed4* real0 = "<<variable<<"real;\n";
    source<<"__local mixed4* imag0 = "<<variable<<"imag;\n";
    source<<"__local mixed4* real1 = &temp[blockStart];\n";
    source<<"__local mixed4* imag1 = &temp[blockStart+get_local_size(0)];\n";
398
399
400

    // Factor size, generating an appropriate block of code for each factor.

peastman's avatar
peastman committed
401
    while (L > 1) {
402
403
        int input = stage%2;
        int output = 1-input;
peastman's avatar
peastman committed
404
405
406
407
408
409
410
411
412
413
414
        int radix;
        if (L%5 == 0)
            radix = 5;
        else if (L%4 == 0)
            radix = 4;
        else if (L%3 == 0)
            radix = 3;
        else if (L%2 == 0)
            radix = 2;
        else
            throw OpenMMException("Illegal size for FFT: "+cl.intToString(size));
415
        source<<"{\n";
peastman's avatar
peastman committed
416
417
418
419
420
421
        L = L/radix;
        source<<"// Pass "<<(stage+1)<<" (radix "<<radix<<")\n";
        source<<"if (indexInBlock < "<<(L*m)<<") {\n";
        source<<"int i = indexInBlock;\n";
        source<<"int j = i/"<<m<<";\n";
        if (radix == 5) {
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
            source<<"mixed4 c0r = real"<<input<<"[i];\n";
            source<<"mixed4 c0i = imag"<<input<<"[i];\n";
            source<<"mixed4 c1r = real"<<input<<"[i+"<<(L*m)<<"];\n";
            source<<"mixed4 c1i = imag"<<input<<"[i+"<<(L*m)<<"];\n";
            source<<"mixed4 c2r = real"<<input<<"[i+"<<(2*L*m)<<"];\n";
            source<<"mixed4 c2i = imag"<<input<<"[i+"<<(2*L*m)<<"];\n";
            source<<"mixed4 c3r = real"<<input<<"[i+"<<(3*L*m)<<"];\n";
            source<<"mixed4 c3i = imag"<<input<<"[i+"<<(3*L*m)<<"];\n";
            source<<"mixed4 c4r = real"<<input<<"[i+"<<(4*L*m)<<"];\n";
            source<<"mixed4 c4i = imag"<<input<<"[i+"<<(4*L*m)<<"];\n";
            source<<"mixed4 d0r = c1r+c4r;\n";
            source<<"mixed4 d0i = c1i+c4i;\n";
            source<<"mixed4 d1r = c2r+c3r;\n";
            source<<"mixed4 d1i = c2i+c3i;\n";
            source<<"mixed4 d2r = "<<cl.doubleToString(sin(0.4*M_PI))<<"*(c1r-c4r);\n";
            source<<"mixed4 d2i = "<<cl.doubleToString(sin(0.4*M_PI))<<"*(c1i-c4i);\n";
            source<<"mixed4 d3r = "<<cl.doubleToString(sin(0.4*M_PI))<<"*(c2r-c3r);\n";
            source<<"mixed4 d3i = "<<cl.doubleToString(sin(0.4*M_PI))<<"*(c2i-c3i);\n";
            source<<"mixed4 d4r = d0r+d1r;\n";
            source<<"mixed4 d4i = d0i+d1i;\n";
            source<<"mixed4 d5r = "<<cl.doubleToString(0.25*sqrt(5.0))<<"*(d0r-d1r);\n";
            source<<"mixed4 d5i = "<<cl.doubleToString(0.25*sqrt(5.0))<<"*(d0i-d1i);\n";
            source<<"mixed4 d6r = c0r-0.25f*d4r;\n";
            source<<"mixed4 d6i = c0i-0.25f*d4i;\n";
            source<<"mixed4 d7r = d6r+d5r;\n";
            source<<"mixed4 d7i = d6i+d5i;\n";
            source<<"mixed4 d8r = d6r-d5r;\n";
            source<<"mixed4 d8i = d6i-d5i;\n";
450
            string coeff = cl.doubleToString(sin(0.2*M_PI)/sin(0.4*M_PI));
451
452
453
454
            source<<"mixed4 d9r = "<<sign<<"*(d2i+"<<coeff<<"*d3i);\n";
            source<<"mixed4 d9i = "<<sign<<"*(-d2r-"<<coeff<<"*d3r);\n";
            source<<"mixed4 d10r = "<<sign<<"*("<<coeff<<"*d2i-d3i);\n";
            source<<"mixed4 d10i = "<<sign<<"*(d3r-"<<coeff<<"*d2r);\n";
455
456
            source<<"real"<<output<<"[i+4*j*"<<m<<"] = c0r+d4r;\n";
            source<<"imag"<<output<<"[i+4*j*"<<m<<"] = c0i+d4i;\n";
Peter Eastman's avatar
Peter Eastman committed
457
458
459
460
461
462
463
464
            source<<"real"<<output<<"[i+(4*j+1)*"<<m<<"] = "<<multReal<<"(w[j*"<<size<<"/"<<(5*L)<<"], d7r+d9r, d7i+d9i);\n";
            source<<"imag"<<output<<"[i+(4*j+1)*"<<m<<"] = "<<multImag<<"(w[j*"<<size<<"/"<<(5*L)<<"], d7r+d9r, d7i+d9i);\n";
            source<<"real"<<output<<"[i+(4*j+2)*"<<m<<"] = "<<multReal<<"(w[j*"<<(2*size)<<"/"<<(5*L)<<"], d8r+d10r, d8i+d10i);\n";
            source<<"imag"<<output<<"[i+(4*j+2)*"<<m<<"] = "<<multImag<<"(w[j*"<<(2*size)<<"/"<<(5*L)<<"], d8r+d10r, d8i+d10i);\n";
            source<<"real"<<output<<"[i+(4*j+3)*"<<m<<"] = "<<multReal<<"(w[j*"<<(3*size)<<"/"<<(5*L)<<"], d8r-d10r, d8i-d10i);\n";
            source<<"imag"<<output<<"[i+(4*j+3)*"<<m<<"] = "<<multImag<<"(w[j*"<<(3*size)<<"/"<<(5*L)<<"], d8r-d10r, d8i-d10i);\n";
            source<<"real"<<output<<"[i+(4*j+4)*"<<m<<"] = "<<multReal<<"(w[j*"<<(4*size)<<"/"<<(5*L)<<"], d7r-d9r, d7i-d9i);\n";
            source<<"imag"<<output<<"[i+(4*j+4)*"<<m<<"] = "<<multImag<<"(w[j*"<<(4*size)<<"/"<<(5*L)<<"], d7r-d9r, d7i-d9i);\n";
465
        }
peastman's avatar
peastman committed
466
        else if (radix == 4) {
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
            source<<"mixed4 c0r = real"<<input<<"[i];\n";
            source<<"mixed4 c0i = imag"<<input<<"[i];\n";
            source<<"mixed4 c1r = real"<<input<<"[i+"<<(L*m)<<"];\n";
            source<<"mixed4 c1i = imag"<<input<<"[i+"<<(L*m)<<"];\n";
            source<<"mixed4 c2r = real"<<input<<"[i+"<<(2*L*m)<<"];\n";
            source<<"mixed4 c2i = imag"<<input<<"[i+"<<(2*L*m)<<"];\n";
            source<<"mixed4 c3r = real"<<input<<"[i+"<<(3*L*m)<<"];\n";
            source<<"mixed4 c3i = imag"<<input<<"[i+"<<(3*L*m)<<"];\n";
            source<<"mixed4 d0r = c0r+c2r;\n";
            source<<"mixed4 d0i = c0i+c2i;\n";
            source<<"mixed4 d1r = c0r-c2r;\n";
            source<<"mixed4 d1i = c0i-c2i;\n";
            source<<"mixed4 d2r = c1r+c3r;\n";
            source<<"mixed4 d2i = c1i+c3i;\n";
            source<<"mixed4 d3r = "<<sign<<"*(c1i-c3i);\n";
            source<<"mixed4 d3i = "<<sign<<"*(c3r-c1r);\n";
483
484
            source<<"real"<<output<<"[i+3*j*"<<m<<"] = d0r+d2r;\n";
            source<<"imag"<<output<<"[i+3*j*"<<m<<"] = d0i+d2i;\n";
Peter Eastman's avatar
Peter Eastman committed
485
486
487
488
489
490
            source<<"real"<<output<<"[i+(3*j+1)*"<<m<<"] = "<<multReal<<"(w[j*"<<size<<"/"<<(4*L)<<"], d1r+d3r, d1i+d3i);\n";
            source<<"imag"<<output<<"[i+(3*j+1)*"<<m<<"] = "<<multImag<<"(w[j*"<<size<<"/"<<(4*L)<<"], d1r+d3r, d1i+d3i);\n";
            source<<"real"<<output<<"[i+(3*j+2)*"<<m<<"] = "<<multReal<<"(w[j*"<<(2*size)<<"/"<<(4*L)<<"], d0r-d2r, d0i-d2i);\n";
            source<<"imag"<<output<<"[i+(3*j+2)*"<<m<<"] = "<<multImag<<"(w[j*"<<(2*size)<<"/"<<(4*L)<<"], d0r-d2r, d0i-d2i);\n";
            source<<"real"<<output<<"[i+(3*j+3)*"<<m<<"] = "<<multReal<<"(w[j*"<<(3*size)<<"/"<<(4*L)<<"], d1r-d3r, d1i-d3i);\n";
            source<<"imag"<<output<<"[i+(3*j+3)*"<<m<<"] = "<<multImag<<"(w[j*"<<(3*size)<<"/"<<(4*L)<<"], d1r-d3r, d1i-d3i);\n";
491
        }
peastman's avatar
peastman committed
492
        else if (radix == 3) {
493
494
495
496
497
498
499
500
501
502
503
504
            source<<"mixed4 c0r = real"<<input<<"[i];\n";
            source<<"mixed4 c0i = imag"<<input<<"[i];\n";
            source<<"mixed4 c1r = real"<<input<<"[i+"<<(L*m)<<"];\n";
            source<<"mixed4 c1i = imag"<<input<<"[i+"<<(L*m)<<"];\n";
            source<<"mixed4 c2r = real"<<input<<"[i+"<<(2*L*m)<<"];\n";
            source<<"mixed4 c2i = imag"<<input<<"[i+"<<(2*L*m)<<"];\n";
            source<<"mixed4 d0r = c1r+c2r;\n";
            source<<"mixed4 d0i = c1i+c2i;\n";
            source<<"mixed4 d1r = c0r-0.5f*d0r;\n";
            source<<"mixed4 d1i = c0i-0.5f*d0i;\n";
            source<<"mixed4 d2r = "<<sign<<"*"<<cl.doubleToString(sin(M_PI/3.0))<<"*(c1i-c2i);\n";
            source<<"mixed4 d2i = "<<sign<<"*"<<cl.doubleToString(sin(M_PI/3.0))<<"*(c2r-c1r);\n";
505
506
            source<<"real"<<output<<"[i+2*j*"<<m<<"] = c0r+d0r;\n";
            source<<"imag"<<output<<"[i+2*j*"<<m<<"] = c0i+d0i;\n";
Peter Eastman's avatar
Peter Eastman committed
507
508
509
510
            source<<"real"<<output<<"[i+(2*j+1)*"<<m<<"] = "<<multReal<<"(w[j*"<<size<<"/"<<(3*L)<<"], d1r+d2r, d1i+d2i);\n";
            source<<"imag"<<output<<"[i+(2*j+1)*"<<m<<"] = "<<multImag<<"(w[j*"<<size<<"/"<<(3*L)<<"], d1r+d2r, d1i+d2i);\n";
            source<<"real"<<output<<"[i+(2*j+2)*"<<m<<"] = "<<multReal<<"(w[j*"<<(2*size)<<"/"<<(3*L)<<"], d1r-d2r, d1i-d2i);\n";
            source<<"imag"<<output<<"[i+(2*j+2)*"<<m<<"] = "<<multImag<<"(w[j*"<<(2*size)<<"/"<<(3*L)<<"], d1r-d2r, d1i-d2i);\n";
511
        }
peastman's avatar
peastman committed
512
        else if (radix == 2) {
513
514
515
516
            source<<"mixed4 c0r = real"<<input<<"[i];\n";
            source<<"mixed4 c0i = imag"<<input<<"[i];\n";
            source<<"mixed4 c1r = real"<<input<<"[i+"<<(L*m)<<"];\n";
            source<<"mixed4 c1i = imag"<<input<<"[i+"<<(L*m)<<"];\n";
517
518
            source<<"real"<<output<<"[i+j*"<<m<<"] = c0r+c1r;\n";
            source<<"imag"<<output<<"[i+j*"<<m<<"] = c0i+c1i;\n";
Peter Eastman's avatar
Peter Eastman committed
519
520
            source<<"real"<<output<<"[i+(j+1)*"<<m<<"] = "<<multReal<<"(w[j*"<<size<<"/"<<(2*L)<<"], c0r-c1r, c0i-c1i);\n";
            source<<"imag"<<output<<"[i+(j+1)*"<<m<<"] = "<<multImag<<"(w[j*"<<size<<"/"<<(2*L)<<"], c0r-c1r, c0i-c1i);\n";
521
        }
peastman's avatar
peastman committed
522
523
        source<<"}\n";
        m = m*radix;
524
525
526
527
528
529
530
531
        source<<"barrier(CLK_LOCAL_MEM_FENCE);\n";
        source<<"}\n";
        ++stage;
    }

    // Create the kernel.

    if (stage%2 == 1) {
Peter Eastman's avatar
Peter Eastman committed
532
533
        source<<"real0[indexInBlock] = real1[indexInBlock];\n";
        source<<"imag0[indexInBlock] = imag1[indexInBlock];\n";
Peter Eastman's avatar
Bug fix  
Peter Eastman committed
534
        source<<"barrier(CLK_LOCAL_MEM_FENCE);\n";
535
536
537
538
    }
    source<<"}\n";
    return source.str();
}