OpenCLRpmdKernels.cpp 28.7 KB
Newer Older
1
2
3
4
5
6
7
8
/* -------------------------------------------------------------------------- *
 *                                   OpenMM                                   *
 * -------------------------------------------------------------------------- *
 * This is part of the OpenMM molecular simulation toolkit originating from   *
 * Simbios, the NIH National Center for Physics-Based Simulation of           *
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
9
 * Portions copyright (c) 2011-2013 Stanford University and the Authors.      *
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
 * Permission is hereby granted, free of charge, to any person obtaining a    *
 * copy of this software and associated documentation files (the "Software"), *
 * to deal in the Software without restriction, including without limitation  *
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
 * and/or sell copies of the Software, and to permit persons to whom the      *
 * Software is furnished to do so, subject to the following conditions:       *
 *                                                                            *
 * The above copyright notice and this permission notice shall be included in *
 * all copies or substantial portions of the Software.                        *
 *                                                                            *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
 * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
 * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
 * -------------------------------------------------------------------------- */

#include "OpenCLRpmdKernels.h"
#include "OpenCLRpmdKernelSources.h"
#include "openmm/internal/ContextImpl.h"
#include "OpenCLIntegrationUtilities.h"
#include "OpenCLExpressionUtilities.h"
#include "OpenCLFFT3D.h"
Peter Eastman's avatar
Peter Eastman committed
38
#include "OpenCLNonbondedUtilities.h"
39
40
41
42
43
44
45
46
47
48
49
50
#include "../src/SimTKUtilities/SimTKOpenMMRealType.h"

using namespace OpenMM;
using namespace std;

OpenCLIntegrateRPMDStepKernel::~OpenCLIntegrateRPMDStepKernel() {
    if (forces != NULL)
        delete forces;
    if (positions != NULL)
        delete positions;
    if (velocities != NULL)
        delete velocities;
51
52
53
54
    if (contractedForces != NULL)
        delete contractedForces;
    if (contractedPositions != NULL)
        delete contractedPositions;
55
}
56

57
58
59
60
61
62
63
64
void OpenCLIntegrateRPMDStepKernel::initialize(const System& system, const RPMDIntegrator& integrator) {
    cl.getPlatformData().initializeContexts(system);
    numCopies = integrator.getNumCopies();
    numParticles = system.getNumParticles();
    workgroupSize = numCopies;
    if (numCopies != OpenCLFFT3D::findLegalDimension(numCopies))
        throw OpenMMException("RPMDIntegrator: the number of copies must be a multiple of powers of 2, 3, and 5.");
    int paddedParticles = cl.getPaddedNumAtoms();
65
66
67
68
69
70
    int forceElementSize = (cl.getUseDoublePrecision() ? sizeof(mm_double4) : sizeof(mm_float4));
    forces = new OpenCLArray(cl, numCopies*paddedParticles, forceElementSize, "rpmdForces");
    bool useDoublePrecision = (cl.getUseDoublePrecision() || cl.getUseMixedPrecision());
    int elementSize = (useDoublePrecision ? sizeof(mm_double4) : sizeof(mm_float4));
    positions = new OpenCLArray(cl, numCopies*paddedParticles, elementSize, "rpmdPositions");
    velocities = new OpenCLArray(cl, numCopies*paddedParticles, elementSize, "rpmdVelocities");
71
72
73
74
    cl.getIntegrationUtilities().initRandomNumberGenerator((unsigned int) integrator.getRandomNumberSeed());
    
    // Fill in the posq and velm arrays with safe values to avoid a risk of nans.
    
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
    if (useDoublePrecision) {
        vector<mm_double4> temp(positions->getSize());
        for (int i = 0; i < positions->getSize(); i++)
            temp[i] = mm_double4(0, 0, 0, 0);
        positions->upload(temp);
        for (int i = 0; i < velocities->getSize(); i++)
            temp[i] = mm_double4(0, 0, 0, 1);
        velocities->upload(temp);
    }
    else {
        vector<mm_float4> temp(positions->getSize());
        for (int i = 0; i < positions->getSize(); i++)
            temp[i] = mm_float4(0, 0, 0, 0);
        positions->upload(temp);
        for (int i = 0; i < velocities->getSize(); i++)
            temp[i] = mm_float4(0, 0, 0, 1);
        velocities->upload(temp);
    }
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
    
    // Build a list of contractions.
    
    groupsNotContracted = -1;
    const map<int, int>& contractions = integrator.getContractions();
    int maxContractedCopies = 0;
    for (map<int, int>::const_iterator iter = contractions.begin(); iter != contractions.end(); ++iter) {
        int group = iter->first;
        int copies = iter->second;
        if (group < 0 || group > 31)
            throw OpenMMException("RPMDIntegrator: Force group must be between 0 and 31");
        if (copies < 0 || copies > numCopies)
            throw OpenMMException("RPMDIntegrator: Number of copies for contraction cannot be greater than the total number of copies being simulated");
        if (copies != numCopies) {
            if (groupsByCopies.find(copies) == groupsByCopies.end()) {
                groupsByCopies[copies] = 1<<group;
                groupsNotContracted -= 1<<group;
                if (copies > maxContractedCopies)
                    maxContractedCopies = copies;
            }
            else
                groupsByCopies[copies] |= 1<<group;
        }
    }
    if (maxContractedCopies > 0) {
        contractedForces = new OpenCLArray(cl, maxContractedCopies*paddedParticles, forceElementSize, "rpmdContractedForces");
        contractedPositions = new OpenCLArray(cl, maxContractedCopies*paddedParticles, elementSize, "rpmdContractedPositions");
    }
121
122
123
124

    // Create kernels.
    
    map<string, string> defines;
125
126
127
    defines["NUM_ATOMS"] = cl.intToString(cl.getNumAtoms());
    defines["PADDED_NUM_ATOMS"] = cl.intToString(cl.getPaddedNumAtoms());
    defines["NUM_COPIES"] = cl.intToString(numCopies);
128
    defines["THREAD_BLOCK_SIZE"] = cl.intToString(workgroupSize);
129
130
131
    defines["HBAR"] = cl.doubleToString(1.054571628e-34*AVOGADRO/(1000*1e-12));
    defines["SCALE"] = cl.doubleToString(1.0/sqrt((double) numCopies));
    defines["M_PI"] = cl.doubleToString(M_PI);
132
133
134
135
136
137
138
139
140
    map<string, string> replacements;
    replacements["FFT_Q_FORWARD"] = createFFT(numCopies, "q", true);
    replacements["FFT_Q_BACKWARD"] = createFFT(numCopies, "q", false);
    replacements["FFT_V_FORWARD"] = createFFT(numCopies, "v", true);
    replacements["FFT_V_BACKWARD"] = createFFT(numCopies, "v", false);
    cl::Program program = cl.createProgram(cl.replaceStrings(OpenCLRpmdKernelSources::rpmd, replacements), defines, "");
    pileKernel = cl::Kernel(program, "applyPileThermostat");
    stepKernel = cl::Kernel(program, "integrateStep");
    velocitiesKernel = cl::Kernel(program, "advanceVelocities");
141
142
    copyToContextKernel = cl::Kernel(program, "copyDataToContext");
    copyFromContextKernel = cl::Kernel(program, "copyDataFromContext");
Peter Eastman's avatar
Peter Eastman committed
143
    translateKernel = cl::Kernel(program, "applyCellTranslations");
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
    
    // Create kernels for doing contractions.
    
    for (map<int, int>::const_iterator iter = groupsByCopies.begin(); iter != groupsByCopies.end(); ++iter) {
        int copies = iter->first;
        replacements.clear();
        replacements["NUM_CONTRACTED_COPIES"] = cl.intToString(copies);
        replacements["POS_SCALE"] = cl.doubleToString(1.0/numCopies);
        replacements["FORCE_SCALE"] = cl.doubleToString(1.0/copies);
        replacements["FFT_Q_FORWARD"] = createFFT(numCopies, "q", true);
        replacements["FFT_Q_BACKWARD"] = createFFT(copies, "q", false);
        replacements["FFT_F_FORWARD"] = createFFT(copies, "f", true);
        replacements["FFT_F_BACKWARD"] = createFFT(numCopies, "f", false);
        program = cl.createProgram(cl.replaceStrings(OpenCLRpmdKernelSources::rpmdContraction, replacements), defines, "");
        positionContractionKernels[copies] = cl::Kernel(program, "contractPositions");
        forceContractionKernels[copies] = cl::Kernel(program, "contractForces");
    }
161
162
}

163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
void OpenCLIntegrateRPMDStepKernel::initializeKernels(ContextImpl& context) {
    hasInitializedKernel = true;
    pileKernel.setArg<cl::Buffer>(0, velocities->getDeviceBuffer());
    stepKernel.setArg<cl::Buffer>(0, positions->getDeviceBuffer());
    stepKernel.setArg<cl::Buffer>(1, velocities->getDeviceBuffer());
    stepKernel.setArg<cl::Buffer>(2, forces->getDeviceBuffer());
    velocitiesKernel.setArg<cl::Buffer>(0, velocities->getDeviceBuffer());
    velocitiesKernel.setArg<cl::Buffer>(1, forces->getDeviceBuffer());
    translateKernel.setArg<cl::Buffer>(0, positions->getDeviceBuffer());
    translateKernel.setArg<cl::Buffer>(1, cl.getPosq().getDeviceBuffer());
    translateKernel.setArg<cl::Buffer>(2, cl.getAtomIndexArray().getDeviceBuffer());
    copyToContextKernel.setArg<cl::Buffer>(0, velocities->getDeviceBuffer());
    copyToContextKernel.setArg<cl::Buffer>(1, cl.getVelm().getDeviceBuffer());
    copyToContextKernel.setArg<cl::Buffer>(3, cl.getPosq().getDeviceBuffer());
    copyToContextKernel.setArg<cl::Buffer>(4, cl.getAtomIndexArray().getDeviceBuffer());
    copyFromContextKernel.setArg<cl::Buffer>(0, cl.getForce().getDeviceBuffer());
    copyFromContextKernel.setArg<cl::Buffer>(2, cl.getVelm().getDeviceBuffer());
    copyFromContextKernel.setArg<cl::Buffer>(3, velocities->getDeviceBuffer());
    copyFromContextKernel.setArg<cl::Buffer>(4, cl.getPosq().getDeviceBuffer());
    copyFromContextKernel.setArg<cl::Buffer>(6, cl.getAtomIndexArray().getDeviceBuffer());
183
184
185
186
187
188
189
    for (map<int, int>::const_iterator iter = groupsByCopies.begin(); iter != groupsByCopies.end(); ++iter) {
        int copies = iter->first;
        positionContractionKernels[copies].setArg<cl::Buffer>(0, positions->getDeviceBuffer());
        positionContractionKernels[copies].setArg<cl::Buffer>(1, contractedPositions->getDeviceBuffer());
        forceContractionKernels[copies].setArg<cl::Buffer>(0, forces->getDeviceBuffer());
        forceContractionKernels[copies].setArg<cl::Buffer>(1, contractedForces->getDeviceBuffer());
    }
190
191
}

192
193
void OpenCLIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDIntegrator& integrator, bool forcesAreValid) {
    OpenCLIntegrationUtilities& integration = cl.getIntegrationUtilities();
194
195
    if (!hasInitializedKernel)
        initializeKernels(context);
196
197
198
    
    // Loop over copies and compute the force on each one.
    
Peter Eastman's avatar
Peter Eastman committed
199
200
    if (!forcesAreValid)
        computeForces(context);
201
202
203
    
    // Apply the PILE-L thermostat.
    
204
    bool useDoublePrecision = (cl.getUseDoublePrecision() || cl.getUseMixedPrecision());
205
    const double dt = integrator.getStepSize();
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
    pileKernel.setArg<cl_uint>(2, integration.prepareRandomNumbers(numParticles*numCopies));
    pileKernel.setArg<cl::Buffer>(1, integration.getRandom().getDeviceBuffer()); // Do this *after* prepareRandomNumbers(), which might rebuild the array.
    if (useDoublePrecision) {
        pileKernel.setArg<cl_double>(3, dt);
        pileKernel.setArg<cl_double>(4, integrator.getTemperature()*BOLTZ);
        pileKernel.setArg<cl_double>(5, integrator.getFriction());
        stepKernel.setArg<cl_double>(3, dt);
        stepKernel.setArg<cl_double>(4, integrator.getTemperature()*BOLTZ);
        velocitiesKernel.setArg<cl_double>(2, dt);
    }
    else {
        pileKernel.setArg<cl_float>(3, (cl_float) dt);
        pileKernel.setArg<cl_float>(4, (cl_float) (integrator.getTemperature()*BOLTZ));
        pileKernel.setArg<cl_float>(5, (cl_float) integrator.getFriction());
        stepKernel.setArg<cl_float>(3, (cl_float) dt);
        stepKernel.setArg<cl_float>(4, (cl_float) (integrator.getTemperature()*BOLTZ));
        velocitiesKernel.setArg<cl_float>(2, (cl_float) dt);
    }
224
225
226
227
228
229
230
231
    cl.executeKernel(pileKernel, numParticles*numCopies, workgroupSize);

    // Update positions and velocities.
    
    cl.executeKernel(stepKernel, numParticles*numCopies, workgroupSize);

    // Calculate forces based on the updated positions.
    
Peter Eastman's avatar
Peter Eastman committed
232
    computeForces(context);
233
234
235
236
237
238
    
    // Update velocities.
    cl.executeKernel(velocitiesKernel, numParticles*numCopies, workgroupSize);

    // Apply the PILE-L thermostat again.

239
    pileKernel.setArg<cl_uint>(2, integration.prepareRandomNumbers(numParticles*numCopies));
240
241
242
243
244
245
    cl.executeKernel(pileKernel, numParticles*numCopies, workgroupSize);

    // Update the time and step count.

    cl.setTime(cl.getTime()+dt);
    cl.setStepCount(cl.getStepCount()+1);
246
247
248
249
250
251
252
253
    cl.reorderAtoms();
    if (cl.getAtomsWereReordered() && cl.getNonbondedUtilities().getUsePeriodic()) {
        // Atoms may have been translated into a different periodic box, so apply
        // the same translation to all the beads.

        translateKernel.setArg<cl_int>(3, numCopies-1);
        cl.executeKernel(translateKernel, cl.getNumAtoms());
    }
254
255
}

Peter Eastman's avatar
Peter Eastman committed
256
void OpenCLIntegrateRPMDStepKernel::computeForces(ContextImpl& context) {
257
258
259
260
261
    // Compute forces from all groups that didn't have a specified contraction.

    copyToContextKernel.setArg<cl::Buffer>(2, positions->getDeviceBuffer());
    copyFromContextKernel.setArg<cl::Buffer>(1, forces->getDeviceBuffer());
    copyFromContextKernel.setArg<cl::Buffer>(5, positions->getDeviceBuffer());
Peter Eastman's avatar
Peter Eastman committed
262
    for (int i = 0; i < numCopies; i++) {
263
264
        copyToContextKernel.setArg<cl_int>(5, i);
        cl.executeKernel(copyToContextKernel, cl.getNumAtoms());
Peter Eastman's avatar
Peter Eastman committed
265
        context.computeVirtualSites();
266
        context.updateContextState();
267
        context.calcForcesAndEnergy(true, false, groupsNotContracted);
268
269
        copyFromContextKernel.setArg<cl_int>(7, i);
        cl.executeKernel(copyFromContextKernel, cl.getNumAtoms());
Peter Eastman's avatar
Peter Eastman committed
270
    }
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
    
    // Now loop over contractions and compute forces from them.
    
    if (groupsByCopies.size() > 0) {
        copyToContextKernel.setArg<cl::Buffer>(2, contractedPositions->getDeviceBuffer());
        copyFromContextKernel.setArg<cl::Buffer>(1, contractedForces->getDeviceBuffer());
        copyFromContextKernel.setArg<cl::Buffer>(5, contractedPositions->getDeviceBuffer());
        for (map<int, int>::const_iterator iter = groupsByCopies.begin(); iter != groupsByCopies.end(); ++iter) {
            int copies = iter->first;
            int groupFlags = iter->second;

            // Find the contracted positions.

            cl.executeKernel(positionContractionKernels[copies], numParticles*numCopies, workgroupSize);

            // Compute forces.

            for (int i = 0; i < copies; i++) {
                copyToContextKernel.setArg<cl_int>(5, i);
                cl.executeKernel(copyToContextKernel, cl.getNumAtoms());
                context.computeVirtualSites();
                context.calcForcesAndEnergy(true, false, groupFlags);
                copyFromContextKernel.setArg<cl_int>(7, i);
                cl.executeKernel(copyFromContextKernel, cl.getNumAtoms());
            }

            // Apply the forces to the original copies.

            cl.executeKernel(forceContractionKernels[copies], numParticles*numCopies, workgroupSize);
        }
    }
Peter Eastman's avatar
Peter Eastman committed
302
303
}

304
305
306
307
double OpenCLIntegrateRPMDStepKernel::computeKineticEnergy(ContextImpl& context, const RPMDIntegrator& integrator) {
    return cl.getIntegrationUtilities().computeKineticEnergy(0);
}

308
309
310
311
312
void OpenCLIntegrateRPMDStepKernel::setPositions(int copy, const vector<Vec3>& pos) {
    if (positions == NULL)
        throw OpenMMException("RPMDIntegrator: Cannot set positions before the integrator is added to a Context");
    if (pos.size() != numParticles)
        throw OpenMMException("RPMDIntegrator: wrong number of values passed to setPositions()");
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
    if (cl.getUseDoublePrecision()) {
        vector<mm_double4> posq(cl.getPaddedNumAtoms());
        cl.getPosq().download(posq);
        for (int i = 0; i < numParticles; i++)
            posq[i] = mm_double4(pos[i][0], pos[i][1], pos[i][2], posq[i].w);
        cl.getQueue().enqueueWriteBuffer(positions->getDeviceBuffer(), CL_TRUE, copy*cl.getPaddedNumAtoms()*sizeof(mm_double4), numParticles*sizeof(mm_double4), &posq[0]);
    }
    else if (cl.getUseMixedPrecision()) {
        vector<mm_float4> posqf(cl.getPaddedNumAtoms());
        cl.getPosq().download(posqf);
        vector<mm_double4> posq(cl.getPaddedNumAtoms());
        for (int i = 0; i < numParticles; i++)
            posq[i] = mm_double4(pos[i][0], pos[i][1], pos[i][2], posqf[i].w);
        cl.getQueue().enqueueWriteBuffer(positions->getDeviceBuffer(), CL_TRUE, copy*cl.getPaddedNumAtoms()*sizeof(mm_double4), numParticles*sizeof(mm_double4), &posq[0]);
    }
    else {
        vector<mm_float4> posq(cl.getPaddedNumAtoms());
        cl.getPosq().download(posq);
        for (int i = 0; i < numParticles; i++)
            posq[i] = mm_float4((cl_float) pos[i][0], (cl_float) pos[i][1], (cl_float) pos[i][2], posq[i].w);
        cl.getQueue().enqueueWriteBuffer(positions->getDeviceBuffer(), CL_TRUE, copy*cl.getPaddedNumAtoms()*sizeof(mm_float4), numParticles*sizeof(mm_float4), &posq[0]);
    }
335
336
337
338
339
340
341
}

void OpenCLIntegrateRPMDStepKernel::setVelocities(int copy, const vector<Vec3>& vel) {
    if (velocities == NULL)
        throw OpenMMException("RPMDIntegrator: Cannot set velocities before the integrator is added to a Context");
    if (vel.size() != numParticles)
        throw OpenMMException("RPMDIntegrator: wrong number of values passed to setVelocities()");
342
343
344
345
346
347
348
349
350
351
352
353
354
355
    if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision()) {
        vector<mm_double4> velm(cl.getPaddedNumAtoms());
        cl.getVelm().download(velm);
        for (int i = 0; i < numParticles; i++)
            velm[i] = mm_double4(vel[i][0], vel[i][1], vel[i][2], velm[i].w);
        cl.getQueue().enqueueWriteBuffer(velocities->getDeviceBuffer(), CL_TRUE, copy*cl.getPaddedNumAtoms()*sizeof(mm_double4), numParticles*sizeof(mm_double4), &velm[0]);
    }
    else {
        vector<mm_float4> velm(cl.getPaddedNumAtoms());
        cl.getVelm().download(velm);
        for (int i = 0; i < numParticles; i++)
            velm[i] = mm_float4((cl_float) vel[i][0], (cl_float) vel[i][1], (cl_float) vel[i][2], velm[i].w);
        cl.getQueue().enqueueWriteBuffer(velocities->getDeviceBuffer(), CL_TRUE, copy*cl.getPaddedNumAtoms()*sizeof(mm_float4), numParticles*sizeof(mm_float4), &velm[0]);
    }
356
357
}

358
void OpenCLIntegrateRPMDStepKernel::copyToContext(int copy, ContextImpl& context) {
359
360
    if (!hasInitializedKernel)
        initializeKernels(context);
361
    copyToContextKernel.setArg<cl::Buffer>(2, positions->getDeviceBuffer());
362
363
    copyToContextKernel.setArg<cl_int>(5, copy);
    cl.executeKernel(copyToContextKernel, cl.getNumAtoms());
364
365
366
367
368
369
370
371
372
}

string OpenCLIntegrateRPMDStepKernel::createFFT(int size, const string& variable, bool forward) {
    stringstream source;
    int unfactored = size;
    int stage = 0;
    int L = size;
    int m = 1;
    string sign = (forward ? "1.0f" : "-1.0f");
Peter Eastman's avatar
Peter Eastman committed
373
374
    string multReal = (forward ? "multiplyComplexRealPart" : "multiplyComplexRealPartConj");
    string multImag = (forward ? "multiplyComplexImagPart" : "multiplyComplexImagPartConj");
375
376

    source<<"{\n";
377
378
379
380
    source<<"__local mixed4* real0 = "<<variable<<"real;\n";
    source<<"__local mixed4* imag0 = "<<variable<<"imag;\n";
    source<<"__local mixed4* real1 = &temp[blockStart];\n";
    source<<"__local mixed4* imag1 = &temp[blockStart+get_local_size(0)];\n";
381
382
383
384
385
386
387
388
389
390
391
392
393

    // Factor size, generating an appropriate block of code for each factor.

    while (unfactored > 1) {
        int input = stage%2;
        int output = 1-input;
        source<<"{\n";
        if (unfactored%5 == 0) {
            L = L/5;
            source<<"// Pass "<<(stage+1)<<" (radix 5)\n";
            source<<"if (indexInBlock < "<<(L*m)<<") {\n";
            source<<"int i = indexInBlock;\n";
            source<<"int j = i/"<<m<<";\n";
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
            source<<"mixed4 c0r = real"<<input<<"[i];\n";
            source<<"mixed4 c0i = imag"<<input<<"[i];\n";
            source<<"mixed4 c1r = real"<<input<<"[i+"<<(L*m)<<"];\n";
            source<<"mixed4 c1i = imag"<<input<<"[i+"<<(L*m)<<"];\n";
            source<<"mixed4 c2r = real"<<input<<"[i+"<<(2*L*m)<<"];\n";
            source<<"mixed4 c2i = imag"<<input<<"[i+"<<(2*L*m)<<"];\n";
            source<<"mixed4 c3r = real"<<input<<"[i+"<<(3*L*m)<<"];\n";
            source<<"mixed4 c3i = imag"<<input<<"[i+"<<(3*L*m)<<"];\n";
            source<<"mixed4 c4r = real"<<input<<"[i+"<<(4*L*m)<<"];\n";
            source<<"mixed4 c4i = imag"<<input<<"[i+"<<(4*L*m)<<"];\n";
            source<<"mixed4 d0r = c1r+c4r;\n";
            source<<"mixed4 d0i = c1i+c4i;\n";
            source<<"mixed4 d1r = c2r+c3r;\n";
            source<<"mixed4 d1i = c2i+c3i;\n";
            source<<"mixed4 d2r = "<<cl.doubleToString(sin(0.4*M_PI))<<"*(c1r-c4r);\n";
            source<<"mixed4 d2i = "<<cl.doubleToString(sin(0.4*M_PI))<<"*(c1i-c4i);\n";
            source<<"mixed4 d3r = "<<cl.doubleToString(sin(0.4*M_PI))<<"*(c2r-c3r);\n";
            source<<"mixed4 d3i = "<<cl.doubleToString(sin(0.4*M_PI))<<"*(c2i-c3i);\n";
            source<<"mixed4 d4r = d0r+d1r;\n";
            source<<"mixed4 d4i = d0i+d1i;\n";
            source<<"mixed4 d5r = "<<cl.doubleToString(0.25*sqrt(5.0))<<"*(d0r-d1r);\n";
            source<<"mixed4 d5i = "<<cl.doubleToString(0.25*sqrt(5.0))<<"*(d0i-d1i);\n";
            source<<"mixed4 d6r = c0r-0.25f*d4r;\n";
            source<<"mixed4 d6i = c0i-0.25f*d4i;\n";
            source<<"mixed4 d7r = d6r+d5r;\n";
            source<<"mixed4 d7i = d6i+d5i;\n";
            source<<"mixed4 d8r = d6r-d5r;\n";
            source<<"mixed4 d8i = d6i-d5i;\n";
422
            string coeff = cl.doubleToString(sin(0.2*M_PI)/sin(0.4*M_PI));
423
424
425
426
            source<<"mixed4 d9r = "<<sign<<"*(d2i+"<<coeff<<"*d3i);\n";
            source<<"mixed4 d9i = "<<sign<<"*(-d2r-"<<coeff<<"*d3r);\n";
            source<<"mixed4 d10r = "<<sign<<"*("<<coeff<<"*d2i-d3i);\n";
            source<<"mixed4 d10i = "<<sign<<"*(d3r-"<<coeff<<"*d2r);\n";
427
428
            source<<"real"<<output<<"[i+4*j*"<<m<<"] = c0r+d4r;\n";
            source<<"imag"<<output<<"[i+4*j*"<<m<<"] = c0i+d4i;\n";
Peter Eastman's avatar
Peter Eastman committed
429
430
431
432
433
434
435
436
            source<<"real"<<output<<"[i+(4*j+1)*"<<m<<"] = "<<multReal<<"(w[j*"<<size<<"/"<<(5*L)<<"], d7r+d9r, d7i+d9i);\n";
            source<<"imag"<<output<<"[i+(4*j+1)*"<<m<<"] = "<<multImag<<"(w[j*"<<size<<"/"<<(5*L)<<"], d7r+d9r, d7i+d9i);\n";
            source<<"real"<<output<<"[i+(4*j+2)*"<<m<<"] = "<<multReal<<"(w[j*"<<(2*size)<<"/"<<(5*L)<<"], d8r+d10r, d8i+d10i);\n";
            source<<"imag"<<output<<"[i+(4*j+2)*"<<m<<"] = "<<multImag<<"(w[j*"<<(2*size)<<"/"<<(5*L)<<"], d8r+d10r, d8i+d10i);\n";
            source<<"real"<<output<<"[i+(4*j+3)*"<<m<<"] = "<<multReal<<"(w[j*"<<(3*size)<<"/"<<(5*L)<<"], d8r-d10r, d8i-d10i);\n";
            source<<"imag"<<output<<"[i+(4*j+3)*"<<m<<"] = "<<multImag<<"(w[j*"<<(3*size)<<"/"<<(5*L)<<"], d8r-d10r, d8i-d10i);\n";
            source<<"real"<<output<<"[i+(4*j+4)*"<<m<<"] = "<<multReal<<"(w[j*"<<(4*size)<<"/"<<(5*L)<<"], d7r-d9r, d7i-d9i);\n";
            source<<"imag"<<output<<"[i+(4*j+4)*"<<m<<"] = "<<multImag<<"(w[j*"<<(4*size)<<"/"<<(5*L)<<"], d7r-d9r, d7i-d9i);\n";
437
438
439
440
441
442
443
444
445
446
            source<<"}\n";
            m = m*5;
            unfactored /= 5;
        }
        else if (unfactored%4 == 0) {
            L = L/4;
            source<<"// Pass "<<(stage+1)<<" (radix 4)\n";
            source<<"if (indexInBlock < "<<(L*m)<<") {\n";
            source<<"int i = indexInBlock;\n";
            source<<"int j = i/"<<m<<";\n";
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
            source<<"mixed4 c0r = real"<<input<<"[i];\n";
            source<<"mixed4 c0i = imag"<<input<<"[i];\n";
            source<<"mixed4 c1r = real"<<input<<"[i+"<<(L*m)<<"];\n";
            source<<"mixed4 c1i = imag"<<input<<"[i+"<<(L*m)<<"];\n";
            source<<"mixed4 c2r = real"<<input<<"[i+"<<(2*L*m)<<"];\n";
            source<<"mixed4 c2i = imag"<<input<<"[i+"<<(2*L*m)<<"];\n";
            source<<"mixed4 c3r = real"<<input<<"[i+"<<(3*L*m)<<"];\n";
            source<<"mixed4 c3i = imag"<<input<<"[i+"<<(3*L*m)<<"];\n";
            source<<"mixed4 d0r = c0r+c2r;\n";
            source<<"mixed4 d0i = c0i+c2i;\n";
            source<<"mixed4 d1r = c0r-c2r;\n";
            source<<"mixed4 d1i = c0i-c2i;\n";
            source<<"mixed4 d2r = c1r+c3r;\n";
            source<<"mixed4 d2i = c1i+c3i;\n";
            source<<"mixed4 d3r = "<<sign<<"*(c1i-c3i);\n";
            source<<"mixed4 d3i = "<<sign<<"*(c3r-c1r);\n";
463
464
            source<<"real"<<output<<"[i+3*j*"<<m<<"] = d0r+d2r;\n";
            source<<"imag"<<output<<"[i+3*j*"<<m<<"] = d0i+d2i;\n";
Peter Eastman's avatar
Peter Eastman committed
465
466
467
468
469
470
            source<<"real"<<output<<"[i+(3*j+1)*"<<m<<"] = "<<multReal<<"(w[j*"<<size<<"/"<<(4*L)<<"], d1r+d3r, d1i+d3i);\n";
            source<<"imag"<<output<<"[i+(3*j+1)*"<<m<<"] = "<<multImag<<"(w[j*"<<size<<"/"<<(4*L)<<"], d1r+d3r, d1i+d3i);\n";
            source<<"real"<<output<<"[i+(3*j+2)*"<<m<<"] = "<<multReal<<"(w[j*"<<(2*size)<<"/"<<(4*L)<<"], d0r-d2r, d0i-d2i);\n";
            source<<"imag"<<output<<"[i+(3*j+2)*"<<m<<"] = "<<multImag<<"(w[j*"<<(2*size)<<"/"<<(4*L)<<"], d0r-d2r, d0i-d2i);\n";
            source<<"real"<<output<<"[i+(3*j+3)*"<<m<<"] = "<<multReal<<"(w[j*"<<(3*size)<<"/"<<(4*L)<<"], d1r-d3r, d1i-d3i);\n";
            source<<"imag"<<output<<"[i+(3*j+3)*"<<m<<"] = "<<multImag<<"(w[j*"<<(3*size)<<"/"<<(4*L)<<"], d1r-d3r, d1i-d3i);\n";
471
472
473
474
475
476
477
478
479
480
            source<<"}\n";
            m = m*4;
            unfactored /= 4;
        }
        else if (unfactored%3 == 0) {
            L = L/3;
            source<<"// Pass "<<(stage+1)<<" (radix 3)\n";
            source<<"if (indexInBlock < "<<(L*m)<<") {\n";
            source<<"int i = indexInBlock;\n";
            source<<"int j = i/"<<m<<";\n";
481
482
483
484
485
486
487
488
489
490
491
492
            source<<"mixed4 c0r = real"<<input<<"[i];\n";
            source<<"mixed4 c0i = imag"<<input<<"[i];\n";
            source<<"mixed4 c1r = real"<<input<<"[i+"<<(L*m)<<"];\n";
            source<<"mixed4 c1i = imag"<<input<<"[i+"<<(L*m)<<"];\n";
            source<<"mixed4 c2r = real"<<input<<"[i+"<<(2*L*m)<<"];\n";
            source<<"mixed4 c2i = imag"<<input<<"[i+"<<(2*L*m)<<"];\n";
            source<<"mixed4 d0r = c1r+c2r;\n";
            source<<"mixed4 d0i = c1i+c2i;\n";
            source<<"mixed4 d1r = c0r-0.5f*d0r;\n";
            source<<"mixed4 d1i = c0i-0.5f*d0i;\n";
            source<<"mixed4 d2r = "<<sign<<"*"<<cl.doubleToString(sin(M_PI/3.0))<<"*(c1i-c2i);\n";
            source<<"mixed4 d2i = "<<sign<<"*"<<cl.doubleToString(sin(M_PI/3.0))<<"*(c2r-c1r);\n";
493
494
            source<<"real"<<output<<"[i+2*j*"<<m<<"] = c0r+d0r;\n";
            source<<"imag"<<output<<"[i+2*j*"<<m<<"] = c0i+d0i;\n";
Peter Eastman's avatar
Peter Eastman committed
495
496
497
498
            source<<"real"<<output<<"[i+(2*j+1)*"<<m<<"] = "<<multReal<<"(w[j*"<<size<<"/"<<(3*L)<<"], d1r+d2r, d1i+d2i);\n";
            source<<"imag"<<output<<"[i+(2*j+1)*"<<m<<"] = "<<multImag<<"(w[j*"<<size<<"/"<<(3*L)<<"], d1r+d2r, d1i+d2i);\n";
            source<<"real"<<output<<"[i+(2*j+2)*"<<m<<"] = "<<multReal<<"(w[j*"<<(2*size)<<"/"<<(3*L)<<"], d1r-d2r, d1i-d2i);\n";
            source<<"imag"<<output<<"[i+(2*j+2)*"<<m<<"] = "<<multImag<<"(w[j*"<<(2*size)<<"/"<<(3*L)<<"], d1r-d2r, d1i-d2i);\n";
499
500
501
502
503
504
505
506
507
508
            source<<"}\n";
            m = m*3;
            unfactored /= 3;
        }
        else if (unfactored%2 == 0) {
            L = L/2;
            source<<"// Pass "<<(stage+1)<<" (radix 2)\n";
            source<<"if (indexInBlock < "<<(L*m)<<") {\n";
            source<<"int i = indexInBlock;\n";
            source<<"int j = i/"<<m<<";\n";
509
510
511
512
            source<<"mixed4 c0r = real"<<input<<"[i];\n";
            source<<"mixed4 c0i = imag"<<input<<"[i];\n";
            source<<"mixed4 c1r = real"<<input<<"[i+"<<(L*m)<<"];\n";
            source<<"mixed4 c1i = imag"<<input<<"[i+"<<(L*m)<<"];\n";
513
514
            source<<"real"<<output<<"[i+j*"<<m<<"] = c0r+c1r;\n";
            source<<"imag"<<output<<"[i+j*"<<m<<"] = c0i+c1i;\n";
Peter Eastman's avatar
Peter Eastman committed
515
516
            source<<"real"<<output<<"[i+(j+1)*"<<m<<"] = "<<multReal<<"(w[j*"<<size<<"/"<<(2*L)<<"], c0r-c1r, c0i-c1i);\n";
            source<<"imag"<<output<<"[i+(j+1)*"<<m<<"] = "<<multImag<<"(w[j*"<<size<<"/"<<(2*L)<<"], c0r-c1r, c0i-c1i);\n";
517
518
519
520
521
            source<<"}\n";
            m = m*2;
            unfactored /= 2;
        }
        else
522
            throw OpenMMException("Illegal size for FFT: "+cl.intToString(size));
523
524
525
526
527
528
529
530
        source<<"barrier(CLK_LOCAL_MEM_FENCE);\n";
        source<<"}\n";
        ++stage;
    }

    // Create the kernel.

    if (stage%2 == 1) {
Peter Eastman's avatar
Peter Eastman committed
531
532
        source<<"real0[indexInBlock] = real1[indexInBlock];\n";
        source<<"imag0[indexInBlock] = imag1[indexInBlock];\n";
Peter Eastman's avatar
Bug fix  
Peter Eastman committed
533
        source<<"barrier(CLK_LOCAL_MEM_FENCE);\n";
534
535
536
537
    }
    source<<"}\n";
    return source.str();
}