CommonRpmdKernels.cpp 28.1 KB
Newer Older
1
2
3
/* -------------------------------------------------------------------------- *
 *                                   OpenMM                                   *
 * -------------------------------------------------------------------------- *
Evan Pretti's avatar
Evan Pretti committed
4
5
 * This is part of the OpenMM molecular simulation toolkit.                   *
 * See https://openmm.org/development.                                        *
6
 *                                                                            *
7
 * Portions copyright (c) 2011-2021 Stanford University and the Authors.      *
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
 * Permission is hereby granted, free of charge, to any person obtaining a    *
 * copy of this software and associated documentation files (the "Software"), *
 * to deal in the Software without restriction, including without limitation  *
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
 * and/or sell copies of the Software, and to permit persons to whom the      *
 * Software is furnished to do so, subject to the following conditions:       *
 *                                                                            *
 * The above copyright notice and this permission notice shall be included in *
 * all copies or substantial portions of the Software.                        *
 *                                                                            *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
 * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
 * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
 * -------------------------------------------------------------------------- */

30
31
#include "CommonRpmdKernels.h"
#include "CommonRpmdKernelSources.h"
32
#include "openmm/internal/ContextImpl.h"
33
#include "openmm/common/ContextSelector.h"
34
35
36
#include "openmm/common/IntegrationUtilities.h"
#include "openmm/common/ExpressionUtilities.h"
#include "openmm/common/NonbondedUtilities.h"
37
#include "SimTKOpenMMRealType.h"
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62

using namespace OpenMM;
using namespace std;


/**
 * Select a size for an FFT that is a multiple of 2, 3, 5, and 7.
 */
static int findFFTDimension(int minimum) {
    if (minimum < 1)
        return 1;
    while (true) {
        // Attempt to factor the current value.

        int unfactored = minimum;
        for (int factor = 2; factor < 8; factor++) {
            while (unfactored > 1 && unfactored%factor == 0)
                unfactored /= factor;
        }
        if (unfactored == 1)
            return minimum;
        minimum++;
    }
}

63
64
void CommonIntegrateRPMDStepKernel::initialize(const System& system, const RPMDIntegrator& integrator) {
    cc.initializeContexts();
65
    ContextSelector selector(cc);
66
67
68
69
70
    numCopies = integrator.getNumCopies();
    numParticles = system.getNumParticles();
    workgroupSize = numCopies;
    if (numCopies != findFFTDimension(numCopies))
        throw OpenMMException("RPMDIntegrator: the number of copies must be a multiple of powers of 2, 3, and 5.");
71
72
73
74
75
76
77
    int paddedParticles = cc.getPaddedNumAtoms();
    bool useDoublePrecision = (cc.getUseDoublePrecision() || cc.getUseMixedPrecision());
    int elementSize = (useDoublePrecision ? sizeof(mm_double4) : sizeof(mm_float4));
    forces.initialize<long long>(cc, numCopies*paddedParticles*3, "rpmdForces");
    positions.initialize(cc, numCopies*paddedParticles, elementSize, "rpmdPositions");
    velocities.initialize(cc, numCopies*paddedParticles, elementSize, "rpmdVelocities");
    cc.getIntegrationUtilities().initRandomNumberGenerator((unsigned int) integrator.getRandomNumberSeed());
78
79
80
    
    // Fill in the posq and velm arrays with safe values to avoid a risk of nans.
    
81
    if (useDoublePrecision) {
82
        vector<mm_double4> temp(positions.getSize());
83
        for (int i = 0; i < positions.getSize(); i++)
84
            temp[i] = mm_double4(0, 0, 0, 0);
85
86
        positions.upload(temp);
        for (int i = 0; i < velocities.getSize(); i++)
87
            temp[i] = mm_double4(0, 0, 0, 1);
88
        velocities.upload(temp);
89
90
    }
    else {
91
        vector<mm_float4> temp(positions.getSize());
92
        for (int i = 0; i < positions.getSize(); i++)
93
            temp[i] = mm_float4(0, 0, 0, 0);
94
95
        positions.upload(temp);
        for (int i = 0; i < velocities.getSize(); i++)
96
            temp[i] = mm_float4(0, 0, 0, 1);
97
        velocities.upload(temp);
98
    }
99
100
101
102
103
104
    
    // Build a list of contractions.
    
    groupsNotContracted = -1;
    const map<int, int>& contractions = integrator.getContractions();
    int maxContractedCopies = 0;
peastman's avatar
peastman committed
105
106
107
    for (auto& c : contractions) {
        int group = c.first;
        int copies = c.second;
108
109
110
111
        if (group < 0 || group > 31)
            throw OpenMMException("RPMDIntegrator: Force group must be between 0 and 31");
        if (copies < 0 || copies > numCopies)
            throw OpenMMException("RPMDIntegrator: Number of copies for contraction cannot be greater than the total number of copies being simulated");
112
113
        if (copies != findFFTDimension(copies))
            throw OpenMMException("RPMDIntegrator: Number of copies for contraction must be a multiple of powers of 2, 3, and 5.");
114
115
116
117
118
119
120
121
        if (copies != numCopies) {
            if (groupsByCopies.find(copies) == groupsByCopies.end()) {
                groupsByCopies[copies] = 1<<group;
                if (copies > maxContractedCopies)
                    maxContractedCopies = copies;
            }
            else
                groupsByCopies[copies] |= 1<<group;
peastman's avatar
peastman committed
122
            groupsNotContracted -= 1<<group;
123
124
        }
    }
125
    groupsNotContracted &= integrator.getIntegrationForceGroups();
126
    if (maxContractedCopies > 0) {
127
128
        contractedForces.initialize<long long>(cc, maxContractedCopies*paddedParticles*3, "rpmdContractedForces");
        contractedPositions.initialize(cc, maxContractedCopies*paddedParticles, elementSize, "rpmdContractedPositions");
129
    }
130
131
132
133

    // Create kernels.
    
    map<string, string> defines;
134
135
136
137
    defines["NUM_ATOMS"] = cc.intToString(cc.getNumAtoms());
    defines["PADDED_NUM_ATOMS"] = cc.intToString(cc.getPaddedNumAtoms());
    defines["NUM_COPIES"] = cc.intToString(numCopies);
    defines["THREAD_BLOCK_SIZE"] = cc.intToString(workgroupSize);
138
139
140
    defines["HBAR"] = cc.doubleToString(1.054571628e-34*AVOGADRO/(1000*1e-12), true);
    defines["SCALE"] = cc.doubleToString(1.0/sqrt((double) numCopies), true);
    defines["M_PI"] = cc.doubleToString(M_PI, true);
141
142
143
144
145
    map<string, string> replacements;
    replacements["FFT_Q_FORWARD"] = createFFT(numCopies, "q", true);
    replacements["FFT_Q_BACKWARD"] = createFFT(numCopies, "q", false);
    replacements["FFT_V_FORWARD"] = createFFT(numCopies, "v", true);
    replacements["FFT_V_BACKWARD"] = createFFT(numCopies, "v", false);
146
147
148
149
150
151
152
    ComputeProgram program = cc.compileProgram(cc.replaceStrings(CommonRpmdKernelSources::rpmd, replacements), defines);
    pileKernel = program->createKernel("applyPileThermostat");
    stepKernel = program->createKernel("integrateStep");
    velocitiesKernel = program->createKernel("advanceVelocities");
    copyToContextKernel = program->createKernel("copyDataToContext");
    copyFromContextKernel = program->createKernel("copyDataFromContext");
    translateKernel = program->createKernel("applyCellTranslations");
153
154
155
    
    // Create kernels for doing contractions.
    
peastman's avatar
peastman committed
156
157
    for (auto& g : groupsByCopies) {
        int copies = g.first;
158
        replacements.clear();
159
        replacements["NUM_CONTRACTED_COPIES"] = cc.intToString(copies);
160
161
        replacements["POS_SCALE"] = cc.doubleToString(1.0/numCopies, true);
        replacements["FORCE_SCALE"] = cc.doubleToString(0x100000000/(double) copies, true);
162
163
164
165
        replacements["FFT_Q_FORWARD"] = createFFT(numCopies, "q", true);
        replacements["FFT_Q_BACKWARD"] = createFFT(copies, "q", false);
        replacements["FFT_F_FORWARD"] = createFFT(copies, "f", true);
        replacements["FFT_F_BACKWARD"] = createFFT(numCopies, "f", false);
166
167
168
        program = cc.compileProgram(cc.replaceStrings(CommonRpmdKernelSources::rpmdContraction, replacements), defines);
        positionContractionKernels[copies] = program->createKernel("contractPositions");
        forceContractionKernels[copies] = program->createKernel("contractForces");
169
    }
170
171
}

172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
void CommonIntegrateRPMDStepKernel::initializeKernels(ContextImpl& context) {
    hasInitializedKernels = true;
    pileKernel->addArg(velocities);
    pileKernel->addArg(cc.getIntegrationUtilities().getRandom());
    pileKernel->addArg();
    pileKernel->addArg();
    pileKernel->addArg();
    pileKernel->addArg();
    stepKernel->addArg(positions);
    stepKernel->addArg(velocities);
    stepKernel->addArg(forces);
    stepKernel->addArg();
    stepKernel->addArg();
    velocitiesKernel->addArg(velocities);
    velocitiesKernel->addArg(forces);
    velocitiesKernel->addArg();
    translateKernel->addArg(positions);
    translateKernel->addArg(cc.getPosq());
    translateKernel->addArg(cc.getAtomIndexArray());
    translateKernel->addArg();
    copyToContextKernel->addArg(velocities);
    copyToContextKernel->addArg(cc.getVelm());
    copyToContextKernel->addArg();
    copyToContextKernel->addArg(cc.getPosq());
    copyToContextKernel->addArg(cc.getAtomIndexArray());
    copyToContextKernel->addArg();
    copyFromContextKernel->addArg(cc.getLongForceBuffer());
    copyFromContextKernel->addArg();
    copyFromContextKernel->addArg(cc.getVelm());
    copyFromContextKernel->addArg(velocities);
    copyFromContextKernel->addArg(cc.getPosq());
    copyFromContextKernel->addArg();
    copyFromContextKernel->addArg(cc.getAtomIndexArray());
    copyFromContextKernel->addArg();
    for (auto& g : groupsByCopies) {
        int copies = g.first;
        positionContractionKernels[copies]->addArg(positions);
        positionContractionKernels[copies]->addArg(contractedPositions);
        forceContractionKernels[copies]->addArg(forces);
        forceContractionKernels[copies]->addArg(contractedForces);
    }
}

void CommonIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDIntegrator& integrator, bool forcesAreValid) {
216
    ContextSelector selector(cc);
217
218
219
    if (!hasInitializedKernels)
        initializeKernels(context);
    IntegrationUtilities& integration = cc.getIntegrationUtilities();
220
221
222
223
224
225
226
227
    
    // Loop over copies and compute the force on each one.
    
    if (!forcesAreValid)
        computeForces(context);
    
    // Apply the PILE-L thermostat.
    
228
    bool useDoublePrecision = (cc.getUseDoublePrecision() || cc.getUseMixedPrecision());
229
    double dt = integrator.getStepSize();
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
    pileKernel->setArg(2, integration.prepareRandomNumbers(numParticles*numCopies));
    if (useDoublePrecision) {
        pileKernel->setArg(3, dt);
        pileKernel->setArg(4, integrator.getTemperature()*BOLTZ);
        pileKernel->setArg(5, integrator.getFriction());
        stepKernel->setArg(3, dt);
        stepKernel->setArg(4, integrator.getTemperature()*BOLTZ);
        velocitiesKernel->setArg(2, dt);
    }
    else {
        pileKernel->setArg(3, (float) dt);
        pileKernel->setArg(4, (float) (integrator.getTemperature()*BOLTZ));
        pileKernel->setArg(5, (float) integrator.getFriction());
        stepKernel->setArg(3, (float) dt);
        stepKernel->setArg(4, (float) (integrator.getTemperature()*BOLTZ));
        velocitiesKernel->setArg(2, (float) dt);
    }
247
    if (integrator.getApplyThermostat())
248
        pileKernel->execute(numParticles*numCopies, workgroupSize);
249
250
251

    // Update positions and velocities.
    
252
    stepKernel->execute(numParticles*numCopies, workgroupSize);
253
254
255
256
257
258
259

    // Calculate forces based on the updated positions.
    
    computeForces(context);
    
    // Update velocities.

260
    velocitiesKernel->execute(numParticles*numCopies, workgroupSize);
261
262
263

    // Apply the PILE-L thermostat again.

264
    if (integrator.getApplyThermostat()) {
265
266
        pileKernel->setArg(2, integration.prepareRandomNumbers(numParticles*numCopies));
        pileKernel->execute(numParticles*numCopies, workgroupSize);
267
    }
268
269
270

    // Update the time and step count.

271
272
273
274
    cc.setTime(cc.getTime()+dt);
    cc.setStepCount(cc.getStepCount()+1);
    cc.reorderAtoms();
    if (cc.getAtomsWereReordered() && cc.getNonbondedUtilities().getUsePeriodic()) {
275
276
277
        // Atoms may have been translated into a different periodic box, so apply
        // the same translation to all the beads.

278
279
        translateKernel->setArg(3, numCopies-1);
        translateKernel->execute(cc.getNumAtoms());
280
    }
281
282
}

283
void CommonIntegrateRPMDStepKernel::computeForces(ContextImpl& context) {
284
285
    // Compute forces from all groups that didn't have a specified contraction.

286
287
288
    copyToContextKernel->setArg(2, positions);
    copyFromContextKernel->setArg(1, forces);
    copyFromContextKernel->setArg(5, positions);
289
    for (int i = 0; i < numCopies; i++) {
290
291
        copyToContextKernel->setArg(5, i);
        copyToContextKernel->execute(cc.getNumAtoms());
Peter Eastman's avatar
Peter Eastman committed
292
        context.computeVirtualSites();
293
294
        Vec3 initialBox[3];
        context.getPeriodicBoxVectors(initialBox[0], initialBox[1], initialBox[2]);
295
        context.updateContextState();
296
297
        Vec3 finalBox[3];
        context.getPeriodicBoxVectors(finalBox[0], finalBox[1], finalBox[2]);
298
299
        if (initialBox[0] != finalBox[0] || initialBox[1] != finalBox[1] || initialBox[2] != finalBox[2])
            throw OpenMMException("Standard barostats cannot be used with RPMDIntegrator.  Use RPMDMonteCarloBarostat instead.");
300
301
302
303
        {
            ContextDeselector deselector(cc);
            context.calcForcesAndEnergy(true, false, groupsNotContracted);
        }
304
305
        copyFromContextKernel->setArg(7, i);
        copyFromContextKernel->execute(cc.getNumAtoms());
306
    }
307
308
309
    
    // Now loop over contractions and compute forces from them.
    
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
    if (groupsByCopies.size() > 0) {
        copyToContextKernel->setArg(2, contractedPositions);
        copyFromContextKernel->setArg(1, contractedForces);
        copyFromContextKernel->setArg(5, contractedPositions);
        for (auto& g : groupsByCopies) {
            int copies = g.first;
            int groupFlags = g.second;

            // Find the contracted positions.

           positionContractionKernels[copies]->execute(numParticles*numCopies, workgroupSize);

            // Compute forces.

            for (int i = 0; i < copies; i++) {
                copyToContextKernel->setArg(5, i);
                copyToContextKernel->execute(cc.getNumAtoms());
                context.computeVirtualSites();
328
329
330
331
                {
                    ContextDeselector deselector(cc);
                    context.calcForcesAndEnergy(true, false, groupFlags);
                }
332
333
334
335
336
337
338
                copyFromContextKernel->setArg(7, i);
                copyFromContextKernel->execute(cc.getNumAtoms());
            }

            // Apply the forces to the original copies.

            forceContractionKernels[copies]->execute(numParticles*numCopies, workgroupSize);
339
340
        }
    }
341
342
343
    if (groupsByCopies.size() > 0) {
        // Ensure the Context contains the positions from the last copy, since we'll assume that later.
        
344
345
346
        copyToContextKernel->setArg(2, positions);
        copyToContextKernel->setArg(5, numCopies-1);
        copyToContextKernel->execute(cc.getNumAtoms());
347
    }
348
349
}

350
351
double CommonIntegrateRPMDStepKernel::computeKineticEnergy(ContextImpl& context, const RPMDIntegrator& integrator) {
    return cc.getIntegrationUtilities().computeKineticEnergy(0);
352
353
}

354
void CommonIntegrateRPMDStepKernel::setPositions(int copy, const vector<Vec3>& pos) {
355
    if (!positions.isInitialized())
356
357
358
        throw OpenMMException("RPMDIntegrator: Cannot set positions before the integrator is added to a Context");
    if (pos.size() != numParticles)
        throw OpenMMException("RPMDIntegrator: wrong number of values passed to setPositions()");
359
360
361

    // Adjust the positions based on the current cell offsets.
    
362
363
364
    const vector<int>& order = cc.getAtomIndex();
    Vec3 a, b, c;
    cc.getPeriodicBoxVectors(a, b, c);
365
366
    vector<Vec3> offsetPos(numParticles);
    for (int i = 0; i < numParticles; ++i) {
367
368
        mm_int4 offset = cc.getPosCellOffsets()[i];
        offsetPos[order[i]] = pos[order[i]] + Vec3(offset.x*a[0], offset.y*b[1], offset.z*c[2]);
369
370
371
372
    }

    // Record the positions.

373
    ContextSelector selector(cc);
374
375
376
    if (cc.getUseDoublePrecision()) {
        vector<mm_double4> posq(cc.getPaddedNumAtoms());
        cc.getPosq().download(posq);
377
        for (int i = 0; i < numParticles; i++)
378
379
            posq[i] = mm_double4(offsetPos[i][0], offsetPos[i][1], offsetPos[i][2], posq[i].w);
        positions.uploadSubArray(&posq[0], copy*cc.getPaddedNumAtoms(), numParticles);
380
    }
381
382
383
384
    else if (cc.getUseMixedPrecision()) {
        vector<mm_float4> posqf(cc.getPaddedNumAtoms());
        cc.getPosq().download(posqf);
        vector<mm_double4> posq(cc.getPaddedNumAtoms());
385
        for (int i = 0; i < numParticles; i++)
386
387
            posq[i] = mm_double4(offsetPos[i][0], offsetPos[i][1], offsetPos[i][2], posqf[i].w);
        positions.uploadSubArray(&posq[0], copy*cc.getPaddedNumAtoms(), numParticles);
388
389
    }
    else {
390
391
        vector<mm_float4> posq(cc.getPaddedNumAtoms());
        cc.getPosq().download(posq);
392
        for (int i = 0; i < numParticles; i++)
393
394
            posq[i] = mm_float4((float) offsetPos[i][0], (float) offsetPos[i][1], (float) offsetPos[i][2], posq[i].w);
        positions.uploadSubArray(&posq[0], copy*cc.getPaddedNumAtoms(), numParticles);
395
396
397
    }
}

398
void CommonIntegrateRPMDStepKernel::setVelocities(int copy, const vector<Vec3>& vel) {
399
    if (!velocities.isInitialized())
400
401
402
        throw OpenMMException("RPMDIntegrator: Cannot set velocities before the integrator is added to a Context");
    if (vel.size() != numParticles)
        throw OpenMMException("RPMDIntegrator: wrong number of values passed to setVelocities()");
403
    ContextSelector selector(cc);
404
405
406
    if (cc.getUseDoublePrecision() || cc.getUseMixedPrecision()) {
        vector<mm_double4> velm(cc.getPaddedNumAtoms());
        cc.getVelm().download(velm);
407
        for (int i = 0; i < numParticles; i++)
408
409
            velm[i] = mm_double4(vel[i][0], vel[i][1], vel[i][2], velm[i].w);
        velocities.uploadSubArray(&velm[0], copy*cc.getPaddedNumAtoms(), numParticles);
410
411
    }
    else {
412
413
        vector<mm_float4> velm(cc.getPaddedNumAtoms());
        cc.getVelm().download(velm);
414
        for (int i = 0; i < numParticles; i++)
415
416
            velm[i] = mm_float4((float) vel[i][0], (float) vel[i][1], (float) vel[i][2], velm[i].w);
        velocities.uploadSubArray(&velm[0], copy*cc.getPaddedNumAtoms(), numParticles);
417
418
419
    }
}

420
void CommonIntegrateRPMDStepKernel::copyToContext(int copy, ContextImpl& context) {
421
    ContextSelector selector(cc);
422
423
424
425
426
    if (!hasInitializedKernels)
        initializeKernels(context);
    copyToContextKernel->setArg(2, positions);
    copyToContextKernel->setArg(5, copy);
    copyToContextKernel->execute(cc.getNumAtoms());
427
428
}

429
string CommonIntegrateRPMDStepKernel::createFFT(int size, const string& variable, bool forward) {
430
431
432
433
434
435
436
437
438
    stringstream source;
    int stage = 0;
    int L = size;
    int m = 1;
    string sign = (forward ? "1.0f" : "-1.0f");
    string multReal = (forward ? "multiplyComplexRealPart" : "multiplyComplexRealPartConj");
    string multImag = (forward ? "multiplyComplexImagPart" : "multiplyComplexImagPartConj");

    source<<"{\n";
439
440
441
442
    source<<"LOCAL_ARG mixed3* real0 = "<<variable<<"real;\n";
    source<<"LOCAL_ARG mixed3* imag0 = "<<variable<<"imag;\n";
    source<<"LOCAL_ARG mixed3* real1 = &temp[blockStart];\n";
    source<<"LOCAL_ARG mixed3* imag1 = &temp[blockStart+LOCAL_SIZE];\n";
443
444
445

    // Factor size, generating an appropriate block of code for each factor.

peastman's avatar
peastman committed
446
    while (L > 1) {
447
448
        int input = stage%2;
        int output = 1-input;
peastman's avatar
peastman committed
449
450
451
452
453
454
455
456
457
458
        int radix;
        if (L%5 == 0)
            radix = 5;
        else if (L%4 == 0)
            radix = 4;
        else if (L%3 == 0)
            radix = 3;
        else if (L%2 == 0)
            radix = 2;
        else
459
            throw OpenMMException("Illegal size for FFT: "+cc.intToString(size));
460
        source<<"{\n";
peastman's avatar
peastman committed
461
462
463
464
465
466
        L = L/radix;
        source<<"// Pass "<<(stage+1)<<" (radix "<<radix<<")\n";
        source<<"if (indexInBlock < "<<(L*m)<<") {\n";
        source<<"int i = indexInBlock;\n";
        source<<"int j = i/"<<m<<";\n";
        if (radix == 5) {
467
468
469
470
471
472
473
474
475
476
477
478
479
480
            source<<"mixed3 c0r = real"<<input<<"[i];\n";
            source<<"mixed3 c0i = imag"<<input<<"[i];\n";
            source<<"mixed3 c1r = real"<<input<<"[i+"<<(L*m)<<"];\n";
            source<<"mixed3 c1i = imag"<<input<<"[i+"<<(L*m)<<"];\n";
            source<<"mixed3 c2r = real"<<input<<"[i+"<<(2*L*m)<<"];\n";
            source<<"mixed3 c2i = imag"<<input<<"[i+"<<(2*L*m)<<"];\n";
            source<<"mixed3 c3r = real"<<input<<"[i+"<<(3*L*m)<<"];\n";
            source<<"mixed3 c3i = imag"<<input<<"[i+"<<(3*L*m)<<"];\n";
            source<<"mixed3 c4r = real"<<input<<"[i+"<<(4*L*m)<<"];\n";
            source<<"mixed3 c4i = imag"<<input<<"[i+"<<(4*L*m)<<"];\n";
            source<<"mixed3 d0r = c1r+c4r;\n";
            source<<"mixed3 d0i = c1i+c4i;\n";
            source<<"mixed3 d1r = c2r+c3r;\n";
            source<<"mixed3 d1i = c2i+c3i;\n";
481
482
483
484
            source<<"mixed3 d2r = "<<cc.doubleToString(sin(0.4*M_PI), true)<<"*(c1r-c4r);\n";
            source<<"mixed3 d2i = "<<cc.doubleToString(sin(0.4*M_PI), true)<<"*(c1i-c4i);\n";
            source<<"mixed3 d3r = "<<cc.doubleToString(sin(0.4*M_PI), true)<<"*(c2r-c3r);\n";
            source<<"mixed3 d3i = "<<cc.doubleToString(sin(0.4*M_PI), true)<<"*(c2i-c3i);\n";
485
486
            source<<"mixed3 d4r = d0r+d1r;\n";
            source<<"mixed3 d4i = d0i+d1i;\n";
487
488
            source<<"mixed3 d5r = "<<cc.doubleToString(0.25*sqrt(5.0), true)<<"*(d0r-d1r);\n";
            source<<"mixed3 d5i = "<<cc.doubleToString(0.25*sqrt(5.0), true)<<"*(d0i-d1i);\n";
489
490
491
492
493
494
            source<<"mixed3 d6r = c0r-0.25f*d4r;\n";
            source<<"mixed3 d6i = c0i-0.25f*d4i;\n";
            source<<"mixed3 d7r = d6r+d5r;\n";
            source<<"mixed3 d7i = d6i+d5i;\n";
            source<<"mixed3 d8r = d6r-d5r;\n";
            source<<"mixed3 d8i = d6i-d5i;\n";
495
            string coeff = cc.doubleToString(sin(0.2*M_PI)/sin(0.4*M_PI), true);
496
497
498
499
            source<<"mixed3 d9r = "<<sign<<"*(d2i+"<<coeff<<"*d3i);\n";
            source<<"mixed3 d9i = "<<sign<<"*(-d2r-"<<coeff<<"*d3r);\n";
            source<<"mixed3 d10r = "<<sign<<"*("<<coeff<<"*d2i-d3i);\n";
            source<<"mixed3 d10i = "<<sign<<"*(d3r-"<<coeff<<"*d2r);\n";
500
501
502
503
504
505
506
507
508
509
510
            source<<"real"<<output<<"[i+4*j*"<<m<<"] = c0r+d4r;\n";
            source<<"imag"<<output<<"[i+4*j*"<<m<<"] = c0i+d4i;\n";
            source<<"real"<<output<<"[i+(4*j+1)*"<<m<<"] = "<<multReal<<"(w[j*"<<size<<"/"<<(5*L)<<"], d7r+d9r, d7i+d9i);\n";
            source<<"imag"<<output<<"[i+(4*j+1)*"<<m<<"] = "<<multImag<<"(w[j*"<<size<<"/"<<(5*L)<<"], d7r+d9r, d7i+d9i);\n";
            source<<"real"<<output<<"[i+(4*j+2)*"<<m<<"] = "<<multReal<<"(w[j*"<<(2*size)<<"/"<<(5*L)<<"], d8r+d10r, d8i+d10i);\n";
            source<<"imag"<<output<<"[i+(4*j+2)*"<<m<<"] = "<<multImag<<"(w[j*"<<(2*size)<<"/"<<(5*L)<<"], d8r+d10r, d8i+d10i);\n";
            source<<"real"<<output<<"[i+(4*j+3)*"<<m<<"] = "<<multReal<<"(w[j*"<<(3*size)<<"/"<<(5*L)<<"], d8r-d10r, d8i-d10i);\n";
            source<<"imag"<<output<<"[i+(4*j+3)*"<<m<<"] = "<<multImag<<"(w[j*"<<(3*size)<<"/"<<(5*L)<<"], d8r-d10r, d8i-d10i);\n";
            source<<"real"<<output<<"[i+(4*j+4)*"<<m<<"] = "<<multReal<<"(w[j*"<<(4*size)<<"/"<<(5*L)<<"], d7r-d9r, d7i-d9i);\n";
            source<<"imag"<<output<<"[i+(4*j+4)*"<<m<<"] = "<<multImag<<"(w[j*"<<(4*size)<<"/"<<(5*L)<<"], d7r-d9r, d7i-d9i);\n";
        }
peastman's avatar
peastman committed
511
        else if (radix == 4) {
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
            source<<"mixed3 c0r = real"<<input<<"[i];\n";
            source<<"mixed3 c0i = imag"<<input<<"[i];\n";
            source<<"mixed3 c1r = real"<<input<<"[i+"<<(L*m)<<"];\n";
            source<<"mixed3 c1i = imag"<<input<<"[i+"<<(L*m)<<"];\n";
            source<<"mixed3 c2r = real"<<input<<"[i+"<<(2*L*m)<<"];\n";
            source<<"mixed3 c2i = imag"<<input<<"[i+"<<(2*L*m)<<"];\n";
            source<<"mixed3 c3r = real"<<input<<"[i+"<<(3*L*m)<<"];\n";
            source<<"mixed3 c3i = imag"<<input<<"[i+"<<(3*L*m)<<"];\n";
            source<<"mixed3 d0r = c0r+c2r;\n";
            source<<"mixed3 d0i = c0i+c2i;\n";
            source<<"mixed3 d1r = c0r-c2r;\n";
            source<<"mixed3 d1i = c0i-c2i;\n";
            source<<"mixed3 d2r = c1r+c3r;\n";
            source<<"mixed3 d2i = c1i+c3i;\n";
            source<<"mixed3 d3r = "<<sign<<"*(c1i-c3i);\n";
            source<<"mixed3 d3i = "<<sign<<"*(c3r-c1r);\n";
528
529
530
531
532
533
534
535
536
            source<<"real"<<output<<"[i+3*j*"<<m<<"] = d0r+d2r;\n";
            source<<"imag"<<output<<"[i+3*j*"<<m<<"] = d0i+d2i;\n";
            source<<"real"<<output<<"[i+(3*j+1)*"<<m<<"] = "<<multReal<<"(w[j*"<<size<<"/"<<(4*L)<<"], d1r+d3r, d1i+d3i);\n";
            source<<"imag"<<output<<"[i+(3*j+1)*"<<m<<"] = "<<multImag<<"(w[j*"<<size<<"/"<<(4*L)<<"], d1r+d3r, d1i+d3i);\n";
            source<<"real"<<output<<"[i+(3*j+2)*"<<m<<"] = "<<multReal<<"(w[j*"<<(2*size)<<"/"<<(4*L)<<"], d0r-d2r, d0i-d2i);\n";
            source<<"imag"<<output<<"[i+(3*j+2)*"<<m<<"] = "<<multImag<<"(w[j*"<<(2*size)<<"/"<<(4*L)<<"], d0r-d2r, d0i-d2i);\n";
            source<<"real"<<output<<"[i+(3*j+3)*"<<m<<"] = "<<multReal<<"(w[j*"<<(3*size)<<"/"<<(4*L)<<"], d1r-d3r, d1i-d3i);\n";
            source<<"imag"<<output<<"[i+(3*j+3)*"<<m<<"] = "<<multImag<<"(w[j*"<<(3*size)<<"/"<<(4*L)<<"], d1r-d3r, d1i-d3i);\n";
        }
peastman's avatar
peastman committed
537
        else if (radix == 3) {
538
539
540
541
542
543
544
545
546
547
            source<<"mixed3 c0r = real"<<input<<"[i];\n";
            source<<"mixed3 c0i = imag"<<input<<"[i];\n";
            source<<"mixed3 c1r = real"<<input<<"[i+"<<(L*m)<<"];\n";
            source<<"mixed3 c1i = imag"<<input<<"[i+"<<(L*m)<<"];\n";
            source<<"mixed3 c2r = real"<<input<<"[i+"<<(2*L*m)<<"];\n";
            source<<"mixed3 c2i = imag"<<input<<"[i+"<<(2*L*m)<<"];\n";
            source<<"mixed3 d0r = c1r+c2r;\n";
            source<<"mixed3 d0i = c1i+c2i;\n";
            source<<"mixed3 d1r = c0r-0.5f*d0r;\n";
            source<<"mixed3 d1i = c0i-0.5f*d0i;\n";
548
549
            source<<"mixed3 d2r = "<<sign<<"*"<<cc.doubleToString(sin(M_PI/3.0), true)<<"*(c1i-c2i);\n";
            source<<"mixed3 d2i = "<<sign<<"*"<<cc.doubleToString(sin(M_PI/3.0), true)<<"*(c2r-c1r);\n";
550
551
552
553
554
555
556
            source<<"real"<<output<<"[i+2*j*"<<m<<"] = c0r+d0r;\n";
            source<<"imag"<<output<<"[i+2*j*"<<m<<"] = c0i+d0i;\n";
            source<<"real"<<output<<"[i+(2*j+1)*"<<m<<"] = "<<multReal<<"(w[j*"<<size<<"/"<<(3*L)<<"], d1r+d2r, d1i+d2i);\n";
            source<<"imag"<<output<<"[i+(2*j+1)*"<<m<<"] = "<<multImag<<"(w[j*"<<size<<"/"<<(3*L)<<"], d1r+d2r, d1i+d2i);\n";
            source<<"real"<<output<<"[i+(2*j+2)*"<<m<<"] = "<<multReal<<"(w[j*"<<(2*size)<<"/"<<(3*L)<<"], d1r-d2r, d1i-d2i);\n";
            source<<"imag"<<output<<"[i+(2*j+2)*"<<m<<"] = "<<multImag<<"(w[j*"<<(2*size)<<"/"<<(3*L)<<"], d1r-d2r, d1i-d2i);\n";
        }
peastman's avatar
peastman committed
557
        else if (radix == 2) {
558
559
560
561
            source<<"mixed3 c0r = real"<<input<<"[i];\n";
            source<<"mixed3 c0i = imag"<<input<<"[i];\n";
            source<<"mixed3 c1r = real"<<input<<"[i+"<<(L*m)<<"];\n";
            source<<"mixed3 c1i = imag"<<input<<"[i+"<<(L*m)<<"];\n";
562
563
564
565
566
            source<<"real"<<output<<"[i+j*"<<m<<"] = c0r+c1r;\n";
            source<<"imag"<<output<<"[i+j*"<<m<<"] = c0i+c1i;\n";
            source<<"real"<<output<<"[i+(j+1)*"<<m<<"] = "<<multReal<<"(w[j*"<<size<<"/"<<(2*L)<<"], c0r-c1r, c0i-c1i);\n";
            source<<"imag"<<output<<"[i+(j+1)*"<<m<<"] = "<<multImag<<"(w[j*"<<size<<"/"<<(2*L)<<"], c0r-c1r, c0i-c1i);\n";
        }
peastman's avatar
peastman committed
567
568
        source<<"}\n";
        m = m*radix;
569
        source<<"SYNC_THREADS;\n";
570
571
572
573
574
575
576
577
578
        source<<"}\n";
        ++stage;
    }

    // Create the kernel.

    if (stage%2 == 1) {
        source<<"real0[indexInBlock] = real1[indexInBlock];\n";
        source<<"imag0[indexInBlock] = imag1[indexInBlock];\n";
579
        source<<"SYNC_THREADS;\n";
580
581
582
583
    }
    source<<"}\n";
    return source.str();
}