CommonRpmdKernels.cpp 28.1 KB
Newer Older
1
2
3
4
5
6
7
8
/* -------------------------------------------------------------------------- *
 *                                   OpenMM                                   *
 * -------------------------------------------------------------------------- *
 * This is part of the OpenMM molecular simulation toolkit originating from   *
 * Simbios, the NIH National Center for Physics-Based Simulation of           *
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
9
 * Portions copyright (c) 2011-2021 Stanford University and the Authors.      *
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
 * Permission is hereby granted, free of charge, to any person obtaining a    *
 * copy of this software and associated documentation files (the "Software"), *
 * to deal in the Software without restriction, including without limitation  *
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
 * and/or sell copies of the Software, and to permit persons to whom the      *
 * Software is furnished to do so, subject to the following conditions:       *
 *                                                                            *
 * The above copyright notice and this permission notice shall be included in *
 * all copies or substantial portions of the Software.                        *
 *                                                                            *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
 * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
 * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
 * -------------------------------------------------------------------------- */

32
33
#include "CommonRpmdKernels.h"
#include "CommonRpmdKernelSources.h"
34
#include "openmm/internal/ContextImpl.h"
35
#include "openmm/common/ContextSelector.h"
36
37
38
#include "openmm/common/IntegrationUtilities.h"
#include "openmm/common/ExpressionUtilities.h"
#include "openmm/common/NonbondedUtilities.h"
39
#include "SimTKOpenMMRealType.h"
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64

using namespace OpenMM;
using namespace std;


/**
 * Select a size for an FFT that is a multiple of 2, 3, 5, and 7.
 */
static int findFFTDimension(int minimum) {
    if (minimum < 1)
        return 1;
    while (true) {
        // Attempt to factor the current value.

        int unfactored = minimum;
        for (int factor = 2; factor < 8; factor++) {
            while (unfactored > 1 && unfactored%factor == 0)
                unfactored /= factor;
        }
        if (unfactored == 1)
            return minimum;
        minimum++;
    }
}

65
66
void CommonIntegrateRPMDStepKernel::initialize(const System& system, const RPMDIntegrator& integrator) {
    cc.initializeContexts();
67
    ContextSelector selector(cc);
68
69
70
71
72
    numCopies = integrator.getNumCopies();
    numParticles = system.getNumParticles();
    workgroupSize = numCopies;
    if (numCopies != findFFTDimension(numCopies))
        throw OpenMMException("RPMDIntegrator: the number of copies must be a multiple of powers of 2, 3, and 5.");
73
74
75
76
77
78
79
    int paddedParticles = cc.getPaddedNumAtoms();
    bool useDoublePrecision = (cc.getUseDoublePrecision() || cc.getUseMixedPrecision());
    int elementSize = (useDoublePrecision ? sizeof(mm_double4) : sizeof(mm_float4));
    forces.initialize<long long>(cc, numCopies*paddedParticles*3, "rpmdForces");
    positions.initialize(cc, numCopies*paddedParticles, elementSize, "rpmdPositions");
    velocities.initialize(cc, numCopies*paddedParticles, elementSize, "rpmdVelocities");
    cc.getIntegrationUtilities().initRandomNumberGenerator((unsigned int) integrator.getRandomNumberSeed());
80
81
82
    
    // Fill in the posq and velm arrays with safe values to avoid a risk of nans.
    
83
    if (useDoublePrecision) {
84
        vector<mm_double4> temp(positions.getSize());
85
        for (int i = 0; i < positions.getSize(); i++)
86
            temp[i] = mm_double4(0, 0, 0, 0);
87
88
        positions.upload(temp);
        for (int i = 0; i < velocities.getSize(); i++)
89
            temp[i] = mm_double4(0, 0, 0, 1);
90
        velocities.upload(temp);
91
92
    }
    else {
93
        vector<mm_float4> temp(positions.getSize());
94
        for (int i = 0; i < positions.getSize(); i++)
95
            temp[i] = mm_float4(0, 0, 0, 0);
96
97
        positions.upload(temp);
        for (int i = 0; i < velocities.getSize(); i++)
98
            temp[i] = mm_float4(0, 0, 0, 1);
99
        velocities.upload(temp);
100
    }
101
102
103
104
105
106
    
    // Build a list of contractions.
    
    groupsNotContracted = -1;
    const map<int, int>& contractions = integrator.getContractions();
    int maxContractedCopies = 0;
peastman's avatar
peastman committed
107
108
109
    for (auto& c : contractions) {
        int group = c.first;
        int copies = c.second;
110
111
112
113
        if (group < 0 || group > 31)
            throw OpenMMException("RPMDIntegrator: Force group must be between 0 and 31");
        if (copies < 0 || copies > numCopies)
            throw OpenMMException("RPMDIntegrator: Number of copies for contraction cannot be greater than the total number of copies being simulated");
114
115
        if (copies != findFFTDimension(copies))
            throw OpenMMException("RPMDIntegrator: Number of copies for contraction must be a multiple of powers of 2, 3, and 5.");
116
117
118
119
120
121
122
123
        if (copies != numCopies) {
            if (groupsByCopies.find(copies) == groupsByCopies.end()) {
                groupsByCopies[copies] = 1<<group;
                if (copies > maxContractedCopies)
                    maxContractedCopies = copies;
            }
            else
                groupsByCopies[copies] |= 1<<group;
peastman's avatar
peastman committed
124
            groupsNotContracted -= 1<<group;
125
126
        }
    }
127
    groupsNotContracted &= integrator.getIntegrationForceGroups();
128
    if (maxContractedCopies > 0) {
129
130
        contractedForces.initialize<long long>(cc, maxContractedCopies*paddedParticles*3, "rpmdContractedForces");
        contractedPositions.initialize(cc, maxContractedCopies*paddedParticles, elementSize, "rpmdContractedPositions");
131
    }
132
133
134
135

    // Create kernels.
    
    map<string, string> defines;
136
137
138
139
    defines["NUM_ATOMS"] = cc.intToString(cc.getNumAtoms());
    defines["PADDED_NUM_ATOMS"] = cc.intToString(cc.getPaddedNumAtoms());
    defines["NUM_COPIES"] = cc.intToString(numCopies);
    defines["THREAD_BLOCK_SIZE"] = cc.intToString(workgroupSize);
140
141
142
    defines["HBAR"] = cc.doubleToString(1.054571628e-34*AVOGADRO/(1000*1e-12), true);
    defines["SCALE"] = cc.doubleToString(1.0/sqrt((double) numCopies), true);
    defines["M_PI"] = cc.doubleToString(M_PI, true);
143
144
145
146
147
    map<string, string> replacements;
    replacements["FFT_Q_FORWARD"] = createFFT(numCopies, "q", true);
    replacements["FFT_Q_BACKWARD"] = createFFT(numCopies, "q", false);
    replacements["FFT_V_FORWARD"] = createFFT(numCopies, "v", true);
    replacements["FFT_V_BACKWARD"] = createFFT(numCopies, "v", false);
148
149
150
151
152
153
154
    ComputeProgram program = cc.compileProgram(cc.replaceStrings(CommonRpmdKernelSources::rpmd, replacements), defines);
    pileKernel = program->createKernel("applyPileThermostat");
    stepKernel = program->createKernel("integrateStep");
    velocitiesKernel = program->createKernel("advanceVelocities");
    copyToContextKernel = program->createKernel("copyDataToContext");
    copyFromContextKernel = program->createKernel("copyDataFromContext");
    translateKernel = program->createKernel("applyCellTranslations");
155
156
157
    
    // Create kernels for doing contractions.
    
peastman's avatar
peastman committed
158
159
    for (auto& g : groupsByCopies) {
        int copies = g.first;
160
        replacements.clear();
161
        replacements["NUM_CONTRACTED_COPIES"] = cc.intToString(copies);
162
163
        replacements["POS_SCALE"] = cc.doubleToString(1.0/numCopies, true);
        replacements["FORCE_SCALE"] = cc.doubleToString(0x100000000/(double) copies, true);
164
165
166
167
        replacements["FFT_Q_FORWARD"] = createFFT(numCopies, "q", true);
        replacements["FFT_Q_BACKWARD"] = createFFT(copies, "q", false);
        replacements["FFT_F_FORWARD"] = createFFT(copies, "f", true);
        replacements["FFT_F_BACKWARD"] = createFFT(numCopies, "f", false);
168
169
170
        program = cc.compileProgram(cc.replaceStrings(CommonRpmdKernelSources::rpmdContraction, replacements), defines);
        positionContractionKernels[copies] = program->createKernel("contractPositions");
        forceContractionKernels[copies] = program->createKernel("contractForces");
171
    }
172
173
}

174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
void CommonIntegrateRPMDStepKernel::initializeKernels(ContextImpl& context) {
    hasInitializedKernels = true;
    pileKernel->addArg(velocities);
    pileKernel->addArg(cc.getIntegrationUtilities().getRandom());
    pileKernel->addArg();
    pileKernel->addArg();
    pileKernel->addArg();
    pileKernel->addArg();
    stepKernel->addArg(positions);
    stepKernel->addArg(velocities);
    stepKernel->addArg(forces);
    stepKernel->addArg();
    stepKernel->addArg();
    velocitiesKernel->addArg(velocities);
    velocitiesKernel->addArg(forces);
    velocitiesKernel->addArg();
    translateKernel->addArg(positions);
    translateKernel->addArg(cc.getPosq());
    translateKernel->addArg(cc.getAtomIndexArray());
    translateKernel->addArg();
    copyToContextKernel->addArg(velocities);
    copyToContextKernel->addArg(cc.getVelm());
    copyToContextKernel->addArg();
    copyToContextKernel->addArg(cc.getPosq());
    copyToContextKernel->addArg(cc.getAtomIndexArray());
    copyToContextKernel->addArg();
    copyFromContextKernel->addArg(cc.getLongForceBuffer());
    copyFromContextKernel->addArg();
    copyFromContextKernel->addArg(cc.getVelm());
    copyFromContextKernel->addArg(velocities);
    copyFromContextKernel->addArg(cc.getPosq());
    copyFromContextKernel->addArg();
    copyFromContextKernel->addArg(cc.getAtomIndexArray());
    copyFromContextKernel->addArg();
    for (auto& g : groupsByCopies) {
        int copies = g.first;
        positionContractionKernels[copies]->addArg(positions);
        positionContractionKernels[copies]->addArg(contractedPositions);
        forceContractionKernels[copies]->addArg(forces);
        forceContractionKernels[copies]->addArg(contractedForces);
    }
}

void CommonIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDIntegrator& integrator, bool forcesAreValid) {
218
    ContextSelector selector(cc);
219
220
221
    if (!hasInitializedKernels)
        initializeKernels(context);
    IntegrationUtilities& integration = cc.getIntegrationUtilities();
222
223
224
225
226
227
228
229
    
    // Loop over copies and compute the force on each one.
    
    if (!forcesAreValid)
        computeForces(context);
    
    // Apply the PILE-L thermostat.
    
230
    bool useDoublePrecision = (cc.getUseDoublePrecision() || cc.getUseMixedPrecision());
231
    double dt = integrator.getStepSize();
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
    pileKernel->setArg(2, integration.prepareRandomNumbers(numParticles*numCopies));
    if (useDoublePrecision) {
        pileKernel->setArg(3, dt);
        pileKernel->setArg(4, integrator.getTemperature()*BOLTZ);
        pileKernel->setArg(5, integrator.getFriction());
        stepKernel->setArg(3, dt);
        stepKernel->setArg(4, integrator.getTemperature()*BOLTZ);
        velocitiesKernel->setArg(2, dt);
    }
    else {
        pileKernel->setArg(3, (float) dt);
        pileKernel->setArg(4, (float) (integrator.getTemperature()*BOLTZ));
        pileKernel->setArg(5, (float) integrator.getFriction());
        stepKernel->setArg(3, (float) dt);
        stepKernel->setArg(4, (float) (integrator.getTemperature()*BOLTZ));
        velocitiesKernel->setArg(2, (float) dt);
    }
249
    if (integrator.getApplyThermostat())
250
        pileKernel->execute(numParticles*numCopies, workgroupSize);
251
252
253

    // Update positions and velocities.
    
254
    stepKernel->execute(numParticles*numCopies, workgroupSize);
255
256
257
258
259
260
261

    // Calculate forces based on the updated positions.
    
    computeForces(context);
    
    // Update velocities.

262
    velocitiesKernel->execute(numParticles*numCopies, workgroupSize);
263
264
265

    // Apply the PILE-L thermostat again.

266
    if (integrator.getApplyThermostat()) {
267
268
        pileKernel->setArg(2, integration.prepareRandomNumbers(numParticles*numCopies));
        pileKernel->execute(numParticles*numCopies, workgroupSize);
269
    }
270
271
272

    // Update the time and step count.

273
274
275
276
    cc.setTime(cc.getTime()+dt);
    cc.setStepCount(cc.getStepCount()+1);
    cc.reorderAtoms();
    if (cc.getAtomsWereReordered() && cc.getNonbondedUtilities().getUsePeriodic()) {
277
278
279
        // Atoms may have been translated into a different periodic box, so apply
        // the same translation to all the beads.

280
281
        translateKernel->setArg(3, numCopies-1);
        translateKernel->execute(cc.getNumAtoms());
282
    }
283
284
}

285
void CommonIntegrateRPMDStepKernel::computeForces(ContextImpl& context) {
286
287
    // Compute forces from all groups that didn't have a specified contraction.

288
289
290
    copyToContextKernel->setArg(2, positions);
    copyFromContextKernel->setArg(1, forces);
    copyFromContextKernel->setArg(5, positions);
291
    for (int i = 0; i < numCopies; i++) {
292
293
        copyToContextKernel->setArg(5, i);
        copyToContextKernel->execute(cc.getNumAtoms());
Peter Eastman's avatar
Peter Eastman committed
294
        context.computeVirtualSites();
295
296
        Vec3 initialBox[3];
        context.getPeriodicBoxVectors(initialBox[0], initialBox[1], initialBox[2]);
297
        context.updateContextState();
298
299
        Vec3 finalBox[3];
        context.getPeriodicBoxVectors(finalBox[0], finalBox[1], finalBox[2]);
300
301
        if (initialBox[0] != finalBox[0] || initialBox[1] != finalBox[1] || initialBox[2] != finalBox[2])
            throw OpenMMException("Standard barostats cannot be used with RPMDIntegrator.  Use RPMDMonteCarloBarostat instead.");
302
        context.calcForcesAndEnergy(true, false, groupsNotContracted);
303
304
        copyFromContextKernel->setArg(7, i);
        copyFromContextKernel->execute(cc.getNumAtoms());
305
    }
306
307
308
    
    // Now loop over contractions and compute forces from them.
    
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
    if (groupsByCopies.size() > 0) {
        copyToContextKernel->setArg(2, contractedPositions);
        copyFromContextKernel->setArg(1, contractedForces);
        copyFromContextKernel->setArg(5, contractedPositions);
        for (auto& g : groupsByCopies) {
            int copies = g.first;
            int groupFlags = g.second;

            // Find the contracted positions.

           positionContractionKernels[copies]->execute(numParticles*numCopies, workgroupSize);

            // Compute forces.

            for (int i = 0; i < copies; i++) {
                copyToContextKernel->setArg(5, i);
                copyToContextKernel->execute(cc.getNumAtoms());
                context.computeVirtualSites();
                context.calcForcesAndEnergy(true, false, groupFlags);
                copyFromContextKernel->setArg(7, i);
                copyFromContextKernel->execute(cc.getNumAtoms());
            }

            // Apply the forces to the original copies.

            forceContractionKernels[copies]->execute(numParticles*numCopies, workgroupSize);
335
336
        }
    }
337
338
339
    if (groupsByCopies.size() > 0) {
        // Ensure the Context contains the positions from the last copy, since we'll assume that later.
        
340
341
342
        copyToContextKernel->setArg(2, positions);
        copyToContextKernel->setArg(5, numCopies-1);
        copyToContextKernel->execute(cc.getNumAtoms());
343
    }
344
345
}

346
347
double CommonIntegrateRPMDStepKernel::computeKineticEnergy(ContextImpl& context, const RPMDIntegrator& integrator) {
    return cc.getIntegrationUtilities().computeKineticEnergy(0);
348
349
}

350
void CommonIntegrateRPMDStepKernel::setPositions(int copy, const vector<Vec3>& pos) {
351
    if (!positions.isInitialized())
352
353
354
        throw OpenMMException("RPMDIntegrator: Cannot set positions before the integrator is added to a Context");
    if (pos.size() != numParticles)
        throw OpenMMException("RPMDIntegrator: wrong number of values passed to setPositions()");
355
356
357

    // Adjust the positions based on the current cell offsets.
    
358
359
360
    const vector<int>& order = cc.getAtomIndex();
    Vec3 a, b, c;
    cc.getPeriodicBoxVectors(a, b, c);
361
362
    vector<Vec3> offsetPos(numParticles);
    for (int i = 0; i < numParticles; ++i) {
363
364
        mm_int4 offset = cc.getPosCellOffsets()[i];
        offsetPos[order[i]] = pos[order[i]] + Vec3(offset.x*a[0], offset.y*b[1], offset.z*c[2]);
365
366
367
368
    }

    // Record the positions.

369
    ContextSelector selector(cc);
370
371
372
    if (cc.getUseDoublePrecision()) {
        vector<mm_double4> posq(cc.getPaddedNumAtoms());
        cc.getPosq().download(posq);
373
        for (int i = 0; i < numParticles; i++)
374
375
            posq[i] = mm_double4(offsetPos[i][0], offsetPos[i][1], offsetPos[i][2], posq[i].w);
        positions.uploadSubArray(&posq[0], copy*cc.getPaddedNumAtoms(), numParticles);
376
    }
377
378
379
380
    else if (cc.getUseMixedPrecision()) {
        vector<mm_float4> posqf(cc.getPaddedNumAtoms());
        cc.getPosq().download(posqf);
        vector<mm_double4> posq(cc.getPaddedNumAtoms());
381
        for (int i = 0; i < numParticles; i++)
382
383
            posq[i] = mm_double4(offsetPos[i][0], offsetPos[i][1], offsetPos[i][2], posqf[i].w);
        positions.uploadSubArray(&posq[0], copy*cc.getPaddedNumAtoms(), numParticles);
384
385
    }
    else {
386
387
        vector<mm_float4> posq(cc.getPaddedNumAtoms());
        cc.getPosq().download(posq);
388
        for (int i = 0; i < numParticles; i++)
389
390
            posq[i] = mm_float4((float) offsetPos[i][0], (float) offsetPos[i][1], (float) offsetPos[i][2], posq[i].w);
        positions.uploadSubArray(&posq[0], copy*cc.getPaddedNumAtoms(), numParticles);
391
392
393
    }
}

394
void CommonIntegrateRPMDStepKernel::setVelocities(int copy, const vector<Vec3>& vel) {
395
    if (!velocities.isInitialized())
396
397
398
        throw OpenMMException("RPMDIntegrator: Cannot set velocities before the integrator is added to a Context");
    if (vel.size() != numParticles)
        throw OpenMMException("RPMDIntegrator: wrong number of values passed to setVelocities()");
399
    ContextSelector selector(cc);
400
401
402
    if (cc.getUseDoublePrecision() || cc.getUseMixedPrecision()) {
        vector<mm_double4> velm(cc.getPaddedNumAtoms());
        cc.getVelm().download(velm);
403
        for (int i = 0; i < numParticles; i++)
404
405
            velm[i] = mm_double4(vel[i][0], vel[i][1], vel[i][2], velm[i].w);
        velocities.uploadSubArray(&velm[0], copy*cc.getPaddedNumAtoms(), numParticles);
406
407
    }
    else {
408
409
        vector<mm_float4> velm(cc.getPaddedNumAtoms());
        cc.getVelm().download(velm);
410
        for (int i = 0; i < numParticles; i++)
411
412
            velm[i] = mm_float4((float) vel[i][0], (float) vel[i][1], (float) vel[i][2], velm[i].w);
        velocities.uploadSubArray(&velm[0], copy*cc.getPaddedNumAtoms(), numParticles);
413
414
415
    }
}

416
void CommonIntegrateRPMDStepKernel::copyToContext(int copy, ContextImpl& context) {
417
    ContextSelector selector(cc);
418
419
420
421
422
    if (!hasInitializedKernels)
        initializeKernels(context);
    copyToContextKernel->setArg(2, positions);
    copyToContextKernel->setArg(5, copy);
    copyToContextKernel->execute(cc.getNumAtoms());
423
424
}

425
string CommonIntegrateRPMDStepKernel::createFFT(int size, const string& variable, bool forward) {
426
427
428
429
430
431
432
433
434
    stringstream source;
    int stage = 0;
    int L = size;
    int m = 1;
    string sign = (forward ? "1.0f" : "-1.0f");
    string multReal = (forward ? "multiplyComplexRealPart" : "multiplyComplexRealPartConj");
    string multImag = (forward ? "multiplyComplexImagPart" : "multiplyComplexImagPartConj");

    source<<"{\n";
435
436
437
438
    source<<"LOCAL_ARG mixed3* real0 = "<<variable<<"real;\n";
    source<<"LOCAL_ARG mixed3* imag0 = "<<variable<<"imag;\n";
    source<<"LOCAL_ARG mixed3* real1 = &temp[blockStart];\n";
    source<<"LOCAL_ARG mixed3* imag1 = &temp[blockStart+LOCAL_SIZE];\n";
439
440
441

    // Factor size, generating an appropriate block of code for each factor.

peastman's avatar
peastman committed
442
    while (L > 1) {
443
444
        int input = stage%2;
        int output = 1-input;
peastman's avatar
peastman committed
445
446
447
448
449
450
451
452
453
454
        int radix;
        if (L%5 == 0)
            radix = 5;
        else if (L%4 == 0)
            radix = 4;
        else if (L%3 == 0)
            radix = 3;
        else if (L%2 == 0)
            radix = 2;
        else
455
            throw OpenMMException("Illegal size for FFT: "+cc.intToString(size));
456
        source<<"{\n";
peastman's avatar
peastman committed
457
458
459
460
461
462
        L = L/radix;
        source<<"// Pass "<<(stage+1)<<" (radix "<<radix<<")\n";
        source<<"if (indexInBlock < "<<(L*m)<<") {\n";
        source<<"int i = indexInBlock;\n";
        source<<"int j = i/"<<m<<";\n";
        if (radix == 5) {
463
464
465
466
467
468
469
470
471
472
473
474
475
476
            source<<"mixed3 c0r = real"<<input<<"[i];\n";
            source<<"mixed3 c0i = imag"<<input<<"[i];\n";
            source<<"mixed3 c1r = real"<<input<<"[i+"<<(L*m)<<"];\n";
            source<<"mixed3 c1i = imag"<<input<<"[i+"<<(L*m)<<"];\n";
            source<<"mixed3 c2r = real"<<input<<"[i+"<<(2*L*m)<<"];\n";
            source<<"mixed3 c2i = imag"<<input<<"[i+"<<(2*L*m)<<"];\n";
            source<<"mixed3 c3r = real"<<input<<"[i+"<<(3*L*m)<<"];\n";
            source<<"mixed3 c3i = imag"<<input<<"[i+"<<(3*L*m)<<"];\n";
            source<<"mixed3 c4r = real"<<input<<"[i+"<<(4*L*m)<<"];\n";
            source<<"mixed3 c4i = imag"<<input<<"[i+"<<(4*L*m)<<"];\n";
            source<<"mixed3 d0r = c1r+c4r;\n";
            source<<"mixed3 d0i = c1i+c4i;\n";
            source<<"mixed3 d1r = c2r+c3r;\n";
            source<<"mixed3 d1i = c2i+c3i;\n";
477
478
479
480
            source<<"mixed3 d2r = "<<cc.doubleToString(sin(0.4*M_PI), true)<<"*(c1r-c4r);\n";
            source<<"mixed3 d2i = "<<cc.doubleToString(sin(0.4*M_PI), true)<<"*(c1i-c4i);\n";
            source<<"mixed3 d3r = "<<cc.doubleToString(sin(0.4*M_PI), true)<<"*(c2r-c3r);\n";
            source<<"mixed3 d3i = "<<cc.doubleToString(sin(0.4*M_PI), true)<<"*(c2i-c3i);\n";
481
482
            source<<"mixed3 d4r = d0r+d1r;\n";
            source<<"mixed3 d4i = d0i+d1i;\n";
483
484
            source<<"mixed3 d5r = "<<cc.doubleToString(0.25*sqrt(5.0), true)<<"*(d0r-d1r);\n";
            source<<"mixed3 d5i = "<<cc.doubleToString(0.25*sqrt(5.0), true)<<"*(d0i-d1i);\n";
485
486
487
488
489
490
            source<<"mixed3 d6r = c0r-0.25f*d4r;\n";
            source<<"mixed3 d6i = c0i-0.25f*d4i;\n";
            source<<"mixed3 d7r = d6r+d5r;\n";
            source<<"mixed3 d7i = d6i+d5i;\n";
            source<<"mixed3 d8r = d6r-d5r;\n";
            source<<"mixed3 d8i = d6i-d5i;\n";
491
            string coeff = cc.doubleToString(sin(0.2*M_PI)/sin(0.4*M_PI), true);
492
493
494
495
            source<<"mixed3 d9r = "<<sign<<"*(d2i+"<<coeff<<"*d3i);\n";
            source<<"mixed3 d9i = "<<sign<<"*(-d2r-"<<coeff<<"*d3r);\n";
            source<<"mixed3 d10r = "<<sign<<"*("<<coeff<<"*d2i-d3i);\n";
            source<<"mixed3 d10i = "<<sign<<"*(d3r-"<<coeff<<"*d2r);\n";
496
497
498
499
500
501
502
503
504
505
506
            source<<"real"<<output<<"[i+4*j*"<<m<<"] = c0r+d4r;\n";
            source<<"imag"<<output<<"[i+4*j*"<<m<<"] = c0i+d4i;\n";
            source<<"real"<<output<<"[i+(4*j+1)*"<<m<<"] = "<<multReal<<"(w[j*"<<size<<"/"<<(5*L)<<"], d7r+d9r, d7i+d9i);\n";
            source<<"imag"<<output<<"[i+(4*j+1)*"<<m<<"] = "<<multImag<<"(w[j*"<<size<<"/"<<(5*L)<<"], d7r+d9r, d7i+d9i);\n";
            source<<"real"<<output<<"[i+(4*j+2)*"<<m<<"] = "<<multReal<<"(w[j*"<<(2*size)<<"/"<<(5*L)<<"], d8r+d10r, d8i+d10i);\n";
            source<<"imag"<<output<<"[i+(4*j+2)*"<<m<<"] = "<<multImag<<"(w[j*"<<(2*size)<<"/"<<(5*L)<<"], d8r+d10r, d8i+d10i);\n";
            source<<"real"<<output<<"[i+(4*j+3)*"<<m<<"] = "<<multReal<<"(w[j*"<<(3*size)<<"/"<<(5*L)<<"], d8r-d10r, d8i-d10i);\n";
            source<<"imag"<<output<<"[i+(4*j+3)*"<<m<<"] = "<<multImag<<"(w[j*"<<(3*size)<<"/"<<(5*L)<<"], d8r-d10r, d8i-d10i);\n";
            source<<"real"<<output<<"[i+(4*j+4)*"<<m<<"] = "<<multReal<<"(w[j*"<<(4*size)<<"/"<<(5*L)<<"], d7r-d9r, d7i-d9i);\n";
            source<<"imag"<<output<<"[i+(4*j+4)*"<<m<<"] = "<<multImag<<"(w[j*"<<(4*size)<<"/"<<(5*L)<<"], d7r-d9r, d7i-d9i);\n";
        }
peastman's avatar
peastman committed
507
        else if (radix == 4) {
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
            source<<"mixed3 c0r = real"<<input<<"[i];\n";
            source<<"mixed3 c0i = imag"<<input<<"[i];\n";
            source<<"mixed3 c1r = real"<<input<<"[i+"<<(L*m)<<"];\n";
            source<<"mixed3 c1i = imag"<<input<<"[i+"<<(L*m)<<"];\n";
            source<<"mixed3 c2r = real"<<input<<"[i+"<<(2*L*m)<<"];\n";
            source<<"mixed3 c2i = imag"<<input<<"[i+"<<(2*L*m)<<"];\n";
            source<<"mixed3 c3r = real"<<input<<"[i+"<<(3*L*m)<<"];\n";
            source<<"mixed3 c3i = imag"<<input<<"[i+"<<(3*L*m)<<"];\n";
            source<<"mixed3 d0r = c0r+c2r;\n";
            source<<"mixed3 d0i = c0i+c2i;\n";
            source<<"mixed3 d1r = c0r-c2r;\n";
            source<<"mixed3 d1i = c0i-c2i;\n";
            source<<"mixed3 d2r = c1r+c3r;\n";
            source<<"mixed3 d2i = c1i+c3i;\n";
            source<<"mixed3 d3r = "<<sign<<"*(c1i-c3i);\n";
            source<<"mixed3 d3i = "<<sign<<"*(c3r-c1r);\n";
524
525
526
527
528
529
530
531
532
            source<<"real"<<output<<"[i+3*j*"<<m<<"] = d0r+d2r;\n";
            source<<"imag"<<output<<"[i+3*j*"<<m<<"] = d0i+d2i;\n";
            source<<"real"<<output<<"[i+(3*j+1)*"<<m<<"] = "<<multReal<<"(w[j*"<<size<<"/"<<(4*L)<<"], d1r+d3r, d1i+d3i);\n";
            source<<"imag"<<output<<"[i+(3*j+1)*"<<m<<"] = "<<multImag<<"(w[j*"<<size<<"/"<<(4*L)<<"], d1r+d3r, d1i+d3i);\n";
            source<<"real"<<output<<"[i+(3*j+2)*"<<m<<"] = "<<multReal<<"(w[j*"<<(2*size)<<"/"<<(4*L)<<"], d0r-d2r, d0i-d2i);\n";
            source<<"imag"<<output<<"[i+(3*j+2)*"<<m<<"] = "<<multImag<<"(w[j*"<<(2*size)<<"/"<<(4*L)<<"], d0r-d2r, d0i-d2i);\n";
            source<<"real"<<output<<"[i+(3*j+3)*"<<m<<"] = "<<multReal<<"(w[j*"<<(3*size)<<"/"<<(4*L)<<"], d1r-d3r, d1i-d3i);\n";
            source<<"imag"<<output<<"[i+(3*j+3)*"<<m<<"] = "<<multImag<<"(w[j*"<<(3*size)<<"/"<<(4*L)<<"], d1r-d3r, d1i-d3i);\n";
        }
peastman's avatar
peastman committed
533
        else if (radix == 3) {
534
535
536
537
538
539
540
541
542
543
            source<<"mixed3 c0r = real"<<input<<"[i];\n";
            source<<"mixed3 c0i = imag"<<input<<"[i];\n";
            source<<"mixed3 c1r = real"<<input<<"[i+"<<(L*m)<<"];\n";
            source<<"mixed3 c1i = imag"<<input<<"[i+"<<(L*m)<<"];\n";
            source<<"mixed3 c2r = real"<<input<<"[i+"<<(2*L*m)<<"];\n";
            source<<"mixed3 c2i = imag"<<input<<"[i+"<<(2*L*m)<<"];\n";
            source<<"mixed3 d0r = c1r+c2r;\n";
            source<<"mixed3 d0i = c1i+c2i;\n";
            source<<"mixed3 d1r = c0r-0.5f*d0r;\n";
            source<<"mixed3 d1i = c0i-0.5f*d0i;\n";
544
545
            source<<"mixed3 d2r = "<<sign<<"*"<<cc.doubleToString(sin(M_PI/3.0), true)<<"*(c1i-c2i);\n";
            source<<"mixed3 d2i = "<<sign<<"*"<<cc.doubleToString(sin(M_PI/3.0), true)<<"*(c2r-c1r);\n";
546
547
548
549
550
551
552
            source<<"real"<<output<<"[i+2*j*"<<m<<"] = c0r+d0r;\n";
            source<<"imag"<<output<<"[i+2*j*"<<m<<"] = c0i+d0i;\n";
            source<<"real"<<output<<"[i+(2*j+1)*"<<m<<"] = "<<multReal<<"(w[j*"<<size<<"/"<<(3*L)<<"], d1r+d2r, d1i+d2i);\n";
            source<<"imag"<<output<<"[i+(2*j+1)*"<<m<<"] = "<<multImag<<"(w[j*"<<size<<"/"<<(3*L)<<"], d1r+d2r, d1i+d2i);\n";
            source<<"real"<<output<<"[i+(2*j+2)*"<<m<<"] = "<<multReal<<"(w[j*"<<(2*size)<<"/"<<(3*L)<<"], d1r-d2r, d1i-d2i);\n";
            source<<"imag"<<output<<"[i+(2*j+2)*"<<m<<"] = "<<multImag<<"(w[j*"<<(2*size)<<"/"<<(3*L)<<"], d1r-d2r, d1i-d2i);\n";
        }
peastman's avatar
peastman committed
553
        else if (radix == 2) {
554
555
556
557
            source<<"mixed3 c0r = real"<<input<<"[i];\n";
            source<<"mixed3 c0i = imag"<<input<<"[i];\n";
            source<<"mixed3 c1r = real"<<input<<"[i+"<<(L*m)<<"];\n";
            source<<"mixed3 c1i = imag"<<input<<"[i+"<<(L*m)<<"];\n";
558
559
560
561
562
            source<<"real"<<output<<"[i+j*"<<m<<"] = c0r+c1r;\n";
            source<<"imag"<<output<<"[i+j*"<<m<<"] = c0i+c1i;\n";
            source<<"real"<<output<<"[i+(j+1)*"<<m<<"] = "<<multReal<<"(w[j*"<<size<<"/"<<(2*L)<<"], c0r-c1r, c0i-c1i);\n";
            source<<"imag"<<output<<"[i+(j+1)*"<<m<<"] = "<<multImag<<"(w[j*"<<size<<"/"<<(2*L)<<"], c0r-c1r, c0i-c1i);\n";
        }
peastman's avatar
peastman committed
563
564
        source<<"}\n";
        m = m*radix;
565
        source<<"SYNC_THREADS;\n";
566
567
568
569
570
571
572
573
574
        source<<"}\n";
        ++stage;
    }

    // Create the kernel.

    if (stage%2 == 1) {
        source<<"real0[indexInBlock] = real1[indexInBlock];\n";
        source<<"imag0[indexInBlock] = imag1[indexInBlock];\n";
575
        source<<"SYNC_THREADS;\n";
576
577
578
579
    }
    source<<"}\n";
    return source.str();
}