AmoebaCudaKernels.cpp 138 KB
Newer Older
1
2
3
4
5
6
7
8
/* -------------------------------------------------------------------------- *
 *                               OpenMMAmoeba                                 *
 * -------------------------------------------------------------------------- *
 * This is part of the OpenMM molecular simulation toolkit originating from   *
 * Simbios, the NIH National Center for Physics-Based Simulation of           *
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
9
 * Portions copyright (c) 2008-2018 Stanford University and the Authors.      *
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
 * Authors: Peter Eastman, Mark Friedrichs                                    *
 * Contributors:                                                              *
 *                                                                            *
 * This program is free software: you can redistribute it and/or modify       *
 * it under the terms of the GNU Lesser General Public License as published   *
 * by the Free Software Foundation, either version 3 of the License, or       *
 * (at your option) any later version.                                        *
 *                                                                            *
 * This program is distributed in the hope that it will be useful,            *
 * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
 * GNU Lesser General Public License for more details.                        *
 *                                                                            *
 * You should have received a copy of the GNU Lesser General Public License   *
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
 * -------------------------------------------------------------------------- */

27
28
29
#ifdef WIN32
  #define _USE_MATH_DEFINES // Needed to get M_PI
#endif
30
31
32
#include "AmoebaCudaKernels.h"
#include "CudaAmoebaKernelSources.h"
#include "openmm/internal/ContextImpl.h"
33
#include "openmm/internal/AmoebaGeneralizedKirkwoodForceImpl.h"
34
35
36
#include "openmm/internal/AmoebaMultipoleForceImpl.h"
#include "openmm/internal/AmoebaWcaDispersionForceImpl.h"
#include "openmm/internal/AmoebaTorsionTorsionForceImpl.h"
37
#include "openmm/internal/AmoebaVdwForceImpl.h"
38
39
#include "openmm/internal/NonbondedForceImpl.h"
#include "CudaBondedUtilities.h"
40
#include "CudaFFT3D.h"
41
42
#include "CudaForceInfo.h"
#include "CudaKernelSources.h"
43
#include "CudaNonbondedUtilities.h"
Peter Eastman's avatar
Peter Eastman committed
44
#include "jama_lu.h"
45

46
#include <algorithm>
47
48
49
50
51
52
53
54
#include <cmath>
#ifdef _MSC_VER
#include <windows.h>
#endif

using namespace OpenMM;
using namespace std;

Peter Eastman's avatar
Peter Eastman committed
55
#define CHECK_RESULT(result, prefix) \
56
57
    if (result != CUDA_SUCCESS) { \
        std::stringstream m; \
Peter Eastman's avatar
Peter Eastman committed
58
        m<<prefix<<": "<<cu.getErrorString(result)<<" ("<<result<<")"<<" at "<<__FILE__<<":"<<__LINE__; \
59
60
61
        throw OpenMMException(m.str());\
    }

62
/* -------------------------------------------------------------------------- *
63
 *                            AmoebaBondForce                                 *
64
65
 * -------------------------------------------------------------------------- */

66
class CudaCalcAmoebaBondForceKernel::ForceInfo : public CudaForceInfo {
67
public:
68
    ForceInfo(const AmoebaBondForce& force) : force(force) {
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
    }
    int getNumParticleGroups() {
        return force.getNumBonds();
    }
    void getParticlesInGroup(int index, std::vector<int>& particles) {
        int particle1, particle2;
        double length, k;
        force.getBondParameters(index, particle1, particle2, length, k);
        particles.resize(2);
        particles[0] = particle1;
        particles[1] = particle2;
    }
    bool areGroupsIdentical(int group1, int group2) {
        int particle1, particle2;
        double length1, length2, k1, k2;
        force.getBondParameters(group1, particle1, particle2, length1, k1);
        force.getBondParameters(group2, particle1, particle2, length2, k2);
        return (length1 == length2 && k1 == k2);
    }
private:
89
    const AmoebaBondForce& force;
90
91
};

92
CudaCalcAmoebaBondForceKernel::CudaCalcAmoebaBondForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) : 
93
                CalcAmoebaBondForceKernel(name, platform), cu(cu), system(system) {
94
95
}

96
void CudaCalcAmoebaBondForceKernel::initialize(const System& system, const AmoebaBondForce& force) {
97
98
99
100
101
102
103
104
    cu.setAsCurrent();
    int numContexts = cu.getPlatformData().contexts.size();
    int startIndex = cu.getContextIndex()*force.getNumBonds()/numContexts;
    int endIndex = (cu.getContextIndex()+1)*force.getNumBonds()/numContexts;
    numBonds = endIndex-startIndex;
    if (numBonds == 0)
        return;
    vector<vector<int> > atoms(numBonds, vector<int>(2));
105
    params.initialize<float2>(cu, numBonds, "bondParams");
106
107
108
109
110
111
    vector<float2> paramVector(numBonds);
    for (int i = 0; i < numBonds; i++) {
        double length, k;
        force.getBondParameters(startIndex+i, atoms[i][0], atoms[i][1], length, k);
        paramVector[i] = make_float2((float) length, (float) k);
    }
112
    params.upload(paramVector);
113
    map<string, string> replacements;
114
    replacements["APPLY_PERIODIC"] = (force.usesPeriodicBoundaryConditions() ? "1" : "0");
115
    replacements["COMPUTE_FORCE"] = CudaAmoebaKernelSources::amoebaBondForce;
116
    replacements["PARAMS"] = cu.getBondedUtilities().addArgument(params.getDevicePointer(), "float2");
117
118
    replacements["CUBIC_K"] = cu.doubleToString(force.getAmoebaGlobalBondCubic());
    replacements["QUARTIC_K"] = cu.doubleToString(force.getAmoebaGlobalBondQuartic());
119
120
121
122
    cu.getBondedUtilities().addInteraction(atoms, cu.replaceStrings(CudaKernelSources::bondForce, replacements), force.getForceGroup());
    cu.addForce(new ForceInfo(force));
}

123
double CudaCalcAmoebaBondForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
124
125
126
    return 0.0;
}

127
128
129
130
131
132
133
void CudaCalcAmoebaBondForceKernel::copyParametersToContext(ContextImpl& context, const AmoebaBondForce& force) {
    cu.setAsCurrent();
    int numContexts = cu.getPlatformData().contexts.size();
    int startIndex = cu.getContextIndex()*force.getNumBonds()/numContexts;
    int endIndex = (cu.getContextIndex()+1)*force.getNumBonds()/numContexts;
    if (numBonds != endIndex-startIndex)
        throw OpenMMException("updateParametersInContext: The number of bonds has changed");
134
135
    if (numBonds == 0)
        return;
136
137
138
139
140
141
142
143
144
145
    
    // Record the per-bond parameters.
    
    vector<float2> paramVector(numBonds);
    for (int i = 0; i < numBonds; i++) {
        int atom1, atom2;
        double length, k;
        force.getBondParameters(startIndex+i, atom1, atom2, length, k);
        paramVector[i] = make_float2((float) length, (float) k);
    }
146
    params.upload(paramVector);
147
148
149
150
151
152
    
    // Mark that the current reordering may be invalid.
    
    cu.invalidateMolecules();
}

153
/* -------------------------------------------------------------------------- *
154
 *                            AmoebaAngleForce                                *
155
156
 * -------------------------------------------------------------------------- */

157
class CudaCalcAmoebaAngleForceKernel::ForceInfo : public CudaForceInfo {
158
public:
159
    ForceInfo(const AmoebaAngleForce& force) : force(force) {
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
    }
    int getNumParticleGroups() {
        return force.getNumAngles();
    }
    void getParticlesInGroup(int index, std::vector<int>& particles) {
        int particle1, particle2, particle3;
        double angle, k;
        force.getAngleParameters(index, particle1, particle2, particle3, angle, k);
        particles.resize(3);
        particles[0] = particle1;
        particles[1] = particle2;
        particles[2] = particle3;
    }
    bool areGroupsIdentical(int group1, int group2) {
        int particle1, particle2, particle3;
        double angle1, angle2, k1, k2;
        force.getAngleParameters(group1, particle1, particle2, particle3, angle1, k1);
        force.getAngleParameters(group2, particle1, particle2, particle3, angle2, k2);
        return (angle1 == angle2 && k1 == k2);
    }
private:
181
    const AmoebaAngleForce& force;
182
183
};

184
CudaCalcAmoebaAngleForceKernel::CudaCalcAmoebaAngleForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) :
185
            CalcAmoebaAngleForceKernel(name, platform), cu(cu), system(system) {
186
187
}

188
void CudaCalcAmoebaAngleForceKernel::initialize(const System& system, const AmoebaAngleForce& force) {
189
190
191
192
193
194
195
196
    cu.setAsCurrent();
    int numContexts = cu.getPlatformData().contexts.size();
    int startIndex = cu.getContextIndex()*force.getNumAngles()/numContexts;
    int endIndex = (cu.getContextIndex()+1)*force.getNumAngles()/numContexts;
    numAngles = endIndex-startIndex;
    if (numAngles == 0)
        return;
    vector<vector<int> > atoms(numAngles, vector<int>(3));
197
    params.initialize<float2>(cu, numAngles, "angleParams");
198
199
200
201
202
203
    vector<float2> paramVector(numAngles);
    for (int i = 0; i < numAngles; i++) {
        double angle, k;
        force.getAngleParameters(startIndex+i, atoms[i][0], atoms[i][1], atoms[i][2], angle, k);
        paramVector[i] = make_float2((float) angle, (float) k);
    }
204
    params.upload(paramVector);
205
    map<string, string> replacements;
206
    replacements["APPLY_PERIODIC"] = (force.usesPeriodicBoundaryConditions() ? "1" : "0");
207
    replacements["COMPUTE_FORCE"] = CudaAmoebaKernelSources::amoebaAngleForce;
208
    replacements["PARAMS"] = cu.getBondedUtilities().addArgument(params.getDevicePointer(), "float2");
209
210
211
212
    replacements["CUBIC_K"] = cu.doubleToString(force.getAmoebaGlobalAngleCubic());
    replacements["QUARTIC_K"] = cu.doubleToString(force.getAmoebaGlobalAngleQuartic());
    replacements["PENTIC_K"] = cu.doubleToString(force.getAmoebaGlobalAnglePentic());
    replacements["SEXTIC_K"] = cu.doubleToString(force.getAmoebaGlobalAngleSextic());
213
214
215
216
217
    replacements["RAD_TO_DEG"] = cu.doubleToString(180/M_PI);
    cu.getBondedUtilities().addInteraction(atoms, cu.replaceStrings(CudaKernelSources::angleForce, replacements), force.getForceGroup());
    cu.addForce(new ForceInfo(force));
}

218
double CudaCalcAmoebaAngleForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
219
220
221
    return 0.0;
}

222
223
224
225
226
227
228
void CudaCalcAmoebaAngleForceKernel::copyParametersToContext(ContextImpl& context, const AmoebaAngleForce& force) {
    cu.setAsCurrent();
    int numContexts = cu.getPlatformData().contexts.size();
    int startIndex = cu.getContextIndex()*force.getNumAngles()/numContexts;
    int endIndex = (cu.getContextIndex()+1)*force.getNumAngles()/numContexts;
    if (numAngles != endIndex-startIndex)
        throw OpenMMException("updateParametersInContext: The number of angles has changed");
229
230
    if (numAngles == 0)
        return;
231
232
233
234
235
236
237
238
239
240
    
    // Record the per-angle parameters.
    
    vector<float2> paramVector(numAngles);
    for (int i = 0; i < numAngles; i++) {
        int atom1, atom2, atom3;
        double angle, k;
        force.getAngleParameters(startIndex+i, atom1, atom2, atom3, angle, k);
        paramVector[i] = make_float2((float) angle, (float) k);
    }
241
    params.upload(paramVector);
242
243
244
245
246
247
    
    // Mark that the current reordering may be invalid.
    
    cu.invalidateMolecules();
}

248
/* -------------------------------------------------------------------------- *
249
 *                            AmoebaInPlaneAngleForce                         *
250
251
 * -------------------------------------------------------------------------- */

252
class CudaCalcAmoebaInPlaneAngleForceKernel::ForceInfo : public CudaForceInfo {
253
public:
254
    ForceInfo(const AmoebaInPlaneAngleForce& force) : force(force) {
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
    }
    int getNumParticleGroups() {
        return force.getNumAngles();
    }
    void getParticlesInGroup(int index, std::vector<int>& particles) {
        int particle1, particle2, particle3, particle4;
        double angle, k;
        force.getAngleParameters(index, particle1, particle2, particle3, particle4, angle, k);
        particles.resize(4);
        particles[0] = particle1;
        particles[1] = particle2;
        particles[2] = particle3;
        particles[3] = particle4;
    }
    bool areGroupsIdentical(int group1, int group2) {
        int particle1, particle2, particle3, particle4;
        double angle1, angle2, k1, k2;
        force.getAngleParameters(group1, particle1, particle2, particle3, particle4, angle1, k1);
        force.getAngleParameters(group2, particle1, particle2, particle3, particle4, angle2, k2);
        return (angle1 == angle2 && k1 == k2);
    }
private:
277
    const AmoebaInPlaneAngleForce& force;
278
279
};

280
CudaCalcAmoebaInPlaneAngleForceKernel::CudaCalcAmoebaInPlaneAngleForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) : 
281
          CalcAmoebaInPlaneAngleForceKernel(name, platform), cu(cu), system(system) {
282
283
}

284
void CudaCalcAmoebaInPlaneAngleForceKernel::initialize(const System& system, const AmoebaInPlaneAngleForce& force) {
285
286
287
288
289
290
291
292
    cu.setAsCurrent();
    int numContexts = cu.getPlatformData().contexts.size();
    int startIndex = cu.getContextIndex()*force.getNumAngles()/numContexts;
    int endIndex = (cu.getContextIndex()+1)*force.getNumAngles()/numContexts;
    numAngles = endIndex-startIndex;
    if (numAngles == 0)
        return;
    vector<vector<int> > atoms(numAngles, vector<int>(4));
293
    params.initialize<float2>(cu, numAngles, "angleParams");
294
295
296
297
298
299
    vector<float2> paramVector(numAngles);
    for (int i = 0; i < numAngles; i++) {
        double angle, k;
        force.getAngleParameters(startIndex+i, atoms[i][0], atoms[i][1], atoms[i][2], atoms[i][3], angle, k);
        paramVector[i] = make_float2((float) angle, (float) k);
    }
300
    params.upload(paramVector);
301
    map<string, string> replacements;
302
    replacements["APPLY_PERIODIC"] = (force.usesPeriodicBoundaryConditions() ? "1" : "0");
303
    replacements["PARAMS"] = cu.getBondedUtilities().addArgument(params.getDevicePointer(), "float2");
304
305
306
307
    replacements["CUBIC_K"] = cu.doubleToString(force.getAmoebaGlobalInPlaneAngleCubic());
    replacements["QUARTIC_K"] = cu.doubleToString(force.getAmoebaGlobalInPlaneAngleQuartic());
    replacements["PENTIC_K"] = cu.doubleToString(force.getAmoebaGlobalInPlaneAnglePentic());
    replacements["SEXTIC_K"] = cu.doubleToString(force.getAmoebaGlobalInPlaneAngleSextic());
308
309
310
311
312
    replacements["RAD_TO_DEG"] = cu.doubleToString(180/M_PI);
    cu.getBondedUtilities().addInteraction(atoms, cu.replaceStrings(CudaAmoebaKernelSources::amoebaInPlaneForce, replacements), force.getForceGroup());
    cu.addForce(new ForceInfo(force));
}

313
double CudaCalcAmoebaInPlaneAngleForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
314
315
316
    return 0.0;
}

317
318
319
320
321
322
323
void CudaCalcAmoebaInPlaneAngleForceKernel::copyParametersToContext(ContextImpl& context, const AmoebaInPlaneAngleForce& force) {
    cu.setAsCurrent();
    int numContexts = cu.getPlatformData().contexts.size();
    int startIndex = cu.getContextIndex()*force.getNumAngles()/numContexts;
    int endIndex = (cu.getContextIndex()+1)*force.getNumAngles()/numContexts;
    if (numAngles != endIndex-startIndex)
        throw OpenMMException("updateParametersInContext: The number of in-plane angles has changed");
324
325
    if (numAngles == 0)
        return;
326
327
328
329
330
331
332
333
334
335
    
    // Record the per-angle parameters.
    
    vector<float2> paramVector(numAngles);
    for (int i = 0; i < numAngles; i++) {
        int atom1, atom2, atom3, atom4;
        double angle, k;
        force.getAngleParameters(startIndex+i, atom1, atom2, atom3, atom4, angle, k);
        paramVector[i] = make_float2((float) angle, (float) k);
    }
336
    params.upload(paramVector);
337
338
339
340
341
342
    
    // Mark that the current reordering may be invalid.
    
    cu.invalidateMolecules();
}

343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
/* -------------------------------------------------------------------------- *
  *                              AmoebaPiTorsion                              *
 * -------------------------------------------------------------------------- */

class CudaCalcAmoebaPiTorsionForceKernel::ForceInfo : public CudaForceInfo {
public:
    ForceInfo(const AmoebaPiTorsionForce& force) : force(force) {
    }
    int getNumParticleGroups() {
        return force.getNumPiTorsions();
    }
    void getParticlesInGroup(int index, std::vector<int>& particles) {
        int particle1, particle2, particle3, particle4, particle5, particle6;
        double k;
        force.getPiTorsionParameters(index, particle1, particle2, particle3, particle4, particle5, particle6, k);
        particles.resize(6);
        particles[0] = particle1;
        particles[1] = particle2;
        particles[2] = particle3;
        particles[3] = particle4;
        particles[4] = particle5;
        particles[5] = particle6;
    }
    bool areGroupsIdentical(int group1, int group2) {
        int particle1, particle2, particle3, particle4, particle5, particle6;
        double k1, k2;
        force.getPiTorsionParameters(group1, particle1, particle2, particle3, particle4, particle5, particle6, k1);
        force.getPiTorsionParameters(group2, particle1, particle2, particle3, particle4, particle5, particle6, k2);
        return (k1 == k2);
    }
private:
    const AmoebaPiTorsionForce& force;
};

377
CudaCalcAmoebaPiTorsionForceKernel::CudaCalcAmoebaPiTorsionForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) :
378
         CalcAmoebaPiTorsionForceKernel(name, platform), cu(cu), system(system) {
379
380
381
382
383
384
385
386
387
388
389
}

void CudaCalcAmoebaPiTorsionForceKernel::initialize(const System& system, const AmoebaPiTorsionForce& force) {
    cu.setAsCurrent();
    int numContexts = cu.getPlatformData().contexts.size();
    int startIndex = cu.getContextIndex()*force.getNumPiTorsions()/numContexts;
    int endIndex = (cu.getContextIndex()+1)*force.getNumPiTorsions()/numContexts;
    numPiTorsions = endIndex-startIndex;
    if (numPiTorsions == 0)
        return;
    vector<vector<int> > atoms(numPiTorsions, vector<int>(6));
390
    params.initialize<float>(cu, numPiTorsions, "piTorsionParams");
391
392
393
394
395
396
    vector<float> paramVector(numPiTorsions);
    for (int i = 0; i < numPiTorsions; i++) {
        double k;
        force.getPiTorsionParameters(startIndex+i, atoms[i][0], atoms[i][1], atoms[i][2], atoms[i][3], atoms[i][4], atoms[i][5], k);
        paramVector[i] = (float) k;
    }
397
    params.upload(paramVector);
398
    map<string, string> replacements;
399
    replacements["APPLY_PERIODIC"] = (force.usesPeriodicBoundaryConditions() ? "1" : "0");
400
    replacements["PARAMS"] = cu.getBondedUtilities().addArgument(params.getDevicePointer(), "float");
401
402
403
404
405
406
407
408
    cu.getBondedUtilities().addInteraction(atoms, cu.replaceStrings(CudaAmoebaKernelSources::amoebaPiTorsionForce, replacements), force.getForceGroup());
    cu.addForce(new ForceInfo(force));
}

double CudaCalcAmoebaPiTorsionForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
    return 0.0;
}

409
410
411
412
413
414
415
void CudaCalcAmoebaPiTorsionForceKernel::copyParametersToContext(ContextImpl& context, const AmoebaPiTorsionForce& force) {
    cu.setAsCurrent();
    int numContexts = cu.getPlatformData().contexts.size();
    int startIndex = cu.getContextIndex()*force.getNumPiTorsions()/numContexts;
    int endIndex = (cu.getContextIndex()+1)*force.getNumPiTorsions()/numContexts;
    if (numPiTorsions != endIndex-startIndex)
        throw OpenMMException("updateParametersInContext: The number of torsions has changed");
416
417
    if (numPiTorsions == 0)
        return;
418
419
420
421
422
423
424
425
426
427
    
    // Record the per-torsion parameters.
    
    vector<float> paramVector(numPiTorsions);
    for (int i = 0; i < numPiTorsions; i++) {
        int atom1, atom2, atom3, atom4, atom5, atom6;
        double k;
        force.getPiTorsionParameters(startIndex+i, atom1, atom2, atom3, atom4, atom5, atom6, k);
        paramVector[i] = (float) k;
    }
428
    params.upload(paramVector);
429
430
431
432
433
434
    
    // Mark that the current reordering may be invalid.
    
    cu.invalidateMolecules();
}

435
436
437
438
439
440
441
442
443
444
445
446
447
/* -------------------------------------------------------------------------- *
 *                           AmoebaStretchBend                                *
 * -------------------------------------------------------------------------- */

class CudaCalcAmoebaStretchBendForceKernel::ForceInfo : public CudaForceInfo {
public:
    ForceInfo(const AmoebaStretchBendForce& force) : force(force) {
    }
    int getNumParticleGroups() {
        return force.getNumStretchBends();
    }
    void getParticlesInGroup(int index, std::vector<int>& particles) {
        int particle1, particle2, particle3;
448
449
        double lengthAB, lengthCB, angle, k1, k2;
        force.getStretchBendParameters(index, particle1, particle2, particle3, lengthAB, lengthCB, angle, k1, k2);
450
451
452
453
454
455
456
        particles.resize(3);
        particles[0] = particle1;
        particles[1] = particle2;
        particles[2] = particle3;
    }
    bool areGroupsIdentical(int group1, int group2) {
        int particle1, particle2, particle3;
457
458
459
460
        double lengthAB1, lengthAB2, lengthCB1, lengthCB2, angle1, angle2, k11, k12, k21, k22;
        force.getStretchBendParameters(group1, particle1, particle2, particle3, lengthAB1, lengthCB1, angle1, k11, k12);
        force.getStretchBendParameters(group2, particle1, particle2, particle3, lengthAB2, lengthCB2, angle2, k21, k22);
        return (lengthAB1 == lengthAB2 && lengthCB1 == lengthCB2 && angle1 == angle2 && k11 == k21 && k12 == k22);
461
462
463
464
465
    }
private:
    const AmoebaStretchBendForce& force;
};

466
CudaCalcAmoebaStretchBendForceKernel::CudaCalcAmoebaStretchBendForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) :
467
                   CalcAmoebaStretchBendForceKernel(name, platform), cu(cu), system(system) {
468
469
470
471
472
473
474
475
476
477
478
}

void CudaCalcAmoebaStretchBendForceKernel::initialize(const System& system, const AmoebaStretchBendForce& force) {
    cu.setAsCurrent();
    int numContexts = cu.getPlatformData().contexts.size();
    int startIndex = cu.getContextIndex()*force.getNumStretchBends()/numContexts;
    int endIndex = (cu.getContextIndex()+1)*force.getNumStretchBends()/numContexts;
    numStretchBends = endIndex-startIndex;
    if (numStretchBends == 0)
        return;
    vector<vector<int> > atoms(numStretchBends, vector<int>(3));
479
480
    params1.initialize<float3>(cu, numStretchBends, "stretchBendParams");
    params2.initialize<float2>(cu, numStretchBends, "stretchBendForceConstants");
481
482
    vector<float3> paramVector(numStretchBends);
    vector<float2> paramVectorK(numStretchBends);
483
    for (int i = 0; i < numStretchBends; i++) {
484
485
486
487
        double lengthAB, lengthCB, angle, k1, k2;
        force.getStretchBendParameters(startIndex+i, atoms[i][0], atoms[i][1], atoms[i][2], lengthAB, lengthCB, angle, k1, k2);
        paramVector[i] = make_float3((float) lengthAB, (float) lengthCB, (float) angle);
        paramVectorK[i] = make_float2((float) k1, (float) k2);
488
    }
489
490
    params1.upload(paramVector);
    params2.upload(paramVectorK);
491
    map<string, string> replacements;
492
    replacements["APPLY_PERIODIC"] = (force.usesPeriodicBoundaryConditions() ? "1" : "0");
493
494
    replacements["PARAMS"] = cu.getBondedUtilities().addArgument(params1.getDevicePointer(), "float3");
    replacements["FORCE_CONSTANTS"] = cu.getBondedUtilities().addArgument(params2.getDevicePointer(), "float2");
495
496
497
498
499
500
    replacements["RAD_TO_DEG"] = cu.doubleToString(180/M_PI);
    cu.getBondedUtilities().addInteraction(atoms, cu.replaceStrings(CudaAmoebaKernelSources::amoebaStretchBendForce, replacements), force.getForceGroup());
    cu.addForce(new ForceInfo(force));
}

double CudaCalcAmoebaStretchBendForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
501
    return 0.0;
502
503
}

504
505
506
507
508
509
510
void CudaCalcAmoebaStretchBendForceKernel::copyParametersToContext(ContextImpl& context, const AmoebaStretchBendForce& force) {
    cu.setAsCurrent();
    int numContexts = cu.getPlatformData().contexts.size();
    int startIndex = cu.getContextIndex()*force.getNumStretchBends()/numContexts;
    int endIndex = (cu.getContextIndex()+1)*force.getNumStretchBends()/numContexts;
    if (numStretchBends != endIndex-startIndex)
        throw OpenMMException("updateParametersInContext: The number of bend-stretch terms has changed");
511
512
    if (numStretchBends == 0)
        return;
513
514
515
    
    // Record the per-stretch-bend parameters.
    
Jason Swails's avatar
Jason Swails committed
516
517
    vector<float3> paramVector(numStretchBends);
    vector<float2> paramVector1(numStretchBends);
518
519
    for (int i = 0; i < numStretchBends; i++) {
        int atom1, atom2, atom3;
Jason Swails's avatar
Jason Swails committed
520
521
522
        double lengthAB, lengthCB, angle, k1, k2;
        force.getStretchBendParameters(startIndex+i, atom1, atom2, atom3, lengthAB, lengthCB, angle, k1, k2);
        paramVector[i] = make_float3((float) lengthAB, (float) lengthCB, (float) angle);
Jason Swails's avatar
Jason Swails committed
523
        paramVector1[i] = make_float2((float) k1, (float) k2);
524
    }
525
526
    params1.upload(paramVector);
    params2.upload(paramVector1);
527
528
529
530
531
532
    
    // Mark that the current reordering may be invalid.
    
    cu.invalidateMolecules();
}

533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
/* -------------------------------------------------------------------------- *
 *                           AmoebaOutOfPlaneBend                             *
 * -------------------------------------------------------------------------- */

class CudaCalcAmoebaOutOfPlaneBendForceKernel::ForceInfo : public CudaForceInfo {
public:
    ForceInfo(const AmoebaOutOfPlaneBendForce& force) : force(force) {
    }
    int getNumParticleGroups() {
        return force.getNumOutOfPlaneBends();
    }
    void getParticlesInGroup(int index, std::vector<int>& particles) {
        int particle1, particle2, particle3, particle4;
        double k;
        force.getOutOfPlaneBendParameters(index, particle1, particle2, particle3, particle4, k);
        particles.resize(4);
        particles[0] = particle1;
        particles[1] = particle2;
        particles[2] = particle3;
        particles[3] = particle4;
    }
    bool areGroupsIdentical(int group1, int group2) {
        int particle1, particle2, particle3, particle4;
        double k1, k2;
        force.getOutOfPlaneBendParameters(group1, particle1, particle2, particle3, particle4, k1);
        force.getOutOfPlaneBendParameters(group2, particle1, particle2, particle3, particle4, k2);
        return (k1 == k2);
    }
private:
    const AmoebaOutOfPlaneBendForce& force;
};

565
CudaCalcAmoebaOutOfPlaneBendForceKernel::CudaCalcAmoebaOutOfPlaneBendForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) :
566
          CalcAmoebaOutOfPlaneBendForceKernel(name, platform), cu(cu), system(system) {
567
568
569
570
571
572
573
574
575
576
577
}

void CudaCalcAmoebaOutOfPlaneBendForceKernel::initialize(const System& system, const AmoebaOutOfPlaneBendForce& force) {
    cu.setAsCurrent();
    int numContexts = cu.getPlatformData().contexts.size();
    int startIndex = cu.getContextIndex()*force.getNumOutOfPlaneBends()/numContexts;
    int endIndex = (cu.getContextIndex()+1)*force.getNumOutOfPlaneBends()/numContexts;
    numOutOfPlaneBends = endIndex-startIndex;
    if (numOutOfPlaneBends == 0)
        return;
    vector<vector<int> > atoms(numOutOfPlaneBends, vector<int>(4));
578
    params.initialize<float>(cu, numOutOfPlaneBends, "outOfPlaneParams");
579
580
581
582
583
584
    vector<float> paramVector(numOutOfPlaneBends);
    for (int i = 0; i < numOutOfPlaneBends; i++) {
        double k;
        force.getOutOfPlaneBendParameters(startIndex+i, atoms[i][0], atoms[i][1], atoms[i][2], atoms[i][3], k);
        paramVector[i] = (float) k;
    }
585
    params.upload(paramVector);
586
    map<string, string> replacements;
587
    replacements["APPLY_PERIODIC"] = (force.usesPeriodicBoundaryConditions() ? "1" : "0");
588
    replacements["PARAMS"] = cu.getBondedUtilities().addArgument(params.getDevicePointer(), "float");
589
590
591
592
593
594
595
596
597
598
599
600
601
    replacements["CUBIC_K"] = cu.doubleToString(force.getAmoebaGlobalOutOfPlaneBendCubic());
    replacements["QUARTIC_K"] = cu.doubleToString(force.getAmoebaGlobalOutOfPlaneBendQuartic());
    replacements["PENTIC_K"] = cu.doubleToString(force.getAmoebaGlobalOutOfPlaneBendPentic());
    replacements["SEXTIC_K"] = cu.doubleToString(force.getAmoebaGlobalOutOfPlaneBendSextic());
    replacements["RAD_TO_DEG"] = cu.doubleToString(180/M_PI);
    cu.getBondedUtilities().addInteraction(atoms, cu.replaceStrings(CudaAmoebaKernelSources::amoebaOutOfPlaneBendForce, replacements), force.getForceGroup());
    cu.addForce(new ForceInfo(force));
}

double CudaCalcAmoebaOutOfPlaneBendForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
    return 0.0;
}

602
603
604
605
606
607
608
void CudaCalcAmoebaOutOfPlaneBendForceKernel::copyParametersToContext(ContextImpl& context, const AmoebaOutOfPlaneBendForce& force) {
    cu.setAsCurrent();
    int numContexts = cu.getPlatformData().contexts.size();
    int startIndex = cu.getContextIndex()*force.getNumOutOfPlaneBends()/numContexts;
    int endIndex = (cu.getContextIndex()+1)*force.getNumOutOfPlaneBends()/numContexts;
    if (numOutOfPlaneBends != endIndex-startIndex)
        throw OpenMMException("updateParametersInContext: The number of out-of-plane bends has changed");
609
610
    if (numOutOfPlaneBends == 0)
        return;
611
612
613
614
615
616
617
618
619
620
    
    // Record the per-bend parameters.
    
    vector<float> paramVector(numOutOfPlaneBends);
    for (int i = 0; i < numOutOfPlaneBends; i++) {
        int atom1, atom2, atom3, atom4;
        double k;
        force.getOutOfPlaneBendParameters(startIndex+i, atom1, atom2, atom3, atom4, k);
        paramVector[i] = (float) k;
    }
621
    params.upload(paramVector);
622
623
624
625
626
627
    
    // Mark that the current reordering may be invalid.
    
    cu.invalidateMolecules();
}

628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
/* -------------------------------------------------------------------------- *
 *                           AmoebaTorsionTorsion                             *
 * -------------------------------------------------------------------------- */

class CudaCalcAmoebaTorsionTorsionForceKernel::ForceInfo : public CudaForceInfo {
public:
    ForceInfo(const AmoebaTorsionTorsionForce& force) : force(force) {
    }
    int getNumParticleGroups() {
        return force.getNumTorsionTorsions();
    }
    void getParticlesInGroup(int index, std::vector<int>& particles) {
        int particle1, particle2, particle3, particle4, particle5, chiralCheckAtomIndex, gridIndex;
        force.getTorsionTorsionParameters(index, particle1, particle2, particle3, particle4, particle5, chiralCheckAtomIndex, gridIndex);
        particles.resize(5);
        particles[0] = particle1;
        particles[1] = particle2;
        particles[2] = particle3;
        particles[3] = particle4;
        particles[4] = particle5;
    }
    bool areGroupsIdentical(int group1, int group2) {
        int particle1, particle2, particle3, particle4, particle5;
        int chiral1, chiral2, grid1, grid2;
        force.getTorsionTorsionParameters(group1, particle1, particle2, particle3, particle4, particle5, chiral1, grid1);
        force.getTorsionTorsionParameters(group2, particle1, particle2, particle3, particle4, particle5, chiral2, grid2);
        return (grid1 == grid2);
    }
private:
    const AmoebaTorsionTorsionForce& force;
};

660
CudaCalcAmoebaTorsionTorsionForceKernel::CudaCalcAmoebaTorsionTorsionForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) :
661
                CalcAmoebaTorsionTorsionForceKernel(name, platform), cu(cu), system(system) {
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
}

void CudaCalcAmoebaTorsionTorsionForceKernel::initialize(const System& system, const AmoebaTorsionTorsionForce& force) {
    cu.setAsCurrent();
    int numContexts = cu.getPlatformData().contexts.size();
    int startIndex = cu.getContextIndex()*force.getNumTorsionTorsions()/numContexts;
    int endIndex = (cu.getContextIndex()+1)*force.getNumTorsionTorsions()/numContexts;
    numTorsionTorsions = endIndex-startIndex;
    if (numTorsionTorsions == 0)
        return;
    
    // Record torsion parameters.
    
    vector<vector<int> > atoms(numTorsionTorsions, vector<int>(5));
    vector<int2> torsionParamsVec(numTorsionTorsions);
677
    torsionParams.initialize<int2>(cu, numTorsionTorsions, "torsionTorsionParams");
678
679
    for (int i = 0; i < numTorsionTorsions; i++)
        force.getTorsionTorsionParameters(startIndex+i, atoms[i][0], atoms[i][1], atoms[i][2], atoms[i][3], atoms[i][4], torsionParamsVec[i].x, torsionParamsVec[i].y);
680
    torsionParams.upload(torsionParamsVec);
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
    
    // Record the grids.
    
    vector<float4> gridValuesVec;
    vector<float4> gridParamsVec;
    for (int i = 0; i < force.getNumTorsionTorsionGrids(); i++) {
        const TorsionTorsionGrid& initialGrid = force.getTorsionTorsionGrid(i);

        // check if grid needs to be reordered: x-angle should be 'slow' index

        bool reordered = false;
        TorsionTorsionGrid reorderedGrid;
        if (initialGrid[0][0][0] != initialGrid[0][1][0]) {
            AmoebaTorsionTorsionForceImpl::reorderGrid(initialGrid, reorderedGrid);
            reordered = true;
        }
        const TorsionTorsionGrid& grid = (reordered ? reorderedGrid : initialGrid);
        float range = grid[0][grid[0].size()-1][1] - grid[0][0][1];
        gridParamsVec.push_back(make_float4(gridValuesVec.size(), grid[0][0][0], range/(grid.size()-1), grid.size()));
        for (int j = 0; j < grid.size(); j++)
            for (int k = 0; k < grid[j].size(); k++)
                gridValuesVec.push_back(make_float4((float) grid[j][k][2], (float) grid[j][k][3], (float) grid[j][k][4], (float) grid[j][k][5]));
    }
704
705
706
707
    gridValues.initialize<float4>(cu, gridValuesVec.size(), "torsionTorsionGridValues");
    gridParams.initialize<float4>(cu, gridParamsVec.size(), "torsionTorsionGridParams");
    gridValues.upload(gridValuesVec);
    gridParams.upload(gridParamsVec);
708
    map<string, string> replacements;
709
    replacements["APPLY_PERIODIC"] = (force.usesPeriodicBoundaryConditions() ? "1" : "0");
710
711
712
    replacements["GRID_VALUES"] = cu.getBondedUtilities().addArgument(gridValues.getDevicePointer(), "float4");
    replacements["GRID_PARAMS"] = cu.getBondedUtilities().addArgument(gridParams.getDevicePointer(), "float4");
    replacements["TORSION_PARAMS"] = cu.getBondedUtilities().addArgument(torsionParams.getDevicePointer(), "int2");
713
714
715
716
717
718
719
720
721
722
    replacements["RAD_TO_DEG"] = cu.doubleToString(180/M_PI);
    cu.getBondedUtilities().addInteraction(atoms, cu.replaceStrings(CudaAmoebaKernelSources::amoebaTorsionTorsionForce, replacements), force.getForceGroup());
    cu.getBondedUtilities().addPrefixCode(CudaAmoebaKernelSources::bicubic);
    cu.addForce(new ForceInfo(force));
}

double CudaCalcAmoebaTorsionTorsionForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
    return 0.0;
}

723
724
725
726
727
728
729
730
731
732
733
734
735
736
/* -------------------------------------------------------------------------- *
 *                             AmoebaMultipole                                *
 * -------------------------------------------------------------------------- */

class CudaCalcAmoebaMultipoleForceKernel::ForceInfo : public CudaForceInfo {
public:
    ForceInfo(const AmoebaMultipoleForce& force) : force(force) {
    }
    bool areParticlesIdentical(int particle1, int particle2) {
        double charge1, charge2, thole1, thole2, damping1, damping2, polarity1, polarity2;
        int axis1, axis2, multipole11, multipole12, multipole21, multipole22, multipole31, multipole32;
        vector<double> dipole1, dipole2, quadrupole1, quadrupole2;
        force.getMultipoleParameters(particle1, charge1, dipole1, quadrupole1, axis1, multipole11, multipole21, multipole31, thole1, damping1, polarity1);
        force.getMultipoleParameters(particle2, charge2, dipole2, quadrupole2, axis2, multipole12, multipole22, multipole32, thole2, damping2, polarity2);
737
        if (charge1 != charge2 || thole1 != thole2 || damping1 != damping2 || polarity1 != polarity2 || axis1 != axis2) {
738
739
            return false;
        }
740
741
        for (int i = 0; i < (int) dipole1.size(); ++i) {
            if (dipole1[i] != dipole2[i]) {
742
743
744
                return false;
            }
        }
745
746
        for (int i = 0; i < (int) quadrupole1.size(); ++i) {
            if (quadrupole1[i] != quadrupole2[i]) {
747
748
749
750
751
                return false;
            }
        }
        return true;
    }
752
753
754
755
756
757
758
759
760
761
762
    int getNumParticleGroups() {
        return 7*force.getNumMultipoles();
    }
    void getParticlesInGroup(int index, vector<int>& particles) {
        int particle = index/7;
        int type = index-7*particle;
        force.getCovalentMap(particle, AmoebaMultipoleForce::CovalentType(type), particles);
    }
    bool areGroupsIdentical(int group1, int group2) {
        return ((group1%7) == (group2%7));
    }
763
764
765
766
private:
    const AmoebaMultipoleForce& force;
};

767
CudaCalcAmoebaMultipoleForceKernel::CudaCalcAmoebaMultipoleForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) : 
Peter Eastman's avatar
Peter Eastman committed
768
        CalcAmoebaMultipoleForceKernel(name, platform), cu(cu), system(system), hasInitializedScaleFactors(false), hasInitializedFFT(false), multipolesAreValid(false), hasCreatedEvent(false),
769
        sort(NULL), gkKernel(NULL) {
770
771
772
773
}

CudaCalcAmoebaMultipoleForceKernel::~CudaCalcAmoebaMultipoleForceKernel() {
    cu.setAsCurrent();
774
775
776
777
    if (sort != NULL)
        delete sort;
    if (hasInitializedFFT)
        cufftDestroy(fft);
Peter Eastman's avatar
Peter Eastman committed
778
779
    if (hasCreatedEvent)
        cuEventDestroy(syncEvent);
780
781
}

782
783
784
785
786
787
788
void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const AmoebaMultipoleForce& force) {
    cu.setAsCurrent();

    // Initialize multipole parameters.

    numMultipoles = force.getNumMultipoles();
    CudaArray& posq = cu.getPosq();
789
790
791
    vector<double4> temp(posq.getSize());
    float4* posqf = (float4*) &temp[0];
    double4* posqd = (double4*) &temp[0];
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
    vector<float2> dampingAndTholeVec;
    vector<float> polarizabilityVec;
    vector<float> molecularDipolesVec;
    vector<float> molecularQuadrupolesVec;
    vector<int4> multipoleParticlesVec;
    for (int i = 0; i < numMultipoles; i++) {
        double charge, thole, damping, polarity;
        int axisType, atomX, atomY, atomZ;
        vector<double> dipole, quadrupole;
        force.getMultipoleParameters(i, charge, dipole, quadrupole, axisType, atomZ, atomX, atomY, thole, damping, polarity);
        if (cu.getUseDoublePrecision())
            posqd[i] = make_double4(0, 0, 0, charge);
        else
            posqf[i] = make_float4(0, 0, 0, (float) charge);
        dampingAndTholeVec.push_back(make_float2((float) damping, (float) thole));
        polarizabilityVec.push_back((float) polarity);
        multipoleParticlesVec.push_back(make_int4(atomX, atomY, atomZ, axisType));
        for (int j = 0; j < 3; j++)
            molecularDipolesVec.push_back((float) dipole[j]);
811
812
813
814
815
        molecularQuadrupolesVec.push_back((float) quadrupole[0]);
        molecularQuadrupolesVec.push_back((float) quadrupole[1]);
        molecularQuadrupolesVec.push_back((float) quadrupole[2]);
        molecularQuadrupolesVec.push_back((float) quadrupole[4]);
        molecularQuadrupolesVec.push_back((float) quadrupole[5]);
816
    }
817
    hasQuadrupoles = false;
peastman's avatar
peastman committed
818
819
    for (auto q : molecularQuadrupolesVec)
        if (q != 0.0)
820
            hasQuadrupoles = true;
821
822
823
824
825
826
827
    int paddedNumAtoms = cu.getPaddedNumAtoms();
    for (int i = numMultipoles; i < paddedNumAtoms; i++) {
        dampingAndTholeVec.push_back(make_float2(0, 0));
        polarizabilityVec.push_back(0);
        multipoleParticlesVec.push_back(make_int4(0, 0, 0, 0));
        for (int j = 0; j < 3; j++)
            molecularDipolesVec.push_back(0);
828
        for (int j = 0; j < 5; j++)
829
830
            molecularQuadrupolesVec.push_back(0);
    }
831
832
833
834
835
836
837
838
839
840
841
    dampingAndThole.initialize<float2>(cu, paddedNumAtoms, "dampingAndThole");
    polarizability.initialize<float>(cu, paddedNumAtoms, "polarizability");
    multipoleParticles.initialize<int4>(cu, paddedNumAtoms, "multipoleParticles");
    molecularDipoles.initialize<float>(cu, 3*paddedNumAtoms, "molecularDipoles");
    molecularQuadrupoles.initialize<float>(cu, 5*paddedNumAtoms, "molecularQuadrupoles");
    lastPositions.initialize(cu, cu.getPosq().getSize(), cu.getPosq().getElementSize(), "lastPositions");
    dampingAndThole.upload(dampingAndTholeVec);
    polarizability.upload(polarizabilityVec);
    multipoleParticles.upload(multipoleParticlesVec);
    molecularDipoles.upload(molecularDipolesVec);
    molecularQuadrupoles.upload(molecularQuadrupolesVec);
842
    posq.upload(&temp[0]);
843
844
845
    
    // Create workspace arrays.
    
846
    polarizationType = force.getPolarizationType();
847
    int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
848
849
850
851
852
853
854
855
856
857
858
    labFrameDipoles.initialize(cu, 3*paddedNumAtoms, elementSize, "labFrameDipoles");
    labFrameQuadrupoles.initialize(cu, 5*paddedNumAtoms, elementSize, "labFrameQuadrupoles");
    sphericalDipoles.initialize(cu, 3*paddedNumAtoms, elementSize, "sphericalDipoles");
    sphericalQuadrupoles.initialize(cu, 5*paddedNumAtoms, elementSize, "sphericalQuadrupoles");
    fracDipoles.initialize(cu, 3*paddedNumAtoms, elementSize, "fracDipoles");
    fracQuadrupoles.initialize(cu, 6*paddedNumAtoms, elementSize, "fracQuadrupoles");
    field.initialize(cu, 3*paddedNumAtoms, sizeof(long long), "field");
    fieldPolar.initialize(cu, 3*paddedNumAtoms, sizeof(long long), "fieldPolar");
    torque.initialize(cu, 3*paddedNumAtoms, sizeof(long long), "torque");
    inducedDipole.initialize(cu, 3*paddedNumAtoms, elementSize, "inducedDipole");
    inducedDipolePolar.initialize(cu, 3*paddedNumAtoms, elementSize, "inducedDipolePolar");
859
    if (polarizationType == AmoebaMultipoleForce::Mutual) {
860
861
862
863
864
865
        inducedDipoleErrors.initialize(cu, cu.getNumThreadBlocks(), sizeof(float2), "inducedDipoleErrors");
        prevDipoles.initialize(cu, 3*numMultipoles*MaxPrevDIISDipoles, elementSize, "prevDipoles");
        prevDipolesPolar.initialize(cu, 3*numMultipoles*MaxPrevDIISDipoles, elementSize, "prevDipolesPolar");
        prevErrors.initialize(cu, 3*numMultipoles*MaxPrevDIISDipoles, elementSize, "prevErrors");
        diisMatrix.initialize(cu, MaxPrevDIISDipoles*MaxPrevDIISDipoles, elementSize, "diisMatrix");
        diisCoefficients.initialize(cu, MaxPrevDIISDipoles+1, sizeof(float), "diisMatrix");
Peter Eastman's avatar
Peter Eastman committed
866
867
        CHECK_RESULT(cuEventCreate(&syncEvent, CU_EVENT_DISABLE_TIMING), "Error creating event for AmoebaMultipoleForce");
        hasCreatedEvent = true;
868
869
870
    }
    else if (polarizationType == AmoebaMultipoleForce::Extrapolated) {
        int numOrders = force.getExtrapolationCoefficients().size();
871
872
873
874
875
876
        extrapolatedDipole.initialize(cu, 3*numMultipoles*numOrders, elementSize, "extrapolatedDipole");
        extrapolatedDipolePolar.initialize(cu, 3*numMultipoles*numOrders, elementSize, "extrapolatedDipolePolar");
        inducedDipoleFieldGradient.initialize(cu, 6*paddedNumAtoms, sizeof(long long), "inducedDipoleFieldGradient");
        inducedDipoleFieldGradientPolar.initialize(cu, 6*paddedNumAtoms, sizeof(long long), "inducedDipoleFieldGradientPolar");
        extrapolatedDipoleFieldGradient.initialize(cu, 6*numMultipoles*(numOrders-1), elementSize, "extrapolatedDipoleFieldGradient");
        extrapolatedDipoleFieldGradientPolar.initialize(cu, 6*numMultipoles*(numOrders-1), elementSize, "extrapolatedDipoleFieldGradientPolar");
877
    }
878
879
880
    cu.addAutoclearBuffer(field);
    cu.addAutoclearBuffer(fieldPolar);
    cu.addAutoclearBuffer(torque);
881
882
883
884
885
886
887
888
889
890
891
892
893
    
    // Record which atoms should be flagged as exclusions based on covalent groups, and determine
    // the values for the covalent group flags.
    
    vector<vector<int> > exclusions(numMultipoles);
    for (int i = 0; i < numMultipoles; i++) {
        vector<int> atoms;
        set<int> allAtoms;
        allAtoms.insert(i);
        force.getCovalentMap(i, AmoebaMultipoleForce::Covalent12, atoms);
        allAtoms.insert(atoms.begin(), atoms.end());
        force.getCovalentMap(i, AmoebaMultipoleForce::Covalent13, atoms);
        allAtoms.insert(atoms.begin(), atoms.end());
peastman's avatar
peastman committed
894
895
        for (int atom : allAtoms)
            covalentFlagValues.push_back(make_int3(i, atom, 0));
896
897
        force.getCovalentMap(i, AmoebaMultipoleForce::Covalent14, atoms);
        allAtoms.insert(atoms.begin(), atoms.end());
peastman's avatar
peastman committed
898
899
        for (int atom : atoms)
            covalentFlagValues.push_back(make_int3(i, atom, 1));
900
        force.getCovalentMap(i, AmoebaMultipoleForce::Covalent15, atoms);
peastman's avatar
peastman committed
901
902
        for (int atom : atoms)
            covalentFlagValues.push_back(make_int3(i, atom, 2));
903
904
905
906
        allAtoms.insert(atoms.begin(), atoms.end());
        force.getCovalentMap(i, AmoebaMultipoleForce::PolarizationCovalent11, atoms);
        allAtoms.insert(atoms.begin(), atoms.end());
        exclusions[i].insert(exclusions[i].end(), allAtoms.begin(), allAtoms.end());
907
908
909
910
911
912

        // Workaround for bug in TINKER: if an atom is listed in both the PolarizationCovalent11
        // and PolarizationCovalent12 maps, the latter takes precedence.

        vector<int> atoms12;
        force.getCovalentMap(i, AmoebaMultipoleForce::PolarizationCovalent12, atoms12);
peastman's avatar
peastman committed
913
914
915
        for (int atom : atoms)
            if (find(atoms12.begin(), atoms12.end(), atom) == atoms12.end())
                polarizationFlagValues.push_back(make_int2(i, atom));
916
    }
917
918
919
    set<pair<int, int> > tilesWithExclusions;
    for (int atom1 = 0; atom1 < (int) exclusions.size(); ++atom1) {
        int x = atom1/CudaContext::TileSize;
peastman's avatar
peastman committed
920
        for (int atom2 : exclusions[atom1]) {
921
922
923
924
            int y = atom2/CudaContext::TileSize;
            tilesWithExclusions.insert(make_pair(max(x, y), min(x, y)));
        }
    }
925
    
926
927
    // Record other options.
    
928
    if (polarizationType == AmoebaMultipoleForce::Mutual) {
929
930
931
932
        maxInducedIterations = force.getMutualInducedMaxIterations();
        inducedEpsilon = force.getMutualInducedTargetEpsilon();
    }
    else
933
        maxInducedIterations = 0;
934
    if (polarizationType != AmoebaMultipoleForce::Direct) {
935
936
        inducedField.initialize(cu, 3*paddedNumAtoms, sizeof(long long), "inducedField");
        inducedFieldPolar.initialize(cu, 3*paddedNumAtoms, sizeof(long long), "inducedFieldPolar");
937
    }
938
    usePME = (force.getNonbondedMethod() == AmoebaMultipoleForce::PME);
939
    
940
941
942
943
944
945
946
    // See whether there's an AmoebaGeneralizedKirkwoodForce in the System.

    const AmoebaGeneralizedKirkwoodForce* gk = NULL;
    for (int i = 0; i < system.getNumForces() && gk == NULL; i++)
        gk = dynamic_cast<const AmoebaGeneralizedKirkwoodForce*>(&system.getForce(i));
    double innerDielectric = (gk == NULL ? 1.0 : gk->getSoluteDielectric());
    
947
948
    // Create the kernels.

949
950
951
    bool useShuffle = (cu.getComputeCapability() >= 3.0 && !cu.getUseDoublePrecision());
    double fixedThreadMemory = 19*elementSize+2*sizeof(float)+3*sizeof(int)/(double) cu.TileSize;
    double inducedThreadMemory = 15*elementSize+2*sizeof(float);
952
953
    if (polarizationType == AmoebaMultipoleForce::Extrapolated)
        inducedThreadMemory += 12*elementSize;
954
    double electrostaticsThreadMemory = 0;
Peter Eastman's avatar
Peter Eastman committed
955
    if (!useShuffle)
956
        fixedThreadMemory += 3*elementSize;
957
958
959
960
    map<string, string> defines;
    defines["NUM_ATOMS"] = cu.intToString(numMultipoles);
    defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
    defines["NUM_BLOCKS"] = cu.intToString(cu.getNumAtomBlocks());
961
    defines["ENERGY_SCALE_FACTOR"] = cu.doubleToString(138.9354558456/innerDielectric);
962
    if (polarizationType == AmoebaMultipoleForce::Direct)
963
        defines["DIRECT_POLARIZATION"] = "";
964
965
966
967
    else if (polarizationType == AmoebaMultipoleForce::Mutual)
        defines["MUTUAL_POLARIZATION"] = "";
    else if (polarizationType == AmoebaMultipoleForce::Extrapolated)
        defines["EXTRAPOLATED_POLARIZATION"] = "";
Peter Eastman's avatar
Peter Eastman committed
968
969
    if (useShuffle)
        defines["USE_SHUFFLE"] = "";
970
971
    if (hasQuadrupoles)
        defines["INCLUDE_QUADRUPOLES"] = "";
972
973
974
975
976
977
978
979
    defines["TILE_SIZE"] = cu.intToString(CudaContext::TileSize);
    int numExclusionTiles = tilesWithExclusions.size();
    defines["NUM_TILES_WITH_EXCLUSIONS"] = cu.intToString(numExclusionTiles);
    int numContexts = cu.getPlatformData().contexts.size();
    int startExclusionIndex = cu.getContextIndex()*numExclusionTiles/numContexts;
    int endExclusionIndex = (cu.getContextIndex()+1)*numExclusionTiles/numContexts;
    defines["FIRST_EXCLUSION_TILE"] = cu.intToString(startExclusionIndex);
    defines["LAST_EXCLUSION_TILE"] = cu.intToString(endExclusionIndex);
980
981
982
983
984
985
986
987
    maxExtrapolationOrder = force.getExtrapolationCoefficients().size();
    defines["MAX_EXTRAPOLATION_ORDER"] = cu.intToString(maxExtrapolationOrder);
    stringstream coefficients;
    for (int i = 0; i < maxExtrapolationOrder; i++) {
        if (i > 0)
            coefficients << ",";
        double sum = 0;
        for (int j = i; j < maxExtrapolationOrder; j++)
988
            sum += force.getExtrapolationCoefficients()[j];
989
990
991
        coefficients << cu.doubleToString(sum);
    }
    defines["EXTRAPOLATION_COEFFICIENTS_SUM"] = coefficients.str();
992
    if (usePME) {
peastman's avatar
peastman committed
993
994
995
        int nx, ny, nz;
        force.getPMEParameters(alpha, nx, ny, nz);
        if (nx == 0 || alpha == 0.0) {
996
997
998
            NonbondedForce nb;
            nb.setEwaldErrorTolerance(force.getEwaldErrorTolerance());
            nb.setCutoffDistance(force.getCutoffDistance());
999
            NonbondedForceImpl::calcPMEParameters(system, nb, alpha, gridSizeX, gridSizeY, gridSizeZ, false);
1000
1001
1002
            gridSizeX = CudaFFT3D::findLegalDimension(gridSizeX);
            gridSizeY = CudaFFT3D::findLegalDimension(gridSizeY);
            gridSizeZ = CudaFFT3D::findLegalDimension(gridSizeZ);
1003
        } else {
peastman's avatar
peastman committed
1004
1005
1006
            gridSizeX = CudaFFT3D::findLegalDimension(nx);
            gridSizeY = CudaFFT3D::findLegalDimension(ny);
            gridSizeZ = CudaFFT3D::findLegalDimension(nz);
1007
1008
1009
1010
1011
1012
1013
1014
        }
        defines["EWALD_ALPHA"] = cu.doubleToString(alpha);
        defines["SQRT_PI"] = cu.doubleToString(sqrt(M_PI));
        defines["USE_EWALD"] = "";
        defines["USE_CUTOFF"] = "";
        defines["USE_PERIODIC"] = "";
        defines["CUTOFF_SQUARED"] = cu.doubleToString(force.getCutoffDistance()*force.getCutoffDistance());
    }
1015
1016
1017
1018
1019
1020
1021
    if (gk != NULL) {
        defines["USE_GK"] = "";
        defines["GK_C"] = cu.doubleToString(2.455);
        double solventDielectric = gk->getSolventDielectric();
        defines["GK_FC"] = cu.doubleToString(1*(1-solventDielectric)/(0+1*solventDielectric));
        defines["GK_FD"] = cu.doubleToString(2*(1-solventDielectric)/(1+2*solventDielectric));
        defines["GK_FQ"] = cu.doubleToString(3*(1-solventDielectric)/(2+3*solventDielectric));
1022
1023
        fixedThreadMemory += 4*elementSize;
        inducedThreadMemory += 13*elementSize;
1024
        if (polarizationType == AmoebaMultipoleForce::Mutual) {
1025
1026
            prevDipolesGk.initialize(cu, 3*numMultipoles*MaxPrevDIISDipoles, elementSize, "prevDipolesGk");
            prevDipolesGkPolar.initialize(cu, 3*numMultipoles*MaxPrevDIISDipoles, elementSize, "prevDipolesGkPolar");
1027
1028
        }
        else if (polarizationType == AmoebaMultipoleForce::Extrapolated) {
1029
            inducedThreadMemory += 12*elementSize;
1030
            int numOrders = force.getExtrapolationCoefficients().size();
1031
1032
1033
1034
1035
1036
            extrapolatedDipoleGk.initialize(cu, 3*numMultipoles*numOrders, elementSize, "extrapolatedDipoleGk");
            extrapolatedDipoleGkPolar.initialize(cu, 3*numMultipoles*numOrders, elementSize, "extrapolatedDipoleGkPolar");
            inducedDipoleFieldGradientGk.initialize(cu, 6*numMultipoles, elementSize, "inducedDipoleFieldGradientGk");
            inducedDipoleFieldGradientGkPolar.initialize(cu, 6*numMultipoles, elementSize, "inducedDipoleFieldGradientGkPolar");
            extrapolatedDipoleFieldGradientGk.initialize(cu, 6*numMultipoles*(numOrders-1), elementSize, "extrapolatedDipoleFieldGradientGk");
            extrapolatedDipoleFieldGradientGkPolar.initialize(cu, 6*numMultipoles*(numOrders-1), elementSize, "extrapolatedDipoleFieldGradientGkPolar");
1037
        }
1038
    }
1039
1040
1041
    int maxThreads = cu.getNonbondedUtilities().getForceThreadBlockSize();
    fixedFieldThreads = min(maxThreads, cu.computeThreadBlockSize(fixedThreadMemory));
    inducedFieldThreads = min(maxThreads, cu.computeThreadBlockSize(inducedThreadMemory));
1042
1043
    CUmodule module = cu.createModule(CudaKernelSources::vectorOps+CudaAmoebaKernelSources::multipoles, defines);
    computeMomentsKernel = cu.getKernel(module, "computeLabFrameMoments");
1044
    recordInducedDipolesKernel = cu.getKernel(module, "recordInducedDipoles");
1045
    mapTorqueKernel = cu.getKernel(module, "mapTorqueToForce");
1046
    computePotentialKernel = cu.getKernel(module, "computePotentialAtPoints");
1047
    defines["THREAD_BLOCK_SIZE"] = cu.intToString(fixedFieldThreads);
1048
1049
    module = cu.createModule(CudaKernelSources::vectorOps+CudaAmoebaKernelSources::multipoleFixedField, defines);
    computeFixedFieldKernel = cu.getKernel(module, "computeFixedField");
1050
    if (polarizationType != AmoebaMultipoleForce::Direct) {
1051
        defines["THREAD_BLOCK_SIZE"] = cu.intToString(inducedFieldThreads);
peastman's avatar
peastman committed
1052
        defines["MAX_PREV_DIIS_DIPOLES"] = cu.intToString(MaxPrevDIISDipoles);
1053
1054
        module = cu.createModule(CudaKernelSources::vectorOps+CudaAmoebaKernelSources::multipoleInducedField, defines);
        computeInducedFieldKernel = cu.getKernel(module, "computeInducedField");
peastman's avatar
peastman committed
1055
1056
1057
        updateInducedFieldKernel = cu.getKernel(module, "updateInducedFieldByDIIS");
        recordDIISDipolesKernel = cu.getKernel(module, "recordInducedDipolesForDIIS");
        buildMatrixKernel = cu.getKernel(module, "computeDIISMatrix");
Peter Eastman's avatar
Peter Eastman committed
1058
        solveMatrixKernel = cu.getKernel(module, "solveDIISMatrix");
1059
1060
1061
        initExtrapolatedKernel = cu.getKernel(module, "initExtrapolatedDipoles");
        iterateExtrapolatedKernel = cu.getKernel(module, "iterateExtrapolatedDipoles");
        computeExtrapolatedKernel = cu.getKernel(module, "computeExtrapolatedDipoles");
1062
        addExtrapolatedGradientKernel = cu.getKernel(module, "addExtrapolatedFieldGradientToForce");
1063
    }
1064
    stringstream electrostaticsSource;
1065
1066
1067
    electrostaticsSource << CudaKernelSources::vectorOps;
    electrostaticsSource << CudaAmoebaKernelSources::sphericalMultipoles;
    if (usePME)
1068
        electrostaticsSource << CudaAmoebaKernelSources::pmeMultipoleElectrostatics;
1069
    else
1070
        electrostaticsSource << CudaAmoebaKernelSources::multipoleElectrostatics;
1071
    electrostaticsThreadMemory = 24*elementSize+3*sizeof(float)+3*sizeof(int)/(double) cu.TileSize;
1072
1073
    electrostaticsThreads = min(maxThreads, cu.computeThreadBlockSize(electrostaticsThreadMemory));
    defines["THREAD_BLOCK_SIZE"] = cu.intToString(electrostaticsThreads);
1074
1075
    module = cu.createModule(electrostaticsSource.str(), defines);
    electrostaticsKernel = cu.getKernel(module, "computeElectrostatics");
1076
1077
1078

    // Set up PME.
    
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
    if (usePME) {
        // Create the PME kernels.

        map<string, string> pmeDefines;
        pmeDefines["EWALD_ALPHA"] = cu.doubleToString(alpha);
        pmeDefines["PME_ORDER"] = cu.intToString(PmeOrder);
        pmeDefines["NUM_ATOMS"] = cu.intToString(numMultipoles);
        pmeDefines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
        pmeDefines["EPSILON_FACTOR"] = cu.doubleToString(138.9354558456);
        pmeDefines["GRID_SIZE_X"] = cu.intToString(gridSizeX);
        pmeDefines["GRID_SIZE_Y"] = cu.intToString(gridSizeY);
        pmeDefines["GRID_SIZE_Z"] = cu.intToString(gridSizeZ);
        pmeDefines["M_PI"] = cu.doubleToString(M_PI);
1092
        pmeDefines["SQRT_PI"] = cu.doubleToString(sqrt(M_PI));
1093
        if (polarizationType == AmoebaMultipoleForce::Direct)
1094
            pmeDefines["DIRECT_POLARIZATION"] = "";
1095
1096
        else if (polarizationType == AmoebaMultipoleForce::Mutual)
            pmeDefines["MUTUAL_POLARIZATION"] = "";
1097
1098
        else if (polarizationType == AmoebaMultipoleForce::Extrapolated)
            pmeDefines["EXTRAPOLATED_POLARIZATION"] = "";
1099
        CUmodule module = cu.createModule(CudaKernelSources::vectorOps+CudaAmoebaKernelSources::multipolePme, pmeDefines);
1100
        pmeTransformMultipolesKernel = cu.getKernel(module, "transformMultipolesToFractionalCoordinates");
1101
        pmeTransformPotentialKernel = cu.getKernel(module, "transformPotentialToCartesianCoordinates");
1102
        pmeSpreadFixedMultipolesKernel = cu.getKernel(module, "gridSpreadFixedMultipoles");
1103
        pmeSpreadInducedDipolesKernel = cu.getKernel(module, "gridSpreadInducedDipoles");
1104
        pmeFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadCharge");
1105
1106
        pmeConvolutionKernel = cu.getKernel(module, "reciprocalConvolution");
        pmeFixedPotentialKernel = cu.getKernel(module, "computeFixedPotentialFromGrid");
1107
        pmeInducedPotentialKernel = cu.getKernel(module, "computeInducedPotentialFromGrid");
1108
        pmeFixedForceKernel = cu.getKernel(module, "computeFixedMultipoleForceAndEnergy");
1109
1110
        pmeInducedForceKernel = cu.getKernel(module, "computeInducedDipoleForceAndEnergy");
        pmeRecordInducedFieldDipolesKernel = cu.getKernel(module, "recordInducedFieldDipoles");
1111
1112
1113
1114
        cuFuncSetCacheConfig(pmeSpreadFixedMultipolesKernel, CU_FUNC_CACHE_PREFER_L1);
        cuFuncSetCacheConfig(pmeSpreadInducedDipolesKernel, CU_FUNC_CACHE_PREFER_L1);
        cuFuncSetCacheConfig(pmeFixedPotentialKernel, CU_FUNC_CACHE_PREFER_L1);
        cuFuncSetCacheConfig(pmeInducedPotentialKernel, CU_FUNC_CACHE_PREFER_L1);
1115
1116
1117
1118

        // Create required data structures.

        int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
        pmeGrid.initialize(cu, gridSizeX*gridSizeY*gridSizeZ, 2*elementSize, "pmeGrid");
        cu.addAutoclearBuffer(pmeGrid);
        pmeBsplineModuliX.initialize(cu, gridSizeX, elementSize, "pmeBsplineModuliX");
        pmeBsplineModuliY.initialize(cu, gridSizeY, elementSize, "pmeBsplineModuliY");
        pmeBsplineModuliZ.initialize(cu, gridSizeZ, elementSize, "pmeBsplineModuliZ");
        pmeIgrid.initialize<int4>(cu, numMultipoles, "pmeIgrid");
        pmePhi.initialize(cu, 20*numMultipoles, elementSize, "pmePhi");
        pmePhid.initialize(cu, 10*numMultipoles, elementSize, "pmePhid");
        pmePhip.initialize(cu, 10*numMultipoles, elementSize, "pmePhip");
        pmePhidp.initialize(cu, 20*numMultipoles, elementSize, "pmePhidp");
        pmeCphi.initialize(cu, 10*numMultipoles, elementSize, "pmeCphi");
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
        sort = new CudaSort(cu, new SortTrait(), cu.getNumAtoms());
        cufftResult result = cufftPlan3d(&fft, gridSizeX, gridSizeY, gridSizeZ, cu.getUseDoublePrecision() ? CUFFT_Z2Z : CUFFT_C2C);
        if (result != CUFFT_SUCCESS)
            throw OpenMMException("Error initializing FFT: "+cu.intToString(result));
        hasInitializedFFT = true;

        // Initialize the b-spline moduli.

        double data[PmeOrder];
        double x = 0.0;
        data[0] = 1.0 - x;
        data[1] = x;
        for (int i = 2; i < PmeOrder; i++) {
            double denom = 1.0/i;
            data[i] = x*data[i-1]*denom;
            for (int j = 1; j < i; j++)
                data[i-j] = ((x+j)*data[i-j-1] + ((i-j+1)-x)*data[i-j])*denom;
            data[0] = (1.0-x)*data[0]*denom;
        }
        int maxSize = max(max(gridSizeX, gridSizeY), gridSizeZ);
        vector<double> bsplines_data(maxSize+1, 0.0);
        for (int i = 2; i <= PmeOrder+1; i++)
            bsplines_data[i] = data[i-2];
        for (int dim = 0; dim < 3; dim++) {
            int ndata = (dim == 0 ? gridSizeX : dim == 1 ? gridSizeY : gridSizeZ);
            vector<double> moduli(ndata);

            // get the modulus of the discrete Fourier transform

            double factor = 2.0*M_PI/ndata;
            for (int i = 0; i < ndata; i++) {
                double sc = 0.0;
                double ss = 0.0;
                for (int j = 1; j <= ndata; j++) {
                    double arg = factor*i*(j-1);
                    sc += bsplines_data[j]*cos(arg);
                    ss += bsplines_data[j]*sin(arg);
                }
                moduli[i] = sc*sc+ss*ss;
            }

            // Fix for exponential Euler spline interpolation failure.

            double eps = 1.0e-7;
            if (moduli[0] < eps)
                moduli[0] = 0.9*moduli[1];
            for (int i = 1; i < ndata-1; i++)
                if (moduli[i] < eps)
                    moduli[i] = 0.9*(moduli[i-1]+moduli[i+1]);
            if (moduli[ndata-1] < eps)
                moduli[ndata-1] = 0.9*moduli[ndata-2];

            // Compute and apply the optimal zeta coefficient.

            int jcut = 50;
            for (int i = 1; i <= ndata; i++) {
                int k = i - 1;
                if (i > ndata/2)
                    k = k - ndata;
                double zeta;
                if (k == 0)
                    zeta = 1.0;
                else {
                    double sum1 = 1.0;
                    double sum2 = 1.0;
                    factor = M_PI*k/ndata;
                    for (int j = 1; j <= jcut; j++) {
                        double arg = factor/(factor+M_PI*j);
                        sum1 += pow(arg, PmeOrder);
                        sum2 += pow(arg, 2*PmeOrder);
                    }
                    for (int j = 1; j <= jcut; j++) {
                        double arg = factor/(factor-M_PI*j);
                        sum1 += pow(arg, PmeOrder);
                        sum2 += pow(arg, 2*PmeOrder);
                    }
                    zeta = sum2/sum1;
                }
                moduli[i-1] = moduli[i-1]*zeta*zeta;
            }
            if (cu.getUseDoublePrecision()) {
                if (dim == 0)
1212
                    pmeBsplineModuliX.upload(moduli);
1213
                else if (dim == 1)
1214
                    pmeBsplineModuliY.upload(moduli);
1215
                else
1216
                    pmeBsplineModuliZ.upload(moduli);
1217
1218
1219
1220
1221
1222
            }
            else {
                vector<float> modulif(ndata);
                for (int i = 0; i < ndata; i++)
                    modulif[i] = (float) moduli[i];
                if (dim == 0)
1223
                    pmeBsplineModuliX.upload(modulif);
1224
                else if (dim == 1)
1225
                    pmeBsplineModuliY.upload(modulif);
1226
                else
1227
                    pmeBsplineModuliZ.upload(modulif);
1228
1229
1230
            }
        }
    }
1231
1232
1233
1234
1235

    // Add an interaction to the default nonbonded kernel.  This doesn't actually do any calculations.  It's
    // just so that CudaNonbondedUtilities will build the exclusion flags and maintain the neighbor list.
    
    cu.getNonbondedUtilities().addInteraction(usePME, usePME, true, force.getCutoffDistance(), exclusions, "", force.getForceGroup());
1236
    cu.getNonbondedUtilities().setUsePadding(false);
1237
1238
1239
1240
1241
1242
1243
1244
    cu.addForce(new ForceInfo(force));
}

void CudaCalcAmoebaMultipoleForceKernel::initializeScaleFactors() {
    hasInitializedScaleFactors = true;
    CudaNonbondedUtilities& nb = cu.getNonbondedUtilities();
    
    // Figure out the covalent flag values to use for each atom pair.
1245
1246
1247
1248
1249
1250
1251
1252

    vector<ushort2> exclusionTiles;
    nb.getExclusionTiles().download(exclusionTiles);
    map<pair<int, int>, int> exclusionTileMap;
    for (int i = 0; i < (int) exclusionTiles.size(); i++) {
        ushort2 tile = exclusionTiles[i];
        exclusionTileMap[make_pair(tile.x, tile.y)] = i;
    }
1253
    covalentFlags.initialize<uint2>(cu, nb.getExclusions().getSize(), "covalentFlags");
1254
    vector<uint2> covalentFlagsVec(nb.getExclusions().getSize(), make_uint2(0, 0));
peastman's avatar
peastman committed
1255
1256
1257
1258
    for (int3 values : covalentFlagValues) {
        int atom1 = values.x;
        int atom2 = values.y;
        int value = values.z;
1259
1260
1261
1262
1263
1264
        int x = atom1/CudaContext::TileSize;
        int offset1 = atom1-x*CudaContext::TileSize;
        int y = atom2/CudaContext::TileSize;
        int offset2 = atom2-y*CudaContext::TileSize;
        int f1 = (value == 0 || value == 1 ? 1 : 0);
        int f2 = (value == 0 || value == 2 ? 1 : 0);
1265
        if (x == y) {
1266
            int index = exclusionTileMap[make_pair(x, y)]*CudaContext::TileSize;
1267
1268
1269
1270
1271
1272
            covalentFlagsVec[index+offset1].x |= f1<<offset2;
            covalentFlagsVec[index+offset1].y |= f2<<offset2;
            covalentFlagsVec[index+offset2].x |= f1<<offset1;
            covalentFlagsVec[index+offset2].y |= f2<<offset1;
        }
        else if (x > y) {
1273
            int index = exclusionTileMap[make_pair(x, y)]*CudaContext::TileSize;
1274
1275
1276
1277
            covalentFlagsVec[index+offset1].x |= f1<<offset2;
            covalentFlagsVec[index+offset1].y |= f2<<offset2;
        }
        else {
1278
            int index = exclusionTileMap[make_pair(y, x)]*CudaContext::TileSize;
1279
1280
1281
1282
            covalentFlagsVec[index+offset2].x |= f1<<offset1;
            covalentFlagsVec[index+offset2].y |= f2<<offset1;
        }
    }
1283
    covalentFlags.upload(covalentFlagsVec);
1284
1285
1286
    
    // Do the same for the polarization flags.
    
1287
    polarizationGroupFlags.initialize<unsigned int>(cu, nb.getExclusions().getSize(), "polarizationGroupFlags");
1288
    vector<unsigned int> polarizationGroupFlagsVec(nb.getExclusions().getSize(), 0);
peastman's avatar
peastman committed
1289
1290
1291
    for (int2 values : polarizationFlagValues) {
        int atom1 = values.x;
        int atom2 = values.y;
1292
1293
1294
1295
        int x = atom1/CudaContext::TileSize;
        int offset1 = atom1-x*CudaContext::TileSize;
        int y = atom2/CudaContext::TileSize;
        int offset2 = atom2-y*CudaContext::TileSize;
1296
        if (x == y) {
1297
            int index = exclusionTileMap[make_pair(x, y)]*CudaContext::TileSize;
1298
1299
1300
1301
            polarizationGroupFlagsVec[index+offset1] |= 1<<offset2;
            polarizationGroupFlagsVec[index+offset2] |= 1<<offset1;
        }
        else if (x > y) {
1302
            int index = exclusionTileMap[make_pair(x, y)]*CudaContext::TileSize;
1303
1304
1305
            polarizationGroupFlagsVec[index+offset1] |= 1<<offset2;
        }
        else {
1306
            int index = exclusionTileMap[make_pair(y, x)]*CudaContext::TileSize;
1307
1308
1309
            polarizationGroupFlagsVec[index+offset2] |= 1<<offset1;
        }
    }
1310
    polarizationGroupFlags.upload(polarizationGroupFlagsVec);
1311
1312
1313
}

double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
1314
    if (!hasInitializedScaleFactors) {
1315
        initializeScaleFactors();
peastman's avatar
peastman committed
1316
1317
1318
        for (auto impl : context.getForceImpls()) {
            AmoebaGeneralizedKirkwoodForceImpl* gkImpl = dynamic_cast<AmoebaGeneralizedKirkwoodForceImpl*>(impl);
            if (gkImpl != NULL) {
1319
                gkKernel = dynamic_cast<CudaCalcAmoebaGeneralizedKirkwoodForceKernel*>(&gkImpl->getKernel().getImpl());
peastman's avatar
peastman committed
1320
1321
                break;
            }
1322
1323
        }
    }
1324
1325
1326
1327
    CudaNonbondedUtilities& nb = cu.getNonbondedUtilities();
    
    // Compute the lab frame moments.

1328
1329
1330
1331
    void* computeMomentsArgs[] = {&cu.getPosq().getDevicePointer(), &multipoleParticles.getDevicePointer(),
        &molecularDipoles.getDevicePointer(), &molecularQuadrupoles.getDevicePointer(),
        &labFrameDipoles.getDevicePointer(), &labFrameQuadrupoles.getDevicePointer(),
        &sphericalDipoles.getDevicePointer(), &sphericalQuadrupoles.getDevicePointer()};
1332
    cu.executeKernel(computeMomentsKernel, computeMomentsArgs, cu.getNumAtoms());
1333
1334
1335
    int startTileIndex = nb.getStartTileIndex();
    int numTileIndices = nb.getNumTiles();
    int numForceThreadBlocks = nb.getNumForceThreadBlocks();
1336
    int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
1337
    if (!pmeGrid.isInitialized()) {
1338
1339
        // Compute induced dipoles.
        
1340
        if (gkKernel == NULL) {
1341
1342
1343
            void* computeFixedFieldArgs[] = {&field.getDevicePointer(), &fieldPolar.getDevicePointer(), &cu.getPosq().getDevicePointer(),
                &covalentFlags.getDevicePointer(), &polarizationGroupFlags.getDevicePointer(), &nb.getExclusionTiles().getDevicePointer(), &startTileIndex, &numTileIndices,
                &labFrameDipoles.getDevicePointer(), &labFrameQuadrupoles.getDevicePointer(), &dampingAndThole.getDevicePointer()};
1344
            cu.executeKernel(computeFixedFieldKernel, computeFixedFieldArgs, numForceThreadBlocks*fixedFieldThreads, fixedFieldThreads);
1345
1346
            void* recordInducedDipolesArgs[] = {&field.getDevicePointer(), &fieldPolar.getDevicePointer(),
                &inducedDipole.getDevicePointer(), &inducedDipolePolar.getDevicePointer(), &polarizability.getDevicePointer()};
1347
1348
1349
1350
            cu.executeKernel(recordInducedDipolesKernel, recordInducedDipolesArgs, cu.getNumAtoms());
        }
        else {
            gkKernel->computeBornRadii();
1351
1352
1353
1354
            void* computeFixedFieldArgs[] = {&field.getDevicePointer(), &fieldPolar.getDevicePointer(), &cu.getPosq().getDevicePointer(),
                &covalentFlags.getDevicePointer(), &polarizationGroupFlags.getDevicePointer(), &nb.getExclusionTiles().getDevicePointer(), &startTileIndex, &numTileIndices,
                &gkKernel->getBornRadii().getDevicePointer(), &gkKernel->getField().getDevicePointer(),
                &labFrameDipoles.getDevicePointer(), &labFrameQuadrupoles.getDevicePointer(), &dampingAndThole.getDevicePointer()};
1355
            cu.executeKernel(computeFixedFieldKernel, computeFixedFieldArgs, numForceThreadBlocks*fixedFieldThreads, fixedFieldThreads);
1356
1357
1358
1359
            void* recordInducedDipolesArgs[] = {&field.getDevicePointer(), &fieldPolar.getDevicePointer(),
                &gkKernel->getField().getDevicePointer(), &gkKernel->getInducedDipoles().getDevicePointer(),
                &gkKernel->getInducedDipolesPolar().getDevicePointer(), &inducedDipole.getDevicePointer(),
                &inducedDipolePolar.getDevicePointer(), &polarizability.getDevicePointer()};
1360
1361
            cu.executeKernel(recordInducedDipolesKernel, recordInducedDipolesArgs, cu.getNumAtoms());
        }
1362
1363
1364
        
        // Iterate until the dipoles converge.
        
1365
1366
        if (polarizationType == AmoebaMultipoleForce::Extrapolated)
            computeExtrapolatedDipoles(NULL);
1367
        for (int i = 0; i < maxInducedIterations; i++) {
1368
            computeInducedField(NULL);
peastman's avatar
peastman committed
1369
1370
            bool converged = iterateDipolesByDIIS(i);
            if (converged)
1371
                break;
1372
        }
1373
1374
1375
        
        // Compute electrostatic force.
        
1376
1377
        void* electrostaticsArgs[] = {&cu.getForce().getDevicePointer(), &torque.getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(),
            &cu.getPosq().getDevicePointer(), &covalentFlags.getDevicePointer(), &polarizationGroupFlags.getDevicePointer(),
1378
            &nb.getExclusionTiles().getDevicePointer(), &startTileIndex, &numTileIndices,
1379
1380
            &sphericalDipoles.getDevicePointer(), &sphericalQuadrupoles.getDevicePointer(),
            &inducedDipole.getDevicePointer(), &inducedDipolePolar.getDevicePointer(), &dampingAndThole.getDevicePointer()};
1381
        cu.executeKernel(electrostaticsKernel, electrostaticsArgs, numForceThreadBlocks*electrostaticsThreads, electrostaticsThreads);
1382
        if (gkKernel != NULL)
1383
            gkKernel->finishComputation(torque, labFrameDipoles, labFrameQuadrupoles, inducedDipole, inducedDipolePolar, dampingAndThole, covalentFlags, polarizationGroupFlags);
1384
    }
1385
    else {
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
        // Compute reciprocal box vectors.
        
        Vec3 boxVectors[3];
        cu.getPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
        double determinant = boxVectors[0][0]*boxVectors[1][1]*boxVectors[2][2];
        double scale = 1.0/determinant;
        double3 recipBoxVectors[3];
        recipBoxVectors[0] = make_double3(boxVectors[1][1]*boxVectors[2][2]*scale, 0, 0);
        recipBoxVectors[1] = make_double3(-boxVectors[1][0]*boxVectors[2][2]*scale, boxVectors[0][0]*boxVectors[2][2]*scale, 0);
        recipBoxVectors[2] = make_double3((boxVectors[1][0]*boxVectors[2][1]-boxVectors[1][1]*boxVectors[2][0])*scale, -boxVectors[0][0]*boxVectors[2][1]*scale, boxVectors[0][0]*boxVectors[1][1]*scale);
        float3 recipBoxVectorsFloat[3];
        void* recipBoxVectorPointer[3];
        if (cu.getUseDoublePrecision()) {
            recipBoxVectorPointer[0] = &recipBoxVectors[0];
            recipBoxVectorPointer[1] = &recipBoxVectors[1];
            recipBoxVectorPointer[2] = &recipBoxVectors[2];
        }
        else {
            recipBoxVectorsFloat[0] = make_float3((float) recipBoxVectors[0].x, 0, 0);
            recipBoxVectorsFloat[1] = make_float3((float) recipBoxVectors[1].x, (float) recipBoxVectors[1].y, 0);
            recipBoxVectorsFloat[2] = make_float3((float) recipBoxVectors[2].x, (float) recipBoxVectors[2].y, (float) recipBoxVectors[2].z);
            recipBoxVectorPointer[0] = &recipBoxVectorsFloat[0];
            recipBoxVectorPointer[1] = &recipBoxVectorsFloat[1];
            recipBoxVectorPointer[2] = &recipBoxVectorsFloat[2];
        }

1412
        // Reciprocal space calculation.
1413
1414
        
        unsigned int maxTiles = nb.getInteractingTiles().getSize();
1415
1416
        void* pmeTransformMultipolesArgs[] = {&labFrameDipoles.getDevicePointer(), &labFrameQuadrupoles.getDevicePointer(),
            &fracDipoles.getDevicePointer(), &fracQuadrupoles.getDevicePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
1417
        cu.executeKernel(pmeTransformMultipolesKernel, pmeTransformMultipolesArgs, cu.getNumAtoms());
1418
1419
        void* pmeSpreadFixedMultipolesArgs[] = {&cu.getPosq().getDevicePointer(), &fracDipoles.getDevicePointer(), &fracQuadrupoles.getDevicePointer(),
            &pmeGrid.getDevicePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
1420
            recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
1421
        cu.executeKernel(pmeSpreadFixedMultipolesKernel, pmeSpreadFixedMultipolesArgs, cu.getNumAtoms());
1422
        void* finishSpreadArgs[] = {&pmeGrid.getDevicePointer()};
1423
        if (cu.getUseDoublePrecision())
1424
            cu.executeKernel(pmeFinishSpreadChargeKernel, finishSpreadArgs, pmeGrid.getSize());
1425
        if (cu.getUseDoublePrecision())
1426
            cufftExecZ2Z(fft, (double2*) pmeGrid.getDevicePointer(), (double2*) pmeGrid.getDevicePointer(), CUFFT_FORWARD);
1427
        else
1428
1429
1430
            cufftExecC2C(fft, (float2*) pmeGrid.getDevicePointer(), (float2*) pmeGrid.getDevicePointer(), CUFFT_FORWARD);
        void* pmeConvolutionArgs[] = {&pmeGrid.getDevicePointer(), &pmeBsplineModuliX.getDevicePointer(), &pmeBsplineModuliY.getDevicePointer(),
            &pmeBsplineModuliZ.getDevicePointer(), cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
Peter Eastman's avatar
Peter Eastman committed
1431
        cu.executeKernel(pmeConvolutionKernel, pmeConvolutionArgs, gridSizeX*gridSizeY*gridSizeZ, 256);
1432
        if (cu.getUseDoublePrecision())
1433
            cufftExecZ2Z(fft, (double2*) pmeGrid.getDevicePointer(), (double2*) pmeGrid.getDevicePointer(), CUFFT_INVERSE);
1434
        else
1435
1436
1437
            cufftExecC2C(fft, (float2*) pmeGrid.getDevicePointer(), (float2*) pmeGrid.getDevicePointer(), CUFFT_INVERSE);
        void* pmeFixedPotentialArgs[] = {&pmeGrid.getDevicePointer(), &pmePhi.getDevicePointer(), &field.getDevicePointer(),
            &fieldPolar .getDevicePointer(), &cu.getPosq().getDevicePointer(), &labFrameDipoles.getDevicePointer(),
1438
            cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
Peter Eastman's avatar
Peter Eastman committed
1439
            recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
1440
        cu.executeKernel(pmeFixedPotentialKernel, pmeFixedPotentialArgs, cu.getNumAtoms());
1441
        void* pmeTransformFixedPotentialArgs[] = {&pmePhi.getDevicePointer(), &pmeCphi.getDevicePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
1442
        cu.executeKernel(pmeTransformPotentialKernel, pmeTransformFixedPotentialArgs, cu.getNumAtoms());
1443
1444
1445
        void* pmeFixedForceArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &torque.getDevicePointer(),
            &cu.getEnergyBuffer().getDevicePointer(), &labFrameDipoles.getDevicePointer(), &labFrameQuadrupoles.getDevicePointer(),
            &fracDipoles.getDevicePointer(), &fracQuadrupoles.getDevicePointer(), &pmePhi.getDevicePointer(), &pmeCphi.getDevicePointer(),
1446
            recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
1447
        cu.executeKernel(pmeFixedForceKernel, pmeFixedForceArgs, cu.getNumAtoms());
1448
1449
1450
        
        // Direct space calculation.
        
1451
1452
        void* computeFixedFieldArgs[] = {&field.getDevicePointer(), &fieldPolar.getDevicePointer(), &cu.getPosq().getDevicePointer(),
            &covalentFlags.getDevicePointer(), &polarizationGroupFlags.getDevicePointer(), &nb.getExclusionTiles().getDevicePointer(), &startTileIndex, &numTileIndices,
1453
            &nb.getInteractingTiles().getDevicePointer(), &nb.getInteractionCount().getDevicePointer(), cu.getPeriodicBoxSizePointer(),
1454
1455
            cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
            &maxTiles, &nb.getBlockCenters().getDevicePointer(), &nb.getInteractingAtoms().getDevicePointer(),
1456
            &labFrameDipoles.getDevicePointer(), &labFrameQuadrupoles.getDevicePointer(), &dampingAndThole.getDevicePointer()};
1457
        cu.executeKernel(computeFixedFieldKernel, computeFixedFieldArgs, numForceThreadBlocks*fixedFieldThreads, fixedFieldThreads);
1458
1459
        void* recordInducedDipolesArgs[] = {&field.getDevicePointer(), &fieldPolar.getDevicePointer(),
            &inducedDipole.getDevicePointer(), &inducedDipolePolar.getDevicePointer(), &polarizability.getDevicePointer()};
1460
        cu.executeKernel(recordInducedDipolesKernel, recordInducedDipolesArgs, cu.getNumAtoms());
1461
1462
1463

        // Reciprocal space calculation for the induced dipoles.

1464
1465
1466
        cu.clearBuffer(pmeGrid);
        void* pmeSpreadInducedDipolesArgs[] = {&cu.getPosq().getDevicePointer(), &inducedDipole.getDevicePointer(), &inducedDipolePolar.getDevicePointer(),
            &pmeGrid.getDevicePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
1467
            recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
1468
        cu.executeKernel(pmeSpreadInducedDipolesKernel, pmeSpreadInducedDipolesArgs, cu.getNumAtoms());
1469
        if (cu.getUseDoublePrecision())
1470
            cu.executeKernel(pmeFinishSpreadChargeKernel, finishSpreadArgs, pmeGrid.getSize());
1471
        if (cu.getUseDoublePrecision())
1472
            cufftExecZ2Z(fft, (double2*) pmeGrid.getDevicePointer(), (double2*) pmeGrid.getDevicePointer(), CUFFT_FORWARD);
1473
        else
1474
            cufftExecC2C(fft, (float2*) pmeGrid.getDevicePointer(), (float2*) pmeGrid.getDevicePointer(), CUFFT_FORWARD);
Peter Eastman's avatar
Peter Eastman committed
1475
        cu.executeKernel(pmeConvolutionKernel, pmeConvolutionArgs, gridSizeX*gridSizeY*gridSizeZ, 256);
1476
        if (cu.getUseDoublePrecision())
1477
            cufftExecZ2Z(fft, (double2*) pmeGrid.getDevicePointer(), (double2*) pmeGrid.getDevicePointer(), CUFFT_INVERSE);
1478
        else
1479
1480
1481
            cufftExecC2C(fft, (float2*) pmeGrid.getDevicePointer(), (float2*) pmeGrid.getDevicePointer(), CUFFT_INVERSE);
        void* pmeInducedPotentialArgs[] = {&pmeGrid.getDevicePointer(), &pmePhid.getDevicePointer(), &pmePhip.getDevicePointer(),
            &pmePhidp.getDevicePointer(), &cu.getPosq().getDevicePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(),
Peter Eastman's avatar
Peter Eastman committed
1482
            cu.getPeriodicBoxVecZPointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
1483
        cu.executeKernel(pmeInducedPotentialKernel, pmeInducedPotentialArgs, cu.getNumAtoms());
1484
        
1485
        // Iterate until the dipoles converge.
1486
        
1487
1488
        if (polarizationType == AmoebaMultipoleForce::Extrapolated)
            computeExtrapolatedDipoles(recipBoxVectorPointer);
1489
        for (int i = 0; i < maxInducedIterations; i++) {
1490
            computeInducedField(recipBoxVectorPointer);
peastman's avatar
peastman committed
1491
1492
            bool converged = iterateDipolesByDIIS(i);
            if (converged)
1493
1494
                break;
        }
1495
1496
1497
        
        // Compute electrostatic force.
        
1498
1499
        void* electrostaticsArgs[] = {&cu.getForce().getDevicePointer(), &torque.getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(),
            &cu.getPosq().getDevicePointer(), &covalentFlags.getDevicePointer(), &polarizationGroupFlags.getDevicePointer(),
1500
            &nb.getExclusionTiles().getDevicePointer(), &startTileIndex, &numTileIndices,
1501
            &nb.getInteractingTiles().getDevicePointer(), &nb.getInteractionCount().getDevicePointer(),
1502
1503
            cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
            &maxTiles, &nb.getBlockCenters().getDevicePointer(), &nb.getInteractingAtoms().getDevicePointer(),
1504
1505
            &sphericalDipoles.getDevicePointer(), &sphericalQuadrupoles.getDevicePointer(),
            &inducedDipole.getDevicePointer(), &inducedDipolePolar.getDevicePointer(), &dampingAndThole.getDevicePointer()};
1506
        cu.executeKernel(electrostaticsKernel, electrostaticsArgs, numForceThreadBlocks*electrostaticsThreads, electrostaticsThreads);
1507
        void* pmeTransformInducedPotentialArgs[] = {&pmePhidp.getDevicePointer(), &pmeCphi.getDevicePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
1508
        cu.executeKernel(pmeTransformPotentialKernel, pmeTransformInducedPotentialArgs, cu.getNumAtoms());
1509
1510
1511
1512
1513
        void* pmeInducedForceArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &torque.getDevicePointer(),
            &cu.getEnergyBuffer().getDevicePointer(), &labFrameDipoles.getDevicePointer(), &labFrameQuadrupoles.getDevicePointer(),
            &fracDipoles.getDevicePointer(), &fracQuadrupoles.getDevicePointer(),
            &inducedDipole.getDevicePointer(), &inducedDipolePolar.getDevicePointer(), &pmePhi.getDevicePointer(), &pmePhid.getDevicePointer(),
            &pmePhip.getDevicePointer(), &pmePhidp.getDevicePointer(), &pmeCphi.getDevicePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
1514
        cu.executeKernel(pmeInducedForceKernel, pmeInducedForceArgs, cu.getNumAtoms());
1515
    }
1516
1517
1518
1519
    
    // If using extrapolated polarization, add in force contributions from µ(m) T µ(n).
    
    if (polarizationType == AmoebaMultipoleForce::Extrapolated) {
1520
        if (gkKernel == NULL) {
1521
1522
            void* extrapolatedArgs[] = {&cu.getForce().getDevicePointer(), &extrapolatedDipole.getDevicePointer(),
                &extrapolatedDipolePolar.getDevicePointer(), &extrapolatedDipoleFieldGradient.getDevicePointer(), &extrapolatedDipoleFieldGradientPolar.getDevicePointer()};
1523
1524
1525
            cu.executeKernel(addExtrapolatedGradientKernel, extrapolatedArgs, numMultipoles);
        }
        else {
1526
1527
1528
1529
            void* extrapolatedArgs[] = {&cu.getForce().getDevicePointer(), &extrapolatedDipole.getDevicePointer(),
                &extrapolatedDipolePolar.getDevicePointer(), &extrapolatedDipoleFieldGradient.getDevicePointer(), &extrapolatedDipoleFieldGradientPolar.getDevicePointer(),
                &extrapolatedDipoleGk.getDevicePointer(), &extrapolatedDipoleGkPolar.getDevicePointer(),
                &extrapolatedDipoleFieldGradientGk.getDevicePointer(), &extrapolatedDipoleFieldGradientGkPolar.getDevicePointer()};
1530
1531
            cu.executeKernel(addExtrapolatedGradientKernel, extrapolatedArgs, numMultipoles);
        }
1532
    }
Peter Eastman's avatar
Peter Eastman committed
1533
1534
1535

    // Map torques to force.

1536
1537
    void* mapTorqueArgs[] = {&cu.getForce().getDevicePointer(), &torque.getDevicePointer(),
        &cu.getPosq().getDevicePointer(), &multipoleParticles.getDevicePointer()};
Peter Eastman's avatar
Peter Eastman committed
1538
    cu.executeKernel(mapTorqueKernel, mapTorqueArgs, cu.getNumAtoms());
1539
1540
1541
    
    // Record the current atom positions so we can tell later if they have changed.
    
1542
    cu.getPosq().copyTo(lastPositions);
1543
    multipolesAreValid = true;
1544
1545
1546
    return 0.0;
}

1547
1548
1549
1550
1551
void CudaCalcAmoebaMultipoleForceKernel::computeInducedField(void** recipBoxVectorPointer) {
    CudaNonbondedUtilities& nb = cu.getNonbondedUtilities();
    int startTileIndex = nb.getStartTileIndex();
    int numTileIndices = nb.getNumTiles();
    int numForceThreadBlocks = nb.getNumForceThreadBlocks();
1552
1553
    unsigned int maxTiles = 0;
    vector<void*> computeInducedFieldArgs;
1554
1555
    computeInducedFieldArgs.push_back(&inducedField.getDevicePointer());
    computeInducedFieldArgs.push_back(&inducedFieldPolar.getDevicePointer());
1556
1557
    computeInducedFieldArgs.push_back(&cu.getPosq().getDevicePointer());
    computeInducedFieldArgs.push_back(&nb.getExclusionTiles().getDevicePointer());
1558
1559
    computeInducedFieldArgs.push_back(&inducedDipole.getDevicePointer());
    computeInducedFieldArgs.push_back(&inducedDipolePolar.getDevicePointer());
1560
1561
1562
    computeInducedFieldArgs.push_back(&startTileIndex);
    computeInducedFieldArgs.push_back(&numTileIndices);
    if (polarizationType == AmoebaMultipoleForce::Extrapolated) {
1563
1564
        computeInducedFieldArgs.push_back(&inducedDipoleFieldGradient.getDevicePointer());
        computeInducedFieldArgs.push_back(&inducedDipoleFieldGradientPolar.getDevicePointer());
1565
    }
1566
    if (pmeGrid.isInitialized()) {
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
        computeInducedFieldArgs.push_back(&nb.getInteractingTiles().getDevicePointer());
        computeInducedFieldArgs.push_back(&nb.getInteractionCount().getDevicePointer());
        computeInducedFieldArgs.push_back(cu.getPeriodicBoxSizePointer());
        computeInducedFieldArgs.push_back(cu.getInvPeriodicBoxSizePointer());
        computeInducedFieldArgs.push_back(cu.getPeriodicBoxVecXPointer());
        computeInducedFieldArgs.push_back(cu.getPeriodicBoxVecYPointer());
        computeInducedFieldArgs.push_back(cu.getPeriodicBoxVecZPointer());
        computeInducedFieldArgs.push_back(&maxTiles);
        computeInducedFieldArgs.push_back(&nb.getBlockCenters().getDevicePointer());
        computeInducedFieldArgs.push_back(&nb.getInteractingAtoms().getDevicePointer());
    }
    if (gkKernel != NULL) {
1579
1580
1581
1582
1583
        computeInducedFieldArgs.push_back(&gkKernel->getInducedField().getDevicePointer());
        computeInducedFieldArgs.push_back(&gkKernel->getInducedFieldPolar().getDevicePointer());
        computeInducedFieldArgs.push_back(&gkKernel->getInducedDipoles().getDevicePointer());
        computeInducedFieldArgs.push_back(&gkKernel->getInducedDipolesPolar().getDevicePointer());
        computeInducedFieldArgs.push_back(&gkKernel->getBornRadii().getDevicePointer());
1584
        if (polarizationType == AmoebaMultipoleForce::Extrapolated) {
1585
1586
            computeInducedFieldArgs.push_back(&inducedDipoleFieldGradientGk.getDevicePointer());
            computeInducedFieldArgs.push_back(&inducedDipoleFieldGradientGkPolar.getDevicePointer());
1587
        }
1588
    }
1589
1590
1591
    computeInducedFieldArgs.push_back(&dampingAndThole.getDevicePointer());
    cu.clearBuffer(inducedField);
    cu.clearBuffer(inducedFieldPolar);
1592
    if (polarizationType == AmoebaMultipoleForce::Extrapolated) {
1593
1594
        cu.clearBuffer(inducedDipoleFieldGradient);
        cu.clearBuffer(inducedDipoleFieldGradientPolar);
1595
1596
    }
    if (gkKernel != NULL) {
1597
1598
        cu.clearBuffer(gkKernel->getInducedField());
        cu.clearBuffer(gkKernel->getInducedFieldPolar());
1599
        if (polarizationType == AmoebaMultipoleForce::Extrapolated) {
1600
1601
            cu.clearBuffer(inducedDipoleFieldGradientGk);
            cu.clearBuffer(inducedDipoleFieldGradientGkPolar);
1602
1603
        }
    }
1604
    if (!pmeGrid.isInitialized())
1605
        cu.executeKernel(computeInducedFieldKernel, &computeInducedFieldArgs[0], numForceThreadBlocks*inducedFieldThreads, inducedFieldThreads);
1606
    else {
1607
1608
        maxTiles = nb.getInteractingTiles().getSize();
        cu.executeKernel(computeInducedFieldKernel, &computeInducedFieldArgs[0], numForceThreadBlocks*inducedFieldThreads, inducedFieldThreads);
1609
1610
1611
        cu.clearBuffer(pmeGrid);
        void* pmeSpreadInducedDipolesArgs[] = {&cu.getPosq().getDevicePointer(), &inducedDipole.getDevicePointer(), &inducedDipolePolar.getDevicePointer(),
            &pmeGrid.getDevicePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
1612
1613
1614
            recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
        cu.executeKernel(pmeSpreadInducedDipolesKernel, pmeSpreadInducedDipolesArgs, cu.getNumAtoms());
        if (cu.getUseDoublePrecision()) {
1615
1616
            void* finishSpreadArgs[] = {&pmeGrid.getDevicePointer()};
            cu.executeKernel(pmeFinishSpreadChargeKernel, finishSpreadArgs, pmeGrid.getSize());
1617
1618
        }
        if (cu.getUseDoublePrecision())
1619
            cufftExecZ2Z(fft, (double2*) pmeGrid.getDevicePointer(), (double2*) pmeGrid.getDevicePointer(), CUFFT_FORWARD);
1620
        else
1621
1622
1623
            cufftExecC2C(fft, (float2*) pmeGrid.getDevicePointer(), (float2*) pmeGrid.getDevicePointer(), CUFFT_FORWARD);
        void* pmeConvolutionArgs[] = {&pmeGrid.getDevicePointer(), &pmeBsplineModuliX.getDevicePointer(), &pmeBsplineModuliY.getDevicePointer(),
            &pmeBsplineModuliZ.getDevicePointer(), cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
Peter Eastman's avatar
Peter Eastman committed
1624
        cu.executeKernel(pmeConvolutionKernel, pmeConvolutionArgs, gridSizeX*gridSizeY*gridSizeZ, 256);
1625
        if (cu.getUseDoublePrecision())
1626
            cufftExecZ2Z(fft, (double2*) pmeGrid.getDevicePointer(), (double2*) pmeGrid.getDevicePointer(), CUFFT_INVERSE);
1627
        else
1628
1629
1630
            cufftExecC2C(fft, (float2*) pmeGrid.getDevicePointer(), (float2*) pmeGrid.getDevicePointer(), CUFFT_INVERSE);
        void* pmeInducedPotentialArgs[] = {&pmeGrid.getDevicePointer(), &pmePhid.getDevicePointer(), &pmePhip.getDevicePointer(),
            &pmePhidp.getDevicePointer(), &cu.getPosq().getDevicePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(),
Peter Eastman's avatar
Peter Eastman committed
1631
            cu.getPeriodicBoxVecZPointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
1632
        cu.executeKernel(pmeInducedPotentialKernel, pmeInducedPotentialArgs, cu.getNumAtoms());
1633
        if (polarizationType == AmoebaMultipoleForce::Extrapolated) {
1634
1635
1636
            void* pmeRecordInducedFieldDipolesArgs[] = {&pmePhid.getDevicePointer(), &pmePhip.getDevicePointer(),
                &inducedField.getDevicePointer(), &inducedFieldPolar.getDevicePointer(), &inducedDipole.getDevicePointer(),
                &inducedDipolePolar.getDevicePointer(), &inducedDipoleFieldGradient.getDevicePointer(), &inducedDipoleFieldGradientPolar.getDevicePointer(),
1637
                recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
1638
1639
1640
            cu.executeKernel(pmeRecordInducedFieldDipolesKernel, pmeRecordInducedFieldDipolesArgs, cu.getNumAtoms());
        }
        else {
1641
1642
            void* pmeRecordInducedFieldDipolesArgs[] = {&pmePhid.getDevicePointer(), &pmePhip.getDevicePointer(),
                &inducedField.getDevicePointer(), &inducedFieldPolar.getDevicePointer(), &inducedDipole.getDevicePointer(), &inducedDipolePolar.getDevicePointer(),
1643
                recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
1644
1645
            cu.executeKernel(pmeRecordInducedFieldDipolesKernel, pmeRecordInducedFieldDipolesArgs, cu.getNumAtoms());
        }
1646
1647
1648
    }
}

peastman's avatar
peastman committed
1649
bool CudaCalcAmoebaMultipoleForceKernel::iterateDipolesByDIIS(int iteration) {
peastman's avatar
peastman committed
1650
1651
1652
1653
    void* npt = NULL;
    bool trueValue = true, falseValue = false;
    int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
    
peastman's avatar
peastman committed
1654
    // Record the dipoles and errors into the lists of previous dipoles.
peastman's avatar
peastman committed
1655
1656
    
    if (gkKernel != NULL) {
1657
1658
1659
1660
        void* recordDIISDipolesGkArgs[] = {&field.getDevicePointer(), &fieldPolar.getDevicePointer(), &gkKernel->getField().getDevicePointer(), &gkKernel->getInducedField().getDevicePointer(),
            &gkKernel->getInducedFieldPolar().getDevicePointer(), &gkKernel->getInducedDipoles().getDevicePointer(), &gkKernel->getInducedDipolesPolar().getDevicePointer(), 
            &polarizability.getDevicePointer(), &inducedDipoleErrors.getDevicePointer(), &prevDipolesGk.getDevicePointer(),
            &prevDipolesGkPolar.getDevicePointer(), &prevErrors.getDevicePointer(), &iteration, &falseValue, &diisMatrix.getDevicePointer()};
peastman's avatar
peastman committed
1661
1662
        cu.executeKernel(recordDIISDipolesKernel, recordDIISDipolesGkArgs, cu.getNumThreadBlocks()*cu.ThreadBlockSize, cu.ThreadBlockSize, cu.ThreadBlockSize*elementSize*2);
    }
1663
1664
1665
1666
    void* recordDIISDipolesArgs[] = {&field.getDevicePointer(), &fieldPolar.getDevicePointer(), &npt, &inducedField.getDevicePointer(),
        &inducedFieldPolar.getDevicePointer(), &inducedDipole.getDevicePointer(), &inducedDipolePolar.getDevicePointer(),
        &polarizability.getDevicePointer(), &inducedDipoleErrors.getDevicePointer(), &prevDipoles.getDevicePointer(),
        &prevDipolesPolar.getDevicePointer(), &prevErrors.getDevicePointer(), &iteration, &trueValue, &diisMatrix.getDevicePointer()};
peastman's avatar
peastman committed
1667
1668
    cu.executeKernel(recordDIISDipolesKernel, recordDIISDipolesArgs, cu.getNumThreadBlocks()*cu.ThreadBlockSize, cu.ThreadBlockSize, cu.ThreadBlockSize*elementSize*2);
    float2* errors = (float2*) cu.getPinnedBuffer();
1669
    inducedDipoleErrors.download(errors, false);
Peter Eastman's avatar
Peter Eastman committed
1670
    cuEventRecord(syncEvent, cu.getCurrentStream());
peastman's avatar
peastman committed
1671
    
peastman's avatar
peastman committed
1672
    // Build the DIIS matrix.
peastman's avatar
peastman committed
1673
1674
    
    int numPrev = (iteration+1 < MaxPrevDIISDipoles ? iteration+1 : MaxPrevDIISDipoles);
1675
    void* buildMatrixArgs[] = {&prevErrors.getDevicePointer(), &iteration, &diisMatrix.getDevicePointer()};
1676
    int threadBlocks = min(numPrev, cu.getNumThreadBlocks());
Peter Eastman's avatar
Peter Eastman committed
1677
1678
    int blockSize = 512;
    cu.executeKernel(buildMatrixKernel, buildMatrixArgs, threadBlocks*blockSize, blockSize, blockSize*elementSize);
Peter Eastman's avatar
Peter Eastman committed
1679
1680
1681
    
    // Solve the matrix.

1682
    void* solveMatrixArgs[] = {&iteration, &diisMatrix.getDevicePointer(), &diisCoefficients.getDevicePointer()};
Peter Eastman's avatar
Peter Eastman committed
1683
    cu.executeKernel(solveMatrixKernel, solveMatrixArgs, 32, 32);
peastman's avatar
peastman committed
1684
1685
1686
    
    // Determine whether the iteration has converged.
    
Peter Eastman's avatar
Peter Eastman committed
1687
    cuEventSynchronize(syncEvent);
peastman's avatar
peastman committed
1688
    double total1 = 0.0, total2 = 0.0;
1689
    for (int j = 0; j < inducedDipoleErrors.getSize(); j++) {
peastman's avatar
peastman committed
1690
1691
1692
1693
1694
        total1 += errors[j].x;
        total2 += errors[j].y;
    }
    if (48.033324*sqrt(max(total1, total2)/cu.getNumAtoms()) < inducedEpsilon)
        return true;
peastman's avatar
peastman committed
1695
1696
1697
    
    // Compute the dipoles.
    
1698
1699
    void* updateInducedFieldArgs[] = {&inducedDipole.getDevicePointer(), &inducedDipolePolar.getDevicePointer(),
        &prevDipoles.getDevicePointer(), &prevDipolesPolar.getDevicePointer(), &diisCoefficients.getDevicePointer(), &numPrev};
Peter Eastman's avatar
Peter Eastman committed
1700
    cu.executeKernel(updateInducedFieldKernel, updateInducedFieldArgs, 3*cu.getNumAtoms(), 256);
peastman's avatar
peastman committed
1701
    if (gkKernel != NULL) {
1702
1703
        void* updateInducedFieldGkArgs[] = {&gkKernel->getInducedDipoles().getDevicePointer(), &gkKernel->getInducedDipolesPolar().getDevicePointer(),
            &prevDipolesGk.getDevicePointer(), &prevDipolesGkPolar.getDevicePointer(), &diisCoefficients.getDevicePointer(), &numPrev};
Peter Eastman's avatar
Peter Eastman committed
1704
        cu.executeKernel(updateInducedFieldKernel, updateInducedFieldGkArgs, 3*cu.getNumAtoms(), 256);
peastman's avatar
peastman committed
1705
    }
peastman's avatar
peastman committed
1706
    return false;
peastman's avatar
peastman committed
1707
1708
}

1709
1710
1711
void CudaCalcAmoebaMultipoleForceKernel::computeExtrapolatedDipoles(void** recipBoxVectorPointer) {
    // Start by storing the direct dipoles as PT0

1712
    if (gkKernel == NULL) {
1713
1714
1715
        void* initArgs[] = {&inducedDipole.getDevicePointer(), &inducedDipolePolar.getDevicePointer(), &extrapolatedDipole.getDevicePointer(),
            &extrapolatedDipolePolar.getDevicePointer(), &inducedDipoleFieldGradient.getDevicePointer(), &inducedDipoleFieldGradientPolar.getDevicePointer()};
        cu.executeKernel(initExtrapolatedKernel, initArgs, extrapolatedDipole.getSize());
1716
1717
    }
    else {
1718
1719
1720
1721
1722
        void* initArgs[] = {&inducedDipole.getDevicePointer(), &inducedDipolePolar.getDevicePointer(), &extrapolatedDipole.getDevicePointer(),
            &extrapolatedDipolePolar.getDevicePointer(), &inducedDipoleFieldGradient.getDevicePointer(), &inducedDipoleFieldGradientPolar.getDevicePointer(),
            &gkKernel->getInducedDipoles().getDevicePointer(), &gkKernel->getInducedDipolesPolar().getDevicePointer(), &extrapolatedDipoleGk.getDevicePointer(),
            &extrapolatedDipoleGkPolar.getDevicePointer(), &inducedDipoleFieldGradientGk.getDevicePointer(), &inducedDipoleFieldGradientGkPolar.getDevicePointer()};
        cu.executeKernel(initExtrapolatedKernel, initArgs, extrapolatedDipole.getSize());
1723
    }
1724
1725
1726
1727
1728

    // Recursively apply alpha.Tau to the µ_(n) components to generate µ_(n+1), and store the result

    for (int order = 1; order < maxExtrapolationOrder; ++order) {
        computeInducedField(recipBoxVectorPointer);
1729
        if (gkKernel == NULL) {
1730
1731
1732
1733
1734
            void* iterateArgs[] = {&order, &inducedDipole.getDevicePointer(), &inducedDipolePolar.getDevicePointer(), &extrapolatedDipole.getDevicePointer(),
                &extrapolatedDipolePolar.getDevicePointer(), &inducedDipoleFieldGradient.getDevicePointer(), &inducedDipoleFieldGradientPolar.getDevicePointer(),
                &inducedField.getDevicePointer(), &inducedFieldPolar.getDevicePointer(), &extrapolatedDipoleFieldGradient.getDevicePointer(), &extrapolatedDipoleFieldGradientPolar.getDevicePointer(),
                &polarizability.getDevicePointer()};
            cu.executeKernel(iterateExtrapolatedKernel, iterateArgs, extrapolatedDipole.getSize());
1735
1736
        }
        else {
1737
1738
1739
1740
1741
1742
1743
1744
1745
            void* iterateArgs[] = {&order, &inducedDipole.getDevicePointer(), &inducedDipolePolar.getDevicePointer(), &extrapolatedDipole.getDevicePointer(),
                &extrapolatedDipolePolar.getDevicePointer(), &inducedDipoleFieldGradient.getDevicePointer(), &inducedDipoleFieldGradientPolar.getDevicePointer(),
                &inducedField.getDevicePointer(), &inducedFieldPolar.getDevicePointer(), &extrapolatedDipoleFieldGradient.getDevicePointer(), &extrapolatedDipoleFieldGradientPolar.getDevicePointer(),
                &gkKernel->getInducedDipoles().getDevicePointer(), &gkKernel->getInducedDipolesPolar().getDevicePointer(), &extrapolatedDipoleGk.getDevicePointer(),
                &extrapolatedDipoleGkPolar.getDevicePointer(), &inducedDipoleFieldGradientGk.getDevicePointer(), &inducedDipoleFieldGradientGkPolar.getDevicePointer(),
                &gkKernel->getInducedField().getDevicePointer(), &gkKernel->getInducedFieldPolar().getDevicePointer(),
                &extrapolatedDipoleFieldGradientGk.getDevicePointer(), &extrapolatedDipoleFieldGradientGkPolar.getDevicePointer(),
                &polarizability.getDevicePointer()};
            cu.executeKernel(iterateExtrapolatedKernel, iterateArgs, extrapolatedDipole.getSize());
1746
        }
1747
1748
1749
1750
    }
    
    // Take a linear combination of the µ_(n) components to form the total dipole

1751
    if (gkKernel == NULL) {
1752
1753
1754
        void* computeArgs[] = {&inducedDipole.getDevicePointer(), &inducedDipolePolar.getDevicePointer(), &extrapolatedDipole.getDevicePointer(),
                &extrapolatedDipolePolar.getDevicePointer()};
        cu.executeKernel(computeExtrapolatedKernel, computeArgs, extrapolatedDipole.getSize());
1755
1756
    }
    else {
1757
1758
1759
1760
        void* computeArgs[] = {&inducedDipole.getDevicePointer(), &inducedDipolePolar.getDevicePointer(), &extrapolatedDipole.getDevicePointer(),
                &extrapolatedDipolePolar.getDevicePointer(), &gkKernel->getInducedDipoles().getDevicePointer(), &gkKernel->getInducedDipolesPolar().getDevicePointer(),
                &extrapolatedDipoleGk.getDevicePointer(), &extrapolatedDipoleGkPolar.getDevicePointer()};
        cu.executeKernel(computeExtrapolatedKernel, computeArgs, extrapolatedDipole.getSize());
1761
    }
1762
    computeInducedField(recipBoxVectorPointer);
1763
1764
}

1765
1766
1767
1768
1769
1770
void CudaCalcAmoebaMultipoleForceKernel::ensureMultipolesValid(ContextImpl& context) {
    if (multipolesAreValid) {
        int numParticles = cu.getNumAtoms();
        if (cu.getUseDoublePrecision()) {
            vector<double4> pos1, pos2;
            cu.getPosq().download(pos1);
1771
            lastPositions.download(pos2);
1772
1773
1774
1775
1776
1777
1778
1779
1780
            for (int i = 0; i < numParticles; i++)
                if (pos1[i].x != pos2[i].x || pos1[i].y != pos2[i].y || pos1[i].z != pos2[i].z) {
                    multipolesAreValid = false;
                    break;
                }
        }
        else {
            vector<float4> pos1, pos2;
            cu.getPosq().download(pos1);
1781
            lastPositions.download(pos2);
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
            for (int i = 0; i < numParticles; i++)
                if (pos1[i].x != pos2[i].x || pos1[i].y != pos2[i].y || pos1[i].z != pos2[i].z) {
                    multipolesAreValid = false;
                    break;
                }
        }
    }
    if (!multipolesAreValid)
        context.calcForcesAndEnergy(false, false, -1);
}

1793
1794
1795
1796
1797
1798
1799

void CudaCalcAmoebaMultipoleForceKernel::getLabFramePermanentDipoles(ContextImpl& context, vector<Vec3>& dipoles) {
    ensureMultipolesValid(context);
    int numParticles = cu.getNumAtoms();
    dipoles.resize(numParticles);
    const vector<int>& order = cu.getAtomIndex();
    if (cu.getUseDoublePrecision()) {
1800
        vector<double> labDipoleVec;
1801
        labFrameDipoles.download(labDipoleVec);
1802
        for (int i = 0; i < numParticles; i++)
1803
            dipoles[order[i]] = Vec3(labDipoleVec[3*i], labDipoleVec[3*i+1], labDipoleVec[3*i+2]);
1804
1805
    }
    else {
1806
        vector<float> labDipoleVec;
1807
        labFrameDipoles.download(labDipoleVec);
1808
        for (int i = 0; i < numParticles; i++)
1809
            dipoles[order[i]] = Vec3(labDipoleVec[3*i], labDipoleVec[3*i+1], labDipoleVec[3*i+2]);
1810
1811
1812
1813
    }
}


1814
1815
1816
1817
void CudaCalcAmoebaMultipoleForceKernel::getInducedDipoles(ContextImpl& context, vector<Vec3>& dipoles) {
    ensureMultipolesValid(context);
    int numParticles = cu.getNumAtoms();
    dipoles.resize(numParticles);
peastman's avatar
peastman committed
1818
    const vector<int>& order = cu.getAtomIndex();
1819
1820
    if (cu.getUseDoublePrecision()) {
        vector<double> d;
1821
        inducedDipole.download(d);
1822
        for (int i = 0; i < numParticles; i++)
peastman's avatar
peastman committed
1823
            dipoles[order[i]] = Vec3(d[3*i], d[3*i+1], d[3*i+2]);
1824
1825
1826
    }
    else {
        vector<float> d;
1827
        inducedDipole.download(d);
1828
        for (int i = 0; i < numParticles; i++)
peastman's avatar
peastman committed
1829
            dipoles[order[i]] = Vec3(d[3*i], d[3*i+1], d[3*i+2]);
1830
1831
1832
    }
}

1833
1834
1835
1836
1837
1838
1839

void CudaCalcAmoebaMultipoleForceKernel::getTotalDipoles(ContextImpl& context, vector<Vec3>& dipoles) {
    ensureMultipolesValid(context);
    int numParticles = cu.getNumAtoms();
    dipoles.resize(numParticles);
    const vector<int>& order = cu.getAtomIndex();
    if (cu.getUseDoublePrecision()) {
1840
        vector<double4> posqVec;
1841
1842
        vector<double> labDipoleVec;
        vector<double> inducedDipoleVec;
1843
1844
1845
        double totalDipoleVecX;
        double totalDipoleVecY;
        double totalDipoleVecZ;
1846
1847
        inducedDipole.download(inducedDipoleVec);
        labFrameDipoles.download(labDipoleVec);
1848
1849
        cu.getPosq().download(posqVec);
        for (int i = 0; i < numParticles; i++) {
1850
1851
1852
            totalDipoleVecX = labDipoleVec[3*i] + inducedDipoleVec[3*i];
            totalDipoleVecY = labDipoleVec[3*i+1] + inducedDipoleVec[3*i+1];
            totalDipoleVecZ = labDipoleVec[3*i+2] + inducedDipoleVec[3*i+2];
1853
            dipoles[order[i]] = Vec3(totalDipoleVecX, totalDipoleVecY, totalDipoleVecZ);
1854
1855
1856
        }
    }
    else {
1857
        vector<float4> posqVec;
1858
1859
        vector<float> labDipoleVec;
        vector<float> inducedDipoleVec;
1860
1861
1862
        float totalDipoleVecX;
        float totalDipoleVecY;
        float totalDipoleVecZ;
1863
1864
        inducedDipole.download(inducedDipoleVec);
        labFrameDipoles.download(labDipoleVec);
1865
1866
        cu.getPosq().download(posqVec);
        for (int i = 0; i < numParticles; i++) {
1867
1868
1869
            totalDipoleVecX = labDipoleVec[3*i] + inducedDipoleVec[3*i];
            totalDipoleVecY = labDipoleVec[3*i+1] + inducedDipoleVec[3*i+1];
            totalDipoleVecZ = labDipoleVec[3*i+2] + inducedDipoleVec[3*i+2];
1870
            dipoles[order[i]] = Vec3(totalDipoleVecX, totalDipoleVecY, totalDipoleVecZ);
1871
1872
1873
1874
        }
    }
}

1875
void CudaCalcAmoebaMultipoleForceKernel::getElectrostaticPotential(ContextImpl& context, const vector<Vec3>& inputGrid, vector<double>& outputElectrostaticPotential) {
1876
    ensureMultipolesValid(context);
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
    int numPoints = inputGrid.size();
    int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
    CudaArray points(cu, numPoints, 4*elementSize, "points");
    CudaArray potential(cu, numPoints, elementSize, "potential");
    
    // Copy the grid points to the GPU.
    
    if (cu.getUseDoublePrecision()) {
        vector<double4> p(numPoints);
        for (int i = 0; i < numPoints; i++)
            p[i] = make_double4(inputGrid[i][0], inputGrid[i][1], inputGrid[i][2], 0);
        points.upload(p);
    }
    else {
        vector<float4> p(numPoints);
        for (int i = 0; i < numPoints; i++)
            p[i] = make_float4((float) inputGrid[i][0], (float) inputGrid[i][1], (float) inputGrid[i][2], 0);
        points.upload(p);
    }
    
    // Compute the potential.
    
1899
1900
    void* computePotentialArgs[] = {&cu.getPosq().getDevicePointer(), &labFrameDipoles.getDevicePointer(),
        &labFrameQuadrupoles.getDevicePointer(), &inducedDipole.getDevicePointer(), &points.getDevicePointer(),
1901
1902
        &potential.getDevicePointer(), &numPoints, cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer(),
        cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer()};
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
    int blockSize = 128;
    cu.executeKernel(computePotentialKernel, computePotentialArgs, numPoints, blockSize, blockSize*15*elementSize);
    outputElectrostaticPotential.resize(numPoints);
    if (cu.getUseDoublePrecision())
        potential.download(outputElectrostaticPotential);
    else {
        vector<float> p(numPoints);
        potential.download(p);
        for (int i = 0; i < numPoints; i++)
            outputElectrostaticPotential[i] = p[i];
    }
}

1916
template <class T, class T4, class M4>
Lee-Ping Wang's avatar
Lee-Ping Wang committed
1917
void CudaCalcAmoebaMultipoleForceKernel::computeSystemMultipoleMoments(ContextImpl& context, vector<double>& outputMultipoleMoments) {
1918
1919
    // Compute the local coordinates relative to the center of mass.
    int numAtoms = cu.getNumAtoms();
1920
1921
    vector<T4> posq;
    vector<M4> velm;
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
    cu.getPosq().download(posq);
    cu.getVelm().download(velm);
    double totalMass = 0.0;
    Vec3 centerOfMass(0, 0, 0);
    for (int i = 0; i < numAtoms; i++) {
        double mass = (velm[i].w > 0 ? 1.0/velm[i].w : 0.0);
        totalMass += mass;
        centerOfMass[0] += mass*posq[i].x;
        centerOfMass[1] += mass*posq[i].y;
        centerOfMass[2] += mass*posq[i].z;
    }
    if (totalMass > 0.0) {
        centerOfMass[0] /= totalMass;
        centerOfMass[1] /= totalMass;
        centerOfMass[2] /= totalMass;
    }
    vector<double4> posqLocal(numAtoms);
    for (int i = 0; i < numAtoms; i++) {
        posqLocal[i].x = posq[i].x - centerOfMass[0];
        posqLocal[i].y = posq[i].y - centerOfMass[1];
        posqLocal[i].z = posq[i].z - centerOfMass[2];
        posqLocal[i].w = posq[i].w;
    }

    // Compute the multipole moments.
    
    double totalCharge = 0.0;
    double xdpl = 0.0;
    double ydpl = 0.0;
    double zdpl = 0.0;
    double xxqdp = 0.0;
    double xyqdp = 0.0;
    double xzqdp = 0.0;
    double yxqdp = 0.0;
    double yyqdp = 0.0;
    double yzqdp = 0.0;
    double zxqdp = 0.0;
    double zyqdp = 0.0;
    double zzqdp = 0.0;
    vector<T> labDipoleVec, inducedDipoleVec, quadrupoleVec;
1962
1963
1964
    labFrameDipoles.download(labDipoleVec);
    inducedDipole.download(inducedDipoleVec);
    labFrameQuadrupoles.download(quadrupoleVec);
1965
1966
    for (int i = 0; i < numAtoms; i++) {
        totalCharge += posqLocal[i].w;
Lee-Ping Wang's avatar
Lee-Ping Wang committed
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
        double netDipoleX = (labDipoleVec[3*i]    + inducedDipoleVec[3*i]);
        double netDipoleY = (labDipoleVec[3*i+1]  + inducedDipoleVec[3*i+1]);
        double netDipoleZ = (labDipoleVec[3*i+2]  + inducedDipoleVec[3*i+2]);
        xdpl += posqLocal[i].x*posqLocal[i].w + netDipoleX;
        ydpl += posqLocal[i].y*posqLocal[i].w + netDipoleY;
        zdpl += posqLocal[i].z*posqLocal[i].w + netDipoleZ;
        xxqdp += posqLocal[i].x*posqLocal[i].x*posqLocal[i].w + 2*posqLocal[i].x*netDipoleX;
        xyqdp += posqLocal[i].x*posqLocal[i].y*posqLocal[i].w + posqLocal[i].x*netDipoleY + posqLocal[i].y*netDipoleX;
        xzqdp += posqLocal[i].x*posqLocal[i].z*posqLocal[i].w + posqLocal[i].x*netDipoleZ + posqLocal[i].z*netDipoleX;
        yxqdp += posqLocal[i].y*posqLocal[i].x*posqLocal[i].w + posqLocal[i].y*netDipoleX + posqLocal[i].x*netDipoleY;
        yyqdp += posqLocal[i].y*posqLocal[i].y*posqLocal[i].w + 2*posqLocal[i].y*netDipoleY;
        yzqdp += posqLocal[i].y*posqLocal[i].z*posqLocal[i].w + posqLocal[i].y*netDipoleZ + posqLocal[i].z*netDipoleY;
        zxqdp += posqLocal[i].z*posqLocal[i].x*posqLocal[i].w + posqLocal[i].z*netDipoleX + posqLocal[i].x*netDipoleZ;
        zyqdp += posqLocal[i].z*posqLocal[i].y*posqLocal[i].w + posqLocal[i].z*netDipoleY + posqLocal[i].y*netDipoleZ;
        zzqdp += posqLocal[i].z*posqLocal[i].z*posqLocal[i].w + 2*posqLocal[i].z*netDipoleZ;
1982
1983
1984
1985
    }

    // Convert the quadrupole from traced to traceless form.
 
Lee-Ping Wang's avatar
Lee-Ping Wang committed
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
    double qave = (xxqdp + yyqdp + zzqdp)/3;
    xxqdp = 1.5*(xxqdp-qave);
    xyqdp = 1.5*xyqdp;
    xzqdp = 1.5*xzqdp;
    yxqdp = 1.5*yxqdp;
    yyqdp = 1.5*(yyqdp-qave);
    yzqdp = 1.5*yzqdp;
    zxqdp = 1.5*zxqdp;
    zyqdp = 1.5*zyqdp;
    zzqdp = 1.5*(zzqdp-qave);
1996
1997
1998

    // Add the traceless atomic quadrupoles to the total quadrupole moment.

Lee-Ping Wang's avatar
Lee-Ping Wang committed
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
    for (int i = 0; i < numAtoms; i++) {
        xxqdp = xxqdp + 3*quadrupoleVec[5*i];
        xyqdp = xyqdp + 3*quadrupoleVec[5*i+1];
        xzqdp = xzqdp + 3*quadrupoleVec[5*i+2];
        yxqdp = yxqdp + 3*quadrupoleVec[5*i+1];
        yyqdp = yyqdp + 3*quadrupoleVec[5*i+3];
        yzqdp = yzqdp + 3*quadrupoleVec[5*i+4];
        zxqdp = zxqdp + 3*quadrupoleVec[5*i+2];
        zyqdp = zyqdp + 3*quadrupoleVec[5*i+4];
        zzqdp = zzqdp + -3*(quadrupoleVec[5*i]+quadrupoleVec[5*i+3]);
    }
 
    double debye = 4.80321;
2012
2013
    outputMultipoleMoments.resize(13);
    outputMultipoleMoments[0] = totalCharge;
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
    outputMultipoleMoments[1] = 10.0*xdpl*debye;
    outputMultipoleMoments[2] = 10.0*ydpl*debye;
    outputMultipoleMoments[3] = 10.0*zdpl*debye;
    outputMultipoleMoments[4] = 100.0*xxqdp*debye;
    outputMultipoleMoments[5] = 100.0*xyqdp*debye;
    outputMultipoleMoments[6] = 100.0*xzqdp*debye;
    outputMultipoleMoments[7] = 100.0*yxqdp*debye;
    outputMultipoleMoments[8] = 100.0*yyqdp*debye;
    outputMultipoleMoments[9] = 100.0*yzqdp*debye;
    outputMultipoleMoments[10] = 100.0*zxqdp*debye;
    outputMultipoleMoments[11] = 100.0*zyqdp*debye;
    outputMultipoleMoments[12] = 100.0*zzqdp*debye;
2026
2027
}

2028

Lee-Ping Wang's avatar
Lee-Ping Wang committed
2029
void CudaCalcAmoebaMultipoleForceKernel::getSystemMultipoleMoments(ContextImpl& context, vector<double>& outputMultipoleMoments) {
2030
    ensureMultipolesValid(context);
2031
    if (cu.getUseDoublePrecision())
Lee-Ping Wang's avatar
Lee-Ping Wang committed
2032
        computeSystemMultipoleMoments<double, double4, double4>(context, outputMultipoleMoments);
2033
    else if (cu.getUseMixedPrecision())
Lee-Ping Wang's avatar
Lee-Ping Wang committed
2034
        computeSystemMultipoleMoments<float, float4, double4>(context, outputMultipoleMoments);
2035
    else
Lee-Ping Wang's avatar
Lee-Ping Wang committed
2036
        computeSystemMultipoleMoments<float, float4, float4>(context, outputMultipoleMoments);
2037
2038
}

2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
void CudaCalcAmoebaMultipoleForceKernel::copyParametersToContext(ContextImpl& context, const AmoebaMultipoleForce& force) {
    // Make sure the new parameters are acceptable.
    
    cu.setAsCurrent();
    if (force.getNumMultipoles() != cu.getNumAtoms())
        throw OpenMMException("updateParametersInContext: The number of multipoles has changed");
    
    // Record the per-multipole parameters.
    
    cu.getPosq().download(cu.getPinnedBuffer());
    float4* posqf = (float4*) cu.getPinnedBuffer();
    double4* posqd = (double4*) cu.getPinnedBuffer();
    vector<float2> dampingAndTholeVec;
    vector<float> polarizabilityVec;
    vector<float> molecularDipolesVec;
    vector<float> molecularQuadrupolesVec;
    vector<int4> multipoleParticlesVec;
    for (int i = 0; i < force.getNumMultipoles(); i++) {
        double charge, thole, damping, polarity;
        int axisType, atomX, atomY, atomZ;
        vector<double> dipole, quadrupole;
        force.getMultipoleParameters(i, charge, dipole, quadrupole, axisType, atomZ, atomX, atomY, thole, damping, polarity);
        if (cu.getUseDoublePrecision())
            posqd[i].w = charge;
        else
            posqf[i].w = (float) charge;
        dampingAndTholeVec.push_back(make_float2((float) damping, (float) thole));
        polarizabilityVec.push_back((float) polarity);
        multipoleParticlesVec.push_back(make_int4(atomX, atomY, atomZ, axisType));
        for (int j = 0; j < 3; j++)
            molecularDipolesVec.push_back((float) dipole[j]);
        molecularQuadrupolesVec.push_back((float) quadrupole[0]);
        molecularQuadrupolesVec.push_back((float) quadrupole[1]);
        molecularQuadrupolesVec.push_back((float) quadrupole[2]);
        molecularQuadrupolesVec.push_back((float) quadrupole[4]);
        molecularQuadrupolesVec.push_back((float) quadrupole[5]);
    }
2076
    if (!hasQuadrupoles) {
peastman's avatar
peastman committed
2077
2078
        for (auto q : molecularQuadrupolesVec)
            if (q != 0.0)
2079
2080
                throw OpenMMException("updateParametersInContext: Cannot set a non-zero quadrupole moment, because quadrupoles were excluded from the kernel");
    }
2081
2082
2083
2084
2085
2086
2087
2088
2089
    for (int i = force.getNumMultipoles(); i < cu.getPaddedNumAtoms(); i++) {
        dampingAndTholeVec.push_back(make_float2(0, 0));
        polarizabilityVec.push_back(0);
        multipoleParticlesVec.push_back(make_int4(0, 0, 0, 0));
        for (int j = 0; j < 3; j++)
            molecularDipolesVec.push_back(0);
        for (int j = 0; j < 5; j++)
            molecularQuadrupolesVec.push_back(0);
    }
2090
2091
2092
2093
2094
    dampingAndThole.upload(dampingAndTholeVec);
    polarizability.upload(polarizabilityVec);
    multipoleParticles.upload(multipoleParticlesVec);
    molecularDipoles.upload(molecularDipolesVec);
    molecularQuadrupoles.upload(molecularQuadrupolesVec);
2095
2096
    cu.getPosq().upload(cu.getPinnedBuffer());
    cu.invalidateMolecules();
2097
    multipolesAreValid = false;
2098
2099
}

2100
2101
2102
2103
2104
2105
2106
2107
2108
void CudaCalcAmoebaMultipoleForceKernel::getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
    if (!usePME)
        throw OpenMMException("getPMEParametersInContext: This Context is not using PME");
    alpha = this->alpha;
    nx = gridSizeX;
    ny = gridSizeY;
    nz = gridSizeZ;
}

2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
/* -------------------------------------------------------------------------- *
 *                       AmoebaGeneralizedKirkwood                            *
 * -------------------------------------------------------------------------- */

class CudaCalcAmoebaGeneralizedKirkwoodForceKernel::ForceInfo : public CudaForceInfo {
public:
    ForceInfo(const AmoebaGeneralizedKirkwoodForce& force) : force(force) {
    }
    bool areParticlesIdentical(int particle1, int particle2) {
        double charge1, charge2, radius1, radius2, scale1, scale2;
        force.getParticleParameters(particle1, charge1, radius1, scale1);
        force.getParticleParameters(particle2, charge2, radius2, scale2);
        return (charge1 == charge2 && radius1 == radius2 && scale1 == scale2);
    }
private:
    const AmoebaGeneralizedKirkwoodForce& force;
};

2127
CudaCalcAmoebaGeneralizedKirkwoodForceKernel::CudaCalcAmoebaGeneralizedKirkwoodForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) : 
2128
           CalcAmoebaGeneralizedKirkwoodForceKernel(name, platform), cu(cu), system(system), hasInitializedKernels(false) {
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
}

void CudaCalcAmoebaGeneralizedKirkwoodForceKernel::initialize(const System& system, const AmoebaGeneralizedKirkwoodForce& force) {
    cu.setAsCurrent();
    if (cu.getPlatformData().contexts.size() > 1)
        throw OpenMMException("AmoebaGeneralizedKirkwoodForce does not support using multiple CUDA devices");
    const AmoebaMultipoleForce* multipoles = NULL;
    for (int i = 0; i < system.getNumForces() && multipoles == NULL; i++)
        multipoles = dynamic_cast<const AmoebaMultipoleForce*>(&system.getForce(i));
    if (multipoles == NULL)
        throw OpenMMException("AmoebaGeneralizedKirkwoodForce requires the System to also contain an AmoebaMultipoleForce");
    CudaNonbondedUtilities& nb = cu.getNonbondedUtilities();
    int paddedNumAtoms = cu.getPaddedNumAtoms();
    int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
2143
2144
2145
2146
2147
2148
2149
    params.initialize<float2>(cu, paddedNumAtoms, "amoebaGkParams");
    bornRadii .initialize(cu, paddedNumAtoms, elementSize, "bornRadii");
    field .initialize(cu, 3*paddedNumAtoms, sizeof(long long), "gkField");
    bornSum.initialize<long long>(cu, paddedNumAtoms, "bornSum");
    bornForce.initialize<long long>(cu, paddedNumAtoms, "bornForce");
    inducedDipoleS .initialize(cu, 3*paddedNumAtoms, elementSize, "inducedDipoleS");
    inducedDipolePolarS .initialize(cu, 3*paddedNumAtoms, elementSize, "inducedDipolePolarS");
2150
2151
    polarizationType = multipoles->getPolarizationType();
    if (polarizationType != AmoebaMultipoleForce::Direct) {
2152
2153
        inducedField .initialize(cu, 3*paddedNumAtoms, sizeof(long long), "gkInducedField");
        inducedFieldPolar .initialize(cu, 3*paddedNumAtoms, sizeof(long long), "gkInducedFieldPolar");
2154
    }
2155
2156
2157
    cu.addAutoclearBuffer(field);
    cu.addAutoclearBuffer(bornSum);
    cu.addAutoclearBuffer(bornForce);
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
    vector<float2> paramsVector(paddedNumAtoms);
    for (int i = 0; i < force.getNumParticles(); i++) {
        double charge, radius, scalingFactor;
        force.getParticleParameters(i, charge, radius, scalingFactor);
        paramsVector[i] = make_float2((float) radius, (float) (scalingFactor*radius));
        
        // Make sure the charge matches the one specified by the AmoebaMultipoleForce.
        
        double charge2, thole, damping, polarity;
        int axisType, atomX, atomY, atomZ;
        vector<double> dipole, quadrupole;
        multipoles->getMultipoleParameters(i, charge2, dipole, quadrupole, axisType, atomZ, atomX, atomY, thole, damping, polarity);
        if (charge != charge2)
            throw OpenMMException("AmoebaGeneralizedKirkwoodForce and AmoebaMultipoleForce must specify the same charge for every atom");
    }
2173
    params.upload(paramsVector);
2174
    
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
    // Select the number of threads for each kernel.
    
    double computeBornSumThreadMemory = 4*elementSize+3*sizeof(float);
    double gkForceThreadMemory = 24*elementSize;
    double chainRuleThreadMemory = 10*elementSize;
    double ediffThreadMemory = 28*elementSize+2*sizeof(float)+3*sizeof(int)/(double) cu.TileSize;
    int maxThreads = cu.getNonbondedUtilities().getForceThreadBlockSize();
    computeBornSumThreads = min(maxThreads, cu.computeThreadBlockSize(computeBornSumThreadMemory));
    gkForceThreads = min(maxThreads, cu.computeThreadBlockSize(gkForceThreadMemory));
    chainRuleThreads = min(maxThreads, cu.computeThreadBlockSize(chainRuleThreadMemory));
    ediffThreads = min(maxThreads, cu.computeThreadBlockSize(ediffThreadMemory));
    
2187
    // Set preprocessor macros we will use when we create the kernels.
2188
2189
2190
    
    defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
    defines["PADDED_NUM_ATOMS"] = cu.intToString(paddedNumAtoms);
2191
2192
2193
2194
    defines["BORN_SUM_THREAD_BLOCK_SIZE"] = cu.intToString(computeBornSumThreads);
    defines["GK_FORCE_THREAD_BLOCK_SIZE"] = cu.intToString(gkForceThreads);
    defines["CHAIN_RULE_THREAD_BLOCK_SIZE"] = cu.intToString(chainRuleThreads);
    defines["EDIFF_THREAD_BLOCK_SIZE"] = cu.intToString(ediffThreads);
2195
2196
2197
2198
2199
2200
    defines["NUM_BLOCKS"] = cu.intToString(cu.getNumAtomBlocks());
    defines["GK_C"] = cu.doubleToString(2.455);
    double solventDielectric = force.getSolventDielectric();
    defines["GK_FC"] = cu.doubleToString(1*(1-solventDielectric)/(0+1*solventDielectric));
    defines["GK_FD"] = cu.doubleToString(2*(1-solventDielectric)/(1+2*solventDielectric));
    defines["GK_FQ"] = cu.doubleToString(3*(1-solventDielectric)/(2+3*solventDielectric));
2201
    defines["EPSILON_FACTOR"] = cu.doubleToString(138.9354558456);
Peter Eastman's avatar
Peter Eastman committed
2202
    defines["M_PI"] = cu.doubleToString(M_PI);
2203
    defines["ENERGY_SCALE_FACTOR"] = cu.doubleToString(138.9354558456/force.getSoluteDielectric());
2204
    if (polarizationType == AmoebaMultipoleForce::Direct)
2205
        defines["DIRECT_POLARIZATION"] = "";
2206
2207
2208
2209
    else if (polarizationType == AmoebaMultipoleForce::Mutual)
        defines["MUTUAL_POLARIZATION"] = "";
    else if (polarizationType == AmoebaMultipoleForce::Extrapolated)
        defines["EXTRAPOLATED_POLARIZATION"] = "";
2210
2211
2212
2213
2214
2215
    includeSurfaceArea = force.getIncludeCavityTerm();
    if (includeSurfaceArea) {
        defines["SURFACE_AREA_FACTOR"] = cu.doubleToString(force.getSurfaceAreaFactor());
        defines["PROBE_RADIUS"] = cu.doubleToString(force.getProbeRadius());
        defines["DIELECTRIC_OFFSET"] = cu.doubleToString(0.009);
    }
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
    cu.addForce(new ForceInfo(force));
}

double CudaCalcAmoebaGeneralizedKirkwoodForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
    // Since GK is so tightly entwined with the electrostatics, this method does nothing, and the force calculation
    // is driven by AmoebaMultipoleForce.
    return 0.0;
}

void CudaCalcAmoebaGeneralizedKirkwoodForceKernel::computeBornRadii() {
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
    if (!hasInitializedKernels) {
        hasInitializedKernels = true;
        
        // Create the kernels.
        
        int numExclusionTiles = cu.getNonbondedUtilities().getExclusionTiles().getSize();
        defines["NUM_TILES_WITH_EXCLUSIONS"] = cu.intToString(numExclusionTiles);
        int numContexts = cu.getPlatformData().contexts.size();
        int startExclusionIndex = cu.getContextIndex()*numExclusionTiles/numContexts;
        int endExclusionIndex = (cu.getContextIndex()+1)*numExclusionTiles/numContexts;
        defines["FIRST_EXCLUSION_TILE"] = cu.intToString(startExclusionIndex);
        defines["LAST_EXCLUSION_TILE"] = cu.intToString(endExclusionIndex);
        stringstream forceSource;
        forceSource << CudaKernelSources::vectorOps;
        forceSource << CudaAmoebaKernelSources::amoebaGk;
        forceSource << "#define F1\n";
        forceSource << CudaAmoebaKernelSources::gkPairForce1;
        forceSource << CudaAmoebaKernelSources::gkPairForce2;
        forceSource << CudaAmoebaKernelSources::gkEDiffPairForce;
        forceSource << "#undef F1\n";
        forceSource << "#define F2\n";
        forceSource << CudaAmoebaKernelSources::gkPairForce1;
        forceSource << CudaAmoebaKernelSources::gkPairForce2;
        forceSource << "#undef F2\n";
        forceSource << "#define T1\n";
        forceSource << CudaAmoebaKernelSources::gkPairForce1;
        forceSource << CudaAmoebaKernelSources::gkPairForce2;
        forceSource << CudaAmoebaKernelSources::gkEDiffPairForce;
        forceSource << "#undef T1\n";
        forceSource << "#define T2\n";
        forceSource << CudaAmoebaKernelSources::gkPairForce1;
        forceSource << CudaAmoebaKernelSources::gkPairForce2;
        forceSource << "#undef T2\n";
        forceSource << "#define T3\n";
        forceSource << CudaAmoebaKernelSources::gkEDiffPairForce;
        forceSource << "#undef T3\n";
        forceSource << "#define B1\n";
        forceSource << "#define B2\n";
        forceSource << CudaAmoebaKernelSources::gkPairForce1;
        forceSource << CudaAmoebaKernelSources::gkPairForce2;
        CUmodule module = cu.createModule(forceSource.str(), defines);
        computeBornSumKernel = cu.getKernel(module, "computeBornSum");
        reduceBornSumKernel = cu.getKernel(module, "reduceBornSum");
        gkForceKernel = cu.getKernel(module, "computeGKForces");
        chainRuleKernel = cu.getKernel(module, "computeChainRuleForce");
        ediffKernel = cu.getKernel(module, "computeEDiffForce");
        if (includeSurfaceArea)
            surfaceAreaKernel = cu.getKernel(module, "computeSurfaceAreaForce");
    }
2275
2276
2277
    CudaNonbondedUtilities& nb = cu.getNonbondedUtilities();
    int numTiles = nb.getNumTiles();
    int numForceThreadBlocks = nb.getNumForceThreadBlocks();
2278
2279
    void* computeBornSumArgs[] = {&bornSum.getDevicePointer(), &cu.getPosq().getDevicePointer(),
        &params.getDevicePointer(), &numTiles};
2280
    cu.executeKernel(computeBornSumKernel, computeBornSumArgs, numForceThreadBlocks*computeBornSumThreads, computeBornSumThreads);
2281
    void* reduceBornSumArgs[] = {&bornSum.getDevicePointer(), &params.getDevicePointer(), &bornRadii.getDevicePointer()};
2282
2283
2284
2285
2286
2287
2288
2289
2290
    cu.executeKernel(reduceBornSumKernel, reduceBornSumArgs, cu.getNumAtoms());
}

void CudaCalcAmoebaGeneralizedKirkwoodForceKernel::finishComputation(CudaArray& torque, CudaArray& labFrameDipoles, CudaArray& labFrameQuadrupoles,
            CudaArray& inducedDipole, CudaArray& inducedDipolePolar, CudaArray& dampingAndThole, CudaArray& covalentFlags, CudaArray& polarizationGroupFlags) {
    CudaNonbondedUtilities& nb = cu.getNonbondedUtilities();
    int startTileIndex = nb.getStartTileIndex();
    int numTileIndices = nb.getNumTiles();
    int numForceThreadBlocks = nb.getNumForceThreadBlocks();
2291
2292
2293
    
    // Compute the GK force.
    
2294
2295
    void* gkForceArgs[] = {&cu.getForce().getDevicePointer(), &torque.getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(),
        &cu.getPosq().getDevicePointer(), &startTileIndex, &numTileIndices, &labFrameDipoles.getDevicePointer(),
2296
2297
        &labFrameQuadrupoles.getDevicePointer(), &inducedDipoleS.getDevicePointer(), &inducedDipolePolarS.getDevicePointer(),
        &bornRadii.getDevicePointer(), &bornForce.getDevicePointer()};
2298
    cu.executeKernel(gkForceKernel, gkForceArgs, numForceThreadBlocks*gkForceThreads, gkForceThreads);
2299

2300
2301
2302
    // Compute the surface area force.
    
    if (includeSurfaceArea) {
2303
        void* surfaceAreaArgs[] = {&bornForce.getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(), &params.getDevicePointer(), &bornRadii.getDevicePointer()};
2304
2305
2306
2307
        cu.executeKernel(surfaceAreaKernel, surfaceAreaArgs, cu.getNumAtoms());
    }
    
    // Apply the remaining terms.
2308
2309
    
    void* chainRuleArgs[] = {&cu.getForce().getDevicePointer(), &cu.getPosq().getDevicePointer(), &startTileIndex, &numTileIndices,
2310
        &params.getDevicePointer(), &bornRadii.getDevicePointer(), &bornForce.getDevicePointer()};
2311
    cu.executeKernel(chainRuleKernel, chainRuleArgs, numForceThreadBlocks*chainRuleThreads, chainRuleThreads);    
2312
    void* ediffArgs[] = {&cu.getForce().getDevicePointer(), &torque.getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(),
2313
2314
        &cu.getPosq().getDevicePointer(), &covalentFlags.getDevicePointer(), &polarizationGroupFlags.getDevicePointer(),
        &nb.getExclusionTiles().getDevicePointer(), &startTileIndex, &numTileIndices,
2315
        &labFrameDipoles.getDevicePointer(), &labFrameQuadrupoles.getDevicePointer(), &inducedDipole.getDevicePointer(),
2316
        &inducedDipolePolar.getDevicePointer(), &inducedDipoleS.getDevicePointer(), &inducedDipolePolarS.getDevicePointer(),
2317
        &dampingAndThole.getDevicePointer()};
2318
    cu.executeKernel(ediffKernel, ediffArgs, numForceThreadBlocks*ediffThreads, ediffThreads);
2319
}
2320

2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
void CudaCalcAmoebaGeneralizedKirkwoodForceKernel::copyParametersToContext(ContextImpl& context, const AmoebaGeneralizedKirkwoodForce& force) {
    // Make sure the new parameters are acceptable.
    
    cu.setAsCurrent();
    if (force.getNumParticles() != cu.getNumAtoms())
        throw OpenMMException("updateParametersInContext: The number of particles has changed");
    
    // Record the per-particle parameters.
    
    vector<float2> paramsVector(cu.getPaddedNumAtoms());
    for (int i = 0; i < force.getNumParticles(); i++) {
        double charge, radius, scalingFactor;
        force.getParticleParameters(i, charge, radius, scalingFactor);
        paramsVector[i] = make_float2((float) radius, (float) (scalingFactor*radius));
    }
2336
    params.upload(paramsVector);
2337
2338
2339
    cu.invalidateMolecules();
}

2340
2341
2342
2343
2344
2345
2346
2347
2348
/* -------------------------------------------------------------------------- *
 *                           AmoebaVdw                                        *
 * -------------------------------------------------------------------------- */

class CudaCalcAmoebaVdwForceKernel::ForceInfo : public CudaForceInfo {
public:
    ForceInfo(const AmoebaVdwForce& force) : force(force) {
    }
    bool areParticlesIdentical(int particle1, int particle2) {
2349
        int iv1, iv2;
2350
        double sigma1, sigma2, epsilon1, epsilon2, reduction1, reduction2;
2351
2352
2353
        force.getParticleParameters(particle1, iv1, sigma1, epsilon1, reduction1);
        force.getParticleParameters(particle2, iv2, sigma2, epsilon2, reduction2);
        return (sigma1 == sigma2 && epsilon1 == epsilon2 && reduction1 == reduction2);
2354
2355
2356
2357
2358
    }
private:
    const AmoebaVdwForce& force;
};

2359
CudaCalcAmoebaVdwForceKernel::CudaCalcAmoebaVdwForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) :
2360
        CalcAmoebaVdwForceKernel(name, platform), cu(cu), system(system), hasInitializedNonbonded(false), nonbonded(NULL) {
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
}

CudaCalcAmoebaVdwForceKernel::~CudaCalcAmoebaVdwForceKernel() {
    cu.setAsCurrent();
    if (nonbonded != NULL)
        delete nonbonded;
}

void CudaCalcAmoebaVdwForceKernel::initialize(const System& system, const AmoebaVdwForce& force) {
    cu.setAsCurrent();
2371
2372
2373
2374
2375
    sigmaEpsilon.initialize<float2>(cu, cu.getPaddedNumAtoms(), "sigmaEpsilon");
    bondReductionAtoms.initialize<int>(cu, cu.getPaddedNumAtoms(), "bondReductionAtoms");
    bondReductionFactors.initialize<float>(cu, cu.getPaddedNumAtoms(), "sigmaEpsilon");
    tempPosq.initialize(cu, cu.getPaddedNumAtoms(), cu.getUseDoublePrecision() ? sizeof(double4) : sizeof(float4), "tempPosq");
    tempForces.initialize<long long>(cu, 3*cu.getPaddedNumAtoms(), "tempForces");
2376
2377
2378
2379
2380
2381
2382
2383
    
    // Record atom parameters.
    
    vector<float2> sigmaEpsilonVec(cu.getPaddedNumAtoms(), make_float2(0, 1));
    vector<int> bondReductionAtomsVec(cu.getPaddedNumAtoms(), 0);
    vector<float> bondReductionFactorsVec(cu.getPaddedNumAtoms(), 0);
    vector<vector<int> > exclusions(cu.getNumAtoms());
    for (int i = 0; i < force.getNumParticles(); i++) {
2384
        int ivIndex;
2385
        double sigma, epsilon, reductionFactor;
2386
        force.getParticleParameters(i, ivIndex, sigma, epsilon, reductionFactor);
2387
2388
2389
2390
2391
2392
        sigmaEpsilonVec[i] = make_float2((float) sigma, (float) epsilon);
        bondReductionAtomsVec[i] = ivIndex;
        bondReductionFactorsVec[i] = (float) reductionFactor;
        force.getParticleExclusions(i, exclusions[i]);
        exclusions[i].push_back(i);
    }
2393
2394
2395
    sigmaEpsilon.upload(sigmaEpsilonVec);
    bondReductionAtoms.upload(bondReductionAtomsVec);
    bondReductionFactors.upload(bondReductionFactorsVec);
2396
2397
2398
2399
    if (force.getUseDispersionCorrection())
        dispersionCoefficient = AmoebaVdwForceImpl::calcDispersionCorrection(system, force);
    else
        dispersionCoefficient = 0.0;               
2400
2401
2402
2403
2404
2405
 
    // This force is applied based on modified atom positions, where hydrogens have been moved slightly
    // closer to their parent atoms.  We therefore create a separate CudaNonbondedUtilities just for
    // this force, so it will have its own neighbor list and interaction kernel.
    
    nonbonded = new CudaNonbondedUtilities(cu);
2406
    nonbonded->addParameter(CudaNonbondedUtilities::ParameterInfo("sigmaEpsilon", "float", 2, sizeof(float2), sigmaEpsilon.getDevicePointer()));
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
    
    // Create the interaction kernel.
    
    map<string, string> replacements;
    string sigmaCombiningRule = force.getSigmaCombiningRule();
    if (sigmaCombiningRule == "ARITHMETIC")
        replacements["SIGMA_COMBINING_RULE"] = "1";
    else if (sigmaCombiningRule == "GEOMETRIC")
        replacements["SIGMA_COMBINING_RULE"] = "2";
    else if (sigmaCombiningRule == "CUBIC-MEAN")
        replacements["SIGMA_COMBINING_RULE"] = "3";
    else
        throw OpenMMException("Illegal combining rule for sigma: "+sigmaCombiningRule);
    string epsilonCombiningRule = force.getEpsilonCombiningRule();
    if (epsilonCombiningRule == "ARITHMETIC")
2422
        replacements["EPSILON_COMBINING_RULE"] = "1";
2423
    else if (epsilonCombiningRule == "GEOMETRIC")
2424
        replacements["EPSILON_COMBINING_RULE"] = "2";
2425
    else if (epsilonCombiningRule == "HARMONIC")
2426
        replacements["EPSILON_COMBINING_RULE"] = "3";
2427
    else if (epsilonCombiningRule == "HHG")
2428
        replacements["EPSILON_COMBINING_RULE"] = "4";
2429
2430
    else
        throw OpenMMException("Illegal combining rule for sigma: "+sigmaCombiningRule);
peastman's avatar
peastman committed
2431
    double cutoff = force.getCutoffDistance();
2432
    double taperCutoff = cutoff*0.9;
peastman's avatar
peastman committed
2433
    replacements["CUTOFF_DISTANCE"] = cu.doubleToString(force.getCutoffDistance());
2434
    replacements["TAPER_CUTOFF"] = cu.doubleToString(taperCutoff);
2435
2436
2437
    replacements["TAPER_C3"] = cu.doubleToString(10/pow(taperCutoff-cutoff, 3.0));
    replacements["TAPER_C4"] = cu.doubleToString(15/pow(taperCutoff-cutoff, 4.0));
    replacements["TAPER_C5"] = cu.doubleToString(6/pow(taperCutoff-cutoff, 5.0));
2438
    bool useCutoff = (force.getNonbondedMethod() != AmoebaVdwForce::NoCutoff);
peastman's avatar
peastman committed
2439
    nonbonded->addInteraction(useCutoff, useCutoff, true, force.getCutoffDistance(), exclusions,
2440
        cu.replaceStrings(CudaAmoebaKernelSources::amoebaVdwForce2, replacements), 0);
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
    
    // Create the other kernels.
    
    map<string, string> defines;
    defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
    CUmodule module = cu.createModule(CudaAmoebaKernelSources::amoebaVdwForce1, defines);
    prepareKernel = cu.getKernel(module, "prepareToComputeForce");
    spreadKernel = cu.getKernel(module, "spreadForces");
    cu.addForce(new ForceInfo(force));
}

double CudaCalcAmoebaVdwForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
    if (!hasInitializedNonbonded) {
        hasInitializedNonbonded = true;
        nonbonded->initialize(system);
    }
2457
2458
2459
2460
    cu.getPosq().copyTo(tempPosq);
    cu.getForce().copyTo(tempForces);
    void* prepareArgs[] = {&cu.getForce().getDevicePointer(), &cu.getPosq().getDevicePointer(), &tempPosq.getDevicePointer(),
        &bondReductionAtoms.getDevicePointer(), &bondReductionFactors.getDevicePointer()};
2461
    cu.executeKernel(prepareKernel, prepareArgs, cu.getPaddedNumAtoms());
2462
    nonbonded->prepareInteractions(1);
2463
    nonbonded->computeInteractions(1, includeForces, includeEnergy);
2464
    void* spreadArgs[] = {&cu.getForce().getDevicePointer(), &tempForces.getDevicePointer(), &bondReductionAtoms.getDevicePointer(), &bondReductionFactors.getDevicePointer()};
2465
    cu.executeKernel(spreadKernel, spreadArgs, cu.getPaddedNumAtoms());
2466
2467
    tempPosq.copyTo(cu.getPosq());
    tempForces.copyTo(cu.getForce());
2468
2469
    double4 box = cu.getPeriodicBoxSize();
    return dispersionCoefficient/(box.x*box.y*box.z);
2470
2471
}

2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
void CudaCalcAmoebaVdwForceKernel::copyParametersToContext(ContextImpl& context, const AmoebaVdwForce& force) {
    // Make sure the new parameters are acceptable.
    
    cu.setAsCurrent();
    if (force.getNumParticles() != cu.getNumAtoms())
        throw OpenMMException("updateParametersInContext: The number of particles has changed");
    
    // Record the per-particle parameters.
    
    vector<float2> sigmaEpsilonVec(cu.getPaddedNumAtoms(), make_float2(0, 1));
    vector<int> bondReductionAtomsVec(cu.getPaddedNumAtoms(), 0);
    vector<float> bondReductionFactorsVec(cu.getPaddedNumAtoms(), 0);
    for (int i = 0; i < force.getNumParticles(); i++) {
        int ivIndex;
        double sigma, epsilon, reductionFactor;
        force.getParticleParameters(i, ivIndex, sigma, epsilon, reductionFactor);
        sigmaEpsilonVec[i] = make_float2((float) sigma, (float) epsilon);
        bondReductionAtomsVec[i] = ivIndex;
        bondReductionFactorsVec[i] = (float) reductionFactor;
    }
2492
2493
2494
    sigmaEpsilon.upload(sigmaEpsilonVec);
    bondReductionAtoms.upload(bondReductionAtomsVec);
    bondReductionFactors.upload(bondReductionFactorsVec);
2495
2496
2497
2498
2499
2500
2501
    if (force.getUseDispersionCorrection())
        dispersionCoefficient = AmoebaVdwForceImpl::calcDispersionCorrection(system, force);
    else
        dispersionCoefficient = 0.0;               
    cu.invalidateMolecules();
}

2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
/* -------------------------------------------------------------------------- *
 *                           AmoebaWcaDispersion                              *
 * -------------------------------------------------------------------------- */

class CudaCalcAmoebaWcaDispersionForceKernel::ForceInfo : public CudaForceInfo {
public:
    ForceInfo(const AmoebaWcaDispersionForce& force) : force(force) {
    }
    bool areParticlesIdentical(int particle1, int particle2) {
        double radius1, radius2, epsilon1, epsilon2;
        force.getParticleParameters(particle1, radius1, epsilon1);
        force.getParticleParameters(particle2, radius2, epsilon2);
        return (radius1 == radius2 && epsilon1 == epsilon2);
    }
private:
    const AmoebaWcaDispersionForce& force;
};

2520
CudaCalcAmoebaWcaDispersionForceKernel::CudaCalcAmoebaWcaDispersionForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) : 
2521
           CalcAmoebaWcaDispersionForceKernel(name, platform), cu(cu), system(system) {
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
}

void CudaCalcAmoebaWcaDispersionForceKernel::initialize(const System& system, const AmoebaWcaDispersionForce& force) {
    int numParticles = system.getNumParticles();
    int paddedNumAtoms = cu.getPaddedNumAtoms();
    
    // Record parameters.
    
    vector<float2> radiusEpsilonVec(paddedNumAtoms, make_float2(0, 0));
    for (int i = 0; i < numParticles; i++) {
        double radius, epsilon;
        force.getParticleParameters(i, radius, epsilon);
        radiusEpsilonVec[i] = make_float2((float) radius, (float) epsilon);
    }
2536
2537
    radiusEpsilon.initialize<float2>(cu, paddedNumAtoms, "radiusEpsilon");
    radiusEpsilon.upload(radiusEpsilonVec);
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
    
    // Create the kernel.
    
    map<string, string> defines;
    defines["NUM_ATOMS"] = cu.intToString(numParticles);
    defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
    defines["THREAD_BLOCK_SIZE"] = cu.intToString(cu.getNonbondedUtilities().getForceThreadBlockSize());
    defines["NUM_BLOCKS"] = cu.intToString(cu.getNumAtomBlocks());
    defines["EPSO"] = cu.doubleToString(force.getEpso());
    defines["EPSH"] = cu.doubleToString(force.getEpsh());
    defines["RMINO"] = cu.doubleToString(force.getRmino());
    defines["RMINH"] = cu.doubleToString(force.getRminh());
    defines["AWATER"] = cu.doubleToString(force.getAwater());
2551
    defines["SHCTD"] = cu.doubleToString(force.getShctd());
Peter Eastman's avatar
Peter Eastman committed
2552
    defines["M_PI"] = cu.doubleToString(M_PI);
2553
2554
    CUmodule module = cu.createModule(CudaKernelSources::vectorOps+CudaAmoebaKernelSources::amoebaWcaForce, defines);
    forceKernel = cu.getKernel(module, "computeWCAForce");
2555
    totalMaximumDispersionEnergy = AmoebaWcaDispersionForceImpl::getTotalMaximumDispersionEnergy(force);
2556
2557
2558
2559
2560

    // Add an interaction to the default nonbonded kernel.  This doesn't actually do any calculations.  It's
    // just so that CudaNonbondedUtilities will keep track of the tiles.
    
    vector<vector<int> > exclusions;
2561
    cu.getNonbondedUtilities().addInteraction(false, false, false, 1.0, exclusions, "", force.getForceGroup());
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
    cu.addForce(new ForceInfo(force));
}

double CudaCalcAmoebaWcaDispersionForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
    CudaNonbondedUtilities& nb = cu.getNonbondedUtilities();
    int startTileIndex = nb.getStartTileIndex();
    int numTileIndices = nb.getNumTiles();
    int numForceThreadBlocks = nb.getNumForceThreadBlocks();
    int forceThreadBlockSize = nb.getForceThreadBlockSize();
    void* forceArgs[] = {&cu.getForce().getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(),
2572
        &cu.getPosq().getDevicePointer(), &startTileIndex, &numTileIndices, &radiusEpsilon.getDevicePointer()};
2573
2574
2575
    cu.executeKernel(forceKernel, forceArgs, numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
    return totalMaximumDispersionEnergy;
}
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591

void CudaCalcAmoebaWcaDispersionForceKernel::copyParametersToContext(ContextImpl& context, const AmoebaWcaDispersionForce& force) {
    // Make sure the new parameters are acceptable.
    
    cu.setAsCurrent();
    if (force.getNumParticles() != cu.getNumAtoms())
        throw OpenMMException("updateParametersInContext: The number of particles has changed");
    
    // Record the per-particle parameters.
    
    vector<float2> radiusEpsilonVec(cu.getPaddedNumAtoms(), make_float2(0, 0));
    for (int i = 0; i < cu.getNumAtoms(); i++) {
        double radius, epsilon;
        force.getParticleParameters(i, radius, epsilon);
        radiusEpsilonVec[i] = make_float2((float) radius, (float) epsilon);
    }
2592
    radiusEpsilon.upload(radiusEpsilonVec);
2593
    totalMaximumDispersionEnergy = AmoebaWcaDispersionForceImpl::getTotalMaximumDispersionEnergy(force);
2594
2595
    cu.invalidateMolecules();
}