OpenCLContext.cpp 57.9 KB
Newer Older
1
2
3
4
5
6
7
8
/* -------------------------------------------------------------------------- *
 *                                   OpenMM                                   *
 * -------------------------------------------------------------------------- *
 * This is part of the OpenMM molecular simulation toolkit originating from   *
 * Simbios, the NIH National Center for Physics-Based Simulation of           *
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
9
 * Portions copyright (c) 2009-2015 Stanford University and the Authors.      *
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
 * This program is free software: you can redistribute it and/or modify       *
 * it under the terms of the GNU Lesser General Public License as published   *
 * by the Free Software Foundation, either version 3 of the License, or       *
 * (at your option) any later version.                                        *
 *                                                                            *
 * This program is distributed in the hope that it will be useful,            *
 * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
 * GNU Lesser General Public License for more details.                        *
 *                                                                            *
 * You should have received a copy of the GNU Lesser General Public License   *
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
 * -------------------------------------------------------------------------- */

27
28
29
30
#ifdef WIN32
  #define _USE_MATH_DEFINES // Needed to get M_PI
#endif
#include <cmath>
31
32
#include "OpenCLContext.h"
#include "OpenCLArray.h"
Peter Eastman's avatar
Peter Eastman committed
33
#include "OpenCLBondedUtilities.h"
34
#include "OpenCLForceInfo.h"
35
#include "OpenCLIntegrationUtilities.h"
36
#include "OpenCLKernelSources.h"
37
#include "OpenCLNonbondedUtilities.h"
38
#include "hilbert.h"
39
#include "openmm/Platform.h"
40
#include "openmm/System.h"
41
#include "openmm/VirtualSite.h"
42
#include "openmm/internal/ContextImpl.h"
Peter Eastman's avatar
Peter Eastman committed
43
#include <algorithm>
44
45
#include <fstream>
#include <iostream>
46
#include <sstream>
47
#include <typeinfo>
48
49

using namespace OpenMM;
50
using namespace std;
51

52
53
54
#ifndef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
  #define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000
#endif
55
56
57
#ifndef CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV
  #define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV 0x4001
#endif
58

59
60
61
const int OpenCLContext::ThreadBlockSize = 64;
const int OpenCLContext::TileSize = 32;

62
static void CL_CALLBACK errorCallback(const char* errinfo, const void* private_info, size_t cb, void* user_data) {
63
64
65
    string skip = "OpenCL Build Warning : Compiler build log:";
    if (strncmp(errinfo, skip.c_str(), skip.length()) == 0)
        return; // OS X Lion insists on calling this for every build warning, even though they aren't errors.
66
67
68
    std::cerr << "OpenCL internal error: " << errinfo << std::endl;
}

69
OpenCLContext::OpenCLContext(const System& system, int platformIndex, int deviceIndex, const string& precision, OpenCLPlatform::PlatformData& platformData) :
70
        system(system), time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), stepsSinceReorder(99999), atomsWereReordered(false), posq(NULL),
Peter Eastman's avatar
Bug fix  
Peter Eastman committed
71
        posqCorrection(NULL), velm(NULL), forceBuffers(NULL), longForceBuffer(NULL), energyBuffer(NULL), atomIndexDevice(NULL), integration(NULL),
72
        expression(NULL), bonded(NULL), nonbonded(NULL), thread(NULL) {
73
74
75
76
77
78
79
80
81
82
83
84
85
86
    if (precision == "single") {
        useDoublePrecision = false;
        useMixedPrecision = false;
    }
    else if (precision == "mixed") {
        useDoublePrecision = false;
        useMixedPrecision = true;
    }
    else if (precision == "double") {
        useDoublePrecision = true;
        useMixedPrecision = false;
    }
    else
        throw OpenMMException("Illegal value for OpenCLPrecision: "+precision);
87
    try {
88
        contextIndex = platformData.contexts.size();
89
90
        std::vector<cl::Platform> platforms;
        cl::Platform::get(&platforms);
Robert McGibbon's avatar
Robert McGibbon committed
91
        const int minThreadBlockSize = 32;
92

Robert McGibbon's avatar
Robert McGibbon committed
93
94
95
        int bestSpeed = -1;
        int bestDevice = -1;
        int bestPlatform = -1;
Robert McGibbon's avatar
Robert McGibbon committed
96
        for (int j = 0; j < platforms.size(); j++) {
Robert McGibbon's avatar
Robert McGibbon committed
97
            // if they supplied a valid platformIndex, we only look through that platform
Robert McGibbon's avatar
Robert McGibbon committed
98
            if (j != platformIndex && platformIndex >= 0 && platformIndex < (int) platforms.size())
Robert McGibbon's avatar
Robert McGibbon committed
99
100
101
102
103
104
                continue;

            string platformVendor = platforms[j].getInfo<CL_PLATFORM_VENDOR>();
            vector<cl::Device> devices;
            platforms[j].getDevices(CL_DEVICE_TYPE_ALL, &devices);

105
            for (int i = 0; i < (int) devices.size(); i++) {
Robert McGibbon's avatar
Robert McGibbon committed
106
107
108
109
                // if they supplied a valid deviceIndex, we only look through that one
                if (i != deviceIndex && deviceIndex >= 0 && deviceIndex < (int) devices.size())
                    continue;

110
111
                if (platformVendor == "Apple" && (devices[i].getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU || devices[i].getInfo<CL_DEVICE_VENDOR>() == "AMD"))
                    continue; // The CPU device on OS X won't work correctly, and there are serious bugs using AMD GPUs.
112
                int maxSize = devices[i].getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>()[0];
113
114
115
116
117
                int processingElementsPerComputeUnit = 8;
                if (devices[i].getInfo<CL_DEVICE_TYPE>() != CL_DEVICE_TYPE_GPU) {
                    processingElementsPerComputeUnit = 1;
                }
                else if (devices[i].getInfo<CL_DEVICE_EXTENSIONS>().find("cl_nv_device_attribute_query") != string::npos) {
118
119
120
121
                    cl_uint computeCapabilityMajor;
                    clGetDeviceInfo(devices[i](), CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, sizeof(cl_uint), &computeCapabilityMajor, NULL);
                    processingElementsPerComputeUnit = (computeCapabilityMajor < 2 ? 8 : 32);
                }
122
123
124
125
                else if (devices[i].getInfo<CL_DEVICE_EXTENSIONS>().find("cl_amd_device_attribute_query") != string::npos) {
                    // This attribute does not ensure that all queries are supported by the runtime (it may be an older runtime,
                    // or the CPU device) so still have to check for errors.
                    try {
126
#ifdef CL_DEVICE_SIMD_WIDTH_AMD
127
128
129
130
131
132
133
134
135
136
137
138
139
                        processingElementsPerComputeUnit =
                            // AMD GPUs either have a single VLIW SIMD or multiple scalar SIMDs.
                            // The SIMD width is the number of threads the SIMD executes per cycle.
                            // This will be less than the wavefront width since it takes several
                            // cycles to execute the full wavefront.
                            // The SIMD instruction width is the VLIW instruction width (or 1 for scalar),
                            // this is the number of ALUs that can be executing per instruction per thread. 
                            devices[i].getInfo<CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD>() *
                            devices[i].getInfo<CL_DEVICE_SIMD_WIDTH_AMD>() *
                            devices[i].getInfo<CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD>();
                        // Just in case any of the queries return 0.
                        if (processingElementsPerComputeUnit <= 0)
                            processingElementsPerComputeUnit = 1;
140
#endif
141
142
143
144
145
                    }
                    catch (cl::Error err) {
                        // Runtime does not support the queries so use default.
                    }
                }
146
                int speed = devices[i].getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>()*processingElementsPerComputeUnit*devices[i].getInfo<CL_DEVICE_MAX_CLOCK_FREQUENCY>();
147
                if (maxSize >= minThreadBlockSize && speed > bestSpeed) {
Robert McGibbon's avatar
Robert McGibbon committed
148
                    bestDevice = i;
149
                    bestSpeed = speed;
Robert McGibbon's avatar
Robert McGibbon committed
150
                    bestPlatform = j;
151
                }
152
            }
153
        }
Robert McGibbon's avatar
Robert McGibbon committed
154
155
156
157
158

        if (bestPlatform == -1)
            throw OpenMMException("No compatible OpenCL platform is available");

        if (bestDevice == -1)
159
            throw OpenMMException("No compatible OpenCL device is available");
Robert McGibbon's avatar
Robert McGibbon committed
160
161
162

        vector<cl::Device> devices;
        platforms[bestPlatform].getDevices(CL_DEVICE_TYPE_ALL, &devices);
Robert McGibbon's avatar
Robert McGibbon committed
163
        string platformVendor = platforms[bestPlatform].getInfo<CL_PLATFORM_VENDOR>();
Robert McGibbon's avatar
Robert McGibbon committed
164
165
166
        device = devices[bestDevice];

        this->deviceIndex = bestDevice;
Robert McGibbon's avatar
Robert McGibbon committed
167
        this->platformIndex = bestPlatform;
168
        if (device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>() < minThreadBlockSize)
169
            throw OpenMMException("The specified OpenCL device is not compatible with OpenMM");
170
        compilationDefines["WORK_GROUP_SIZE"] = intToString(ThreadBlockSize);
Peter Eastman's avatar
Peter Eastman committed
171
        if (platformVendor.size() >= 5 && platformVendor.substr(0, 5) == "Intel")
172
173
174
            defaultOptimizationOptions = "";
        else
            defaultOptimizationOptions = "-cl-fast-relaxed-math";
175
        supports64BitGlobalAtomics = (device.getInfo<CL_DEVICE_EXTENSIONS>().find("cl_khr_int64_base_atomics") != string::npos);
176
        supportsDoublePrecision = (device.getInfo<CL_DEVICE_EXTENSIONS>().find("cl_khr_fp64") != string::npos);
177
178
        if ((useDoublePrecision || useMixedPrecision) && !supportsDoublePrecision)
            throw OpenMMException("This device does not support double precision");
179
        string vendor = device.getInfo<CL_DEVICE_VENDOR>();
180
        int numThreadBlocksPerComputeUnit = 6;
181
        if (vendor.size() >= 6 && vendor.substr(0, 6) == "NVIDIA") {
182
            compilationDefines["WARPS_ARE_ATOMIC"] = "";
183
            simdWidth = 32;
184
185
            if (device.getInfo<CL_DEVICE_EXTENSIONS>().find("cl_nv_device_attribute_query") != string::npos) {
                // Compute level 1.2 and later Nvidia GPUs support 64 bit atomics, even though they don't list the
186
187
                // proper extension as supported.  We only use them on compute level 2.0 or later, since they're very
                // slow on earlier GPUs.
188

189
                cl_uint computeCapabilityMajor;
190
                clGetDeviceInfo(device(), CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, sizeof(cl_uint), &computeCapabilityMajor, NULL);
191
                if (computeCapabilityMajor > 1)
192
193
                    supports64BitGlobalAtomics = true;
            }
194
        }
195
        else if (vendor.size() >= 28 && vendor.substr(0, 28) == "Advanced Micro Devices, Inc.") {
196
197
198
199
200
201
202
203
204
205
206
207
208
            if (device.getInfo<CL_DEVICE_TYPE>() != CL_DEVICE_TYPE_GPU) {
                /// \todo Is 6 a good value for the OpenCL CPU device?
                // numThreadBlocksPerComputeUnit = ?;
                simdWidth = 1;
            }
            else {
                bool amdPostSdk2_4 = false;
                // Default to 1 which will use the default kernels.
                simdWidth = 1;
                if (device.getInfo<CL_DEVICE_EXTENSIONS>().find("cl_amd_device_attribute_query") != string::npos) {
                    // This attribute does not ensure that all queries are supported by the runtime so still have to
                    // check for errors.
                    try {
Peter Eastman's avatar
Peter Eastman committed
209
#ifdef CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD
210
211
212
                        // Must catch cl:Error as will fail if runtime does not support queries.

                        cl_uint simdPerComputeUnit = device.getInfo<CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD>();
213
214
                        simdWidth = device.getInfo<CL_DEVICE_WAVEFRONT_WIDTH_AMD>();

215
216
217
218
219
220
221
222
                        // If the GPU has multiple SIMDs per compute unit then it is uses the scalar instruction
                        // set instead of the VLIW instruction set. It therefore needs more thread blocks per
                        // compute unit to hide memory latency.
                        if (simdPerComputeUnit > 1)
                            numThreadBlocksPerComputeUnit = 4 * simdPerComputeUnit;

                        // If the queries are supported then must be newer than SDK 2.4.
                        amdPostSdk2_4 = true;
Peter Eastman's avatar
Peter Eastman committed
223
#endif
224
225
226
227
228
229
230
231
232
233
                    }
                    catch (cl::Error err) {
                        // Runtime does not support the query so is unlikely to be the newer scalar GPU.
                        // Stay with the default simdWidth and numThreadBlocksPerComputeUnit.
                    }
                }
                // AMD APP SDK 2.4 has a performance problem with atomics. Enable the work around. This is fixed after SDK 2.4.
                if (!amdPostSdk2_4)
                    compilationDefines["AMD_ATOMIC_WORK_AROUND"] = "";
            }
234
        }
235
236
        else
            simdWidth = 1;
Peter Eastman's avatar
Peter Eastman committed
237
        if (platformVendor == "Apple" && vendor == "AMD")
238
            compilationDefines["MAC_AMD_WORKAROUND"] = "";
239
        if (supports64BitGlobalAtomics)
240
            compilationDefines["SUPPORTS_64_BIT_ATOMICS"] = "";
241
242
        if (supportsDoublePrecision)
            compilationDefines["SUPPORTS_DOUBLE_PRECISION"] = "";
243
244
245
246
        if (simdWidth >= 32)
            compilationDefines["SYNC_WARPS"] = "";
        else
            compilationDefines["SYNC_WARPS"] = "barrier(CLK_LOCAL_MEM_FENCE)";
247
248
        vector<cl::Device> contextDevices;
        contextDevices.push_back(device);
Robert McGibbon's avatar
Robert McGibbon committed
249
        cl_context_properties cprops[] = {CL_CONTEXT_PLATFORM, (cl_context_properties) platforms[bestPlatform](), 0};
250
        context = cl::Context(contextDevices, cprops, errorCallback);
251
252
        defaultQueue = cl::CommandQueue(context, device);
        currentQueue = defaultQueue;
Peter Eastman's avatar
Peter Eastman committed
253
254
        numAtoms = system.getNumParticles();
        paddedNumAtoms = TileSize*((numAtoms+TileSize-1)/TileSize);
255
        numAtomBlocks = (paddedNumAtoms+(TileSize-1))/TileSize;
256
        numThreadBlocks = numThreadBlocksPerComputeUnit*device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
        if (useDoublePrecision) {
            posq = OpenCLArray::create<mm_double4>(*this, paddedNumAtoms, "posq");
            velm = OpenCLArray::create<mm_double4>(*this, paddedNumAtoms, "velm");
            compilationDefines["USE_DOUBLE_PRECISION"] = "1";
            compilationDefines["convert_real4"] = "convert_double4";
            compilationDefines["convert_mixed4"] = "convert_double4";
        }
        else if (useMixedPrecision) {
            posq = OpenCLArray::create<mm_float4>(*this, paddedNumAtoms, "posq");
            posqCorrection = OpenCLArray::create<mm_float4>(*this, paddedNumAtoms, "posq");
            velm = OpenCLArray::create<mm_double4>(*this, paddedNumAtoms, "velm");
            compilationDefines["USE_MIXED_PRECISION"] = "1";
            compilationDefines["convert_real4"] = "convert_float4";
            compilationDefines["convert_mixed4"] = "convert_double4";
        }
        else {
            posq = OpenCLArray::create<mm_float4>(*this, paddedNumAtoms, "posq");
            velm = OpenCLArray::create<mm_float4>(*this, paddedNumAtoms, "velm");
            compilationDefines["convert_real4"] = "convert_float4";
            compilationDefines["convert_mixed4"] = "convert_float4";
        }
278
        posCellOffsets.resize(paddedNumAtoms, mm_int4(0, 0, 0, 0));
279
280
281
282
283
    }
    catch (cl::Error err) {
        std::stringstream str;
        str<<"Error initializing context: "<<err.what()<<" ("<<err.err()<<")";
        throw OpenMMException(str.str());
284
    }
285
286
287

    // Create utility kernels that are used in multiple places.

Peter Eastman's avatar
Peter Eastman committed
288
    cl::Program utilities = createProgram(OpenCLKernelSources::utilities);
289
    clearBufferKernel = cl::Kernel(utilities, "clearBuffer");
290
291
292
    clearTwoBuffersKernel = cl::Kernel(utilities, "clearTwoBuffers");
    clearThreeBuffersKernel = cl::Kernel(utilities, "clearThreeBuffers");
    clearFourBuffersKernel = cl::Kernel(utilities, "clearFourBuffers");
293
294
    clearFiveBuffersKernel = cl::Kernel(utilities, "clearFiveBuffers");
    clearSixBuffersKernel = cl::Kernel(utilities, "clearSixBuffers");
295
    reduceReal4Kernel = cl::Kernel(utilities, "reduceReal4Buffer");
296
297
    if (supports64BitGlobalAtomics)
        reduceForcesKernel = cl::Kernel(utilities, "reduceForces");
298
299
300

    // Decide whether native_sqrt(), native_rsqrt(), and native_recip() are sufficiently accurate to use.

301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
    if (!useDoublePrecision) {
        cl::Kernel accuracyKernel(utilities, "determineNativeAccuracy");
        OpenCLArray valuesArray(*this, 20, sizeof(mm_float8), "values");
        vector<mm_float8> values(valuesArray.getSize());
        float nextValue = 1e-4f;
        for (int i = 0; i < (int) values.size(); ++i) {
            values[i].s0 = nextValue;
            nextValue *= (float) M_PI;
        }
        valuesArray.upload(values);
        accuracyKernel.setArg<cl::Buffer>(0, valuesArray.getDeviceBuffer());
        accuracyKernel.setArg<cl_int>(1, values.size());
        executeKernel(accuracyKernel, values.size());
        valuesArray.download(values);
        double maxSqrtError = 0.0, maxRsqrtError = 0.0, maxRecipError = 0.0, maxExpError = 0.0, maxLogError = 0.0;
        for (int i = 0; i < (int) values.size(); ++i) {
            double v = values[i].s0;
            double correctSqrt = sqrt(v);
            maxSqrtError = max(maxSqrtError, fabs(correctSqrt-values[i].s1)/correctSqrt);
            maxRsqrtError = max(maxRsqrtError, fabs(1.0/correctSqrt-values[i].s2)*correctSqrt);
            maxRecipError = max(maxRecipError, fabs(1.0/v-values[i].s3)/values[i].s3);
            maxExpError = max(maxExpError, fabs(exp(v)-values[i].s4)/values[i].s4);
            maxLogError = max(maxLogError, fabs(log(v)-values[i].s5)/values[i].s5);
        }
        compilationDefines["SQRT"] = (maxSqrtError < 1e-6) ? "native_sqrt" : "sqrt";
        compilationDefines["RSQRT"] = (maxRsqrtError < 1e-6) ? "native_rsqrt" : "rsqrt";
        compilationDefines["RECIP"] = (maxRecipError < 1e-6) ? "native_recip" : "1.0f/";
        compilationDefines["EXP"] = (maxExpError < 1e-6) ? "native_exp" : "exp";
        compilationDefines["LOG"] = (maxLogError < 1e-6) ? "native_log" : "log";
    }
    else {
        compilationDefines["SQRT"] = "sqrt";
        compilationDefines["RSQRT"] = "rsqrt";
        compilationDefines["RECIP"] = "1.0/";
        compilationDefines["EXP"] = "exp";
        compilationDefines["LOG"] = "log";
    }
338
    
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
    // Set defines for applying periodic boundary conditions.
    
    Vec3 boxVectors[3];
    system.getDefaultPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
    boxIsTriclinic = (boxVectors[0][1] != 0.0 || boxVectors[0][2] != 0.0 ||
                      boxVectors[1][0] != 0.0 || boxVectors[1][2] != 0.0 ||
                      boxVectors[2][0] != 0.0 || boxVectors[2][1] != 0.0);
    if (boxIsTriclinic) {
        compilationDefines["APPLY_PERIODIC_TO_DELTA(delta)"] =
            "{"
            "real scale3 = floor(delta.z*invPeriodicBoxSize.z+0.5f); \\\n"
            "delta.xyz -= scale3*periodicBoxVecZ.xyz; \\\n"
            "real scale2 = floor(delta.y*invPeriodicBoxSize.y+0.5f); \\\n"
            "delta.xy -= scale2*periodicBoxVecY.xy; \\\n"
            "real scale1 = floor(delta.x*invPeriodicBoxSize.x+0.5f); \\\n"
            "delta.x -= scale1*periodicBoxVecX.x;}";
        compilationDefines["APPLY_PERIODIC_TO_POS(pos)"] =
            "{"
            "real scale3 = floor(pos.z*invPeriodicBoxSize.z); \\\n"
            "pos.xyz -= scale3*periodicBoxVecZ.xyz; \\\n"
            "real scale2 = floor(pos.y*invPeriodicBoxSize.y); \\\n"
            "pos.xy -= scale2*periodicBoxVecY.xy; \\\n"
            "real scale1 = floor(pos.x*invPeriodicBoxSize.x); \\\n"
            "pos.x -= scale1*periodicBoxVecX.x;}";
        compilationDefines["APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, center)"] =
            "{"
            "real scale3 = floor((pos.z-center.z)*invPeriodicBoxSize.z+0.5f); \\\n"
            "pos.x -= scale3*periodicBoxVecZ.x; \\\n"
            "pos.y -= scale3*periodicBoxVecZ.y; \\\n"
            "pos.z -= scale3*periodicBoxVecZ.z; \\\n"
            "real scale2 = floor((pos.y-center.y)*invPeriodicBoxSize.y+0.5f); \\\n"
            "pos.x -= scale2*periodicBoxVecY.x; \\\n"
            "pos.y -= scale2*periodicBoxVecY.y; \\\n"
            "real scale1 = floor((pos.x-center.x)*invPeriodicBoxSize.x+0.5f); \\\n"
            "pos.x -= scale1*periodicBoxVecX.x;}";
    }
    else {
        compilationDefines["APPLY_PERIODIC_TO_DELTA(delta)"] =
            "delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;";
        compilationDefines["APPLY_PERIODIC_TO_POS(pos)"] =
            "pos.xyz -= floor(pos.xyz*invPeriodicBoxSize.xyz)*periodicBoxSize.xyz;";
        compilationDefines["APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, center)"] =
            "{"
            "pos.x -= floor((pos.x-center.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; \\\n"
            "pos.y -= floor((pos.y-center.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; \\\n"
            "pos.z -= floor((pos.z-center.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;}";
    }

387
388
389
    // Create the work thread used for parallelization when running on multiple devices.
    
    thread = new WorkThread();
Peter Eastman's avatar
Peter Eastman committed
390
    
391
    // Create utilities objects.
Peter Eastman's avatar
Peter Eastman committed
392
    
393
394
    bonded = new OpenCLBondedUtilities(*this);
    nonbonded = new OpenCLNonbondedUtilities(*this);
Peter Eastman's avatar
Peter Eastman committed
395
    integration = new OpenCLIntegrationUtilities(*this, system);
396
    expression = new OpenCLExpressionUtilities(*this);
397
398
399
}

OpenCLContext::~OpenCLContext() {
400
401
    for (int i = 0; i < (int) forces.size(); i++)
        delete forces[i];
402
403
    for (int i = 0; i < (int) reorderListeners.size(); i++)
        delete reorderListeners[i];
404
405
406
407
    for (int i = 0; i < (int) preComputations.size(); i++)
        delete preComputations[i];
    for (int i = 0; i < (int) postComputations.size(); i++)
        delete postComputations[i];
408
409
    if (pinnedBuffer != NULL)
        delete pinnedBuffer;
410
411
    if (posq != NULL)
        delete posq;
Peter Eastman's avatar
Bug fix  
Peter Eastman committed
412
413
    if (posqCorrection != NULL)
        delete posqCorrection;
414
415
416
417
418
419
    if (velm != NULL)
        delete velm;
    if (force != NULL)
        delete force;
    if (forceBuffers != NULL)
        delete forceBuffers;
420
421
    if (longForceBuffer != NULL)
        delete longForceBuffer;
422
423
    if (energyBuffer != NULL)
        delete energyBuffer;
424
425
    if (atomIndexDevice != NULL)
        delete atomIndexDevice;
426
427
    if (integration != NULL)
        delete integration;
428
429
    if (expression != NULL)
        delete expression;
Peter Eastman's avatar
Peter Eastman committed
430
431
    if (bonded != NULL)
        delete bonded;
432
433
    if (nonbonded != NULL)
        delete nonbonded;
434
435
    if (thread != NULL)
        delete thread;
436
437
}

438
void OpenCLContext::initialize() {
Peter Eastman's avatar
Peter Eastman committed
439
    bonded->initialize(system);
440
    numForceBuffers = platformData.contexts.size();
Peter Eastman's avatar
Peter Eastman committed
441
    numForceBuffers = std::max(numForceBuffers, bonded->getNumForceBuffers());
442
443
    for (int i = 0; i < (int) forces.size(); i++)
        numForceBuffers = std::max(numForceBuffers, forces[i]->getRequiredForceBuffers());
444
445
446
447
448
449
450
451
452
453
    if (useDoublePrecision) {
        forceBuffers = OpenCLArray::create<mm_double4>(*this, paddedNumAtoms*numForceBuffers, "forceBuffers");
        force = OpenCLArray::create<mm_double4>(*this, &forceBuffers->getDeviceBuffer(), paddedNumAtoms, "force");
        energyBuffer = OpenCLArray::create<cl_double>(*this, max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers()), "energyBuffer");
    }
    else {
        forceBuffers = OpenCLArray::create<mm_float4>(*this, paddedNumAtoms*numForceBuffers, "forceBuffers");
        force = OpenCLArray::create<mm_float4>(*this, &forceBuffers->getDeviceBuffer(), paddedNumAtoms, "force");
        energyBuffer = OpenCLArray::create<cl_float>(*this, max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers()), "energyBuffer");
    }
454
    if (supports64BitGlobalAtomics) {
455
        longForceBuffer = OpenCLArray::create<cl_long>(*this, 3*paddedNumAtoms, "longForceBuffer");
456
457
458
459
        reduceForcesKernel.setArg<cl::Buffer>(0, longForceBuffer->getDeviceBuffer());
        reduceForcesKernel.setArg<cl::Buffer>(1, forceBuffers->getDeviceBuffer());
        reduceForcesKernel.setArg<cl_int>(2, paddedNumAtoms);
        reduceForcesKernel.setArg<cl_int>(3, numForceBuffers);
460
        addAutoclearBuffer(*longForceBuffer);
461
    }
462
463
    addAutoclearBuffer(*forceBuffers);
    addAutoclearBuffer(*energyBuffer);
464
    int bufferBytes = max(velm->getSize()*velm->getElementSize(), energyBuffer->getSize()*energyBuffer->getElementSize());
465
    pinnedBuffer = new cl::Buffer(context, CL_MEM_ALLOC_HOST_PTR, bufferBytes);
466
    pinnedMemory = currentQueue.enqueueMapBuffer(*pinnedBuffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, bufferBytes);
467
468
469
470
471
472
473
474
    for (int i = 0; i < numAtoms; i++) {
        double mass = system.getParticleMass(i);
        if (useDoublePrecision || useMixedPrecision)
            ((mm_double4*) pinnedMemory)[i] = mm_double4(0.0, 0.0, 0.0, mass == 0.0 ? 0.0 : 1.0/mass);
        else
            ((mm_float4*) pinnedMemory)[i] = mm_float4(0.0f, 0.0f, 0.0f, mass == 0.0 ? 0.0f : (cl_float) (1.0/mass));
    }
    velm->upload(pinnedMemory);
475
476
    atomIndexDevice = OpenCLArray::create<cl_int>(*this, paddedNumAtoms, "atomIndexDevice");
    atomIndex.resize(paddedNumAtoms);
477
    for (int i = 0; i < paddedNumAtoms; ++i)
478
479
        atomIndex[i] = i;
    atomIndexDevice->upload(atomIndex);
480
    findMoleculeGroups();
481
    nonbonded->initialize(system);
482
483
484
485
486
487
}

void OpenCLContext::addForce(OpenCLForceInfo* force) {
    forces.push_back(force);
}

488
489
string OpenCLContext::replaceStrings(const string& input, const std::map<std::string, std::string>& replacements) const {
    string result = input;
490
491
492
    for (map<string, string>::const_iterator iter = replacements.begin(); iter != replacements.end(); iter++) {
        int index = -1;
        do {
493
494
495
496
            index = result.find(iter->first);
            if (index != result.npos)
                result.replace(index, iter->first.size(), iter->second);
        } while (index != result.npos);
497
    }
498
    return result;
499
500
}

501
502
cl::Program OpenCLContext::createProgram(const string source, const char* optimizationFlags) {
    return createProgram(source, map<string, string>(), optimizationFlags);
503
504
}

505
cl::Program OpenCLContext::createProgram(const string source, const map<string, string>& defines, const char* optimizationFlags) {
Peter Eastman's avatar
Peter Eastman committed
506
    string options = (optimizationFlags == NULL ? defaultOptimizationOptions : string(optimizationFlags));
507
508
509
510
511
512
513
514
515
516
517
    stringstream src;
    if (!options.empty())
        src << "// Compilation Options: " << options << endl << endl;
    for (map<string, string>::const_iterator iter = compilationDefines.begin(); iter != compilationDefines.end(); ++iter) {
        src << "#define " << iter->first;
        if (!iter->second.empty())
            src << " " << iter->second;
        src << endl;
    }
    if (!compilationDefines.empty())
        src << endl;
518
519
520
521
522
    if (supportsDoublePrecision)
        src << "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
    if (useDoublePrecision) {
        src << "typedef double real;\n";
        src << "typedef double2 real2;\n";
523
        src << "typedef double3 real3;\n";
524
525
526
527
528
        src << "typedef double4 real4;\n";
    }
    else {
        src << "typedef float real;\n";
        src << "typedef float2 real2;\n";
529
        src << "typedef float3 real3;\n";
530
531
532
533
534
        src << "typedef float4 real4;\n";
    }
    if (useDoublePrecision || useMixedPrecision) {
        src << "typedef double mixed;\n";
        src << "typedef double2 mixed2;\n";
535
        src << "typedef double3 mixed3;\n";
536
537
538
539
540
        src << "typedef double4 mixed4;\n";
    }
    else {
        src << "typedef float mixed;\n";
        src << "typedef float2 mixed2;\n";
541
        src << "typedef float3 mixed3;\n";
542
543
        src << "typedef float4 mixed4;\n";
    }
544
545
546
547
548
549
550
551
552
553
554
555
556
    for (map<string, string>::const_iterator iter = defines.begin(); iter != defines.end(); ++iter) {
        src << "#define " << iter->first;
        if (!iter->second.empty())
            src << " " << iter->second;
        src << endl;
    }
    if (!defines.empty())
        src << endl;
    src << source << endl;
    // Get length before using c_str() to avoid length() call invalidating the c_str() value.
    string src_string = src.str();
    ::size_t src_length = src_string.length();
    cl::Program::Sources sources(1, make_pair(src_string.c_str(), src_length));
557
558
    cl::Program program(context, sources);
    try {
559
        program.build(vector<cl::Device>(1, device), options.c_str());
560
561
562
563
564
565
    } catch (cl::Error err) {
        throw OpenMMException("Error compiling kernel: "+program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device));
    }
    return program;
}

566
567
568
569
570
571
572
573
574
575
576
577
cl::CommandQueue& OpenCLContext::getQueue() {
    return currentQueue;
}

void OpenCLContext::setQueue(cl::CommandQueue& queue) {
    currentQueue = queue;
}

void OpenCLContext::restoreDefaultQueue() {
    currentQueue = defaultQueue;
}

578
string OpenCLContext::doubleToString(double value) const {
579
580
581
582
583
584
585
586
    stringstream s;
    s.precision(useDoublePrecision ? 16 : 8);
    s << scientific << value;
    if (!useDoublePrecision)
        s << "f";
    return s.str();
}

587
string OpenCLContext::intToString(int value) const {
588
589
590
591
592
    stringstream s;
    s << value;
    return s.str();
}

593
594
595
596
void OpenCLContext::executeKernel(cl::Kernel& kernel, int workUnits, int blockSize) {
    if (blockSize == -1)
        blockSize = ThreadBlockSize;
    int size = std::min((workUnits+blockSize-1)/blockSize, numThreadBlocks)*blockSize;
597
    try {
598
        currentQueue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(size), cl::NDRange(blockSize));
599
600
601
    }
    catch (cl::Error err) {
        stringstream str;
602
        str<<"Error invoking kernel "<<kernel.getInfo<CL_KERNEL_FUNCTION_NAME>()<<": "<<err.what()<<" ("<<err.err()<<")";
603
604
605
606
        throw OpenMMException(str.str());
    }
}

607
void OpenCLContext::clearBuffer(OpenCLArray& array) {
608
    clearBuffer(array.getDeviceBuffer(), array.getSize()*array.getElementSize());
609
610
}

611
void OpenCLContext::clearBuffer(cl::Memory& memory, int size) {
612
    int words = size/4;
613
    clearBufferKernel.setArg<cl::Memory>(0, memory);
614
615
616
617
618
619
    clearBufferKernel.setArg<cl_int>(1, words);
    executeKernel(clearBufferKernel, words, 128);
}

void OpenCLContext::addAutoclearBuffer(OpenCLArray& array) {
    addAutoclearBuffer(array.getDeviceBuffer(), array.getSize()*array.getElementSize());
620
621
}

622
623
void OpenCLContext::addAutoclearBuffer(cl::Memory& memory, int size) {
    autoclearBuffers.push_back(&memory);
624
    autoclearBufferSizes.push_back(size/4);
625
626
627
628
629
}

void OpenCLContext::clearAutoclearBuffers() {
    int base = 0;
    int total = autoclearBufferSizes.size();
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
    while (total-base >= 6) {
        clearSixBuffersKernel.setArg<cl::Memory>(0, *autoclearBuffers[base]);
        clearSixBuffersKernel.setArg<cl_int>(1, autoclearBufferSizes[base]);
        clearSixBuffersKernel.setArg<cl::Memory>(2, *autoclearBuffers[base+1]);
        clearSixBuffersKernel.setArg<cl_int>(3, autoclearBufferSizes[base+1]);
        clearSixBuffersKernel.setArg<cl::Memory>(4, *autoclearBuffers[base+2]);
        clearSixBuffersKernel.setArg<cl_int>(5, autoclearBufferSizes[base+2]);
        clearSixBuffersKernel.setArg<cl::Memory>(6, *autoclearBuffers[base+3]);
        clearSixBuffersKernel.setArg<cl_int>(7, autoclearBufferSizes[base+3]);
        clearSixBuffersKernel.setArg<cl::Memory>(8, *autoclearBuffers[base+4]);
        clearSixBuffersKernel.setArg<cl_int>(9, autoclearBufferSizes[base+4]);
        clearSixBuffersKernel.setArg<cl::Memory>(10, *autoclearBuffers[base+5]);
        clearSixBuffersKernel.setArg<cl_int>(11, autoclearBufferSizes[base+5]);
        executeKernel(clearSixBuffersKernel, max(max(max(max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), autoclearBufferSizes[base+3]), autoclearBufferSizes[base+4]), autoclearBufferSizes[base+5]), 128);
        base += 6;
    }
    if (total-base == 5) {
        clearFiveBuffersKernel.setArg<cl::Memory>(0, *autoclearBuffers[base]);
        clearFiveBuffersKernel.setArg<cl_int>(1, autoclearBufferSizes[base]);
        clearFiveBuffersKernel.setArg<cl::Memory>(2, *autoclearBuffers[base+1]);
        clearFiveBuffersKernel.setArg<cl_int>(3, autoclearBufferSizes[base+1]);
        clearFiveBuffersKernel.setArg<cl::Memory>(4, *autoclearBuffers[base+2]);
        clearFiveBuffersKernel.setArg<cl_int>(5, autoclearBufferSizes[base+2]);
        clearFiveBuffersKernel.setArg<cl::Memory>(6, *autoclearBuffers[base+3]);
        clearFiveBuffersKernel.setArg<cl_int>(7, autoclearBufferSizes[base+3]);
        clearFiveBuffersKernel.setArg<cl::Memory>(8, *autoclearBuffers[base+4]);
        clearFiveBuffersKernel.setArg<cl_int>(9, autoclearBufferSizes[base+4]);
        executeKernel(clearFiveBuffersKernel, max(max(max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), autoclearBufferSizes[base+3]), autoclearBufferSizes[base+4]), 128);
    }
    else if (total-base == 4) {
660
661
662
663
664
665
666
667
        clearFourBuffersKernel.setArg<cl::Memory>(0, *autoclearBuffers[base]);
        clearFourBuffersKernel.setArg<cl_int>(1, autoclearBufferSizes[base]);
        clearFourBuffersKernel.setArg<cl::Memory>(2, *autoclearBuffers[base+1]);
        clearFourBuffersKernel.setArg<cl_int>(3, autoclearBufferSizes[base+1]);
        clearFourBuffersKernel.setArg<cl::Memory>(4, *autoclearBuffers[base+2]);
        clearFourBuffersKernel.setArg<cl_int>(5, autoclearBufferSizes[base+2]);
        clearFourBuffersKernel.setArg<cl::Memory>(6, *autoclearBuffers[base+3]);
        clearFourBuffersKernel.setArg<cl_int>(7, autoclearBufferSizes[base+3]);
668
        executeKernel(clearFourBuffersKernel, max(max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), autoclearBufferSizes[base+3]), 128);
669
    }
670
    else if (total-base == 3) {
671
672
673
674
675
676
        clearThreeBuffersKernel.setArg<cl::Memory>(0, *autoclearBuffers[base]);
        clearThreeBuffersKernel.setArg<cl_int>(1, autoclearBufferSizes[base]);
        clearThreeBuffersKernel.setArg<cl::Memory>(2, *autoclearBuffers[base+1]);
        clearThreeBuffersKernel.setArg<cl_int>(3, autoclearBufferSizes[base+1]);
        clearThreeBuffersKernel.setArg<cl::Memory>(4, *autoclearBuffers[base+2]);
        clearThreeBuffersKernel.setArg<cl_int>(5, autoclearBufferSizes[base+2]);
677
        executeKernel(clearThreeBuffersKernel, max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), 128);
678
679
680
681
682
683
    }
    else if (total-base == 2) {
        clearTwoBuffersKernel.setArg<cl::Memory>(0, *autoclearBuffers[base]);
        clearTwoBuffersKernel.setArg<cl_int>(1, autoclearBufferSizes[base]);
        clearTwoBuffersKernel.setArg<cl::Memory>(2, *autoclearBuffers[base+1]);
        clearTwoBuffersKernel.setArg<cl_int>(3, autoclearBufferSizes[base+1]);
684
        executeKernel(clearTwoBuffersKernel, max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), 128);
685
686
687
688
689
690
    }
    else if (total-base == 1) {
        clearBuffer(*autoclearBuffers[base], autoclearBufferSizes[base]);
    }
}

691
692
693
694
695
696
697
void OpenCLContext::reduceForces() {
    if (supports64BitGlobalAtomics)
        executeKernel(reduceForcesKernel, paddedNumAtoms, 128);
    else
        reduceBuffer(*forceBuffers, numForceBuffers);
}

698
void OpenCLContext::reduceBuffer(OpenCLArray& array, int numBuffers) {
699
    int bufferSize = array.getSize()/numBuffers;
700
701
702
703
    reduceReal4Kernel.setArg<cl::Buffer>(0, array.getDeviceBuffer());
    reduceReal4Kernel.setArg<cl_int>(1, bufferSize);
    reduceReal4Kernel.setArg<cl_int>(2, numBuffers);
    executeKernel(reduceReal4Kernel, bufferSize, 128);
704
}
705

706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
/**
 * This class ensures that atom reordering doesn't break virtual sites.
 */
class OpenCLContext::VirtualSiteInfo : public OpenCLForceInfo {
public:
    VirtualSiteInfo(const System& system) : OpenCLForceInfo(0) {
        for (int i = 0; i < system.getNumParticles(); i++) {
            if (system.isVirtualSite(i)) {
                siteTypes.push_back(&typeid(system.getVirtualSite(i)));
                vector<int> particles;
                particles.push_back(i);
                for (int j = 0; j < system.getVirtualSite(i).getNumParticles(); j++)
                    particles.push_back(system.getVirtualSite(i).getParticle(j));
                siteParticles.push_back(particles);
                vector<double> weights;
                if (dynamic_cast<const TwoParticleAverageSite*>(&system.getVirtualSite(i)) != NULL) {
                    // A two particle average.

                    const TwoParticleAverageSite& site = dynamic_cast<const TwoParticleAverageSite&>(system.getVirtualSite(i));
                    weights.push_back(site.getWeight(0));
                    weights.push_back(site.getWeight(1));
                }
                else if (dynamic_cast<const ThreeParticleAverageSite*>(&system.getVirtualSite(i)) != NULL) {
                    // A three particle average.

                    const ThreeParticleAverageSite& site = dynamic_cast<const ThreeParticleAverageSite&>(system.getVirtualSite(i));
                    weights.push_back(site.getWeight(0));
                    weights.push_back(site.getWeight(1));
                    weights.push_back(site.getWeight(2));
                }
                else if (dynamic_cast<const OutOfPlaneSite*>(&system.getVirtualSite(i)) != NULL) {
                    // An out of plane site.

                    const OutOfPlaneSite& site = dynamic_cast<const OutOfPlaneSite&>(system.getVirtualSite(i));
                    weights.push_back(site.getWeight12());
                    weights.push_back(site.getWeight13());
                    weights.push_back(site.getWeightCross());
                }
                siteWeights.push_back(weights);
            }
        }
    }
    int getNumParticleGroups() {
        return siteTypes.size();
    }
    void getParticlesInGroup(int index, std::vector<int>& particles) {
        particles = siteParticles[index];
    }
    bool areGroupsIdentical(int group1, int group2) {
        if (siteTypes[group1] != siteTypes[group2])
            return false;
        int numParticles = siteWeights[group1].size();
        if (siteWeights[group2].size() != numParticles)
            return false;
        for (int i = 0; i < numParticles; i++)
            if (siteWeights[group1][i] != siteWeights[group2][i])
                return false;
        return true;
    }
private:
    vector<const type_info*> siteTypes;
    vector<vector<int> > siteParticles;
    vector<vector<double> > siteWeights;
};


772
773
void OpenCLContext::findMoleculeGroups() {
    // The first time this is called, we need to identify all the molecules in the system.
774
    
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
    if (moleculeGroups.size() == 0) {
        // Add a ForceInfo that makes sure reordering doesn't break virtual sites.

        addForce(new VirtualSiteInfo(system));

        // First make a list of every other atom to which each atom is connect by a constraint or force group.

        vector<vector<int> > atomBonds(system.getNumParticles());
        for (int i = 0; i < system.getNumConstraints(); i++) {
            int particle1, particle2;
            double distance;
            system.getConstraintParameters(i, particle1, particle2, distance);
            atomBonds[particle1].push_back(particle2);
            atomBonds[particle2].push_back(particle1);
        }
        for (int i = 0; i < (int) forces.size(); i++) {
            for (int j = 0; j < forces[i]->getNumParticleGroups(); j++) {
                vector<int> particles;
                forces[i]->getParticlesInGroup(j, particles);
                for (int k = 0; k < (int) particles.size(); k++)
                    for (int m = 0; m < (int) particles.size(); m++)
                        if (k != m)
                            atomBonds[particles[k]].push_back(particles[m]);
            }
799
800
        }

801
        // Now identify atoms by which molecule they belong to.
802

803
804
805
806
807
808
        vector<vector<int> > atomIndices = ContextImpl::findMolecules(numAtoms, atomBonds);
        int numMolecules = atomIndices.size();
        vector<int> atomMolecule(numAtoms);
        for (int i = 0; i < (int) atomIndices.size(); i++)
            for (int j = 0; j < (int) atomIndices[i].size(); j++)
                atomMolecule[atomIndices[i][j]] = i;
809

810
        // Construct a description of each molecule.
811

812
813
814
815
        molecules.resize(numMolecules);
        for (int i = 0; i < numMolecules; i++) {
            molecules[i].atoms = atomIndices[i];
            molecules[i].groups.resize(forces.size());
816
        }
817
818
819
820
821
822
823
824
825
826
827
828
829
        for (int i = 0; i < system.getNumConstraints(); i++) {
            int particle1, particle2;
            double distance;
            system.getConstraintParameters(i, particle1, particle2, distance);
            molecules[atomMolecule[particle1]].constraints.push_back(i);
        }
        for (int i = 0; i < (int) forces.size(); i++)
            for (int j = 0; j < forces[i]->getNumParticleGroups(); j++) {
                vector<int> particles;
                forces[i]->getParticlesInGroup(j, particles);
                molecules[atomMolecule[particles[0]]].groups[i].push_back(j);
            }
    }
830
831
832
833
834

    // Sort them into groups of identical molecules.

    vector<Molecule> uniqueMolecules;
    vector<vector<int> > moleculeInstances;
835
    vector<vector<int> > moleculeOffsets;
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
    for (int molIndex = 0; molIndex < (int) molecules.size(); molIndex++) {
        Molecule& mol = molecules[molIndex];

        // See if it is identical to another molecule.

        bool isNew = true;
        for (int j = 0; j < (int) uniqueMolecules.size() && isNew; j++) {
            Molecule& mol2 = uniqueMolecules[j];
            bool identical = (mol.atoms.size() == mol2.atoms.size() && mol.constraints.size() == mol2.constraints.size());

            // See if the atoms are identical.

            int atomOffset = mol2.atoms[0]-mol.atoms[0];
            for (int i = 0; i < (int) mol.atoms.size() && identical; i++) {
                if (mol.atoms[i] != mol2.atoms[i]-atomOffset || system.getParticleMass(mol.atoms[i]) != system.getParticleMass(mol2.atoms[i]))
                    identical = false;
852
                for (int k = 0; k < (int) forces.size(); k++)
853
854
855
856
857
858
859
860
861
862
863
                    if (!forces[k]->areParticlesIdentical(mol.atoms[i], mol2.atoms[i]))
                        identical = false;
            }
            
            // See if the constraints are identical.

            for (int i = 0; i < (int) mol.constraints.size() && identical; i++) {
                int c1particle1, c1particle2, c2particle1, c2particle2;
                double distance1, distance2;
                system.getConstraintParameters(mol.constraints[i], c1particle1, c1particle2, distance1);
                system.getConstraintParameters(mol2.constraints[i], c2particle1, c2particle2, distance2);
864
                if (c1particle1 != c2particle1-atomOffset || c1particle2 != c2particle2-atomOffset || distance1 != distance2)
865
866
867
868
869
                    identical = false;
            }

            // See if the force groups are identical.

870
            for (int i = 0; i < (int) forces.size() && identical; i++) {
871
872
                if (mol.groups[i].size() != mol2.groups[i].size())
                    identical = false;
873
                for (int k = 0; k < (int) mol.groups[i].size() && identical; k++)
874
875
876
877
                    if (!forces[i]->areGroupsIdentical(mol.groups[i][k], mol2.groups[i][k]))
                        identical = false;
            }
            if (identical) {
878
879
                moleculeInstances[j].push_back(molIndex);
                moleculeOffsets[j].push_back(mol.atoms[0]);
880
881
882
883
884
885
                isNew = false;
            }
        }
        if (isNew) {
            uniqueMolecules.push_back(mol);
            moleculeInstances.push_back(vector<int>());
886
887
888
            moleculeInstances[moleculeInstances.size()-1].push_back(molIndex);
            moleculeOffsets.push_back(vector<int>());
            moleculeOffsets[moleculeOffsets.size()-1].push_back(mol.atoms[0]);
889
890
891
892
893
894
        }
    }
    moleculeGroups.resize(moleculeInstances.size());
    for (int i = 0; i < (int) moleculeInstances.size(); i++)
    {
        moleculeGroups[i].instances = moleculeInstances[i];
895
        moleculeGroups[i].offsets = moleculeOffsets[i];
896
897
898
899
900
901
902
        vector<int>& atoms = uniqueMolecules[i].atoms;
        moleculeGroups[i].atoms.resize(atoms.size());
        for (int j = 0; j < (int) atoms.size(); j++)
            moleculeGroups[i].atoms[j] = atoms[j]-atoms[0];
    }
}

903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
void OpenCLContext::invalidateMolecules() {
    if (numAtoms == 0 || nonbonded == NULL || !nonbonded->getUseCutoff())
        return;
    bool valid = true;
    for (int group = 0; valid && group < (int) moleculeGroups.size(); group++) {
        MoleculeGroup& mol = moleculeGroups[group];
        vector<int>& instances = mol.instances;
        vector<int>& offsets = mol.offsets;
        vector<int>& atoms = mol.atoms;
        int numMolecules = instances.size();
        Molecule& m1 = molecules[instances[0]];
        int offset1 = offsets[0];
        for (int j = 1; valid && j < numMolecules; j++) {
            // See if the atoms are identical.

            Molecule& m2 = molecules[instances[j]];
            int offset2 = offsets[j];
            for (int i = 0; i < (int) atoms.size() && valid; i++) {
                for (int k = 0; k < (int) forces.size(); k++)
                    if (!forces[k]->areParticlesIdentical(atoms[i]+offset1, atoms[i]+offset2))
                        valid = false;
            }

            // See if the force groups are identical.

            for (int i = 0; i < (int) forces.size() && valid; i++) {
                for (int k = 0; k < (int) m1.groups[i].size() && valid; k++)
                    if (!forces[i]->areGroupsIdentical(m1.groups[i][k], m2.groups[i][k]))
                        valid = false;
            }
        }
    }
    if (valid)
        return;
    
    // The list of which molecules are identical is no longer valid.  We need to restore the
    // atoms to their original order, rebuild the list of identical molecules, and sort them
    // again.
    
    vector<mm_int4> newCellOffsets(numAtoms);
943
944
    if (useDoublePrecision) {
        vector<mm_double4> oldPosq(paddedNumAtoms);
945
        vector<mm_double4> newPosq(paddedNumAtoms, mm_double4(0,0,0,0));
946
        vector<mm_double4> oldVelm(paddedNumAtoms);
947
        vector<mm_double4> newVelm(paddedNumAtoms, mm_double4(0,0,0,0));
948
949
950
951
952
953
954
955
956
957
958
959
960
        posq->download(oldPosq);
        velm->download(oldVelm);
        for (int i = 0; i < numAtoms; i++) {
            int index = atomIndex[i];
            newPosq[index] = oldPosq[i];
            newVelm[index] = oldVelm[i];
            newCellOffsets[index] = posCellOffsets[i];
        }
        posq->upload(newPosq);
        velm->upload(newVelm);
    }
    else if (useMixedPrecision) {
        vector<mm_float4> oldPosq(paddedNumAtoms);
961
        vector<mm_float4> newPosq(paddedNumAtoms, mm_float4(0,0,0,0));
962
        vector<mm_float4> oldPosqCorrection(paddedNumAtoms);
963
        vector<mm_float4> newPosqCorrection(paddedNumAtoms, mm_float4(0,0,0,0));
964
        vector<mm_double4> oldVelm(paddedNumAtoms);
965
        vector<mm_double4> newVelm(paddedNumAtoms, mm_double4(0,0,0,0));
966
967
968
969
970
971
972
973
974
975
        posq->download(oldPosq);
        velm->download(oldVelm);
        for (int i = 0; i < numAtoms; i++) {
            int index = atomIndex[i];
            newPosq[index] = oldPosq[i];
            newPosqCorrection[index] = oldPosqCorrection[i];
            newVelm[index] = oldVelm[i];
            newCellOffsets[index] = posCellOffsets[i];
        }
        posq->upload(newPosq);
Peter Eastman's avatar
Peter Eastman committed
976
        posqCorrection->upload(newPosqCorrection);
977
978
979
980
        velm->upload(newVelm);
    }
    else {
        vector<mm_float4> oldPosq(paddedNumAtoms);
981
        vector<mm_float4> newPosq(paddedNumAtoms, mm_float4(0,0,0,0));
982
        vector<mm_float4> oldVelm(paddedNumAtoms);
983
        vector<mm_float4> newVelm(paddedNumAtoms, mm_float4(0,0,0,0));
984
985
986
987
988
989
990
991
992
993
        posq->download(oldPosq);
        velm->download(oldVelm);
        for (int i = 0; i < numAtoms; i++) {
            int index = atomIndex[i];
            newPosq[index] = oldPosq[i];
            newVelm[index] = oldVelm[i];
            newCellOffsets[index] = posCellOffsets[i];
        }
        posq->upload(newPosq);
        velm->upload(newVelm);
994
995
    }
    for (int i = 0; i < numAtoms; i++) {
996
        atomIndex[i] = i;
997
998
        posCellOffsets[i] = newCellOffsets[i];
    }
999
    atomIndexDevice->upload(atomIndex);
1000
    findMoleculeGroups();
1001
1002
    for (int i = 0; i < (int) reorderListeners.size(); i++)
        reorderListeners[i]->execute();
1003
    reorderAtoms();
1004
1005
}

1006
1007
1008
1009
void OpenCLContext::reorderAtoms() {
    atomsWereReordered = false;
    if (numAtoms == 0 || nonbonded == NULL || !nonbonded->getUseCutoff() || stepsSinceReorder < 100) {
        stepsSinceReorder++;
1010
        return;
1011
    }
Peter Eastman's avatar
Peter Eastman committed
1012
    atomsWereReordered = true;
1013
    stepsSinceReorder = 0;
1014
    if (useDoublePrecision)
1015
        reorderAtomsImpl<cl_double, mm_double4, cl_double, mm_double4>();
1016
    else if (useMixedPrecision)
1017
        reorderAtomsImpl<cl_float, mm_float4, cl_double, mm_double4>();
1018
    else
1019
1020
        reorderAtomsImpl<cl_float, mm_float4, cl_float, mm_float4>();
    nonbonded->updateNeighborListSize();
1021
1022
1023
}

template <class Real, class Real4, class Mixed, class Mixed4>
1024
void OpenCLContext::reorderAtomsImpl() {
1025
1026
1027

    // Find the range of positions and the number of bins along each axis.

1028
1029
1030
    vector<Real4> oldPosq(paddedNumAtoms);
    vector<Real4> oldPosqCorrection(paddedNumAtoms);
    vector<Mixed4> oldVelm(paddedNumAtoms);
1031
1032
    posq->download(oldPosq);
    velm->download(oldVelm);
1033
1034
1035
1036
1037
    if (useMixedPrecision)
        posqCorrection->download(oldPosqCorrection);
    Real minx = oldPosq[0].x, maxx = oldPosq[0].x;
    Real miny = oldPosq[0].y, maxy = oldPosq[0].y;
    Real minz = oldPosq[0].z, maxz = oldPosq[0].z;
1038
1039
    if (nonbonded->getUsePeriodic()) {
        minx = miny = minz = 0.0;
1040
1041
1042
        maxx = periodicBoxSizeDouble.x;
        maxy = periodicBoxSizeDouble.y;
        maxz = periodicBoxSizeDouble.z;
1043
1044
1045
    }
    else {
        for (int i = 1; i < numAtoms; i++) {
1046
            const Real4& pos = oldPosq[i];
1047
1048
1049
1050
1051
1052
            minx = min(minx, pos.x);
            maxx = max(maxx, pos.x);
            miny = min(miny, pos.y);
            maxy = max(maxy, pos.y);
            minz = min(minz, pos.z);
            maxz = max(maxz, pos.z);
1053
1054
1055
1056
1057
1058
        }
    }

    // Loop over each group of identical molecules and reorder them.

    vector<int> originalIndex(numAtoms);
1059
1060
1061
    vector<Real4> newPosq(paddedNumAtoms, Real4(0,0,0,0));
    vector<Real4> newPosqCorrection(paddedNumAtoms, Real4(0,0,0,0));
    vector<Mixed4> newVelm(paddedNumAtoms, Mixed4(0,0,0,0));
1062
1063
1064
1065
1066
    vector<mm_int4> newCellOffsets(numAtoms);
    for (int group = 0; group < (int) moleculeGroups.size(); group++) {
        // Find the center of each molecule.

        MoleculeGroup& mol = moleculeGroups[group];
1067
        int numMolecules = mol.offsets.size();
1068
        vector<int>& atoms = mol.atoms;
1069
1070
        vector<Real4> molPos(numMolecules);
        Real invNumAtoms = (Real) (1.0/atoms.size());
1071
1072
1073
1074
1075
        for (int i = 0; i < numMolecules; i++) {
            molPos[i].x = 0.0f;
            molPos[i].y = 0.0f;
            molPos[i].z = 0.0f;
            for (int j = 0; j < (int)atoms.size(); j++) {
1076
                int atom = atoms[j]+mol.offsets[i];
1077
                const Real4& pos = oldPosq[atom];
1078
1079
1080
                molPos[i].x += pos.x;
                molPos[i].y += pos.y;
                molPos[i].z += pos.z;
1081
            }
1082
1083
1084
            molPos[i].x *= invNumAtoms;
            molPos[i].y *= invNumAtoms;
            molPos[i].z *= invNumAtoms;
1085
1086
1087
1088
1089
        }
        if (nonbonded->getUsePeriodic()) {
            // Move each molecule position into the same box.

            for (int i = 0; i < numMolecules; i++) {
Peter Eastman's avatar
Peter Eastman committed
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
                Real4 center = molPos[i];
                int zcell = (int) floor(center.z*invPeriodicBoxSize.z);
                center.x -= zcell*periodicBoxVecZ.x;
                center.y -= zcell*periodicBoxVecZ.y;
                center.z -= zcell*periodicBoxVecZ.z;
                int ycell = (int) floor(center.y*invPeriodicBoxSize.y);
                center.x -= ycell*periodicBoxVecY.x;
                center.y -= ycell*periodicBoxVecY.y;
                int xcell = (int) floor(center.x*invPeriodicBoxSize.x);
                center.x -= xcell*periodicBoxVecX.x;
                if (xcell != 0 || ycell != 0 || zcell != 0) {
                    Real dx = molPos[i].x-center.x;
                    Real dy = molPos[i].y-center.y;
                    Real dz = molPos[i].z-center.z;
                    molPos[i] = center;
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
                    for (int j = 0; j < (int) atoms.size(); j++) {
                        int atom = atoms[j]+mol.offsets[i];
                        Real4 p = oldPosq[atom];
                        p.x -= dx;
                        p.y -= dy;
                        p.z -= dz;
                        oldPosq[atom] = p;
                        posCellOffsets[atom].x -= xcell;
                        posCellOffsets[atom].y -= ycell;
                        posCellOffsets[atom].z -= zcell;
1115
1116
1117
1118
1119
1120
1121
1122
                    }
                }
            }
        }

        // Select a bin for each molecule, then sort them by bin.

        bool useHilbert = (numMolecules > 5000 || atoms.size() > 8); // For small systems, a simple zigzag curve works better than a Hilbert curve.
1123
        Real binWidth;
1124
        if (useHilbert)
1125
            binWidth = (Real) (max(max(maxx-minx, maxy-miny), maxz-minz)/255.0);
1126
        else
1127
1128
            binWidth = (Real) (0.2*nonbonded->getCutoffDistance());
        Real invBinWidth = (Real) (1.0/binWidth);
1129
1130
        int xbins = 1 + (int) ((maxx-minx)*invBinWidth);
        int ybins = 1 + (int) ((maxy-miny)*invBinWidth);
1131
1132
1133
        vector<pair<int, int> > molBins(numMolecules);
        bitmask_t coords[3];
        for (int i = 0; i < numMolecules; i++) {
1134
1135
1136
            int x = (int) ((molPos[i].x-minx)*invBinWidth);
            int y = (int) ((molPos[i].y-miny)*invBinWidth);
            int z = (int) ((molPos[i].z-minz)*invBinWidth);
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
            int bin;
            if (useHilbert) {
                coords[0] = x;
                coords[1] = y;
                coords[2] = z;
                bin = (int) hilbert_c2i(3, 8, coords);
            }
            else {
                int yodd = y&1;
                int zodd = z&1;
                bin = z*xbins*ybins;
                bin += (zodd ? ybins-y : y)*xbins;
                bin += (yodd ? xbins-x : x);
            }
            molBins[i] = pair<int, int>(bin, i);
        }
        sort(molBins.begin(), molBins.end());

        // Reorder the atoms.

        for (int i = 0; i < numMolecules; i++) {
            for (int j = 0; j < (int)atoms.size(); j++) {
1159
1160
                int oldIndex = mol.offsets[molBins[i].second]+atoms[j];
                int newIndex = mol.offsets[i]+atoms[j];
1161
1162
                originalIndex[newIndex] = atomIndex[oldIndex];
                newPosq[newIndex] = oldPosq[oldIndex];
1163
1164
                if (useMixedPrecision)
                    newPosqCorrection[newIndex] = oldPosqCorrection[oldIndex];
1165
                newVelm[newIndex] = oldVelm[oldIndex];
1166
1167
1168
1169
1170
1171
1172
1173
                newCellOffsets[newIndex] = posCellOffsets[oldIndex];
            }
        }
    }

    // Update the streams.

    for (int i = 0; i < numAtoms; i++) {
1174
        atomIndex[i] = originalIndex[i];
1175
1176
        posCellOffsets[i] = newCellOffsets[i];
    }
1177
    posq->upload(newPosq);
1178
1179
    if (useMixedPrecision)
        posqCorrection->upload(newPosqCorrection);
1180
1181
    velm->upload(newVelm);
    atomIndexDevice->upload(atomIndex);
1182
1183
1184
1185
1186
1187
    for (int i = 0; i < (int) reorderListeners.size(); i++)
        reorderListeners[i]->execute();
}

void OpenCLContext::addReorderListener(ReorderListener* listener) {
    reorderListeners.push_back(listener);
1188
}
1189

1190
1191
1192
1193
1194
1195
1196
1197
void OpenCLContext::addPreComputation(ForcePreComputation* computation) {
    preComputations.push_back(computation);
}

void OpenCLContext::addPostComputation(ForcePostComputation* computation) {
    postComputations.push_back(computation);
}

1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
struct OpenCLContext::WorkThread::ThreadData {
    ThreadData(std::queue<OpenCLContext::WorkTask*>& tasks, bool& waiting,  bool& finished,
            pthread_mutex_t& queueLock, pthread_cond_t& waitForTaskCondition, pthread_cond_t& queueEmptyCondition) :
        tasks(tasks), waiting(waiting), finished(finished), queueLock(queueLock),
        waitForTaskCondition(waitForTaskCondition), queueEmptyCondition(queueEmptyCondition) {
    }
    std::queue<OpenCLContext::WorkTask*>& tasks;
    bool& waiting;
    bool& finished;
    pthread_mutex_t& queueLock;
    pthread_cond_t& waitForTaskCondition;
    pthread_cond_t& queueEmptyCondition;
};

static void* threadBody(void* args) {
    OpenCLContext::WorkThread::ThreadData& data = *reinterpret_cast<OpenCLContext::WorkThread::ThreadData*>(args);
    while (!data.finished || data.tasks.size() > 0) {
        pthread_mutex_lock(&data.queueLock);
        while (data.tasks.empty() && !data.finished) {
            data.waiting = true;
            pthread_cond_signal(&data.queueEmptyCondition);
            pthread_cond_wait(&data.waitForTaskCondition, &data.queueLock);
        }
        OpenCLContext::WorkTask* task = NULL;
        if (!data.tasks.empty()) {
            data.waiting = false;
            task = data.tasks.front();
            data.tasks.pop();
        }
        pthread_mutex_unlock(&data.queueLock);
        if (task != NULL) {
            task->execute();
            delete task;
        }
    }
    data.waiting = true;
    pthread_cond_signal(&data.queueEmptyCondition);
    delete &data;
    return 0;
}

OpenCLContext::WorkThread::WorkThread() : waiting(true), finished(false) {
    pthread_mutex_init(&queueLock, NULL);
    pthread_cond_init(&waitForTaskCondition, NULL);
    pthread_cond_init(&queueEmptyCondition, NULL);
    ThreadData* data = new ThreadData(tasks, waiting, finished, queueLock, waitForTaskCondition, queueEmptyCondition);
    pthread_create(&thread, NULL, threadBody, data);
}

OpenCLContext::WorkThread::~WorkThread() {
    pthread_mutex_lock(&queueLock);
    finished = true;
    pthread_cond_broadcast(&waitForTaskCondition);
    pthread_mutex_unlock(&queueLock);
    pthread_join(thread, NULL);
    pthread_mutex_destroy(&queueLock);
    pthread_cond_destroy(&waitForTaskCondition);
    pthread_cond_destroy(&queueEmptyCondition);
}

void OpenCLContext::WorkThread::addTask(OpenCLContext::WorkTask* task) {
    pthread_mutex_lock(&queueLock);
    tasks.push(task);
    waiting = false;
    pthread_cond_signal(&waitForTaskCondition);
    pthread_mutex_unlock(&queueLock);
}

bool OpenCLContext::WorkThread::isWaiting() {
    return waiting;
}

bool OpenCLContext::WorkThread::isFinished() {
    return finished;
}

void OpenCLContext::WorkThread::flush() {
    pthread_mutex_lock(&queueLock);
    while (!waiting)
       pthread_cond_wait(&queueEmptyCondition, &queueLock);
    pthread_mutex_unlock(&queueLock);
}