CudaContext.cpp 35.9 KB
Newer Older
1
2
3
4
5
6
7
8
/* -------------------------------------------------------------------------- *
 *                                   OpenMM                                   *
 * -------------------------------------------------------------------------- *
 * This is part of the OpenMM molecular simulation toolkit originating from   *
 * Simbios, the NIH National Center for Physics-Based Simulation of           *
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
9
 * Portions copyright (c) 2009-2023 Stanford University and the Authors.      *
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
 * This program is free software: you can redistribute it and/or modify       *
 * it under the terms of the GNU Lesser General Public License as published   *
 * by the Free Software Foundation, either version 3 of the License, or       *
 * (at your option) any later version.                                        *
 *                                                                            *
 * This program is distributed in the hope that it will be useful,            *
 * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
 * GNU Lesser General Public License for more details.                        *
 *                                                                            *
 * You should have received a copy of the GNU Lesser General Public License   *
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
 * -------------------------------------------------------------------------- */

#ifdef WIN32
  #define _USE_MATH_DEFINES // Needed to get M_PI
#endif
#include <cmath>
#include "CudaContext.h"
#include "CudaArray.h"
33
#include "CudaBondedUtilities.h"
34
#include "CudaEvent.h"
35
#include "CudaIntegrationUtilities.h"
36
#include "CudaKernels.h"
37
#include "CudaKernelSources.h"
38
#include "CudaNonbondedUtilities.h"
39
40
#include "CudaProgram.h"
#include "openmm/common/ComputeArray.h"
41
#include "openmm/common/ContextSelector.h"
42
#include "SHA1.h"
43
44
45
#include "openmm/Platform.h"
#include "openmm/System.h"
#include "openmm/VirtualSite.h"
46
#include "CudaExpressionUtilities.h"
47
#include "openmm/internal/ContextImpl.h"
48
49
50
#include <algorithm>
#include <cstdlib>
#include <fstream>
51
#include <iomanip>
52
#include <iostream>
53
#include <set>
54
55
#include <sstream>
#include <typeinfo>
56
#include <sys/stat.h>
57
#include <cudaProfiler.h>
58
#include <nvrtc.h>
59
60
61
#ifndef WIN32
  #include <unistd.h>
#endif
peastman's avatar
peastman committed
62

63
64
65
66
67

#define CHECK_RESULT(result) CHECK_RESULT2(result, errorMessage);
#define CHECK_RESULT2(result, prefix) \
    if (result != CUDA_SUCCESS) { \
        std::stringstream m; \
68
        m<<prefix<<": "<<getErrorString(result)<<" ("<<result<<")"<<" at "<<__FILE__<<":"<<__LINE__; \
69
70
        throw OpenMMException(m.str());\
    }
71
72
73
74
75
76
#define CHECK_NVRTC_RESULT(result, prefix) \
    if (result != NVRTC_SUCCESS) { \
        stringstream m; \
        m<<prefix<<": "<<nvrtcGetErrorString(result)<<" ("<<result<<")"<<" at "<<__FILE__<<":"<<__LINE__; \
        throw OpenMMException(m.str());\
    }
77
78
79
80
81

using namespace OpenMM;
using namespace std;

const int CudaContext::ThreadBlockSize = 64;
82
const int CudaContext::TileSize = sizeof(tileflags)*8;
83
84
bool CudaContext::hasInitializedCuda = false;

85
CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const string& precision, const string& tempDir, CudaPlatform::PlatformData& platformData,
86
        CudaContext* originalContext) : ComputeContext(system), currentStream(0), platformData(platformData), contextIsValid(false), hasAssignedPosqCharges(false),
87
        pinnedBuffer(NULL), integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL), useBlockingSync(useBlockingSync) {
88
89
    int cudaDriverVersion;
    cuDriverGetVersion(&cudaDriverVersion);
90
91
92
93
94
95
    if (!hasInitializedCuda) {
        CHECK_RESULT2(cuInit(0), "Error initializing CUDA");
        hasInitializedCuda = true;
    }
    if (precision == "single") {
        useDoublePrecision = false;
96
        useMixedPrecision = false;
97
98
99
    }
    else if (precision == "mixed") {
        useDoublePrecision = false;
100
        useMixedPrecision = true;
101
102
103
    }
    else if (precision == "double") {
        useDoublePrecision = true;
104
        useMixedPrecision = false;
105
106
    }
    else
107
        throw OpenMMException("Illegal value for Precision: "+precision);
108
109
    char* cacheVariable = getenv("OPENMM_CACHE_DIR");
    cacheDir = (cacheVariable == NULL ? tempDir : string(cacheVariable));
110
#ifdef WIN32
111
    this->tempDir = tempDir+"\\";
112
    cacheDir = cacheDir+"\\";
113
114
#else
    this->tempDir = tempDir+"/";
115
    cacheDir = cacheDir+"/";
116
117
118
#endif
    contextIndex = platformData.contexts.size();
    string errorMessage = "Error initializing Context";
119
120
121
122
123
124
    if (originalContext == NULL) {
        isLinkedContext = false;
        int numDevices;
        CHECK_RESULT(cuDeviceGetCount(&numDevices));
        if (deviceIndex < -1 || deviceIndex >= numDevices)
            throw OpenMMException("Illegal value for DeviceIndex: "+intToString(deviceIndex));
125

126
127
128
129
130
131
        vector<int> devicePrecedence;
        if (deviceIndex == -1) {
            devicePrecedence = getDevicePrecedence();
        } else {
            devicePrecedence.push_back(deviceIndex);
        }
132

133
134
135
136
137
138
139
140
141
142
        this->deviceIndex = -1;
        for (int i = 0; i < static_cast<int>(devicePrecedence.size()); i++) {
            int trialDeviceIndex = devicePrecedence[i];
            CHECK_RESULT(cuDeviceGet(&device, trialDeviceIndex));
            defaultOptimizationOptions = "--use_fast_math";
            unsigned int flags = CU_CTX_MAP_HOST;
            if (useBlockingSync)
                flags += CU_CTX_SCHED_BLOCKING_SYNC;
            else
                flags += CU_CTX_SCHED_SPIN;
143

144
145
            if (cuCtxCreate(&context, flags, device) == CUDA_SUCCESS) {
                this->deviceIndex = trialDeviceIndex;
146
147
                CUcontext popped;
                cuCtxPopCurrent(&popped);
148
149
                break;
            }
150
        }
151
        if (this->deviceIndex == -1) {
152
153
154
155
            if (deviceIndex != -1)
                throw OpenMMException("The requested CUDA device could not be loaded");
            else
                throw OpenMMException("No compatible CUDA device is available");
156
        }
157
158
159
160
161
162
    }
    else {
        isLinkedContext = true;
        context = originalContext->context;
        this->deviceIndex = originalContext->deviceIndex;
        this->device = originalContext->device;
163
    }
164

165
    int major, minor;
166
167
    CHECK_RESULT(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device));
    CHECK_RESULT(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device));
peastman's avatar
peastman committed
168
    int numThreadBlocksPerComputeUnit = (major == 6 ? 4 : 6);
169
    if (cudaDriverVersion < 7000) {
170
171
172
173
174
        // This is a workaround to support GTX 980 with CUDA 6.5.  It reports
        // its compute capability as 5.2, but the compiler doesn't support
        // anything beyond 5.0.
        if (major == 5)
            minor = 0;
175
176
    }
    if (cudaDriverVersion < 8000) {
177
178
179
180
181
182
183
        // This is a workaround to support Pascal with CUDA 7.5.  It reports
        // its compute capability as 6.x, but the compiler doesn't support
        // anything beyond 5.3.
        if (major == 6) {
            major = 5;
            minor = 3;
        }
184
    }
185
    gpuArchitecture = 10*major+minor;
186
    computeCapability = major+0.1*minor;
187

188
    contextIsValid = true;
189
    ContextSelector selector(*this);
190
    CHECK_RESULT(cuCtxSetCacheConfig(CU_FUNC_CACHE_PREFER_SHARED));
root's avatar
root committed
191
192
193
194
    if (contextIndex > 0) {
        int canAccess;
        cuDeviceCanAccessPeer(&canAccess, getDevice(), platformData.contexts[0]->getDevice());
        if (canAccess) {
195
196
197
198
            {
                ContextSelector selector2(*platformData.contexts[0]);
                CHECK_RESULT(cuCtxEnablePeerAccess(getContext(), 0));
            }
root's avatar
root committed
199
200
201
            CHECK_RESULT(cuCtxEnablePeerAccess(platformData.contexts[0]->getContext(), 0));
        }
    }
202
203
204
205
206
207
    numAtoms = system.getNumParticles();
    paddedNumAtoms = TileSize*((numAtoms+TileSize-1)/TileSize);
    numAtomBlocks = (paddedNumAtoms+(TileSize-1))/TileSize;
    int multiprocessors;
    CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device));
    numThreadBlocks = numThreadBlocksPerComputeUnit*multiprocessors;
208
    if (cudaDriverVersion >= 9000) {
Peter Eastman's avatar
Peter Eastman committed
209
210
        compilationDefines["SYNC_WARPS"] = "__syncwarp();";
        compilationDefines["SHFL(var, srcLane)"] = "__shfl_sync(0xffffffff, var, srcLane);";
211
        compilationDefines["BALLOT(var)"] = "__ballot_sync(0xffffffff, var);";
Peter Eastman's avatar
Peter Eastman committed
212
213
214
215
    }
    else {
        compilationDefines["SYNC_WARPS"] = "";
        compilationDefines["SHFL(var, srcLane)"] = "__shfl(var, srcLane);";
216
        compilationDefines["BALLOT(var)"] = "__ballot(var);";
Peter Eastman's avatar
Peter Eastman committed
217
    }
218
    if (useDoublePrecision) {
Peter Eastman's avatar
Peter Eastman committed
219
220
        posq.initialize<double4>(*this, paddedNumAtoms, "posq");
        velm.initialize<double4>(*this, paddedNumAtoms, "velm");
221
        compilationDefines["USE_DOUBLE_PRECISION"] = "1";
222
223
224
        compilationDefines["make_real2"] = "make_double2";
        compilationDefines["make_real3"] = "make_double3";
        compilationDefines["make_real4"] = "make_double4";
225
226
227
        compilationDefines["make_mixed2"] = "make_double2";
        compilationDefines["make_mixed3"] = "make_double3";
        compilationDefines["make_mixed4"] = "make_double4";
228
    }
229
    else if (useMixedPrecision) {
Peter Eastman's avatar
Peter Eastman committed
230
231
232
        posq.initialize<float4>(*this, paddedNumAtoms, "posq");
        posqCorrection.initialize<float4>(*this, paddedNumAtoms, "posqCorrection");
        velm.initialize<double4>(*this, paddedNumAtoms, "velm");
233
234
235
236
237
238
239
240
        compilationDefines["USE_MIXED_PRECISION"] = "1";
        compilationDefines["make_real2"] = "make_float2";
        compilationDefines["make_real3"] = "make_float3";
        compilationDefines["make_real4"] = "make_float4";
        compilationDefines["make_mixed2"] = "make_double2";
        compilationDefines["make_mixed3"] = "make_double3";
        compilationDefines["make_mixed4"] = "make_double4";
    }
241
    else {
Peter Eastman's avatar
Peter Eastman committed
242
243
        posq.initialize<float4>(*this, paddedNumAtoms, "posq");
        velm.initialize<float4>(*this, paddedNumAtoms, "velm");
244
245
246
        compilationDefines["make_real2"] = "make_float2";
        compilationDefines["make_real3"] = "make_float3";
        compilationDefines["make_real4"] = "make_float4";
247
248
249
        compilationDefines["make_mixed2"] = "make_float2";
        compilationDefines["make_mixed3"] = "make_float3";
        compilationDefines["make_mixed4"] = "make_float4";
250
    }
251
252
    force.initialize<long long>(*this, paddedNumAtoms*3, "force");
    posCellOffsets.resize(paddedNumAtoms, mm_int4(0, 0, 0, 0));
253
254
255
256
257
    atomIndexDevice.initialize<int>(*this, paddedNumAtoms, "atomIndex");
    atomIndex.resize(paddedNumAtoms);
    for (int i = 0; i < paddedNumAtoms; ++i)
        atomIndex[i] = i;
    atomIndexDevice.upload(atomIndex);
258
259
260
261

    // Create utility kernels that are used in multiple places.

    CUmodule utilities = createModule(CudaKernelSources::vectorOps+CudaKernelSources::utilities);
262
263
264
265
266
267
    clearBufferKernel = getKernel(utilities, "clearBuffer");
    clearTwoBuffersKernel = getKernel(utilities, "clearTwoBuffers");
    clearThreeBuffersKernel = getKernel(utilities, "clearThreeBuffers");
    clearFourBuffersKernel = getKernel(utilities, "clearFourBuffers");
    clearFiveBuffersKernel = getKernel(utilities, "clearFiveBuffers");
    clearSixBuffersKernel = getKernel(utilities, "clearSixBuffers");
Peter Eastman's avatar
Peter Eastman committed
268
    reduceEnergyKernel = getKernel(utilities, "reduceEnergy");
269
    setChargesKernel = getKernel(utilities, "setCharges");
270
271
272
273
274
275
276
277

    // Set defines based on the requested precision.

    compilationDefines["SQRT"] = useDoublePrecision ? "sqrt" : "sqrtf";
    compilationDefines["RSQRT"] = useDoublePrecision ? "rsqrt" : "rsqrtf";
    compilationDefines["RECIP"] = useDoublePrecision ? "1.0/" : "1.0f/";
    compilationDefines["EXP"] = useDoublePrecision ? "exp" : "expf";
    compilationDefines["LOG"] = useDoublePrecision ? "log" : "logf";
278
    compilationDefines["POW"] = useDoublePrecision ? "pow" : "powf";
279
280
281
282
283
284
    compilationDefines["COS"] = useDoublePrecision ? "cos" : "cosf";
    compilationDefines["SIN"] = useDoublePrecision ? "sin" : "sinf";
    compilationDefines["TAN"] = useDoublePrecision ? "tan" : "tanf";
    compilationDefines["ACOS"] = useDoublePrecision ? "acos" : "acosf";
    compilationDefines["ASIN"] = useDoublePrecision ? "asin" : "asinf";
    compilationDefines["ATAN"] = useDoublePrecision ? "atan" : "atanf";
285
286
    compilationDefines["ERF"] = useDoublePrecision ? "erf" : "erff";
    compilationDefines["ERFC"] = useDoublePrecision ? "erfc" : "erfcf";
287

288
    // Set defines for applying periodic boundary conditions.
289

290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
    Vec3 boxVectors[3];
    system.getDefaultPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
    boxIsTriclinic = (boxVectors[0][1] != 0.0 || boxVectors[0][2] != 0.0 ||
                      boxVectors[1][0] != 0.0 || boxVectors[1][2] != 0.0 ||
                      boxVectors[2][0] != 0.0 || boxVectors[2][1] != 0.0);
    if (boxIsTriclinic) {
        compilationDefines["APPLY_PERIODIC_TO_DELTA(delta)"] =
            "{"
            "real scale3 = floor(delta.z*invPeriodicBoxSize.z+0.5f); \\\n"
            "delta.x -= scale3*periodicBoxVecZ.x; \\\n"
            "delta.y -= scale3*periodicBoxVecZ.y; \\\n"
            "delta.z -= scale3*periodicBoxVecZ.z; \\\n"
            "real scale2 = floor(delta.y*invPeriodicBoxSize.y+0.5f); \\\n"
            "delta.x -= scale2*periodicBoxVecY.x; \\\n"
            "delta.y -= scale2*periodicBoxVecY.y; \\\n"
            "real scale1 = floor(delta.x*invPeriodicBoxSize.x+0.5f); \\\n"
            "delta.x -= scale1*periodicBoxVecX.x;}";
        compilationDefines["APPLY_PERIODIC_TO_POS(pos)"] =
            "{"
            "real scale3 = floor(pos.z*invPeriodicBoxSize.z); \\\n"
            "pos.x -= scale3*periodicBoxVecZ.x; \\\n"
            "pos.y -= scale3*periodicBoxVecZ.y; \\\n"
            "pos.z -= scale3*periodicBoxVecZ.z; \\\n"
            "real scale2 = floor(pos.y*invPeriodicBoxSize.y); \\\n"
            "pos.x -= scale2*periodicBoxVecY.x; \\\n"
            "pos.y -= scale2*periodicBoxVecY.y; \\\n"
            "real scale1 = floor(pos.x*invPeriodicBoxSize.x); \\\n"
            "pos.x -= scale1*periodicBoxVecX.x;}";
        compilationDefines["APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, center)"] =
            "{"
            "real scale3 = floor((pos.z-center.z)*invPeriodicBoxSize.z+0.5f); \\\n"
            "pos.x -= scale3*periodicBoxVecZ.x; \\\n"
            "pos.y -= scale3*periodicBoxVecZ.y; \\\n"
            "pos.z -= scale3*periodicBoxVecZ.z; \\\n"
            "real scale2 = floor((pos.y-center.y)*invPeriodicBoxSize.y+0.5f); \\\n"
            "pos.x -= scale2*periodicBoxVecY.x; \\\n"
            "pos.y -= scale2*periodicBoxVecY.y; \\\n"
            "real scale1 = floor((pos.x-center.x)*invPeriodicBoxSize.x+0.5f); \\\n"
            "pos.x -= scale1*periodicBoxVecX.x;}";
    }
    else {
        compilationDefines["APPLY_PERIODIC_TO_DELTA(delta)"] =
            "{"
            "delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; \\\n"
            "delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; \\\n"
            "delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;}";
        compilationDefines["APPLY_PERIODIC_TO_POS(pos)"] =
            "{"
            "pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x; \\\n"
            "pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y; \\\n"
            "pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;}";
        compilationDefines["APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, center)"] =
            "{"
            "pos.x -= floor((pos.x-center.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; \\\n"
            "pos.y -= floor((pos.y-center.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; \\\n"
            "pos.z -= floor((pos.z-center.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;}";
    }

348
    // Create utilities objects.
349

350
351
    bonded = new CudaBondedUtilities(*this);
    nonbonded = new CudaNonbondedUtilities(*this);
352
353
    integration = new CudaIntegrationUtilities(*this, system);
    expression = new CudaExpressionUtilities(*this);
354
355
356
}

CudaContext::~CudaContext() {
357
    pushAsCurrent();
peastman's avatar
peastman committed
358
359
360
361
362
363
364
365
    for (auto force : forces)
        delete force;
    for (auto listener : reorderListeners)
        delete listener;
    for (auto computation : preComputations)
        delete computation;
    for (auto computation : postComputations)
        delete computation;
366
367
    if (pinnedBuffer != NULL)
        cuMemFreeHost(pinnedBuffer);
368
369
370
371
    if (integration != NULL)
        delete integration;
    if (expression != NULL)
        delete expression;
372
373
    if (bonded != NULL)
        delete bonded;
374
375
    if (nonbonded != NULL)
        delete nonbonded;
376
377
    if (contextIsValid && !isLinkedContext)
        cuProfilerStop();
378
    popAsCurrent();
379
    string errorMessage = "Error deleting Context";
380
    if (contextIsValid && !isLinkedContext)
381
        cuCtxDestroy(context);
382
    contextIsValid = false;
383
384
}

385
void CudaContext::initialize() {
386
    ContextSelector selector(*this);
387
    string errorMessage = "Error initializing Context";
388
    int numEnergyBuffers = max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers());
389
390
    int multiprocessors;
    CHECK_RESULT2(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device), "Error checking GPU properties");
391
    if (useDoublePrecision) {
Peter Eastman's avatar
Peter Eastman committed
392
        energyBuffer.initialize<double>(*this, numEnergyBuffers, "energyBuffer");
393
        energySum.initialize<double>(*this, multiprocessors, "energySum");
394
395
396
397
        int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers);
        CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0));
    }
    else if (useMixedPrecision) {
Peter Eastman's avatar
Peter Eastman committed
398
        energyBuffer.initialize<double>(*this, numEnergyBuffers, "energyBuffer");
399
        energySum.initialize<double>(*this, multiprocessors, "energySum");
400
401
402
403
        int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers);
        CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0));
    }
    else {
Peter Eastman's avatar
Peter Eastman committed
404
        energyBuffer.initialize<float>(*this, numEnergyBuffers, "energyBuffer");
405
        energySum.initialize<float>(*this, multiprocessors, "energySum");
406
407
408
        int pinnedBufferSize = max(paddedNumAtoms*6, numEnergyBuffers);
        CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(float), 0));
    }
409
410
    for (int i = 0; i < numAtoms; i++) {
        double mass = system.getParticleMass(i);
411
        if (useDoublePrecision || useMixedPrecision)
412
413
414
415
            ((double4*) pinnedBuffer)[i] = make_double4(0.0, 0.0, 0.0, mass == 0.0 ? 0.0 : 1.0/mass);
        else
            ((float4*) pinnedBuffer)[i] = make_float4(0.0f, 0.0f, 0.0f, mass == 0.0 ? 0.0f : (float) (1.0/mass));
    }
Peter Eastman's avatar
Peter Eastman committed
416
    velm.upload(pinnedBuffer);
417
    bonded->initialize(system);
Peter Eastman's avatar
Peter Eastman committed
418
419
    addAutoclearBuffer(force.getDevicePointer(), force.getSize()*force.getElementSize());
    addAutoclearBuffer(energyBuffer.getDevicePointer(), energyBuffer.getSize()*energyBuffer.getElementSize());
420
421
422
    int numEnergyParamDerivs = energyParamDerivNames.size();
    if (numEnergyParamDerivs > 0) {
        if (useDoublePrecision || useMixedPrecision)
Peter Eastman's avatar
Peter Eastman committed
423
            energyParamDerivBuffer.initialize<double>(*this, numEnergyParamDerivs*numEnergyBuffers, "energyParamDerivBuffer");
424
        else
Peter Eastman's avatar
Peter Eastman committed
425
426
            energyParamDerivBuffer.initialize<float>(*this, numEnergyParamDerivs*numEnergyBuffers, "energyParamDerivBuffer");
        addAutoclearBuffer(energyParamDerivBuffer);
427
    }
428
    findMoleculeGroups();
429
    nonbonded->initialize(system);
430
}
431

432
433
void CudaContext::initializeContexts() {
    getPlatformData().initializeContexts(system);
434
435
}

436
437
438
439
440
void CudaContext::setAsCurrent() {
    if (contextIsValid)
        cuCtxSetCurrent(context);
}

441
442
443
444
445
446
447
448
449
450
451
void CudaContext::pushAsCurrent() {
    if (contextIsValid)
        cuCtxPushCurrent(context);
}

void CudaContext::popAsCurrent() {
    CUcontext popped;
    if (contextIsValid)
        cuCtxPopCurrent(&popped);
}

452
453
454
455
456
CUmodule CudaContext::createModule(const string source, const char* optimizationFlags) {
    return createModule(source, map<string, string>(), optimizationFlags);
}

CUmodule CudaContext::createModule(const string source, const map<string, string>& defines, const char* optimizationFlags) {
457
    string bits = intToString(8*sizeof(void*));
458
459
460
461
    string options = (optimizationFlags == NULL ? defaultOptimizationOptions : string(optimizationFlags));
    stringstream src;
    if (!options.empty())
        src << "// Compilation Options: " << options << endl << endl;
peastman's avatar
peastman committed
462
    for (auto& pair : compilationDefines) {
463
464
465
466
467
468
469
        // Query defines to avoid duplicate variables
        if (defines.find(pair.first) == defines.end()) {
            src << "#define " << pair.first;
            if (!pair.second.empty())
                src << " " << pair.second;
            src << endl;
        }
470
471
472
    }
    if (!compilationDefines.empty())
        src << endl;
473
474
475
476
477
478
479
480
481
482
483
484
    if (useDoublePrecision) {
        src << "typedef double real;\n";
        src << "typedef double2 real2;\n";
        src << "typedef double3 real3;\n";
        src << "typedef double4 real4;\n";
    }
    else {
        src << "typedef float real;\n";
        src << "typedef float2 real2;\n";
        src << "typedef float3 real3;\n";
        src << "typedef float4 real4;\n";
    }
485
486
487
488
489
490
491
492
493
494
495
496
    if (useDoublePrecision || useMixedPrecision) {
        src << "typedef double mixed;\n";
        src << "typedef double2 mixed2;\n";
        src << "typedef double3 mixed3;\n";
        src << "typedef double4 mixed4;\n";
    }
    else {
        src << "typedef float mixed;\n";
        src << "typedef float2 mixed2;\n";
        src << "typedef float3 mixed3;\n";
        src << "typedef float4 mixed4;\n";
    }
497
    src << "typedef unsigned int tileflags;\n";
498
    src << CudaKernelSources::common << endl;
peastman's avatar
peastman committed
499
500
501
502
    for (auto& pair : defines) {
        src << "#define " << pair.first;
        if (!pair.second.empty())
            src << " " << pair.second;
503
504
505
506
507
        src << endl;
    }
    if (!defines.empty())
        src << endl;
    src << source << endl;
508
509
    
    // Determine what architecture to compile for.
510
511
512
513

    int maxCompilerArchitecture;
#if CUDA_VERSION < 11020
    // CUDA versions before 11.2 can't query the compiler to see what it supports.
514
    
515
516
517
518
519
520
521
522
523
    maxCompilerArchitecture = 75;
#else
    int numArchs;
    CHECK_NVRTC_RESULT(nvrtcGetNumSupportedArchs(&numArchs), "Error querying supported architectures");
    vector<int> archs(numArchs);
    CHECK_NVRTC_RESULT(nvrtcGetSupportedArchs(archs.data()), "Error querying supported architectures");
    maxCompilerArchitecture = archs.back();
#endif
    string compileArchitecture = intToString(min(gpuArchitecture, maxCompilerArchitecture));
524

525
    // See whether we already have PTX for this kernel cached.
526

527
528
529
530
531
532
    CSHA1 sha1;
    sha1.Update((const UINT_8*) src.str().c_str(), src.str().size());
    sha1.Final();
    UINT_8 hash[20];
    sha1.GetHash(hash);
    stringstream cacheFile;
533
    cacheFile << cacheDir;
534
535
536
    cacheFile.flags(ios::hex);
    for (int i = 0; i < 20; i++)
        cacheFile << setw(2) << setfill('0') << (int) hash[i];
537
    cacheFile << '_' << compileArchitecture << '_' << bits;
538
539
540
    CUmodule module;
    if (cuModuleLoad(&module, cacheFile.str().c_str()) == CUDA_SUCCESS)
        return module;
541

542
    // Select a name for the output file.
543

544
545
    stringstream tempFileName;
    tempFileName << "openmmTempKernel" << this; // Include a pointer to this context as part of the filename to avoid collisions.
546
547
548
549
550
#ifdef WIN32
    tempFileName << "_" << GetCurrentProcessId();
#else
    tempFileName << "_" << getpid();
#endif
551
    string outputFile = (tempDir+tempFileName.str()+".ptx");
552

553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
    // Split the command line flags into an array of options.
    
    string flags = "-arch=compute_"+compileArchitecture+" "+options;
    stringstream flagsStream(flags);
    string flag;
    vector<string> splitFlags;
    while (flagsStream >> flag)
        splitFlags.push_back(flag);
    int numOptions = splitFlags.size();
    vector<const char*> optionsVec(numOptions);
    for (int i = 0; i < numOptions; i++)
        optionsVec[i] = &splitFlags[i][0];
    
    // Compile the program to PTX.
    
    nvrtcProgram program;
    CHECK_NVRTC_RESULT(nvrtcCreateProgram(&program, src.str().c_str(), NULL, 0, NULL, NULL), "Error creating program");
    try {
        nvrtcResult result = nvrtcCompileProgram(program, optionsVec.size(), &optionsVec[0]);
        if (result != NVRTC_SUCCESS) {
            size_t logSize;
            nvrtcGetProgramLogSize(program, &logSize);
            vector<char> log(logSize);
            nvrtcGetProgramLog(program, &log[0]);
            throw OpenMMException("Error compiling program: "+string(&log[0]));
        }
        size_t ptxSize;
        nvrtcGetPTXSize(program, &ptxSize);
        vector<char> ptx(ptxSize);
        nvrtcGetPTX(program, &ptx[0]);
        nvrtcDestroyProgram(&program);
584

585
        // If possible, write the PTX out to a temporary file so we can cache it for later use.
586

587
        bool wroteCache = false;
588
589
        try {
            ofstream out(outputFile.c_str());
590
            out << string(&ptx[0]);
591
            out.close();
592
593
            if (!out.fail())
                wroteCache = true;
594
595
        }
        catch (...) {
596
597
598
599
            // Ignore.
        }
        if (!wroteCache) {
            // An error occurred.  Possibly we don't have permission to write to the temp directory.  Just try to load the module directly.
600

601
602
603
604
            CHECK_RESULT2(cuModuleLoadDataEx(&module, &ptx[0], 0, NULL, NULL), "Error loading CUDA module");
            return module;
        }
    }
605
606
607
    catch (...) {
        nvrtcDestroyProgram(&program);
        throw;
608
    }
609
610
611
612
    try {
        CUresult result = cuModuleLoad(&module, outputFile.c_str());
        if (result != CUDA_SUCCESS) {
            std::stringstream m;
613
            m<<"Error loading CUDA module: "<<getErrorString(result)<<" ("<<result<<")";
614
615
            throw OpenMMException(m.str());
        }
616
617
        if (rename(outputFile.c_str(), cacheFile.str().c_str()) != 0)
            remove(outputFile.c_str());
618
619
620
621
622
623
624
        return module;
    }
    catch (...) {
        remove(outputFile.c_str());
        throw;
    }
}
625
626
627
628
629
630
631
632
633
634
635
636

CUfunction CudaContext::getKernel(CUmodule& module, const string& name) {
    CUfunction function;
    CUresult result = cuModuleGetFunction(&function, module, name.c_str());
    if (result != CUDA_SUCCESS) {
        std::stringstream m;
        m<<"Error creating kernel "<<name<<": "<<getErrorString(result)<<" ("<<result<<")";
        throw OpenMMException(m.str());
    }
    return function;
}

637
638
639
640
641
642
643
644
645
646
647
648
CUstream CudaContext::getCurrentStream() {
    return currentStream;
}

void CudaContext::setCurrentStream(CUstream stream) {
    currentStream = stream;
}

void CudaContext::restoreDefaultStream() {
    setCurrentStream(0);
}

649
650
CudaArray* CudaContext::createArray() {
    return new CudaArray();
651
652
}

653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
ComputeEvent CudaContext::createEvent() {
    return shared_ptr<ComputeEventImpl>(new CudaEvent(*this));
}

ComputeProgram CudaContext::compileProgram(const std::string source, const std::map<std::string, std::string>& defines) {
    CUmodule module = createModule(CudaKernelSources::vectorOps+source, defines);
    return shared_ptr<ComputeProgramImpl>(new CudaProgram(*this, module));
}

CudaArray& CudaContext::unwrap(ArrayInterface& array) const {
    CudaArray* cuarray;
    ComputeArray* wrapper = dynamic_cast<ComputeArray*>(&array);
    if (wrapper != NULL)
        cuarray = dynamic_cast<CudaArray*>(&wrapper->getArray());
    else
        cuarray = dynamic_cast<CudaArray*>(&array);
    if (cuarray == NULL)
        throw OpenMMException("Array argument is not an CudaArray");
    return *cuarray;
672
673
674
}

std::string CudaContext::getErrorString(CUresult result) {
Peter Eastman's avatar
Peter Eastman committed
675
676
677
    const char* message;
    if (cuGetErrorName(result, &message) == CUDA_SUCCESS)
        return string(message);
peastman's avatar
peastman committed
678
    return "CUDA error";
679
680
681
682
683
684
}

void CudaContext::executeKernel(CUfunction kernel, void** arguments, int threads, int blockSize, unsigned int sharedSize) {
    if (blockSize == -1)
        blockSize = ThreadBlockSize;
    int gridSize = std::min((threads+blockSize-1)/blockSize, numThreadBlocks);
685
    CUresult result = cuLaunchKernel(kernel, gridSize, 1, 1, blockSize, 1, 1, sharedSize, currentStream, arguments, NULL);
686
687
688
689
690
691
692
    if (result != CUDA_SUCCESS) {
        stringstream str;
        str<<"Error invoking kernel: "<<getErrorString(result)<<" ("<<result<<")";
        throw OpenMMException(str.str());
    }
}

693
694
695
int CudaContext::computeThreadBlockSize(double memory) const {
    int maxShared;
    CHECK_RESULT2(cuDeviceGetAttribute(&maxShared, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, device), "Error querying device property");
696
697
698
699
700
701
702
703
704
    int max = (int) (maxShared/memory);
    if (max < 64)
        return 32;
    int threads = 64;
    while (threads+64 < max)
        threads += 64;
    return threads;
}

705
706
void CudaContext::clearBuffer(ArrayInterface& array) {
    clearBuffer(unwrap(array).getDevicePointer(), array.getSize()*array.getElementSize());
707
708
709
}

void CudaContext::clearBuffer(CUdeviceptr memory, int size) {
710
711
    int words = size/4;
    void* args[] = {&memory, &words};
Peter Eastman's avatar
Peter Eastman committed
712
    executeKernel(clearBufferKernel, args, words, 128);
713
714
}

715
716
void CudaContext::addAutoclearBuffer(ArrayInterface& array) {
    addAutoclearBuffer(unwrap(array).getDevicePointer(), array.getSize()*array.getElementSize());
717
718
}

719
720
void CudaContext::addAutoclearBuffer(CUdeviceptr memory, int size) {
    autoclearBuffers.push_back(memory);
721
    autoclearBufferSizes.push_back(size/4);
722
723
}

724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
void CudaContext::clearAutoclearBuffers() {
    int base = 0;
    int total = autoclearBufferSizes.size();
    while (total-base >= 6) {
        void* args[] = {&autoclearBuffers[base], &autoclearBufferSizes[base],
                        &autoclearBuffers[base+1], &autoclearBufferSizes[base+1],
                        &autoclearBuffers[base+2], &autoclearBufferSizes[base+2],
                        &autoclearBuffers[base+3], &autoclearBufferSizes[base+3],
                        &autoclearBuffers[base+4], &autoclearBufferSizes[base+4],
                        &autoclearBuffers[base+5], &autoclearBufferSizes[base+5]};
        executeKernel(clearSixBuffersKernel, args, max(max(max(max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), autoclearBufferSizes[base+3]), autoclearBufferSizes[base+4]), autoclearBufferSizes[base+5]), 128);
        base += 6;
    }
    if (total-base == 5) {
        void* args[] = {&autoclearBuffers[base], &autoclearBufferSizes[base],
                        &autoclearBuffers[base+1], &autoclearBufferSizes[base+1],
                        &autoclearBuffers[base+2], &autoclearBufferSizes[base+2],
                        &autoclearBuffers[base+3], &autoclearBufferSizes[base+3],
                        &autoclearBuffers[base+4], &autoclearBufferSizes[base+4]};
        executeKernel(clearFiveBuffersKernel, args, max(max(max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), autoclearBufferSizes[base+3]), autoclearBufferSizes[base+4]), 128);
    }
    else if (total-base == 4) {
        void* args[] = {&autoclearBuffers[base], &autoclearBufferSizes[base],
                        &autoclearBuffers[base+1], &autoclearBufferSizes[base+1],
                        &autoclearBuffers[base+2], &autoclearBufferSizes[base+2],
                        &autoclearBuffers[base+3], &autoclearBufferSizes[base+3]};
        executeKernel(clearFourBuffersKernel, args, max(max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), autoclearBufferSizes[base+3]), 128);
    }
    else if (total-base == 3) {
        void* args[] = {&autoclearBuffers[base], &autoclearBufferSizes[base],
                        &autoclearBuffers[base+1], &autoclearBufferSizes[base+1],
                        &autoclearBuffers[base+2], &autoclearBufferSizes[base+2]};
        executeKernel(clearThreeBuffersKernel, args, max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), 128);
    }
    else if (total-base == 2) {
        void* args[] = {&autoclearBuffers[base], &autoclearBufferSizes[base],
                        &autoclearBuffers[base+1], &autoclearBufferSizes[base+1]};
        executeKernel(clearTwoBuffersKernel, args, max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), 128);
    }
    else if (total-base == 1) {
        clearBuffer(autoclearBuffers[base], autoclearBufferSizes[base]*4);
    }
}
767

Peter Eastman's avatar
Peter Eastman committed
768
double CudaContext::reduceEnergy() {
Peter Eastman's avatar
Peter Eastman committed
769
    int bufferSize = energyBuffer.getSize();
Peter Eastman's avatar
Peter Eastman committed
770
    int workGroupSize  = 512;
Peter Eastman's avatar
Peter Eastman committed
771
    void* args[] = {&energyBuffer.getDevicePointer(), &energySum.getDevicePointer(), &bufferSize, &workGroupSize};
772
    executeKernel(reduceEnergyKernel, args, workGroupSize*energySum.getSize(), workGroupSize, workGroupSize*energyBuffer.getElementSize());
Peter Eastman's avatar
Peter Eastman committed
773
    energySum.download(pinnedBuffer);
774
775
776
777
778
779
780
781
782
783
    double result = 0;
    if (getUseDoublePrecision() || getUseMixedPrecision()) {
        for (int i = 0; i < energySum.getSize(); i++)
            result += ((double*) pinnedBuffer)[i];
    }
    else {
        for (int i = 0; i < energySum.getSize(); i++)
            result += ((float*) pinnedBuffer)[i];
    }
    return result;
Peter Eastman's avatar
Peter Eastman committed
784
785
}

786
void CudaContext::setCharges(const vector<double>& charges) {
Peter Eastman's avatar
Peter Eastman committed
787
788
    if (!chargeBuffer.isInitialized())
        chargeBuffer.initialize(*this, numAtoms, useDoublePrecision ? sizeof(double) : sizeof(float), "chargeBuffer");
Peter Eastman's avatar
Peter Eastman committed
789
790
791
792
    vector<double> c(numAtoms);
    for (int i = 0; i < numAtoms; i++)
        c[i] = charges[i];
    chargeBuffer.upload(c, true);
Peter Eastman's avatar
Peter Eastman committed
793
    void* args[] = {&chargeBuffer.getDevicePointer(), &posq.getDevicePointer(), &atomIndexDevice.getDevicePointer(), &numAtoms};
794
795
796
    executeKernel(setChargesKernel, args, numAtoms);
}

797
798
799
800
801
802
bool CudaContext::requestPosqCharges() {
    bool allow = !hasAssignedPosqCharges;
    hasAssignedPosqCharges = true;
    return allow;
}

803
804
805
806
807
808
809
810
811
void CudaContext::addEnergyParameterDerivative(const string& param) {
    // See if this parameter has already been registered.
    
    for (int i = 0; i < energyParamDerivNames.size(); i++)
        if (param == energyParamDerivNames[i])
            return;
    energyParamDerivNames.push_back(param);
}

812
813
void CudaContext::flushQueue() {
    cuStreamSynchronize(getCurrentStream());
814
815
}

816
817
818
819
820
821
822
823
824
825
vector<int> CudaContext::getDevicePrecedence() {
    int numDevices;
    CUdevice thisDevice;
    string errorMessage = "Error initializing Context";
    vector<pair<pair<int, int>, int> > devices;

    CHECK_RESULT(cuDeviceGetCount(&numDevices));
    for (int i = 0; i < numDevices; i++) {
        CHECK_RESULT(cuDeviceGet(&thisDevice, i));
        int major, minor, clock, multiprocessors, speed;
826
827
        CHECK_RESULT(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, thisDevice));
        CHECK_RESULT(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, thisDevice));
828
829
830
        if (major == 1 && minor < 2)
            continue;

Robert T. McGibbon's avatar
Robert T. McGibbon committed
831
        if ((useDoublePrecision || useMixedPrecision) && (major+0.1*minor < 1.3))
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
            continue;

        CHECK_RESULT(cuDeviceGetAttribute(&clock, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, thisDevice));
        CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, thisDevice));
        speed = clock*multiprocessors;
        pair<int, int> deviceProperties = std::make_pair(major, speed);
        devices.push_back(std::make_pair(deviceProperties, -i));
    }

    // sort first by compute capability (higher is better), then speed
    // (higher is better), and finally device index (lower is better)
    std::sort(devices.begin(), devices.end());
    std::reverse(devices.begin(), devices.end());

    vector<int> precedence;
    for (int i = 0; i < static_cast<int>(devices.size()); i++) {
        precedence.push_back(-devices[i].second);
    }

    return precedence;
}
853
854
855
856
857
858
859

unsigned int CudaContext::getEventFlags() {
    unsigned int flags = CU_EVENT_DISABLE_TIMING;
    if (useBlockingSync)
        flags += CU_EVENT_BLOCKING_SYNC;
    return flags;
}