CudaContext.cpp 36.1 KB
Newer Older
1
2
3
4
5
6
7
8
/* -------------------------------------------------------------------------- *
 *                                   OpenMM                                   *
 * -------------------------------------------------------------------------- *
 * This is part of the OpenMM molecular simulation toolkit originating from   *
 * Simbios, the NIH National Center for Physics-Based Simulation of           *
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
9
 * Portions copyright (c) 2009-2023 Stanford University and the Authors.      *
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
 * This program is free software: you can redistribute it and/or modify       *
 * it under the terms of the GNU Lesser General Public License as published   *
 * by the Free Software Foundation, either version 3 of the License, or       *
 * (at your option) any later version.                                        *
 *                                                                            *
 * This program is distributed in the hope that it will be useful,            *
 * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
 * GNU Lesser General Public License for more details.                        *
 *                                                                            *
 * You should have received a copy of the GNU Lesser General Public License   *
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
 * -------------------------------------------------------------------------- */

#ifdef WIN32
  #define _USE_MATH_DEFINES // Needed to get M_PI
#endif
#include <cmath>
#include "CudaContext.h"
#include "CudaArray.h"
33
#include "CudaBondedUtilities.h"
34
#include "CudaEvent.h"
35
#include "CudaIntegrationUtilities.h"
36
#include "CudaKernels.h"
37
#include "CudaKernelSources.h"
38
#include "CudaNonbondedUtilities.h"
39
40
#include "CudaProgram.h"
#include "openmm/common/ComputeArray.h"
41
#include "openmm/common/ContextSelector.h"
42
#include "SHA1.h"
43
44
45
#include "openmm/Platform.h"
#include "openmm/System.h"
#include "openmm/VirtualSite.h"
46
#include "CudaExpressionUtilities.h"
47
#include "openmm/internal/ContextImpl.h"
48
49
50
#include <algorithm>
#include <cstdlib>
#include <fstream>
51
#include <iomanip>
52
#include <iostream>
53
#include <set>
54
55
#include <sstream>
#include <typeinfo>
56
#include <sys/stat.h>
57
#include <cudaProfiler.h>
58
#include <nvrtc.h>
59
60
61
#ifndef WIN32
  #include <unistd.h>
#endif
peastman's avatar
peastman committed
62

63
64
65
66
67

#define CHECK_RESULT(result) CHECK_RESULT2(result, errorMessage);
#define CHECK_RESULT2(result, prefix) \
    if (result != CUDA_SUCCESS) { \
        std::stringstream m; \
68
        m<<prefix<<": "<<getErrorString(result)<<" ("<<result<<")"<<" at "<<__FILE__<<":"<<__LINE__; \
69
70
        throw OpenMMException(m.str());\
    }
71
72
73
74
75
76
#define CHECK_NVRTC_RESULT(result, prefix) \
    if (result != NVRTC_SUCCESS) { \
        stringstream m; \
        m<<prefix<<": "<<nvrtcGetErrorString(result)<<" ("<<result<<")"<<" at "<<__FILE__<<":"<<__LINE__; \
        throw OpenMMException(m.str());\
    }
77
78
79
80
81

using namespace OpenMM;
using namespace std;

const int CudaContext::ThreadBlockSize = 64;
82
const int CudaContext::TileSize = sizeof(tileflags)*8;
83
84
bool CudaContext::hasInitializedCuda = false;

85
CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const string& precision, const string& tempDir, CudaPlatform::PlatformData& platformData,
86
        CudaContext* originalContext) : ComputeContext(system), currentStream(0), platformData(platformData), contextIsValid(false), hasAssignedPosqCharges(false),
87
        pinnedBuffer(NULL), integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL), useBlockingSync(useBlockingSync) {
88
89
    int cudaDriverVersion;
    cuDriverGetVersion(&cudaDriverVersion);
90
91
92
93
94
95
    if (!hasInitializedCuda) {
        CHECK_RESULT2(cuInit(0), "Error initializing CUDA");
        hasInitializedCuda = true;
    }
    if (precision == "single") {
        useDoublePrecision = false;
96
        useMixedPrecision = false;
97
98
99
    }
    else if (precision == "mixed") {
        useDoublePrecision = false;
100
        useMixedPrecision = true;
101
102
103
    }
    else if (precision == "double") {
        useDoublePrecision = true;
104
        useMixedPrecision = false;
105
106
    }
    else
107
        throw OpenMMException("Illegal value for Precision: "+precision);
108
109
    char* cacheVariable = getenv("OPENMM_CACHE_DIR");
    cacheDir = (cacheVariable == NULL ? tempDir : string(cacheVariable));
110
#ifdef WIN32
111
    this->tempDir = tempDir+"\\";
112
    cacheDir = cacheDir+"\\";
113
114
#else
    this->tempDir = tempDir+"/";
115
    cacheDir = cacheDir+"/";
116
117
118
#endif
    contextIndex = platformData.contexts.size();
    string errorMessage = "Error initializing Context";
119
120
121
122
123
124
    if (originalContext == NULL) {
        isLinkedContext = false;
        int numDevices;
        CHECK_RESULT(cuDeviceGetCount(&numDevices));
        if (deviceIndex < -1 || deviceIndex >= numDevices)
            throw OpenMMException("Illegal value for DeviceIndex: "+intToString(deviceIndex));
125

126
127
128
129
130
131
        vector<int> devicePrecedence;
        if (deviceIndex == -1) {
            devicePrecedence = getDevicePrecedence();
        } else {
            devicePrecedence.push_back(deviceIndex);
        }
132

133
134
135
136
137
138
139
140
141
142
        this->deviceIndex = -1;
        for (int i = 0; i < static_cast<int>(devicePrecedence.size()); i++) {
            int trialDeviceIndex = devicePrecedence[i];
            CHECK_RESULT(cuDeviceGet(&device, trialDeviceIndex));
            defaultOptimizationOptions = "--use_fast_math";
            unsigned int flags = CU_CTX_MAP_HOST;
            if (useBlockingSync)
                flags += CU_CTX_SCHED_BLOCKING_SYNC;
            else
                flags += CU_CTX_SCHED_SPIN;
143

144
145
            if (cuCtxCreate(&context, flags, device) == CUDA_SUCCESS) {
                this->deviceIndex = trialDeviceIndex;
146
147
                CUcontext popped;
                cuCtxPopCurrent(&popped);
148
149
                break;
            }
150
        }
151
        if (this->deviceIndex == -1) {
152
153
154
155
            if (deviceIndex != -1)
                throw OpenMMException("The requested CUDA device could not be loaded");
            else
                throw OpenMMException("No compatible CUDA device is available");
156
        }
157
158
159
160
161
162
    }
    else {
        isLinkedContext = true;
        context = originalContext->context;
        this->deviceIndex = originalContext->deviceIndex;
        this->device = originalContext->device;
163
    }
164

165
    int major, minor;
166
167
    CHECK_RESULT(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device));
    CHECK_RESULT(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device));
peastman's avatar
peastman committed
168
    int numThreadBlocksPerComputeUnit = (major == 6 ? 4 : 6);
169
    if (cudaDriverVersion < 7000) {
170
171
172
173
174
        // This is a workaround to support GTX 980 with CUDA 6.5.  It reports
        // its compute capability as 5.2, but the compiler doesn't support
        // anything beyond 5.0.
        if (major == 5)
            minor = 0;
175
176
    }
    if (cudaDriverVersion < 8000) {
177
178
179
180
181
182
183
        // This is a workaround to support Pascal with CUDA 7.5.  It reports
        // its compute capability as 6.x, but the compiler doesn't support
        // anything beyond 5.3.
        if (major == 6) {
            major = 5;
            minor = 3;
        }
184
    }
185
    gpuArchitecture = 10*major+minor;
186
    computeCapability = major+0.1*minor;
187

188
    contextIsValid = true;
189
    ContextSelector selector(*this);
190
    CHECK_RESULT(cuCtxSetCacheConfig(CU_FUNC_CACHE_PREFER_SHARED));
root's avatar
root committed
191
192
193
194
    if (contextIndex > 0) {
        int canAccess;
        cuDeviceCanAccessPeer(&canAccess, getDevice(), platformData.contexts[0]->getDevice());
        if (canAccess) {
195
196
197
198
            {
                ContextSelector selector2(*platformData.contexts[0]);
                CHECK_RESULT(cuCtxEnablePeerAccess(getContext(), 0));
            }
root's avatar
root committed
199
200
201
            CHECK_RESULT(cuCtxEnablePeerAccess(platformData.contexts[0]->getContext(), 0));
        }
    }
202
203
204
205
206
207
    numAtoms = system.getNumParticles();
    paddedNumAtoms = TileSize*((numAtoms+TileSize-1)/TileSize);
    numAtomBlocks = (paddedNumAtoms+(TileSize-1))/TileSize;
    int multiprocessors;
    CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device));
    numThreadBlocks = numThreadBlocksPerComputeUnit*multiprocessors;
208
    if (cudaDriverVersion >= 9000) {
Peter Eastman's avatar
Peter Eastman committed
209
210
        compilationDefines["SYNC_WARPS"] = "__syncwarp();";
        compilationDefines["SHFL(var, srcLane)"] = "__shfl_sync(0xffffffff, var, srcLane);";
211
        compilationDefines["BALLOT(var)"] = "__ballot_sync(0xffffffff, var);";
Peter Eastman's avatar
Peter Eastman committed
212
213
214
215
    }
    else {
        compilationDefines["SYNC_WARPS"] = "";
        compilationDefines["SHFL(var, srcLane)"] = "__shfl(var, srcLane);";
216
        compilationDefines["BALLOT(var)"] = "__ballot(var);";
Peter Eastman's avatar
Peter Eastman committed
217
    }
218
    if (useDoublePrecision) {
Peter Eastman's avatar
Peter Eastman committed
219
220
        posq.initialize<double4>(*this, paddedNumAtoms, "posq");
        velm.initialize<double4>(*this, paddedNumAtoms, "velm");
221
        compilationDefines["USE_DOUBLE_PRECISION"] = "1";
222
223
224
        compilationDefines["make_real2"] = "make_double2";
        compilationDefines["make_real3"] = "make_double3";
        compilationDefines["make_real4"] = "make_double4";
225
226
227
        compilationDefines["make_mixed2"] = "make_double2";
        compilationDefines["make_mixed3"] = "make_double3";
        compilationDefines["make_mixed4"] = "make_double4";
228
    }
229
    else if (useMixedPrecision) {
Peter Eastman's avatar
Peter Eastman committed
230
231
232
        posq.initialize<float4>(*this, paddedNumAtoms, "posq");
        posqCorrection.initialize<float4>(*this, paddedNumAtoms, "posqCorrection");
        velm.initialize<double4>(*this, paddedNumAtoms, "velm");
233
234
235
236
237
238
239
240
        compilationDefines["USE_MIXED_PRECISION"] = "1";
        compilationDefines["make_real2"] = "make_float2";
        compilationDefines["make_real3"] = "make_float3";
        compilationDefines["make_real4"] = "make_float4";
        compilationDefines["make_mixed2"] = "make_double2";
        compilationDefines["make_mixed3"] = "make_double3";
        compilationDefines["make_mixed4"] = "make_double4";
    }
241
    else {
Peter Eastman's avatar
Peter Eastman committed
242
243
        posq.initialize<float4>(*this, paddedNumAtoms, "posq");
        velm.initialize<float4>(*this, paddedNumAtoms, "velm");
244
245
246
        compilationDefines["make_real2"] = "make_float2";
        compilationDefines["make_real3"] = "make_float3";
        compilationDefines["make_real4"] = "make_float4";
247
248
249
        compilationDefines["make_mixed2"] = "make_float2";
        compilationDefines["make_mixed3"] = "make_float3";
        compilationDefines["make_mixed4"] = "make_float4";
250
    }
251
252
    force.initialize<long long>(*this, paddedNumAtoms*3, "force");
    posCellOffsets.resize(paddedNumAtoms, mm_int4(0, 0, 0, 0));
253
254
255
256
257
    atomIndexDevice.initialize<int>(*this, paddedNumAtoms, "atomIndex");
    atomIndex.resize(paddedNumAtoms);
    for (int i = 0; i < paddedNumAtoms; ++i)
        atomIndex[i] = i;
    atomIndexDevice.upload(atomIndex);
258
259
260
261

    // Create utility kernels that are used in multiple places.

    CUmodule utilities = createModule(CudaKernelSources::vectorOps+CudaKernelSources::utilities);
262
263
264
265
266
267
    clearBufferKernel = getKernel(utilities, "clearBuffer");
    clearTwoBuffersKernel = getKernel(utilities, "clearTwoBuffers");
    clearThreeBuffersKernel = getKernel(utilities, "clearThreeBuffers");
    clearFourBuffersKernel = getKernel(utilities, "clearFourBuffers");
    clearFiveBuffersKernel = getKernel(utilities, "clearFiveBuffers");
    clearSixBuffersKernel = getKernel(utilities, "clearSixBuffers");
Peter Eastman's avatar
Peter Eastman committed
268
    reduceEnergyKernel = getKernel(utilities, "reduceEnergy");
269
    setChargesKernel = getKernel(utilities, "setCharges");
270
271
272
273
274
275
276
277

    // Set defines based on the requested precision.

    compilationDefines["SQRT"] = useDoublePrecision ? "sqrt" : "sqrtf";
    compilationDefines["RSQRT"] = useDoublePrecision ? "rsqrt" : "rsqrtf";
    compilationDefines["RECIP"] = useDoublePrecision ? "1.0/" : "1.0f/";
    compilationDefines["EXP"] = useDoublePrecision ? "exp" : "expf";
    compilationDefines["LOG"] = useDoublePrecision ? "log" : "logf";
278
    compilationDefines["POW"] = useDoublePrecision ? "pow" : "powf";
279
280
281
282
283
284
    compilationDefines["COS"] = useDoublePrecision ? "cos" : "cosf";
    compilationDefines["SIN"] = useDoublePrecision ? "sin" : "sinf";
    compilationDefines["TAN"] = useDoublePrecision ? "tan" : "tanf";
    compilationDefines["ACOS"] = useDoublePrecision ? "acos" : "acosf";
    compilationDefines["ASIN"] = useDoublePrecision ? "asin" : "asinf";
    compilationDefines["ATAN"] = useDoublePrecision ? "atan" : "atanf";
285
286
    compilationDefines["ERF"] = useDoublePrecision ? "erf" : "erff";
    compilationDefines["ERFC"] = useDoublePrecision ? "erfc" : "erfcf";
287

288
    // Set defines for applying periodic boundary conditions.
289

290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
    Vec3 boxVectors[3];
    system.getDefaultPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
    boxIsTriclinic = (boxVectors[0][1] != 0.0 || boxVectors[0][2] != 0.0 ||
                      boxVectors[1][0] != 0.0 || boxVectors[1][2] != 0.0 ||
                      boxVectors[2][0] != 0.0 || boxVectors[2][1] != 0.0);
    if (boxIsTriclinic) {
        compilationDefines["APPLY_PERIODIC_TO_DELTA(delta)"] =
            "{"
            "real scale3 = floor(delta.z*invPeriodicBoxSize.z+0.5f); \\\n"
            "delta.x -= scale3*periodicBoxVecZ.x; \\\n"
            "delta.y -= scale3*periodicBoxVecZ.y; \\\n"
            "delta.z -= scale3*periodicBoxVecZ.z; \\\n"
            "real scale2 = floor(delta.y*invPeriodicBoxSize.y+0.5f); \\\n"
            "delta.x -= scale2*periodicBoxVecY.x; \\\n"
            "delta.y -= scale2*periodicBoxVecY.y; \\\n"
            "real scale1 = floor(delta.x*invPeriodicBoxSize.x+0.5f); \\\n"
            "delta.x -= scale1*periodicBoxVecX.x;}";
        compilationDefines["APPLY_PERIODIC_TO_POS(pos)"] =
            "{"
            "real scale3 = floor(pos.z*invPeriodicBoxSize.z); \\\n"
            "pos.x -= scale3*periodicBoxVecZ.x; \\\n"
            "pos.y -= scale3*periodicBoxVecZ.y; \\\n"
            "pos.z -= scale3*periodicBoxVecZ.z; \\\n"
            "real scale2 = floor(pos.y*invPeriodicBoxSize.y); \\\n"
            "pos.x -= scale2*periodicBoxVecY.x; \\\n"
            "pos.y -= scale2*periodicBoxVecY.y; \\\n"
            "real scale1 = floor(pos.x*invPeriodicBoxSize.x); \\\n"
            "pos.x -= scale1*periodicBoxVecX.x;}";
        compilationDefines["APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, center)"] =
            "{"
            "real scale3 = floor((pos.z-center.z)*invPeriodicBoxSize.z+0.5f); \\\n"
            "pos.x -= scale3*periodicBoxVecZ.x; \\\n"
            "pos.y -= scale3*periodicBoxVecZ.y; \\\n"
            "pos.z -= scale3*periodicBoxVecZ.z; \\\n"
            "real scale2 = floor((pos.y-center.y)*invPeriodicBoxSize.y+0.5f); \\\n"
            "pos.x -= scale2*periodicBoxVecY.x; \\\n"
            "pos.y -= scale2*periodicBoxVecY.y; \\\n"
            "real scale1 = floor((pos.x-center.x)*invPeriodicBoxSize.x+0.5f); \\\n"
            "pos.x -= scale1*periodicBoxVecX.x;}";
    }
    else {
        compilationDefines["APPLY_PERIODIC_TO_DELTA(delta)"] =
            "{"
            "delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; \\\n"
            "delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; \\\n"
            "delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;}";
        compilationDefines["APPLY_PERIODIC_TO_POS(pos)"] =
            "{"
            "pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x; \\\n"
            "pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y; \\\n"
            "pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;}";
        compilationDefines["APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, center)"] =
            "{"
            "pos.x -= floor((pos.x-center.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; \\\n"
            "pos.y -= floor((pos.y-center.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; \\\n"
            "pos.z -= floor((pos.z-center.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;}";
    }

348
    // Create utilities objects.
349

350
351
    bonded = new CudaBondedUtilities(*this);
    nonbonded = new CudaNonbondedUtilities(*this);
352
353
    integration = new CudaIntegrationUtilities(*this, system);
    expression = new CudaExpressionUtilities(*this);
354
355
356
}

CudaContext::~CudaContext() {
357
    pushAsCurrent();
peastman's avatar
peastman committed
358
359
360
361
362
363
364
365
    for (auto force : forces)
        delete force;
    for (auto listener : reorderListeners)
        delete listener;
    for (auto computation : preComputations)
        delete computation;
    for (auto computation : postComputations)
        delete computation;
366
367
    if (pinnedBuffer != NULL)
        cuMemFreeHost(pinnedBuffer);
368
369
370
371
    if (integration != NULL)
        delete integration;
    if (expression != NULL)
        delete expression;
372
373
    if (bonded != NULL)
        delete bonded;
374
375
    if (nonbonded != NULL)
        delete nonbonded;
376
377
    if (contextIsValid && !isLinkedContext)
        cuProfilerStop();
378
    popAsCurrent();
379
    string errorMessage = "Error deleting Context";
380
    if (contextIsValid && !isLinkedContext)
381
        cuCtxDestroy(context);
382
    contextIsValid = false;
383
384
}

385
void CudaContext::initialize() {
386
    ContextSelector selector(*this);
387
    string errorMessage = "Error initializing Context";
388
    int numEnergyBuffers = max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers());
389
390
    int multiprocessors;
    CHECK_RESULT2(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device), "Error checking GPU properties");
391
    if (useDoublePrecision) {
Peter Eastman's avatar
Peter Eastman committed
392
        energyBuffer.initialize<double>(*this, numEnergyBuffers, "energyBuffer");
393
        energySum.initialize<double>(*this, multiprocessors, "energySum");
394
395
396
397
        int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers);
        CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0));
    }
    else if (useMixedPrecision) {
Peter Eastman's avatar
Peter Eastman committed
398
        energyBuffer.initialize<double>(*this, numEnergyBuffers, "energyBuffer");
399
        energySum.initialize<double>(*this, multiprocessors, "energySum");
400
401
402
403
        int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers);
        CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0));
    }
    else {
Peter Eastman's avatar
Peter Eastman committed
404
        energyBuffer.initialize<float>(*this, numEnergyBuffers, "energyBuffer");
405
        energySum.initialize<float>(*this, multiprocessors, "energySum");
406
407
408
        int pinnedBufferSize = max(paddedNumAtoms*6, numEnergyBuffers);
        CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(float), 0));
    }
409
410
    for (int i = 0; i < numAtoms; i++) {
        double mass = system.getParticleMass(i);
411
        if (useDoublePrecision || useMixedPrecision)
412
413
414
415
            ((double4*) pinnedBuffer)[i] = make_double4(0.0, 0.0, 0.0, mass == 0.0 ? 0.0 : 1.0/mass);
        else
            ((float4*) pinnedBuffer)[i] = make_float4(0.0f, 0.0f, 0.0f, mass == 0.0 ? 0.0f : (float) (1.0/mass));
    }
Peter Eastman's avatar
Peter Eastman committed
416
    velm.upload(pinnedBuffer);
417
    bonded->initialize(system);
Peter Eastman's avatar
Peter Eastman committed
418
419
    addAutoclearBuffer(force.getDevicePointer(), force.getSize()*force.getElementSize());
    addAutoclearBuffer(energyBuffer.getDevicePointer(), energyBuffer.getSize()*energyBuffer.getElementSize());
420
421
422
    int numEnergyParamDerivs = energyParamDerivNames.size();
    if (numEnergyParamDerivs > 0) {
        if (useDoublePrecision || useMixedPrecision)
Peter Eastman's avatar
Peter Eastman committed
423
            energyParamDerivBuffer.initialize<double>(*this, numEnergyParamDerivs*numEnergyBuffers, "energyParamDerivBuffer");
424
        else
Peter Eastman's avatar
Peter Eastman committed
425
426
            energyParamDerivBuffer.initialize<float>(*this, numEnergyParamDerivs*numEnergyBuffers, "energyParamDerivBuffer");
        addAutoclearBuffer(energyParamDerivBuffer);
427
    }
428
    findMoleculeGroups();
429
    nonbonded->initialize(system);
430
}
431

432
433
void CudaContext::initializeContexts() {
    getPlatformData().initializeContexts(system);
434
435
}

436
437
438
439
440
void CudaContext::setAsCurrent() {
    if (contextIsValid)
        cuCtxSetCurrent(context);
}

441
442
443
444
445
446
447
448
449
450
451
void CudaContext::pushAsCurrent() {
    if (contextIsValid)
        cuCtxPushCurrent(context);
}

void CudaContext::popAsCurrent() {
    CUcontext popped;
    if (contextIsValid)
        cuCtxPopCurrent(&popped);
}

452
453
454
455
456
CUmodule CudaContext::createModule(const string source, const char* optimizationFlags) {
    return createModule(source, map<string, string>(), optimizationFlags);
}

CUmodule CudaContext::createModule(const string source, const map<string, string>& defines, const char* optimizationFlags) {
457
    string bits = intToString(8*sizeof(void*));
458
459
460
461
    string options = (optimizationFlags == NULL ? defaultOptimizationOptions : string(optimizationFlags));
    stringstream src;
    if (!options.empty())
        src << "// Compilation Options: " << options << endl << endl;
peastman's avatar
peastman committed
462
    for (auto& pair : compilationDefines) {
463
464
465
466
467
468
469
        // Query defines to avoid duplicate variables
        if (defines.find(pair.first) == defines.end()) {
            src << "#define " << pair.first;
            if (!pair.second.empty())
                src << " " << pair.second;
            src << endl;
        }
470
471
472
    }
    if (!compilationDefines.empty())
        src << endl;
473
474
475
476
477
478
479
480
481
482
483
484
    if (useDoublePrecision) {
        src << "typedef double real;\n";
        src << "typedef double2 real2;\n";
        src << "typedef double3 real3;\n";
        src << "typedef double4 real4;\n";
    }
    else {
        src << "typedef float real;\n";
        src << "typedef float2 real2;\n";
        src << "typedef float3 real3;\n";
        src << "typedef float4 real4;\n";
    }
485
486
487
488
489
490
491
492
493
494
495
496
    if (useDoublePrecision || useMixedPrecision) {
        src << "typedef double mixed;\n";
        src << "typedef double2 mixed2;\n";
        src << "typedef double3 mixed3;\n";
        src << "typedef double4 mixed4;\n";
    }
    else {
        src << "typedef float mixed;\n";
        src << "typedef float2 mixed2;\n";
        src << "typedef float3 mixed3;\n";
        src << "typedef float4 mixed4;\n";
    }
497
    src << "typedef unsigned int tileflags;\n";
498
    src << CudaKernelSources::common << endl;
peastman's avatar
peastman committed
499
500
501
502
    for (auto& pair : defines) {
        src << "#define " << pair.first;
        if (!pair.second.empty())
            src << " " << pair.second;
503
504
505
506
507
        src << endl;
    }
    if (!defines.empty())
        src << endl;
    src << source << endl;
508
509
    
    // Determine what architecture to compile for.
510
511
512
513

    int maxCompilerArchitecture;
#if CUDA_VERSION < 11020
    // CUDA versions before 11.2 can't query the compiler to see what it supports.
514
    
515
516
517
518
519
520
521
522
523
    maxCompilerArchitecture = 75;
#else
    int numArchs;
    CHECK_NVRTC_RESULT(nvrtcGetNumSupportedArchs(&numArchs), "Error querying supported architectures");
    vector<int> archs(numArchs);
    CHECK_NVRTC_RESULT(nvrtcGetSupportedArchs(archs.data()), "Error querying supported architectures");
    maxCompilerArchitecture = archs.back();
#endif
    string compileArchitecture = intToString(min(gpuArchitecture, maxCompilerArchitecture));
524

525
    // See whether we already have PTX for this kernel cached.
526

527
528
529
530
531
532
    CSHA1 sha1;
    sha1.Update((const UINT_8*) src.str().c_str(), src.str().size());
    sha1.Final();
    UINT_8 hash[20];
    sha1.GetHash(hash);
    stringstream cacheFile;
533
    cacheFile << cacheDir;
534
535
536
    cacheFile.flags(ios::hex);
    for (int i = 0; i < 20; i++)
        cacheFile << setw(2) << setfill('0') << (int) hash[i];
537
    cacheFile << '_' << compileArchitecture << '_' << bits;
538
539
540
    CUmodule module;
    if (cuModuleLoad(&module, cacheFile.str().c_str()) == CUDA_SUCCESS)
        return module;
541

542
    // Select a name for the output file.
543

544
545
    stringstream tempFileName;
    tempFileName << "openmmTempKernel" << this; // Include a pointer to this context as part of the filename to avoid collisions.
546
547
548
549
550
#ifdef WIN32
    tempFileName << "_" << GetCurrentProcessId();
#else
    tempFileName << "_" << getpid();
#endif
551
    string outputFile = (tempDir+tempFileName.str()+".ptx");
552

553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
    // Split the command line flags into an array of options.
    
    string flags = "-arch=compute_"+compileArchitecture+" "+options;
    stringstream flagsStream(flags);
    string flag;
    vector<string> splitFlags;
    while (flagsStream >> flag)
        splitFlags.push_back(flag);
    int numOptions = splitFlags.size();
    vector<const char*> optionsVec(numOptions);
    for (int i = 0; i < numOptions; i++)
        optionsVec[i] = &splitFlags[i][0];
    
    // Compile the program to PTX.
    
    nvrtcProgram program;
    CHECK_NVRTC_RESULT(nvrtcCreateProgram(&program, src.str().c_str(), NULL, 0, NULL, NULL), "Error creating program");
    try {
        nvrtcResult result = nvrtcCompileProgram(program, optionsVec.size(), &optionsVec[0]);
        if (result != NVRTC_SUCCESS) {
            size_t logSize;
            nvrtcGetProgramLogSize(program, &logSize);
            vector<char> log(logSize);
            nvrtcGetProgramLog(program, &log[0]);
            throw OpenMMException("Error compiling program: "+string(&log[0]));
        }
        size_t ptxSize;
        nvrtcGetPTXSize(program, &ptxSize);
        vector<char> ptx(ptxSize);
        nvrtcGetPTX(program, &ptx[0]);
        nvrtcDestroyProgram(&program);
584

585
        // If possible, write the PTX out to a temporary file so we can cache it for later use.
586

587
        bool wroteCache = false;
588
589
        try {
            ofstream out(outputFile.c_str());
590
            out << string(&ptx[0]);
591
            out.close();
592
593
            if (!out.fail())
                wroteCache = true;
594
595
        }
        catch (...) {
596
597
598
599
            // Ignore.
        }
        if (!wroteCache) {
            // An error occurred.  Possibly we don't have permission to write to the temp directory.  Just try to load the module directly.
600

601
602
603
604
            CHECK_RESULT2(cuModuleLoadDataEx(&module, &ptx[0], 0, NULL, NULL), "Error loading CUDA module");
            return module;
        }
    }
605
606
607
    catch (...) {
        nvrtcDestroyProgram(&program);
        throw;
608
    }
609
610
611
612
    try {
        CUresult result = cuModuleLoad(&module, outputFile.c_str());
        if (result != CUDA_SUCCESS) {
            std::stringstream m;
613
            m<<"Error loading CUDA module: "<<getErrorString(result)<<" ("<<result<<")";
614
615
            throw OpenMMException(m.str());
        }
616
617
        if (rename(outputFile.c_str(), cacheFile.str().c_str()) != 0)
            remove(outputFile.c_str());
618
619
620
621
622
623
624
        return module;
    }
    catch (...) {
        remove(outputFile.c_str());
        throw;
    }
}
625
626
627
628
629
630
631
632
633
634
635
636

CUfunction CudaContext::getKernel(CUmodule& module, const string& name) {
    CUfunction function;
    CUresult result = cuModuleGetFunction(&function, module, name.c_str());
    if (result != CUDA_SUCCESS) {
        std::stringstream m;
        m<<"Error creating kernel "<<name<<": "<<getErrorString(result)<<" ("<<result<<")";
        throw OpenMMException(m.str());
    }
    return function;
}

637
638
639
640
641
642
643
vector<ComputeContext*> CudaContext::getAllContexts() {
    vector<ComputeContext*> result;
    for (CudaContext* c : platformData.contexts)
        result.push_back(c);
    return result;
}

644
645
646
647
648
649
650
651
652
653
654
655
CUstream CudaContext::getCurrentStream() {
    return currentStream;
}

void CudaContext::setCurrentStream(CUstream stream) {
    currentStream = stream;
}

void CudaContext::restoreDefaultStream() {
    setCurrentStream(0);
}

656
657
CudaArray* CudaContext::createArray() {
    return new CudaArray();
658
659
}

660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
ComputeEvent CudaContext::createEvent() {
    return shared_ptr<ComputeEventImpl>(new CudaEvent(*this));
}

ComputeProgram CudaContext::compileProgram(const std::string source, const std::map<std::string, std::string>& defines) {
    CUmodule module = createModule(CudaKernelSources::vectorOps+source, defines);
    return shared_ptr<ComputeProgramImpl>(new CudaProgram(*this, module));
}

CudaArray& CudaContext::unwrap(ArrayInterface& array) const {
    CudaArray* cuarray;
    ComputeArray* wrapper = dynamic_cast<ComputeArray*>(&array);
    if (wrapper != NULL)
        cuarray = dynamic_cast<CudaArray*>(&wrapper->getArray());
    else
        cuarray = dynamic_cast<CudaArray*>(&array);
    if (cuarray == NULL)
        throw OpenMMException("Array argument is not an CudaArray");
    return *cuarray;
679
680
681
}

std::string CudaContext::getErrorString(CUresult result) {
Peter Eastman's avatar
Peter Eastman committed
682
683
684
    const char* message;
    if (cuGetErrorName(result, &message) == CUDA_SUCCESS)
        return string(message);
peastman's avatar
peastman committed
685
    return "CUDA error";
686
687
688
689
690
691
}

void CudaContext::executeKernel(CUfunction kernel, void** arguments, int threads, int blockSize, unsigned int sharedSize) {
    if (blockSize == -1)
        blockSize = ThreadBlockSize;
    int gridSize = std::min((threads+blockSize-1)/blockSize, numThreadBlocks);
692
    CUresult result = cuLaunchKernel(kernel, gridSize, 1, 1, blockSize, 1, 1, sharedSize, currentStream, arguments, NULL);
693
694
695
696
697
698
699
    if (result != CUDA_SUCCESS) {
        stringstream str;
        str<<"Error invoking kernel: "<<getErrorString(result)<<" ("<<result<<")";
        throw OpenMMException(str.str());
    }
}

700
701
702
int CudaContext::computeThreadBlockSize(double memory) const {
    int maxShared;
    CHECK_RESULT2(cuDeviceGetAttribute(&maxShared, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, device), "Error querying device property");
703
704
705
706
707
708
709
710
711
    int max = (int) (maxShared/memory);
    if (max < 64)
        return 32;
    int threads = 64;
    while (threads+64 < max)
        threads += 64;
    return threads;
}

712
713
void CudaContext::clearBuffer(ArrayInterface& array) {
    clearBuffer(unwrap(array).getDevicePointer(), array.getSize()*array.getElementSize());
714
715
716
}

void CudaContext::clearBuffer(CUdeviceptr memory, int size) {
717
718
    int words = size/4;
    void* args[] = {&memory, &words};
Peter Eastman's avatar
Peter Eastman committed
719
    executeKernel(clearBufferKernel, args, words, 128);
720
721
}

722
723
void CudaContext::addAutoclearBuffer(ArrayInterface& array) {
    addAutoclearBuffer(unwrap(array).getDevicePointer(), array.getSize()*array.getElementSize());
724
725
}

726
727
void CudaContext::addAutoclearBuffer(CUdeviceptr memory, int size) {
    autoclearBuffers.push_back(memory);
728
    autoclearBufferSizes.push_back(size/4);
729
730
}

731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
void CudaContext::clearAutoclearBuffers() {
    int base = 0;
    int total = autoclearBufferSizes.size();
    while (total-base >= 6) {
        void* args[] = {&autoclearBuffers[base], &autoclearBufferSizes[base],
                        &autoclearBuffers[base+1], &autoclearBufferSizes[base+1],
                        &autoclearBuffers[base+2], &autoclearBufferSizes[base+2],
                        &autoclearBuffers[base+3], &autoclearBufferSizes[base+3],
                        &autoclearBuffers[base+4], &autoclearBufferSizes[base+4],
                        &autoclearBuffers[base+5], &autoclearBufferSizes[base+5]};
        executeKernel(clearSixBuffersKernel, args, max(max(max(max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), autoclearBufferSizes[base+3]), autoclearBufferSizes[base+4]), autoclearBufferSizes[base+5]), 128);
        base += 6;
    }
    if (total-base == 5) {
        void* args[] = {&autoclearBuffers[base], &autoclearBufferSizes[base],
                        &autoclearBuffers[base+1], &autoclearBufferSizes[base+1],
                        &autoclearBuffers[base+2], &autoclearBufferSizes[base+2],
                        &autoclearBuffers[base+3], &autoclearBufferSizes[base+3],
                        &autoclearBuffers[base+4], &autoclearBufferSizes[base+4]};
        executeKernel(clearFiveBuffersKernel, args, max(max(max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), autoclearBufferSizes[base+3]), autoclearBufferSizes[base+4]), 128);
    }
    else if (total-base == 4) {
        void* args[] = {&autoclearBuffers[base], &autoclearBufferSizes[base],
                        &autoclearBuffers[base+1], &autoclearBufferSizes[base+1],
                        &autoclearBuffers[base+2], &autoclearBufferSizes[base+2],
                        &autoclearBuffers[base+3], &autoclearBufferSizes[base+3]};
        executeKernel(clearFourBuffersKernel, args, max(max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), autoclearBufferSizes[base+3]), 128);
    }
    else if (total-base == 3) {
        void* args[] = {&autoclearBuffers[base], &autoclearBufferSizes[base],
                        &autoclearBuffers[base+1], &autoclearBufferSizes[base+1],
                        &autoclearBuffers[base+2], &autoclearBufferSizes[base+2]};
        executeKernel(clearThreeBuffersKernel, args, max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), 128);
    }
    else if (total-base == 2) {
        void* args[] = {&autoclearBuffers[base], &autoclearBufferSizes[base],
                        &autoclearBuffers[base+1], &autoclearBufferSizes[base+1]};
        executeKernel(clearTwoBuffersKernel, args, max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), 128);
    }
    else if (total-base == 1) {
        clearBuffer(autoclearBuffers[base], autoclearBufferSizes[base]*4);
    }
}
774

Peter Eastman's avatar
Peter Eastman committed
775
double CudaContext::reduceEnergy() {
Peter Eastman's avatar
Peter Eastman committed
776
    int bufferSize = energyBuffer.getSize();
Peter Eastman's avatar
Peter Eastman committed
777
    int workGroupSize  = 512;
Peter Eastman's avatar
Peter Eastman committed
778
    void* args[] = {&energyBuffer.getDevicePointer(), &energySum.getDevicePointer(), &bufferSize, &workGroupSize};
779
    executeKernel(reduceEnergyKernel, args, workGroupSize*energySum.getSize(), workGroupSize, workGroupSize*energyBuffer.getElementSize());
Peter Eastman's avatar
Peter Eastman committed
780
    energySum.download(pinnedBuffer);
781
782
783
784
785
786
787
788
789
790
    double result = 0;
    if (getUseDoublePrecision() || getUseMixedPrecision()) {
        for (int i = 0; i < energySum.getSize(); i++)
            result += ((double*) pinnedBuffer)[i];
    }
    else {
        for (int i = 0; i < energySum.getSize(); i++)
            result += ((float*) pinnedBuffer)[i];
    }
    return result;
Peter Eastman's avatar
Peter Eastman committed
791
792
}

793
void CudaContext::setCharges(const vector<double>& charges) {
Peter Eastman's avatar
Peter Eastman committed
794
795
    if (!chargeBuffer.isInitialized())
        chargeBuffer.initialize(*this, numAtoms, useDoublePrecision ? sizeof(double) : sizeof(float), "chargeBuffer");
Peter Eastman's avatar
Peter Eastman committed
796
797
798
799
    vector<double> c(numAtoms);
    for (int i = 0; i < numAtoms; i++)
        c[i] = charges[i];
    chargeBuffer.upload(c, true);
Peter Eastman's avatar
Peter Eastman committed
800
    void* args[] = {&chargeBuffer.getDevicePointer(), &posq.getDevicePointer(), &atomIndexDevice.getDevicePointer(), &numAtoms};
801
802
803
    executeKernel(setChargesKernel, args, numAtoms);
}

804
805
806
807
808
809
bool CudaContext::requestPosqCharges() {
    bool allow = !hasAssignedPosqCharges;
    hasAssignedPosqCharges = true;
    return allow;
}

810
811
812
813
814
815
816
817
818
void CudaContext::addEnergyParameterDerivative(const string& param) {
    // See if this parameter has already been registered.
    
    for (int i = 0; i < energyParamDerivNames.size(); i++)
        if (param == energyParamDerivNames[i])
            return;
    energyParamDerivNames.push_back(param);
}

819
820
void CudaContext::flushQueue() {
    cuStreamSynchronize(getCurrentStream());
821
822
}

823
824
825
826
827
828
829
830
831
832
vector<int> CudaContext::getDevicePrecedence() {
    int numDevices;
    CUdevice thisDevice;
    string errorMessage = "Error initializing Context";
    vector<pair<pair<int, int>, int> > devices;

    CHECK_RESULT(cuDeviceGetCount(&numDevices));
    for (int i = 0; i < numDevices; i++) {
        CHECK_RESULT(cuDeviceGet(&thisDevice, i));
        int major, minor, clock, multiprocessors, speed;
833
834
        CHECK_RESULT(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, thisDevice));
        CHECK_RESULT(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, thisDevice));
835
836
837
        if (major == 1 && minor < 2)
            continue;

Robert T. McGibbon's avatar
Robert T. McGibbon committed
838
        if ((useDoublePrecision || useMixedPrecision) && (major+0.1*minor < 1.3))
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
            continue;

        CHECK_RESULT(cuDeviceGetAttribute(&clock, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, thisDevice));
        CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, thisDevice));
        speed = clock*multiprocessors;
        pair<int, int> deviceProperties = std::make_pair(major, speed);
        devices.push_back(std::make_pair(deviceProperties, -i));
    }

    // sort first by compute capability (higher is better), then speed
    // (higher is better), and finally device index (lower is better)
    std::sort(devices.begin(), devices.end());
    std::reverse(devices.begin(), devices.end());

    vector<int> precedence;
    for (int i = 0; i < static_cast<int>(devices.size()); i++) {
        precedence.push_back(-devices[i].second);
    }

    return precedence;
}
860
861
862
863
864
865
866

unsigned int CudaContext::getEventFlags() {
    unsigned int flags = CU_EVENT_DISABLE_TIMING;
    if (useBlockingSync)
        flags += CU_EVENT_BLOCKING_SYNC;
    return flags;
}