CudaPlatform.cpp 13.6 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
/* -------------------------------------------------------------------------- *
 *                                   OpenMM                                   *
 * -------------------------------------------------------------------------- *
 * This is part of the OpenMM molecular simulation toolkit originating from   *
 * Simbios, the NIH National Center for Physics-Based Simulation of           *
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
 * Portions copyright (c) 2008-2012 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
 * This program is free software: you can redistribute it and/or modify       *
 * it under the terms of the GNU Lesser General Public License as published   *
 * by the Free Software Foundation, either version 3 of the License, or       *
 * (at your option) any later version.                                        *
 *                                                                            *
 * This program is distributed in the hope that it will be useful,            *
 * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
 * GNU Lesser General Public License for more details.                        *
 *                                                                            *
 * You should have received a copy of the GNU Lesser General Public License   *
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
 * -------------------------------------------------------------------------- */

#include "CudaContext.h"
#include "CudaExpressionUtilities.h"
#include "CudaPlatform.h"
#include "CudaKernelFactory.h"
#include "CudaKernels.h"
#include "openmm/internal/ContextImpl.h"
#include "openmm/Context.h"
#include "openmm/System.h"
#include <algorithm>
#include <cctype>
#include <sstream>
38
#include <cstdio>
39
40
41
#ifdef _MSC_VER
    #include <Windows.h>
#endif
42
43
44
using namespace OpenMM;
using namespace std;

45
46
47
48
49
50
51
#define CHECK_RESULT(result, prefix) \
    if (result != CUDA_SUCCESS) { \
        std::stringstream m; \
        m<<prefix<<": "<<CudaContext::getErrorString(result)<<" ("<<result<<")"<<" at "<<__FILE__<<":"<<__LINE__; \
        throw OpenMMException(m.str());\
    }

52
53
54
55
56
57

#ifdef OPENMM_CUDA_BUILDING_STATIC_LIBRARY
extern "C" void registerCudaPlatform() {
    Platform::registerPlatform(new CudaPlatform());
}
#else
58
extern "C" OPENMM_EXPORT_CUDA void registerPlatforms() {
59
60
    Platform::registerPlatform(new CudaPlatform());
}
61
#endif
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93

CudaPlatform::CudaPlatform() {
    CudaKernelFactory* factory = new CudaKernelFactory();
    registerKernelFactory(CalcForcesAndEnergyKernel::Name(), factory);
    registerKernelFactory(UpdateStateDataKernel::Name(), factory);
    registerKernelFactory(ApplyConstraintsKernel::Name(), factory);
    registerKernelFactory(VirtualSitesKernel::Name(), factory);
    registerKernelFactory(CalcHarmonicBondForceKernel::Name(), factory);
    registerKernelFactory(CalcCustomBondForceKernel::Name(), factory);
    registerKernelFactory(CalcHarmonicAngleForceKernel::Name(), factory);
    registerKernelFactory(CalcCustomAngleForceKernel::Name(), factory);
    registerKernelFactory(CalcPeriodicTorsionForceKernel::Name(), factory);
    registerKernelFactory(CalcRBTorsionForceKernel::Name(), factory);
    registerKernelFactory(CalcCMAPTorsionForceKernel::Name(), factory);
    registerKernelFactory(CalcCustomTorsionForceKernel::Name(), factory);
    registerKernelFactory(CalcNonbondedForceKernel::Name(), factory);
    registerKernelFactory(CalcCustomNonbondedForceKernel::Name(), factory);
    registerKernelFactory(CalcGBSAOBCForceKernel::Name(), factory);
    registerKernelFactory(CalcCustomGBForceKernel::Name(), factory);
    registerKernelFactory(CalcCustomExternalForceKernel::Name(), factory);
    registerKernelFactory(CalcCustomHbondForceKernel::Name(), factory);
    registerKernelFactory(CalcCustomCompoundBondForceKernel::Name(), factory);
    registerKernelFactory(IntegrateVerletStepKernel::Name(), factory);
    registerKernelFactory(IntegrateLangevinStepKernel::Name(), factory);
    registerKernelFactory(IntegrateBrownianStepKernel::Name(), factory);
    registerKernelFactory(IntegrateVariableVerletStepKernel::Name(), factory);
    registerKernelFactory(IntegrateVariableLangevinStepKernel::Name(), factory);
    registerKernelFactory(IntegrateCustomStepKernel::Name(), factory);
    registerKernelFactory(ApplyAndersenThermostatKernel::Name(), factory);
    registerKernelFactory(ApplyMonteCarloBarostatKernel::Name(), factory);
    registerKernelFactory(RemoveCMMotionKernel::Name(), factory);
    platformProperties.push_back(CudaDeviceIndex());
94
    platformProperties.push_back(CudaDeviceName());
95
96
    platformProperties.push_back(CudaUseBlockingSync());
    platformProperties.push_back(CudaPrecision());
97
    platformProperties.push_back(CudaUseCpuPme());
98
99
    platformProperties.push_back(CudaCompiler());
    platformProperties.push_back(CudaTempDirectory());
100
    platformProperties.push_back(CudaHostCompiler());
101
    setPropertyDefaultValue(CudaDeviceIndex(), "");
102
    setPropertyDefaultValue(CudaDeviceName(), "");
103
104
    setPropertyDefaultValue(CudaUseBlockingSync(), "true");
    setPropertyDefaultValue(CudaPrecision(), "single");
105
    setPropertyDefaultValue(CudaUseCpuPme(), "false");
106
#ifdef _MSC_VER
107
    char* bindir = getenv("CUDA_BIN_PATH");
108
    string nvcc = (bindir == NULL ? "nvcc.exe" : string(bindir)+"\\nvcc.exe");
109
    int length = GetShortPathName(nvcc.c_str(), NULL, 0);
110
111
112
113
114
    if (length > 0) {
        vector<char> shortName(length);
        GetShortPathName(nvcc.c_str(), &shortName[0], length);
        nvcc = string(&shortName[0]);
    }
115
    setPropertyDefaultValue(CudaCompiler(), nvcc);
116
117
    setPropertyDefaultValue(CudaTempDirectory(), string(getenv("TEMP")));
#else
118
    char* compiler = getenv("OPENMM_CUDA_COMPILER");
119
    string nvcc = (compiler == NULL ? "/usr/local/cuda/bin/nvcc" : string(compiler));
120
    setPropertyDefaultValue(CudaCompiler(), nvcc);
121
    char* tmpdir = getenv("TMPDIR");
Peter Eastman's avatar
Peter Eastman committed
122
123
    string tmp = (tmpdir == NULL ? string(P_tmpdir) : string(tmpdir));
    setPropertyDefaultValue(CudaTempDirectory(), tmp);
124
#endif
125
126
    char* hostCompiler = getenv("CUDA_HOST_COMPILER");
    setPropertyDefaultValue(CudaHostCompiler(), (hostCompiler == NULL ? "" : string(hostCompiler)));
127
128
}

129
130
131
132
double CudaPlatform::getSpeed() const {
    return 100;
}

133
bool CudaPlatform::supportsDoublePrecision() const {
Peter Eastman's avatar
Peter Eastman committed
134
    return true;
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
}

const string& CudaPlatform::getPropertyValue(const Context& context, const string& property) const {
    const ContextImpl& impl = getContextImpl(context);
    const PlatformData* data = reinterpret_cast<const PlatformData*>(impl.getPlatformData());
    map<string, string>::const_iterator value = data->propertyValues.find(property);
    if (value != data->propertyValues.end())
        return value->second;
    return Platform::getPropertyValue(context, property);
}

void CudaPlatform::setPropertyValue(Context& context, const string& property, const string& value) const {
}

void CudaPlatform::contextCreated(ContextImpl& context, const map<string, string>& properties) const {
    const string& devicePropValue = (properties.find(CudaDeviceIndex()) == properties.end() ?
            getPropertyDefaultValue(CudaDeviceIndex()) : properties.find(CudaDeviceIndex())->second);
    string blockingPropValue = (properties.find(CudaUseBlockingSync()) == properties.end() ?
            getPropertyDefaultValue(CudaUseBlockingSync()) : properties.find(CudaUseBlockingSync())->second);
    string precisionPropValue = (properties.find(CudaPrecision()) == properties.end() ?
            getPropertyDefaultValue(CudaPrecision()) : properties.find(CudaPrecision())->second);
156
157
    string cpuPmePropValue = (properties.find(CudaUseCpuPme()) == properties.end() ?
            getPropertyDefaultValue(CudaUseCpuPme()) : properties.find(CudaUseCpuPme())->second);
158
159
160
161
    const string& compilerPropValue = (properties.find(CudaCompiler()) == properties.end() ?
            getPropertyDefaultValue(CudaCompiler()) : properties.find(CudaCompiler())->second);
    const string& tempPropValue = (properties.find(CudaTempDirectory()) == properties.end() ?
            getPropertyDefaultValue(CudaTempDirectory()) : properties.find(CudaTempDirectory())->second);
162
163
    const string& hostCompilerPropValue = (properties.find(CudaHostCompiler()) == properties.end() ?
            getPropertyDefaultValue(CudaHostCompiler()) : properties.find(CudaHostCompiler())->second);
164
165
    transform(blockingPropValue.begin(), blockingPropValue.end(), blockingPropValue.begin(), ::tolower);
    transform(precisionPropValue.begin(), precisionPropValue.end(), precisionPropValue.begin(), ::tolower);
166
167
168
169
170
    transform(cpuPmePropValue.begin(), cpuPmePropValue.end(), cpuPmePropValue.begin(), ::tolower);
    vector<string> pmeKernelName;
    pmeKernelName.push_back(CalcPmeReciprocalForceKernel::Name());
    if (!supportsKernels(pmeKernelName))
        cpuPmePropValue = "false";
171
    context.setPlatformData(new PlatformData(&context, context.getSystem(), devicePropValue, blockingPropValue, precisionPropValue, cpuPmePropValue, compilerPropValue, tempPropValue, hostCompilerPropValue));
172
173
174
175
176
177
178
}

void CudaPlatform::contextDestroyed(ContextImpl& context) const {
    PlatformData* data = reinterpret_cast<PlatformData*>(context.getPlatformData());
    delete data;
}

179
CudaPlatform::PlatformData::PlatformData(ContextImpl* context, const System& system, const string& deviceIndexProperty, const string& blockingProperty, const string& precisionProperty,
180
            const string& cpuPmeProperty, const string& compilerProperty, const string& tempProperty, const string& hostCompilerProperty) : context(context), removeCM(false), stepCount(0), computeForceCount(0), time(0.0) {
181
182
183
184
185
186
187
188
    bool blocking = (blockingProperty == "true");
    vector<string> devices;
    size_t searchPos = 0, nextPos;
    while ((nextPos = deviceIndexProperty.find_first_of(", ", searchPos)) != string::npos) {
        devices.push_back(deviceIndexProperty.substr(searchPos, nextPos-searchPos));
        searchPos = nextPos+1;
    }
    devices.push_back(deviceIndexProperty.substr(searchPos));
189
190
191
192
193
    try {
        for (int i = 0; i < (int) devices.size(); i++) {
            if (devices[i].length() > 0) {
                unsigned int deviceIndex;
                stringstream(devices[i]) >> deviceIndex;
194
                contexts.push_back(new CudaContext(system, deviceIndex, blocking, precisionProperty, compilerProperty, tempProperty, hostCompilerProperty, *this));
195
            }
196
        }
197
        if (contexts.size() == 0)
198
            contexts.push_back(new CudaContext(system, -1, blocking, precisionProperty, compilerProperty, tempProperty, hostCompilerProperty, *this));
199
200
201
202
203
204
205
    }
    catch (...) {
        // If an exception was thrown, do our best to clean up memory.
        
        for (int i = 0; i < (int) contexts.size(); i++)
            delete contexts[i];
        throw;
206
    }
207
    stringstream deviceIndex, deviceName;
208
    for (int i = 0; i < (int) contexts.size(); i++) {
209
210
211
212
213
214
215
216
        if (i > 0) {
            deviceIndex << ',';
            deviceName << ',';
        }
        deviceIndex << contexts[i]->getDeviceIndex();
        char name[1000];
        CHECK_RESULT(cuDeviceGetName(name, 1000, contexts[i]->getDevice()), "Error querying device name");
        deviceName << name;
217
    }
218
    useCpuPme = (cpuPmeProperty == "true" && !contexts[0]->getUseDoublePrecision());
219
220
    propertyValues[CudaPlatform::CudaDeviceIndex()] = deviceIndex.str();
    propertyValues[CudaPlatform::CudaDeviceName()] = deviceName.str();
221
    propertyValues[CudaPlatform::CudaUseBlockingSync()] = blocking ? "true" : "false";
222
    propertyValues[CudaPlatform::CudaPrecision()] = precisionProperty;
223
    propertyValues[CudaPlatform::CudaUseCpuPme()] = useCpuPme ? "true" : "false";
224
225
    propertyValues[CudaPlatform::CudaCompiler()] = compilerProperty;
    propertyValues[CudaPlatform::CudaTempDirectory()] = tempProperty;
226
    propertyValues[CudaPlatform::CudaHostCompiler()] = hostCompilerProperty;
227
    contextEnergy.resize(contexts.size());
228
229
230
    
    // Determine whether peer-to-peer copying is supported, and enable it if so.
    
231
232
233
234
235
236
237
238
239
240
    peerAccessSupported = false; // Disable until I figure out why it usually makes things slower
//    peerAccessSupported = true;
//    for (int i = 1; i < contexts.size(); i++) {
//        int canAccess;
//        cuDeviceCanAccessPeer(&canAccess, contexts[i]->getDevice(), contexts[0]->getDevice());
//        if (!canAccess) {
//            peerAccessSupported = false;
//            break;
//        }
//    }
241
242
243
244
245
246
247
248
    if (peerAccessSupported) {
        for (int i = 1; i < contexts.size(); i++) {
            contexts[0]->setAsCurrent();
            CHECK_RESULT(cuCtxEnablePeerAccess(contexts[i]->getContext(), 0), "Error enabling peer access");
            contexts[i]->setAsCurrent();
            CHECK_RESULT(cuCtxEnablePeerAccess(contexts[0]->getContext(), 0), "Error enabling peer access");
        }
    }
249
250
251
252
253
254
255
256
}

CudaPlatform::PlatformData::~PlatformData() {
    for (int i = 0; i < (int) contexts.size(); i++)
        delete contexts[i];
}

void CudaPlatform::PlatformData::initializeContexts(const System& system) {
257
258
    for (int i = 0; i < (int) contexts.size(); i++)
        contexts[i]->initialize();
259
260
261
}

void CudaPlatform::PlatformData::syncContexts() {
262
263
    for (int i = 0; i < (int) contexts.size(); i++)
        contexts[i]->getWorkThread().flush();
264
}