HipParallelKernels.cpp 16 KB
Newer Older
1
2
3
4
5
6
7
8
/* -------------------------------------------------------------------------- *
 *                                   OpenMM                                   *
 * -------------------------------------------------------------------------- *
 * This is part of the OpenMM molecular simulation toolkit originating from   *
 * Simbios, the NIH National Center for Physics-Based Simulation of           *
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
9
 * Portions copyright (c) 2011-2024 Stanford University and the Authors.      *
10
 * Portions copyright (c) 2020-2021 Advanced Micro Devices, Inc.              *
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
 * Authors: Peter Eastman, Nicholas Curtis                                    *
 * Contributors:                                                              *
 *                                                                            *
 * This program is free software: you can redistribute it and/or modify       *
 * it under the terms of the GNU Lesser General Public License as published   *
 * by the Free Software Foundation, either version 3 of the License, or       *
 * (at your option) any later version.                                        *
 *                                                                            *
 * This program is distributed in the hope that it will be useful,            *
 * but WITHOUT ANY WARRANTY; without even the implied warranty of             *
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the              *
 * GNU Lesser General Public License for more details.                        *
 *                                                                            *
 * You should have received a copy of the GNU Lesser General Public License   *
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.      *
 * -------------------------------------------------------------------------- */

#include "HipParallelKernels.h"
#include "HipKernelSources.h"
30
#include "openmm/common/ContextSelector.h"
Peter Eastman's avatar
Peter Eastman committed
31
#include "openmm/internal/timer.h"
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46

using namespace OpenMM;
using namespace std;


#define CHECK_RESULT(result, prefix) \
if (result != hipSuccess) { \
    std::stringstream m; \
    m<<prefix<<": "<<cu.getErrorString(result)<<" ("<<result<<")"<<" at "<<__FILE__<<":"<<__LINE__; \
    throw OpenMMException(m.str());\
}

class HipParallelCalcForcesAndEnergyKernel::BeginComputationTask : public HipContext::WorkTask {
public:
    BeginComputationTask(ContextImpl& context, HipContext& cu, HipCalcForcesAndEnergyKernel& kernel,
47
48
            bool includeForce, bool includeEnergy, int groups, void* pinnedMemory, hipEvent_t event) : context(context), cu(cu), kernel(kernel),
            includeForce(includeForce), includeEnergy(includeEnergy), groups(groups), pinnedMemory(pinnedMemory), event(event) {
49
50
51
52
    }
    void execute() {
        // Copy coordinates over to this device and execute the kernel.

53
        ContextSelector selector(cu);
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
        if (cu.getContextIndex() > 0) {
            hipStreamWaitEvent(cu.getCurrentStream(), event, 0);
            if (!cu.getPlatformData().peerAccessSupported)
                cu.getPosq().upload(pinnedMemory, false);
        }
        kernel.beginComputation(context, includeForce, includeEnergy, groups);
    }
private:
    ContextImpl& context;
    HipContext& cu;
    HipCalcForcesAndEnergyKernel& kernel;
    bool includeForce, includeEnergy;
    int groups;
    void* pinnedMemory;
    hipEvent_t event;
};

class HipParallelCalcForcesAndEnergyKernel::FinishComputationTask : public HipContext::WorkTask {
public:
    FinishComputationTask(ContextImpl& context, HipContext& cu, HipCalcForcesAndEnergyKernel& kernel,
Peter Eastman's avatar
Peter Eastman committed
74
            bool includeForce, bool includeEnergy, int groups, double& energy, double& completionTime, long long* pinnedMemory, HipArray& contextForces,
75
            bool& valid, hipStream_t stream, hipEvent_t event, hipEvent_t localEvent) :
76
            context(context), cu(cu), kernel(kernel), includeForce(includeForce), includeEnergy(includeEnergy), groups(groups), energy(energy),
77
            completionTime(completionTime), pinnedMemory(pinnedMemory), contextForces(contextForces), valid(valid),
78
            stream(stream), event(event), localEvent(localEvent) {
79
80
81
82
    }
    void execute() {
        // Execute the kernel, then download forces.

83
        ContextSelector selector(cu);
84
85
86
87
88
        energy += kernel.finishComputation(context, includeForce, includeEnergy, groups, valid);
        if (cu.getComputeForceCount() < 200) {
            // Record timing information for load balancing.  Since this takes time, only do it at the start of the simulation.

            CHECK_RESULT(hipStreamSynchronize(cu.getCurrentStream()), "Error synchronizing HIP context");
Peter Eastman's avatar
Peter Eastman committed
89
            completionTime = getCurrentTime();
90
91
92
        }
        if (includeForce) {
            if (cu.getContextIndex() > 0) {
93
94
                hipEventRecord(localEvent, cu.getCurrentStream());
                hipStreamWaitEvent(stream, localEvent, 0);
95
96
97
98
                int numAtoms = cu.getPaddedNumAtoms();
                if (cu.getPlatformData().peerAccessSupported) {
                    int numBytes = numAtoms*3*sizeof(long long);
                    int offset = (cu.getContextIndex()-1)*numBytes;
99
100
101
                    CHECK_RESULT(hipMemcpyAsync(static_cast<char*>(contextForces.getDevicePointer())+offset,
                                           cu.getForce().getDevicePointer(), numBytes, hipMemcpyDeviceToDevice, stream), "Error copying forces");
                    hipEventRecord(event, stream);
102
103
104
105
106
107
108
109
110
111
112
113
114
                }
                else
                    cu.getForce().download(&pinnedMemory[(cu.getContextIndex()-1)*numAtoms*3]);
            }
        }
    }
private:
    ContextImpl& context;
    HipContext& cu;
    HipCalcForcesAndEnergyKernel& kernel;
    bool includeForce, includeEnergy;
    int groups;
    double& energy;
Peter Eastman's avatar
Peter Eastman committed
115
    double& completionTime;
116
117
118
    long long* pinnedMemory;
    HipArray& contextForces;
    bool& valid;
119
120
121
    hipStream_t stream;
    hipEvent_t event;
    hipEvent_t localEvent;
122
123
124
125
};

HipParallelCalcForcesAndEnergyKernel::HipParallelCalcForcesAndEnergyKernel(string name, const Platform& platform, HipPlatform::PlatformData& data) :
        CalcForcesAndEnergyKernel(name, platform), data(data), completionTimes(data.contexts.size()), contextNonbondedFractions(data.contexts.size()),
126
        pinnedPositionBuffer(NULL), pinnedForceBuffer(NULL) {
127
128
129
130
131
    for (int i = 0; i < (int) data.contexts.size(); i++)
        kernels.push_back(Kernel(new HipCalcForcesAndEnergyKernel(name, platform, *data.contexts[i])));
}

HipParallelCalcForcesAndEnergyKernel::~HipParallelCalcForcesAndEnergyKernel() {
132
    ContextSelector selector(*data.contexts[0]);
133
134
135
136
137
    if (pinnedPositionBuffer != NULL)
        hipHostFree(pinnedPositionBuffer);
    if (pinnedForceBuffer != NULL)
        hipHostFree(pinnedForceBuffer);
    hipEventDestroy(event);
138
139
140
141
142
143
    for (int i = 0; i < peerCopyEvent.size(); i++)
        hipEventDestroy(peerCopyEvent[i]);
    for (int i = 0; i < peerCopyEventLocal.size(); i++)
        hipEventDestroy(peerCopyEventLocal[i]);
    for (int i = 0; i < peerCopyStream.size(); i++)
        hipStreamDestroy(peerCopyStream[i]);
144
145
146
147
}

void HipParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
    HipContext& cu = *data.contexts[0];
148
    ContextSelector selector(cu);
149
150
151
152
153
154
155
    hipModule_t module = cu.createModule(HipKernelSources::parallel);
    sumKernel = cu.getKernel(module, "sumForces");
    int numContexts = data.contexts.size();
    for (int i = 0; i < numContexts; i++)
        getKernel(i).initialize(system);
    for (int i = 0; i < numContexts; i++)
        contextNonbondedFractions[i] = 1/(double) numContexts;
156
157
158
159
160
161
162
    CHECK_RESULT(hipEventCreateWithFlags(&event, cu.getEventFlags()), "Error creating event");
    peerCopyEvent.resize(numContexts);
    peerCopyEventLocal.resize(numContexts);
    peerCopyStream.resize(numContexts);
    for (int i = 0; i < numContexts; i++) {
        HipContext& cuLocal = *data.contexts[i];
        ContextSelector selectorLocal(cuLocal);
163
164
        CHECK_RESULT(hipEventCreateWithFlags(&peerCopyEvent[i], cu.getEventFlags()), "Error creating event");
        CHECK_RESULT(hipStreamCreateWithFlags(&peerCopyStream[i], hipStreamNonBlocking), "Error creating stream");
165
166
        CHECK_RESULT(hipEventCreateWithFlags(&peerCopyEventLocal[i], cu.getEventFlags()), "Error creating event");
    }
167
168
169
170
}

void HipParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) {
    HipContext& cu = *data.contexts[0];
171
    ContextSelector selector(cu);
172
173
    if (!contextForces.isInitialized()) {
        contextForces.initialize<long long>(cu, 3*(data.contexts.size()-1)*cu.getPaddedNumAtoms(), "contextForces");
174
175
176
177
        if (!cu.getPlatformData().peerAccessSupported) {
            CHECK_RESULT(hipHostMalloc((void**) &pinnedForceBuffer, 3*(data.contexts.size()-1)*cu.getPaddedNumAtoms()*sizeof(long long), hipHostMallocPortable), "Error allocating pinned memory");
            CHECK_RESULT(hipHostMalloc(&pinnedPositionBuffer, cu.getPaddedNumAtoms()*(cu.getUseDoublePrecision() ? sizeof(double4) : sizeof(float4)), hipHostMallocPortable), "Error allocating pinned memory");
        }
178
179
180
181
182
183
184
185
186
187
188
    }

    // Copy coordinates over to each device and execute the kernel.

    if (!cu.getPlatformData().peerAccessSupported) {
        cu.getPosq().download(pinnedPositionBuffer, false);
        hipEventRecord(event, cu.getCurrentStream());
    }
    else {
        int numBytes = cu.getPosq().getSize()*cu.getPosq().getElementSize();
        hipEventRecord(event, cu.getCurrentStream());
189
190
        for (int i = 1; i < (int) data.contexts.size(); i++) {
            hipStreamWaitEvent(peerCopyStream[i], event, 0);
191
192
193
            CHECK_RESULT(hipMemcpyAsync(
                data.contexts[i]->getPosq().getDevicePointer(),
                cu.getPosq().getDevicePointer(), numBytes,
194
195
196
                hipMemcpyDeviceToDevice, peerCopyStream[i]), "Error copying positions");
            hipEventRecord(peerCopyEvent[i], peerCopyStream[i]);
        }
197
198
199
200
201
    }
    for (int i = 0; i < (int) data.contexts.size(); i++) {
        data.contextEnergy[i] = 0.0;
        HipContext& cu = *data.contexts[i];
        ComputeContext::WorkThread& thread = cu.getWorkThread();
202
        hipEvent_t waitEvent = (cu.getPlatformData().peerAccessSupported ? peerCopyEvent[i] : event);
203
        thread.addTask(new BeginComputationTask(context, cu, getKernel(i), includeForce, includeEnergy, groups, pinnedPositionBuffer, waitEvent));
204
    }
205
    data.syncContexts();
206
207
208
209
210
211
}

double HipParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups, bool& valid) {
    for (int i = 0; i < (int) data.contexts.size(); i++) {
        HipContext& cu = *data.contexts[i];
        ComputeContext::WorkThread& thread = cu.getWorkThread();
212
        thread.addTask(new FinishComputationTask(context, cu, getKernel(i), includeForce, includeEnergy, groups, data.contextEnergy[i], completionTimes[i],
213
                pinnedForceBuffer, contextForces, valid, peerCopyStream[i], peerCopyEvent[i], peerCopyEventLocal[i]));
214
215
    }
    data.syncContexts();
216
217
218
219
220
    HipContext& cu = *data.contexts[0];
    ContextSelector selector(cu);
    if (cu.getPlatformData().peerAccessSupported)
        for (int i = 1; i < data.contexts.size(); i++)
            hipStreamWaitEvent(cu.getCurrentStream(), peerCopyEvent[i], 0);
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
    double energy = 0.0;
    for (int i = 0; i < (int) data.contextEnergy.size(); i++)
        energy += data.contextEnergy[i];
    if (includeForce && valid) {
        // Sum the forces from all devices.

        if (!cu.getPlatformData().peerAccessSupported)
            contextForces.upload(pinnedForceBuffer, false);
        int bufferSize = 3*cu.getPaddedNumAtoms();
        int numBuffers = data.contexts.size()-1;
        void* args[] = {&cu.getForce().getDevicePointer(), &contextForces.getDevicePointer(), &bufferSize, &numBuffers};
        cu.executeKernel(sumKernel, args, bufferSize);

        // Balance work between the contexts by transferring a little nonbonded work from the context that
        // finished last to the one that finished first.

        if (cu.getComputeForceCount() < 200) {
            int firstIndex = 0, lastIndex = 0;
239
            const double eps = 0.001;
240
241
242
            for (int i = 0; i < (int) completionTimes.size(); i++) {
                if (completionTimes[i] < completionTimes[firstIndex])
                    firstIndex = i;
243
                if (contextNonbondedFractions[lastIndex] < eps || completionTimes[i] > completionTimes[lastIndex])
244
245
                    lastIndex = i;
            }
246
            double fractionToTransfer = min(cu.getComputeForceCount() < 100 ? 0.01 : 0.001, contextNonbondedFractions[lastIndex]);
247
248
249
250
251
252
253
254
255
256
            contextNonbondedFractions[firstIndex] += fractionToTransfer;
            contextNonbondedFractions[lastIndex] -= fractionToTransfer;
            double startFraction = 0.0;
            for (int i = 0; i < (int) contextNonbondedFractions.size(); i++) {
                double endFraction = startFraction+contextNonbondedFractions[i];
                if (i == contextNonbondedFractions.size()-1)
                    endFraction = 1.0; // Avoid roundoff error
                data.contexts[i]->getNonbondedUtilities().setAtomBlockRange(startFraction, endFraction);
                startFraction = endFraction;
            }
257
        }
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
    }
    return energy;
}

class HipParallelCalcNonbondedForceKernel::Task : public HipContext::WorkTask {
public:
    Task(ContextImpl& context, HipCalcNonbondedForceKernel& kernel, bool includeForce,
            bool includeEnergy, bool includeDirect, bool includeReciprocal, double& energy) : context(context), kernel(kernel),
            includeForce(includeForce), includeEnergy(includeEnergy), includeDirect(includeDirect), includeReciprocal(includeReciprocal), energy(energy) {
    }
    void execute() {
        energy += kernel.execute(context, includeForce, includeEnergy, includeDirect, includeReciprocal);
    }
private:
    ContextImpl& context;
    HipCalcNonbondedForceKernel& kernel;
    bool includeForce, includeEnergy, includeDirect, includeReciprocal;
    double& energy;
};

HipParallelCalcNonbondedForceKernel::HipParallelCalcNonbondedForceKernel(std::string name, const Platform& platform, HipPlatform::PlatformData& data, const System& system) :
        CalcNonbondedForceKernel(name, platform), data(data) {
    for (int i = 0; i < (int) data.contexts.size(); i++)
        kernels.push_back(Kernel(new HipCalcNonbondedForceKernel(name, platform, *data.contexts[i], system)));
}

void HipParallelCalcNonbondedForceKernel::initialize(const System& system, const NonbondedForce& force) {
    for (int i = 0; i < (int) kernels.size(); i++)
        getKernel(i).initialize(system, force);
}

double HipParallelCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy, bool includeDirect, bool includeReciprocal) {
    for (int i = 0; i < (int) data.contexts.size(); i++) {
        HipContext& cu = *data.contexts[i];
        ComputeContext::WorkThread& thread = cu.getWorkThread();
        thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, includeDirect, includeReciprocal, data.contextEnergy[i]));
    }
    return 0.0;
}

298
void HipParallelCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& context, const NonbondedForce& force, int firstParticle, int lastParticle, int firstException, int lastException) {
299
    for (int i = 0; i < (int) kernels.size(); i++)
300
        getKernel(i).copyParametersToContext(context, force, firstParticle, lastParticle, firstException, lastException);
301
302
303
304
305
306
307
308
309
}

void HipParallelCalcNonbondedForceKernel::getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
    dynamic_cast<const HipCalcNonbondedForceKernel&>(kernels[0].getImpl()).getPMEParameters(alpha, nx, ny, nz);
}

void HipParallelCalcNonbondedForceKernel::getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
    dynamic_cast<const HipCalcNonbondedForceKernel&>(kernels[0].getImpl()).getLJPMEParameters(alpha, nx, ny, nz);
}