/* -------------------------------------------------------------------------- * * OpenMM * * -------------------------------------------------------------------------- * * This is part of the OpenMM molecular simulation toolkit originating from * * Simbios, the NIH National Center for Physics-Based Simulation of * * Biological Structures at Stanford, funded under the NIH Roadmap for * * Medical Research, grant U54 GM072970. See https://simtk.org. * * * * Portions copyright (c) 2011-2013 Stanford University and the Authors. * * Authors: Peter Eastman * * Contributors: * * * * Permission is hereby granted, free of charge, to any person obtaining a * * copy of this software and associated documentation files (the "Software"), * * to deal in the Software without restriction, including without limitation * * the rights to use, copy, modify, merge, publish, distribute, sublicense, * * and/or sell copies of the Software, and to permit persons to whom the * * Software is furnished to do so, subject to the following conditions: * * * * The above copyright notice and this permission notice shall be included in * * all copies or substantial portions of the Software. * * * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, * * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR * * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE * * USE OR OTHER DEALINGS IN THE SOFTWARE. * * -------------------------------------------------------------------------- */ #include "OpenCLRpmdKernels.h" #include "OpenCLRpmdKernelSources.h" #include "openmm/internal/ContextImpl.h" #include "OpenCLIntegrationUtilities.h" #include "OpenCLExpressionUtilities.h" #include "OpenCLFFT3D.h" #include "OpenCLNonbondedUtilities.h" #include "../src/SimTKUtilities/SimTKOpenMMRealType.h" using namespace OpenMM; using namespace std; OpenCLIntegrateRPMDStepKernel::~OpenCLIntegrateRPMDStepKernel() { if (forces != NULL) delete forces; if (positions != NULL) delete positions; if (velocities != NULL) delete velocities; if (contractedForces != NULL) delete contractedForces; if (contractedPositions != NULL) delete contractedPositions; } void OpenCLIntegrateRPMDStepKernel::initialize(const System& system, const RPMDIntegrator& integrator) { cl.getPlatformData().initializeContexts(system); numCopies = integrator.getNumCopies(); numParticles = system.getNumParticles(); workgroupSize = numCopies; if (numCopies != OpenCLFFT3D::findLegalDimension(numCopies)) throw OpenMMException("RPMDIntegrator: the number of copies must be a multiple of powers of 2, 3, and 5."); int paddedParticles = cl.getPaddedNumAtoms(); int forceElementSize = (cl.getUseDoublePrecision() ? sizeof(mm_double4) : sizeof(mm_float4)); forces = new OpenCLArray(cl, numCopies*paddedParticles, forceElementSize, "rpmdForces"); bool useDoublePrecision = (cl.getUseDoublePrecision() || cl.getUseMixedPrecision()); int elementSize = (useDoublePrecision ? sizeof(mm_double4) : sizeof(mm_float4)); positions = new OpenCLArray(cl, numCopies*paddedParticles, elementSize, "rpmdPositions"); velocities = new OpenCLArray(cl, numCopies*paddedParticles, elementSize, "rpmdVelocities"); cl.getIntegrationUtilities().initRandomNumberGenerator((unsigned int) integrator.getRandomNumberSeed()); // Fill in the posq and velm arrays with safe values to avoid a risk of nans. if (useDoublePrecision) { vector temp(positions->getSize()); for (int i = 0; i < positions->getSize(); i++) temp[i] = mm_double4(0, 0, 0, 0); positions->upload(temp); for (int i = 0; i < velocities->getSize(); i++) temp[i] = mm_double4(0, 0, 0, 1); velocities->upload(temp); } else { vector temp(positions->getSize()); for (int i = 0; i < positions->getSize(); i++) temp[i] = mm_float4(0, 0, 0, 0); positions->upload(temp); for (int i = 0; i < velocities->getSize(); i++) temp[i] = mm_float4(0, 0, 0, 1); velocities->upload(temp); } // Build a list of contractions. groupsNotContracted = -1; const map& contractions = integrator.getContractions(); int maxContractedCopies = 0; for (map::const_iterator iter = contractions.begin(); iter != contractions.end(); ++iter) { int group = iter->first; int copies = iter->second; if (group < 0 || group > 31) throw OpenMMException("RPMDIntegrator: Force group must be between 0 and 31"); if (copies < 0 || copies > numCopies) throw OpenMMException("RPMDIntegrator: Number of copies for contraction cannot be greater than the total number of copies being simulated"); if (copies != numCopies) { if (groupsByCopies.find(copies) == groupsByCopies.end()) { groupsByCopies[copies] = 1< maxContractedCopies) maxContractedCopies = copies; } else groupsByCopies[copies] |= 1< 0) { contractedForces = new OpenCLArray(cl, maxContractedCopies*paddedParticles, forceElementSize, "rpmdContractedForces"); contractedPositions = new OpenCLArray(cl, maxContractedCopies*paddedParticles, elementSize, "rpmdContractedPositions"); } // Create kernels. map defines; defines["NUM_ATOMS"] = cl.intToString(cl.getNumAtoms()); defines["PADDED_NUM_ATOMS"] = cl.intToString(cl.getPaddedNumAtoms()); defines["NUM_COPIES"] = cl.intToString(numCopies); defines["THREAD_BLOCK_SIZE"] = cl.intToString(workgroupSize); defines["HBAR"] = cl.doubleToString(1.054571628e-34*AVOGADRO/(1000*1e-12)); defines["SCALE"] = cl.doubleToString(1.0/sqrt((double) numCopies)); defines["M_PI"] = cl.doubleToString(M_PI); map replacements; replacements["FFT_Q_FORWARD"] = createFFT(numCopies, "q", true); replacements["FFT_Q_BACKWARD"] = createFFT(numCopies, "q", false); replacements["FFT_V_FORWARD"] = createFFT(numCopies, "v", true); replacements["FFT_V_BACKWARD"] = createFFT(numCopies, "v", false); cl::Program program = cl.createProgram(cl.replaceStrings(OpenCLRpmdKernelSources::rpmd, replacements), defines, ""); pileKernel = cl::Kernel(program, "applyPileThermostat"); stepKernel = cl::Kernel(program, "integrateStep"); velocitiesKernel = cl::Kernel(program, "advanceVelocities"); copyToContextKernel = cl::Kernel(program, "copyDataToContext"); copyFromContextKernel = cl::Kernel(program, "copyDataFromContext"); translateKernel = cl::Kernel(program, "applyCellTranslations"); // Create kernels for doing contractions. for (map::const_iterator iter = groupsByCopies.begin(); iter != groupsByCopies.end(); ++iter) { int copies = iter->first; replacements.clear(); replacements["NUM_CONTRACTED_COPIES"] = cl.intToString(copies); replacements["POS_SCALE"] = cl.doubleToString(1.0/numCopies); replacements["FORCE_SCALE"] = cl.doubleToString(1.0/copies); replacements["FFT_Q_FORWARD"] = createFFT(numCopies, "q", true); replacements["FFT_Q_BACKWARD"] = createFFT(copies, "q", false); replacements["FFT_F_FORWARD"] = createFFT(copies, "f", true); replacements["FFT_F_BACKWARD"] = createFFT(numCopies, "f", false); program = cl.createProgram(cl.replaceStrings(OpenCLRpmdKernelSources::rpmdContraction, replacements), defines, ""); positionContractionKernels[copies] = cl::Kernel(program, "contractPositions"); forceContractionKernels[copies] = cl::Kernel(program, "contractForces"); } } void OpenCLIntegrateRPMDStepKernel::initializeKernels(ContextImpl& context) { hasInitializedKernel = true; pileKernel.setArg(0, velocities->getDeviceBuffer()); stepKernel.setArg(0, positions->getDeviceBuffer()); stepKernel.setArg(1, velocities->getDeviceBuffer()); stepKernel.setArg(2, forces->getDeviceBuffer()); velocitiesKernel.setArg(0, velocities->getDeviceBuffer()); velocitiesKernel.setArg(1, forces->getDeviceBuffer()); translateKernel.setArg(0, positions->getDeviceBuffer()); translateKernel.setArg(1, cl.getPosq().getDeviceBuffer()); translateKernel.setArg(2, cl.getAtomIndexArray().getDeviceBuffer()); copyToContextKernel.setArg(0, velocities->getDeviceBuffer()); copyToContextKernel.setArg(1, cl.getVelm().getDeviceBuffer()); copyToContextKernel.setArg(3, cl.getPosq().getDeviceBuffer()); copyToContextKernel.setArg(4, cl.getAtomIndexArray().getDeviceBuffer()); copyFromContextKernel.setArg(0, cl.getForce().getDeviceBuffer()); copyFromContextKernel.setArg(2, cl.getVelm().getDeviceBuffer()); copyFromContextKernel.setArg(3, velocities->getDeviceBuffer()); copyFromContextKernel.setArg(4, cl.getPosq().getDeviceBuffer()); copyFromContextKernel.setArg(6, cl.getAtomIndexArray().getDeviceBuffer()); for (map::const_iterator iter = groupsByCopies.begin(); iter != groupsByCopies.end(); ++iter) { int copies = iter->first; positionContractionKernels[copies].setArg(0, positions->getDeviceBuffer()); positionContractionKernels[copies].setArg(1, contractedPositions->getDeviceBuffer()); forceContractionKernels[copies].setArg(0, forces->getDeviceBuffer()); forceContractionKernels[copies].setArg(1, contractedForces->getDeviceBuffer()); } } void OpenCLIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDIntegrator& integrator, bool forcesAreValid) { OpenCLIntegrationUtilities& integration = cl.getIntegrationUtilities(); if (!hasInitializedKernel) initializeKernels(context); // Loop over copies and compute the force on each one. if (!forcesAreValid) computeForces(context); // Apply the PILE-L thermostat. bool useDoublePrecision = (cl.getUseDoublePrecision() || cl.getUseMixedPrecision()); const double dt = integrator.getStepSize(); pileKernel.setArg(2, integration.prepareRandomNumbers(numParticles*numCopies)); pileKernel.setArg(1, integration.getRandom().getDeviceBuffer()); // Do this *after* prepareRandomNumbers(), which might rebuild the array. if (useDoublePrecision) { pileKernel.setArg(3, dt); pileKernel.setArg(4, integrator.getTemperature()*BOLTZ); pileKernel.setArg(5, integrator.getFriction()); stepKernel.setArg(3, dt); stepKernel.setArg(4, integrator.getTemperature()*BOLTZ); velocitiesKernel.setArg(2, dt); } else { pileKernel.setArg(3, (cl_float) dt); pileKernel.setArg(4, (cl_float) (integrator.getTemperature()*BOLTZ)); pileKernel.setArg(5, (cl_float) integrator.getFriction()); stepKernel.setArg(3, (cl_float) dt); stepKernel.setArg(4, (cl_float) (integrator.getTemperature()*BOLTZ)); velocitiesKernel.setArg(2, (cl_float) dt); } cl.executeKernel(pileKernel, numParticles*numCopies, workgroupSize); // Update positions and velocities. cl.executeKernel(stepKernel, numParticles*numCopies, workgroupSize); // Calculate forces based on the updated positions. computeForces(context); // Update velocities. cl.executeKernel(velocitiesKernel, numParticles*numCopies, workgroupSize); // Apply the PILE-L thermostat again. pileKernel.setArg(2, integration.prepareRandomNumbers(numParticles*numCopies)); cl.executeKernel(pileKernel, numParticles*numCopies, workgroupSize); // Update the time and step count. cl.setTime(cl.getTime()+dt); cl.setStepCount(cl.getStepCount()+1); cl.reorderAtoms(); if (cl.getAtomsWereReordered() && cl.getNonbondedUtilities().getUsePeriodic()) { // Atoms may have been translated into a different periodic box, so apply // the same translation to all the beads. translateKernel.setArg(3, numCopies-1); cl.executeKernel(translateKernel, cl.getNumAtoms()); } } void OpenCLIntegrateRPMDStepKernel::computeForces(ContextImpl& context) { // Compute forces from all groups that didn't have a specified contraction. copyToContextKernel.setArg(2, positions->getDeviceBuffer()); copyFromContextKernel.setArg(1, forces->getDeviceBuffer()); copyFromContextKernel.setArg(5, positions->getDeviceBuffer()); for (int i = 0; i < numCopies; i++) { copyToContextKernel.setArg(5, i); cl.executeKernel(copyToContextKernel, cl.getNumAtoms()); context.computeVirtualSites(); context.updateContextState(); context.calcForcesAndEnergy(true, false, groupsNotContracted); copyFromContextKernel.setArg(7, i); cl.executeKernel(copyFromContextKernel, cl.getNumAtoms()); } // Now loop over contractions and compute forces from them. if (groupsByCopies.size() > 0) { copyToContextKernel.setArg(2, contractedPositions->getDeviceBuffer()); copyFromContextKernel.setArg(1, contractedForces->getDeviceBuffer()); copyFromContextKernel.setArg(5, contractedPositions->getDeviceBuffer()); for (map::const_iterator iter = groupsByCopies.begin(); iter != groupsByCopies.end(); ++iter) { int copies = iter->first; int groupFlags = iter->second; // Find the contracted positions. cl.executeKernel(positionContractionKernels[copies], numParticles*numCopies, workgroupSize); // Compute forces. for (int i = 0; i < copies; i++) { copyToContextKernel.setArg(5, i); cl.executeKernel(copyToContextKernel, cl.getNumAtoms()); context.computeVirtualSites(); context.calcForcesAndEnergy(true, false, groupFlags); copyFromContextKernel.setArg(7, i); cl.executeKernel(copyFromContextKernel, cl.getNumAtoms()); } // Apply the forces to the original copies. cl.executeKernel(forceContractionKernels[copies], numParticles*numCopies, workgroupSize); } } } double OpenCLIntegrateRPMDStepKernel::computeKineticEnergy(ContextImpl& context, const RPMDIntegrator& integrator) { return cl.getIntegrationUtilities().computeKineticEnergy(0); } void OpenCLIntegrateRPMDStepKernel::setPositions(int copy, const vector& pos) { if (positions == NULL) throw OpenMMException("RPMDIntegrator: Cannot set positions before the integrator is added to a Context"); if (pos.size() != numParticles) throw OpenMMException("RPMDIntegrator: wrong number of values passed to setPositions()"); if (cl.getUseDoublePrecision()) { vector posq(cl.getPaddedNumAtoms()); cl.getPosq().download(posq); for (int i = 0; i < numParticles; i++) posq[i] = mm_double4(pos[i][0], pos[i][1], pos[i][2], posq[i].w); cl.getQueue().enqueueWriteBuffer(positions->getDeviceBuffer(), CL_TRUE, copy*cl.getPaddedNumAtoms()*sizeof(mm_double4), numParticles*sizeof(mm_double4), &posq[0]); } else if (cl.getUseMixedPrecision()) { vector posqf(cl.getPaddedNumAtoms()); cl.getPosq().download(posqf); vector posq(cl.getPaddedNumAtoms()); for (int i = 0; i < numParticles; i++) posq[i] = mm_double4(pos[i][0], pos[i][1], pos[i][2], posqf[i].w); cl.getQueue().enqueueWriteBuffer(positions->getDeviceBuffer(), CL_TRUE, copy*cl.getPaddedNumAtoms()*sizeof(mm_double4), numParticles*sizeof(mm_double4), &posq[0]); } else { vector posq(cl.getPaddedNumAtoms()); cl.getPosq().download(posq); for (int i = 0; i < numParticles; i++) posq[i] = mm_float4((cl_float) pos[i][0], (cl_float) pos[i][1], (cl_float) pos[i][2], posq[i].w); cl.getQueue().enqueueWriteBuffer(positions->getDeviceBuffer(), CL_TRUE, copy*cl.getPaddedNumAtoms()*sizeof(mm_float4), numParticles*sizeof(mm_float4), &posq[0]); } } void OpenCLIntegrateRPMDStepKernel::setVelocities(int copy, const vector& vel) { if (velocities == NULL) throw OpenMMException("RPMDIntegrator: Cannot set velocities before the integrator is added to a Context"); if (vel.size() != numParticles) throw OpenMMException("RPMDIntegrator: wrong number of values passed to setVelocities()"); if (cl.getUseDoublePrecision() || cl.getUseMixedPrecision()) { vector velm(cl.getPaddedNumAtoms()); cl.getVelm().download(velm); for (int i = 0; i < numParticles; i++) velm[i] = mm_double4(vel[i][0], vel[i][1], vel[i][2], velm[i].w); cl.getQueue().enqueueWriteBuffer(velocities->getDeviceBuffer(), CL_TRUE, copy*cl.getPaddedNumAtoms()*sizeof(mm_double4), numParticles*sizeof(mm_double4), &velm[0]); } else { vector velm(cl.getPaddedNumAtoms()); cl.getVelm().download(velm); for (int i = 0; i < numParticles; i++) velm[i] = mm_float4((cl_float) vel[i][0], (cl_float) vel[i][1], (cl_float) vel[i][2], velm[i].w); cl.getQueue().enqueueWriteBuffer(velocities->getDeviceBuffer(), CL_TRUE, copy*cl.getPaddedNumAtoms()*sizeof(mm_float4), numParticles*sizeof(mm_float4), &velm[0]); } } void OpenCLIntegrateRPMDStepKernel::copyToContext(int copy, ContextImpl& context) { if (!hasInitializedKernel) initializeKernels(context); copyToContextKernel.setArg(2, positions->getDeviceBuffer()); copyToContextKernel.setArg(5, copy); cl.executeKernel(copyToContextKernel, cl.getNumAtoms()); } string OpenCLIntegrateRPMDStepKernel::createFFT(int size, const string& variable, bool forward) { stringstream source; int unfactored = size; int stage = 0; int L = size; int m = 1; string sign = (forward ? "1.0f" : "-1.0f"); string multReal = (forward ? "multiplyComplexRealPart" : "multiplyComplexRealPartConj"); string multImag = (forward ? "multiplyComplexImagPart" : "multiplyComplexImagPartConj"); source<<"{\n"; source<<"__local mixed4* real0 = "< 1) { int input = stage%2; int output = 1-input; source<<"{\n"; if (unfactored%5 == 0) { L = L/5; source<<"// Pass "<<(stage+1)<<" (radix 5)\n"; source<<"if (indexInBlock < "<<(L*m)<<") {\n"; source<<"int i = indexInBlock;\n"; source<<"int j = i/"<