/* -------------------------------------------------------------------------- * * OpenMM * * -------------------------------------------------------------------------- * * This is part of the OpenMM molecular simulation toolkit originating from * * Simbios, the NIH National Center for Physics-Based Simulation of * * Biological Structures at Stanford, funded under the NIH Roadmap for * * Medical Research, grant U54 GM072970. See https://simtk.org. * * * * Portions copyright (c) 2011-2013 Stanford University and the Authors. * * Authors: Peter Eastman * * Contributors: * * * * Permission is hereby granted, free of charge, to any person obtaining a * * copy of this software and associated documentation files (the "Software"), * * to deal in the Software without restriction, including without limitation * * the rights to use, copy, modify, merge, publish, distribute, sublicense, * * and/or sell copies of the Software, and to permit persons to whom the * * Software is furnished to do so, subject to the following conditions: * * * * The above copyright notice and this permission notice shall be included in * * all copies or substantial portions of the Software. * * * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, * * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR * * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE * * USE OR OTHER DEALINGS IN THE SOFTWARE. * * -------------------------------------------------------------------------- */ #include "CudaRpmdKernels.h" #include "CudaRpmdKernelSources.h" #include "openmm/internal/ContextImpl.h" #include "CudaIntegrationUtilities.h" #include "CudaExpressionUtilities.h" #include "CudaKernelSources.h" #include "CudaNonbondedUtilities.h" #include "SimTKOpenMMRealType.h" using namespace OpenMM; using namespace std; /** * Select a size for an FFT that is a multiple of 2, 3, 5, and 7. */ static int findFFTDimension(int minimum) { if (minimum < 1) return 1; while (true) { // Attempt to factor the current value. int unfactored = minimum; for (int factor = 2; factor < 8; factor++) { while (unfactored > 1 && unfactored%factor == 0) unfactored /= factor; } if (unfactored == 1) return minimum; minimum++; } } CudaIntegrateRPMDStepKernel::~CudaIntegrateRPMDStepKernel() { if (forces != NULL) delete forces; if (positions != NULL) delete positions; if (velocities != NULL) delete velocities; if (contractedForces != NULL) delete contractedForces; if (contractedPositions != NULL) delete contractedPositions; } void CudaIntegrateRPMDStepKernel::initialize(const System& system, const RPMDIntegrator& integrator) { cu.getPlatformData().initializeContexts(system); numCopies = integrator.getNumCopies(); numParticles = system.getNumParticles(); workgroupSize = numCopies; if (numCopies != findFFTDimension(numCopies)) throw OpenMMException("RPMDIntegrator: the number of copies must be a multiple of powers of 2, 3, and 5."); int paddedParticles = cu.getPaddedNumAtoms(); bool useDoublePrecision = (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()); int elementSize = (useDoublePrecision ? sizeof(double4) : sizeof(float4)); forces = CudaArray::create(cu, numCopies*paddedParticles*3, "rpmdForces"); positions = new CudaArray(cu, numCopies*paddedParticles, elementSize, "rpmdPositions"); velocities = new CudaArray(cu, numCopies*paddedParticles, elementSize, "rpmdVelocities"); cu.getIntegrationUtilities().initRandomNumberGenerator((unsigned int) integrator.getRandomNumberSeed()); // Fill in the posq and velm arrays with safe values to avoid a risk of nans. if (useDoublePrecision) { vector temp(positions->getSize()); for (int i = 0; i < positions->getSize(); i++) temp[i] = make_double4(0, 0, 0, 0); positions->upload(temp); for (int i = 0; i < velocities->getSize(); i++) temp[i] = make_double4(0, 0, 0, 1); velocities->upload(temp); } else { vector temp(positions->getSize()); for (int i = 0; i < positions->getSize(); i++) temp[i] = make_float4(0, 0, 0, 0); positions->upload(temp); for (int i = 0; i < velocities->getSize(); i++) temp[i] = make_float4(0, 0, 0, 1); velocities->upload(temp); } // Build a list of contractions. groupsNotContracted = -1; const map& contractions = integrator.getContractions(); int maxContractedCopies = 0; for (map::const_iterator iter = contractions.begin(); iter != contractions.end(); ++iter) { int group = iter->first; int copies = iter->second; if (group < 0 || group > 31) throw OpenMMException("RPMDIntegrator: Force group must be between 0 and 31"); if (copies < 0 || copies > numCopies) throw OpenMMException("RPMDIntegrator: Number of copies for contraction cannot be greater than the total number of copies being simulated"); if (copies != findFFTDimension(copies)) throw OpenMMException("RPMDIntegrator: Number of copies for contraction must be a multiple of powers of 2, 3, and 5."); if (copies != numCopies) { if (groupsByCopies.find(copies) == groupsByCopies.end()) { groupsByCopies[copies] = 1< maxContractedCopies) maxContractedCopies = copies; } else groupsByCopies[copies] |= 1< 0) { contractedForces = CudaArray::create(cu, maxContractedCopies*paddedParticles*3, "rpmdContractedForces"); contractedPositions = new CudaArray(cu, maxContractedCopies*paddedParticles, elementSize, "rpmdContractedPositions"); } // Create kernels. map defines; defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms()); defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms()); defines["NUM_COPIES"] = cu.intToString(numCopies); defines["THREAD_BLOCK_SIZE"] = cu.intToString(workgroupSize); defines["HBAR"] = cu.doubleToString(1.054571628e-34*AVOGADRO/(1000*1e-12)); defines["SCALE"] = cu.doubleToString(1.0/sqrt((double) numCopies)); defines["M_PI"] = cu.doubleToString(M_PI); map replacements; replacements["FFT_Q_FORWARD"] = createFFT(numCopies, "q", true); replacements["FFT_Q_BACKWARD"] = createFFT(numCopies, "q", false); replacements["FFT_V_FORWARD"] = createFFT(numCopies, "v", true); replacements["FFT_V_BACKWARD"] = createFFT(numCopies, "v", false); CUmodule module = cu.createModule(cu.replaceStrings(CudaKernelSources::vectorOps+CudaRpmdKernelSources::rpmd, replacements), defines, ""); pileKernel = cu.getKernel(module, "applyPileThermostat"); stepKernel = cu.getKernel(module, "integrateStep"); velocitiesKernel = cu.getKernel(module, "advanceVelocities"); copyToContextKernel = cu.getKernel(module, "copyDataToContext"); copyFromContextKernel = cu.getKernel(module, "copyDataFromContext"); translateKernel = cu.getKernel(module, "applyCellTranslations"); // Create kernels for doing contractions. for (map::const_iterator iter = groupsByCopies.begin(); iter != groupsByCopies.end(); ++iter) { int copies = iter->first; replacements.clear(); replacements["NUM_CONTRACTED_COPIES"] = cu.intToString(copies); replacements["POS_SCALE"] = cu.doubleToString(1.0/numCopies); replacements["FORCE_SCALE"] = cu.doubleToString(0x100000000/(double) copies); replacements["FFT_Q_FORWARD"] = createFFT(numCopies, "q", true); replacements["FFT_Q_BACKWARD"] = createFFT(copies, "q", false); replacements["FFT_F_FORWARD"] = createFFT(copies, "f", true); replacements["FFT_F_BACKWARD"] = createFFT(numCopies, "f", false); module = cu.createModule(cu.replaceStrings(CudaKernelSources::vectorOps+CudaRpmdKernelSources::rpmdContraction, replacements), defines, ""); positionContractionKernels[copies] = cu.getKernel(module, "contractPositions"); forceContractionKernels[copies] = cu.getKernel(module, "contractForces"); } } void CudaIntegrateRPMDStepKernel::execute(ContextImpl& context, const RPMDIntegrator& integrator, bool forcesAreValid) { cu.setAsCurrent(); CudaIntegrationUtilities& integration = cu.getIntegrationUtilities(); // Loop over copies and compute the force on each one. if (!forcesAreValid) computeForces(context); // Apply the PILE-L thermostat. bool useDoublePrecision = (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()); double dt = integrator.getStepSize(); float dtFloat = (float) dt; void* dtPtr = (useDoublePrecision ? (void*) &dt : (void*) &dtFloat); double kT = integrator.getTemperature()*BOLTZ; float kTFloat = (float) kT; void* kTPtr = (useDoublePrecision ? (void*) &kT : (void*) &kTFloat); double friction = integrator.getFriction(); float frictionFloat = (float) friction; void* frictionPtr = (useDoublePrecision ? (void*) &friction : (void*) &frictionFloat); int randomIndex = integration.prepareRandomNumbers(numParticles*numCopies); void* pileArgs[] = {&velocities->getDevicePointer(), &integration.getRandom().getDevicePointer(), &randomIndex, dtPtr, kTPtr, frictionPtr}; if (integrator.getApplyThermostat()) cu.executeKernel(pileKernel, pileArgs, numParticles*numCopies, workgroupSize); // Update positions and velocities. void* stepArgs[] = {&positions->getDevicePointer(), &velocities->getDevicePointer(), &forces->getDevicePointer(), dtPtr, kTPtr}; cu.executeKernel(stepKernel, stepArgs, numParticles*numCopies, workgroupSize); // Calculate forces based on the updated positions. computeForces(context); // Update velocities. void* velocitiesArgs[] = {&velocities->getDevicePointer(), &forces->getDevicePointer(), dtPtr}; cu.executeKernel(velocitiesKernel, velocitiesArgs, numParticles*numCopies, workgroupSize); // Apply the PILE-L thermostat again. if (integrator.getApplyThermostat()) { randomIndex = integration.prepareRandomNumbers(numParticles*numCopies); cu.executeKernel(pileKernel, pileArgs, numParticles*numCopies, workgroupSize); } // Update the time and step count. cu.setTime(cu.getTime()+dt); cu.setStepCount(cu.getStepCount()+1); cu.reorderAtoms(); if (cu.getAtomsWereReordered() && cu.getNonbondedUtilities().getUsePeriodic()) { // Atoms may have been translated into a different periodic box, so apply // the same translation to all the beads. int i = numCopies-1; void* args[] = {&positions->getDevicePointer(), &cu.getPosq().getDevicePointer(), &cu.getAtomIndexArray().getDevicePointer(), &i}; cu.executeKernel(translateKernel, args, cu.getNumAtoms()); } } void CudaIntegrateRPMDStepKernel::computeForces(ContextImpl& context) { // Compute forces from all groups that didn't have a specified contraction. for (int i = 0; i < numCopies; i++) { void* copyToContextArgs[] = {&velocities->getDevicePointer(), &cu.getVelm().getDevicePointer(), &positions->getDevicePointer(), &cu.getPosq().getDevicePointer(), &cu.getAtomIndexArray().getDevicePointer(), &i}; cu.executeKernel(copyToContextKernel, copyToContextArgs, cu.getNumAtoms()); context.computeVirtualSites(); Vec3 initialBox[3]; context.getPeriodicBoxVectors(initialBox[0], initialBox[1], initialBox[2]); context.updateContextState(); Vec3 finalBox[3]; context.getPeriodicBoxVectors(finalBox[0], finalBox[1], finalBox[2]); if (initialBox[0] != finalBox[0] || initialBox[1] != finalBox[1] || initialBox[2] != finalBox[2]) throw OpenMMException("Standard barostats cannot be used with RPMDIntegrator. Use RPMDMonteCarloBarostat instead."); context.calcForcesAndEnergy(true, false, groupsNotContracted); void* copyFromContextArgs[] = {&cu.getForce().getDevicePointer(), &forces->getDevicePointer(), &cu.getVelm().getDevicePointer(), &velocities->getDevicePointer(), &cu.getPosq().getDevicePointer(), &positions->getDevicePointer(), &cu.getAtomIndexArray().getDevicePointer(), &i}; cu.executeKernel(copyFromContextKernel, copyFromContextArgs, cu.getNumAtoms()); } // Now loop over contractions and compute forces from them. for (map::const_iterator iter = groupsByCopies.begin(); iter != groupsByCopies.end(); ++iter) { int copies = iter->first; int groupFlags = iter->second; // Find the contracted positions. void* contractPosArgs[] = {&positions->getDevicePointer(), &contractedPositions->getDevicePointer()}; cu.executeKernel(positionContractionKernels[copies], contractPosArgs, numParticles*numCopies, workgroupSize); // Compute forces. for (int i = 0; i < copies; i++) { void* copyToContextArgs[] = {&velocities->getDevicePointer(), &cu.getVelm().getDevicePointer(), &contractedPositions->getDevicePointer(), &cu.getPosq().getDevicePointer(), &cu.getAtomIndexArray().getDevicePointer(), &i}; cu.executeKernel(copyToContextKernel, copyToContextArgs, cu.getNumAtoms()); context.computeVirtualSites(); context.calcForcesAndEnergy(true, false, groupFlags); void* copyFromContextArgs[] = {&cu.getForce().getDevicePointer(), &contractedForces->getDevicePointer(), &cu.getVelm().getDevicePointer(), &velocities->getDevicePointer(), &cu.getPosq().getDevicePointer(), &contractedPositions->getDevicePointer(), &cu.getAtomIndexArray().getDevicePointer(), &i}; cu.executeKernel(copyFromContextKernel, copyFromContextArgs, cu.getNumAtoms()); } // Apply the forces to the original copies. void* contractForceArgs[] = {&forces->getDevicePointer(), &contractedForces->getDevicePointer()}; cu.executeKernel(forceContractionKernels[copies], contractForceArgs, numParticles*numCopies, workgroupSize); } if (groupsByCopies.size() > 0) { // Ensure the Context contains the positions from the last copy, since we'll assume that later. int i = numCopies-1; void* copyToContextArgs[] = {&velocities->getDevicePointer(), &cu.getVelm().getDevicePointer(), &positions->getDevicePointer(), &cu.getPosq().getDevicePointer(), &cu.getAtomIndexArray().getDevicePointer(), &i}; cu.executeKernel(copyToContextKernel, copyToContextArgs, cu.getNumAtoms()); } } double CudaIntegrateRPMDStepKernel::computeKineticEnergy(ContextImpl& context, const RPMDIntegrator& integrator) { return cu.getIntegrationUtilities().computeKineticEnergy(0); } void CudaIntegrateRPMDStepKernel::setPositions(int copy, const vector& pos) { if (positions == NULL) throw OpenMMException("RPMDIntegrator: Cannot set positions before the integrator is added to a Context"); if (pos.size() != numParticles) throw OpenMMException("RPMDIntegrator: wrong number of values passed to setPositions()"); // Adjust the positions based on the current cell offsets. const vector& order = cu.getAtomIndex(); double4 periodicBoxSize = cu.getPeriodicBoxSize(); vector offsetPos(numParticles); for (int i = 0; i < numParticles; ++i) { int4 offset = cu.getPosCellOffsets()[i]; offsetPos[order[i]] = pos[order[i]] + Vec3(offset.x*periodicBoxSize.x, offset.y*periodicBoxSize.y, offset.z*periodicBoxSize.z); } // Record the positions. CUresult result; if (cu.getUseDoublePrecision()) { vector posq(cu.getPaddedNumAtoms()); cu.getPosq().download(posq); for (int i = 0; i < numParticles; i++) posq[i] = make_double4(offsetPos[i][0], offsetPos[i][1], offsetPos[i][2], posq[i].w); result = cuMemcpyHtoD(positions->getDevicePointer()+copy*cu.getPaddedNumAtoms()*sizeof(double4), &posq[0], numParticles*sizeof(double4)); } else if (cu.getUseMixedPrecision()) { vector posqf(cu.getPaddedNumAtoms()); cu.getPosq().download(posqf); vector posq(cu.getPaddedNumAtoms()); for (int i = 0; i < numParticles; i++) posq[i] = make_double4(offsetPos[i][0], offsetPos[i][1], offsetPos[i][2], posqf[i].w); result = cuMemcpyHtoD(positions->getDevicePointer()+copy*cu.getPaddedNumAtoms()*sizeof(double4), &posq[0], numParticles*sizeof(double4)); } else { vector posq(cu.getPaddedNumAtoms()); cu.getPosq().download(posq); for (int i = 0; i < numParticles; i++) posq[i] = make_float4((float) offsetPos[i][0], (float) offsetPos[i][1], (float) offsetPos[i][2], posq[i].w); result = cuMemcpyHtoD(positions->getDevicePointer()+copy*cu.getPaddedNumAtoms()*sizeof(float4), &posq[0], numParticles*sizeof(float4)); } if (result != CUDA_SUCCESS) { std::stringstream str; str<<"Error uploading array "<getName()<<": "<& vel) { if (velocities == NULL) throw OpenMMException("RPMDIntegrator: Cannot set velocities before the integrator is added to a Context"); if (vel.size() != numParticles) throw OpenMMException("RPMDIntegrator: wrong number of values passed to setVelocities()"); CUresult result; if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) { vector velm(cu.getPaddedNumAtoms()); cu.getVelm().download(velm); for (int i = 0; i < numParticles; i++) velm[i] = make_double4(vel[i][0], vel[i][1], vel[i][2], velm[i].w); result = cuMemcpyHtoD(velocities->getDevicePointer()+copy*cu.getPaddedNumAtoms()*sizeof(double4), &velm[0], numParticles*sizeof(double4)); } else { vector velm(cu.getPaddedNumAtoms()); cu.getVelm().download(velm); for (int i = 0; i < numParticles; i++) velm[i] = make_float4((float) vel[i][0], (float) vel[i][1], (float) vel[i][2], velm[i].w); result = cuMemcpyHtoD(velocities->getDevicePointer()+copy*cu.getPaddedNumAtoms()*sizeof(float4), &velm[0], numParticles*sizeof(float4)); } if (result != CUDA_SUCCESS) { std::stringstream str; str<<"Error uploading array "<getName()<<": "<getDevicePointer(), &cu.getVelm().getDevicePointer(), &positions->getDevicePointer(), &cu.getPosq().getDevicePointer(), &cu.getAtomIndexArray().getDevicePointer(), ©}; cu.executeKernel(copyToContextKernel, copyArgs, cu.getNumAtoms()); } string CudaIntegrateRPMDStepKernel::createFFT(int size, const string& variable, bool forward) { stringstream source; int stage = 0; int L = size; int m = 1; string sign = (forward ? "1.0f" : "-1.0f"); string multReal = (forward ? "multiplyComplexRealPart" : "multiplyComplexRealPartConj"); string multImag = (forward ? "multiplyComplexImagPart" : "multiplyComplexImagPartConj"); source<<"{\n"; source<<"mixed3* real0 = "< 1) { int input = stage%2; int output = 1-input; int radix; if (L%5 == 0) radix = 5; else if (L%4 == 0) radix = 4; else if (L%3 == 0) radix = 3; else if (L%2 == 0) radix = 2; else throw OpenMMException("Illegal size for FFT: "+cu.intToString(size)); source<<"{\n"; L = L/radix; source<<"// Pass "<<(stage+1)<<" (radix "<