/* -------------------------------------------------------------------------- * * OpenMM * * -------------------------------------------------------------------------- * * This is part of the OpenMM molecular simulation toolkit originating from * * Simbios, the NIH National Center for Physics-Based Simulation of * * Biological Structures at Stanford, funded under the NIH Roadmap for * * Medical Research, grant U54 GM072970. See https://simtk.org. * * * * Portions copyright (c) 2009-2011 Stanford University and the Authors. * * Authors: Peter Eastman * * Contributors: * * * * This program is free software: you can redistribute it and/or modify * * it under the terms of the GNU Lesser General Public License as published * * by the Free Software Foundation, either version 3 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU Lesser General Public License for more details. * * * * You should have received a copy of the GNU Lesser General Public License * * along with this program. If not, see . * * -------------------------------------------------------------------------- */ #include "openmm/OpenMMException.h" #include "OpenCLNonbondedUtilities.h" #include "OpenCLArray.h" #include "OpenCLKernelSources.h" #include "OpenCLExpressionUtilities.h" #include #include #include using namespace OpenMM; using namespace std; OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : context(context), cutoff(-1.0), useCutoff(false), anyExclusions(false), numForceBuffers(0), exclusionIndices(NULL), exclusionRowIndices(NULL), exclusions(NULL), interactingTiles(NULL), interactionFlags(NULL), interactionCount(NULL), blockCenter(NULL), blockBoundingBox(NULL), nonbondedForceGroup(0) { // Decide how many thread blocks and force buffers to use. deviceIsCpu = (context.getDevice().getInfo() == CL_DEVICE_TYPE_CPU); forceBufferPerAtomBlock = false; if (deviceIsCpu) { numForceThreadBlocks = context.getNumThreadBlocks(); forceThreadBlockSize = 1; numForceBuffers = numForceThreadBlocks; } else if (context.getSIMDWidth() == 32) { if (context.getSupports64BitGlobalAtomics()) { numForceThreadBlocks = 2*context.getDevice().getInfo(); forceThreadBlockSize = 256; // Even though using longForceBuffer, still need a single forceBuffer for the reduceForces kernel to convert the long results into float4 which will be used by later kernels. numForceBuffers = 1; } else { numForceThreadBlocks = 4*context.getDevice().getInfo(); forceThreadBlockSize = 128; numForceBuffers = numForceThreadBlocks; } } else { numForceThreadBlocks = context.getNumThreadBlocks(); forceThreadBlockSize = OpenCLContext::ThreadBlockSize; if (context.getSupports64BitGlobalAtomics()) { // Even though using longForceBuffer, still need a single forceBuffer for the reduceForces kernel to convert the long results into float4 which will be used by later kernels. numForceBuffers = 1; } else { numForceBuffers = numForceThreadBlocks; if (numForceBuffers >= context.getNumAtomBlocks()) { // For small systems, it is more efficient to have one force buffer per block of 32 atoms instead of one per warp. forceBufferPerAtomBlock = true; numForceBuffers = context.getNumAtomBlocks(); } } } } OpenCLNonbondedUtilities::~OpenCLNonbondedUtilities() { if (exclusionIndices != NULL) delete exclusionIndices; if (exclusionRowIndices != NULL) delete exclusionRowIndices; if (exclusions != NULL) delete exclusions; if (interactingTiles != NULL) delete interactingTiles; if (interactionFlags != NULL) delete interactionFlags; if (interactionCount != NULL) delete interactionCount; if (blockCenter != NULL) delete blockCenter; if (blockBoundingBox != NULL) delete blockBoundingBox; } void OpenCLNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector >& exclusionList, const string& kernel, int forceGroup) { if (cutoff != -1.0) { if (usesCutoff != useCutoff) throw OpenMMException("All Forces must agree on whether to use a cutoff"); if (usesPeriodic != usePeriodic) throw OpenMMException("All Forces must agree on whether to use periodic boundary conditions"); if (cutoffDistance != cutoff) throw OpenMMException("All Forces must use the same cutoff distance"); if (forceGroup != nonbondedForceGroup) throw OpenMMException("All nonbonded forces must be in the same force group"); } if (usesExclusions) requestExclusions(exclusionList); useCutoff = usesCutoff; usePeriodic = usesPeriodic; cutoff = cutoffDistance; kernelSource += kernel+"\n"; nonbondedForceGroup = forceGroup; } void OpenCLNonbondedUtilities::addParameter(const ParameterInfo& parameter) { parameters.push_back(parameter); } void OpenCLNonbondedUtilities::addArgument(const ParameterInfo& parameter) { arguments.push_back(parameter); } void OpenCLNonbondedUtilities::requestExclusions(const vector >& exclusionList) { if (anyExclusions) { bool sameExclusions = (exclusionList.size() == atomExclusions.size()); for (int i = 0; i < (int) exclusionList.size() && sameExclusions; i++) { if (exclusionList[i].size() != atomExclusions[i].size()) sameExclusions = false; for (int j = 0; j < (int) exclusionList[i].size(); j++) if (exclusionList[i][j] != atomExclusions[i][j]) sameExclusions = false; } if (!sameExclusions) throw OpenMMException("All Forces must have identical exceptions"); } else { atomExclusions = exclusionList; anyExclusions = true; } } void OpenCLNonbondedUtilities::initialize(const System& system) { if (cutoff == -1.0) return; // There are no nonbonded interactions in the System. if (atomExclusions.size() == 0) { // No exclusions were specifically requested, so just mark every atom as not interacting with itself. atomExclusions.resize(context.getNumAtoms()); for (int i = 0; i < (int) atomExclusions.size(); i++) atomExclusions[i].push_back(i); } // Create the list of tiles. int numAtomBlocks = context.getNumAtomBlocks(); int totalTiles = numAtomBlocks*(numAtomBlocks+1)/2; int numContexts = context.getPlatformData().contexts.size(); startTileIndex = context.getContextIndex()*totalTiles/numContexts; int endTileIndex = (context.getContextIndex()+1)*totalTiles/numContexts; numTiles = endTileIndex-startTileIndex; // Build a list of indices for the tiles with exclusions. set > tilesWithExclusions; for (int atom1 = 0; atom1 < (int) atomExclusions.size(); ++atom1) { int x = atom1/OpenCLContext::TileSize; for (int j = 0; j < (int) atomExclusions[atom1].size(); ++j) { int atom2 = atomExclusions[atom1][j]; int y = atom2/OpenCLContext::TileSize; tilesWithExclusions.insert(make_pair(max(x, y), min(x, y))); } } if (context.getPaddedNumAtoms() > context.getNumAtoms()) { for (int i = 0; i < numAtomBlocks; ++i) tilesWithExclusions.insert(make_pair(numAtomBlocks-1, i)); } vector exclusionRowIndicesVec(numAtomBlocks+1, 0); vector exclusionIndicesVec; int currentRow = 0; for (set >::const_iterator iter = tilesWithExclusions.begin(); iter != tilesWithExclusions.end(); ++iter) { while (iter->first != currentRow) exclusionRowIndicesVec[++currentRow] = exclusionIndicesVec.size(); exclusionIndicesVec.push_back(iter->second); } exclusionRowIndicesVec[++currentRow] = exclusionIndicesVec.size(); exclusionIndices = new OpenCLArray(context, exclusionIndicesVec.size(), "exclusionIndices"); exclusionRowIndices = new OpenCLArray(context, exclusionRowIndicesVec.size(), "exclusionRowIndices"); exclusionIndices->upload(exclusionIndicesVec); exclusionRowIndices->upload(exclusionRowIndicesVec); // Record the exclusion data. exclusions = new OpenCLArray(context, tilesWithExclusions.size()*OpenCLContext::TileSize, "exclusions"); vector exclusionVec(exclusions->getSize()); for (int i = 0; i < exclusions->getSize(); ++i) exclusionVec[i] = 0xFFFFFFFF; for (int atom1 = 0; atom1 < (int) atomExclusions.size(); ++atom1) { int x = atom1/OpenCLContext::TileSize; int offset1 = atom1-x*OpenCLContext::TileSize; for (int j = 0; j < (int) atomExclusions[atom1].size(); ++j) { int atom2 = atomExclusions[atom1][j]; int y = atom2/OpenCLContext::TileSize; int offset2 = atom2-y*OpenCLContext::TileSize; if (x > y) { int index = findExclusionIndex(x, y, exclusionIndicesVec, exclusionRowIndicesVec); exclusionVec[index+offset1] &= 0xFFFFFFFF-(1<= y) { int index = findExclusionIndex(x, y, exclusionIndicesVec, exclusionRowIndicesVec); exclusionVec[index+offset1] &= 0xFFFFFFFF-(1<= x) { int index = findExclusionIndex(y, x, exclusionIndicesVec, exclusionRowIndicesVec); exclusionVec[index+offset2] &= 0xFFFFFFFF-(1<upload(exclusionVec); // Create data structures for the neighbor list. if (useCutoff) { // Select a size for the arrays that hold the neighbor list. This estimate is intentionally very // high, because if it ever is too small, we have to fall back to the N^2 algorithm. mm_float4 boxSize = context.getPeriodicBoxSize(); int maxInteractingTiles = (int) (numTiles*(cutoff/boxSize.x+cutoff/boxSize.y+cutoff/boxSize.z)); if (maxInteractingTiles > numTiles) maxInteractingTiles = numTiles; if (maxInteractingTiles < 1) maxInteractingTiles = 1; interactingTiles = new OpenCLArray(context, maxInteractingTiles, "interactingTiles"); interactionFlags = new OpenCLArray(context, context.getSIMDWidth() == 32 ? maxInteractingTiles : (deviceIsCpu ? 2*maxInteractingTiles : 1), "interactionFlags"); interactionCount = new OpenCLArray(context, 1, "interactionCount", true); blockCenter = new OpenCLArray(context, numAtomBlocks, "blockCenter"); blockBoundingBox = new OpenCLArray(context, numAtomBlocks, "blockBoundingBox"); interactionCount->set(0, 0); interactionCount->upload(); } // Create kernels. forceKernel = createInteractionKernel(kernelSource, parameters, arguments, true, true); if (useCutoff) { map defines; defines["NUM_BLOCKS"] = OpenCLExpressionUtilities::intToString(context.getNumAtomBlocks()); if (forceBufferPerAtomBlock) defines["USE_OUTPUT_BUFFER_PER_BLOCK"] = "1"; if (usePeriodic) defines["USE_PERIODIC"] = "1"; string file = (deviceIsCpu ? OpenCLKernelSources::findInteractingBlocks_cpu : OpenCLKernelSources::findInteractingBlocks); cl::Program interactingBlocksProgram = context.createProgram(file, defines); findBlockBoundsKernel = cl::Kernel(interactingBlocksProgram, "findBlockBounds"); findBlockBoundsKernel.setArg(0, context.getNumAtoms()); findBlockBoundsKernel.setArg(3, context.getPosq().getDeviceBuffer()); findBlockBoundsKernel.setArg(4, blockCenter->getDeviceBuffer()); findBlockBoundsKernel.setArg(5, blockBoundingBox->getDeviceBuffer()); findBlockBoundsKernel.setArg(6, interactionCount->getDeviceBuffer()); findInteractingBlocksKernel = cl::Kernel(interactingBlocksProgram, "findBlocksWithInteractions"); findInteractingBlocksKernel.setArg(0, (cl_float) (cutoff*cutoff)); findInteractingBlocksKernel.setArg(3, blockCenter->getDeviceBuffer()); findInteractingBlocksKernel.setArg(4, blockBoundingBox->getDeviceBuffer()); findInteractingBlocksKernel.setArg(5, interactionCount->getDeviceBuffer()); findInteractingBlocksKernel.setArg(6, interactingTiles->getDeviceBuffer()); findInteractingBlocksKernel.setArg(7, interactionFlags->getDeviceBuffer()); findInteractingBlocksKernel.setArg(8, context.getPosq().getDeviceBuffer()); findInteractingBlocksKernel.setArg(9, interactingTiles->getSize()); findInteractingBlocksKernel.setArg(10, startTileIndex); findInteractingBlocksKernel.setArg(11, startTileIndex+numTiles); if (context.getSIMDWidth() == 32 && !deviceIsCpu) { findInteractionsWithinBlocksKernel = cl::Kernel(interactingBlocksProgram, "findInteractionsWithinBlocks"); findInteractionsWithinBlocksKernel.setArg(0, (cl_float) (cutoff*cutoff)); findInteractionsWithinBlocksKernel.setArg(3, context.getPosq().getDeviceBuffer()); findInteractionsWithinBlocksKernel.setArg(4, interactingTiles->getDeviceBuffer()); findInteractionsWithinBlocksKernel.setArg(5, blockCenter->getDeviceBuffer()); findInteractionsWithinBlocksKernel.setArg(6, blockBoundingBox->getDeviceBuffer()); findInteractionsWithinBlocksKernel.setArg(7, interactionFlags->getDeviceBuffer()); findInteractionsWithinBlocksKernel.setArg(8, interactionCount->getDeviceBuffer()); findInteractionsWithinBlocksKernel.setArg(9, 128*sizeof(cl_uint), NULL); findInteractionsWithinBlocksKernel.setArg(10, interactingTiles->getSize()); } } } int OpenCLNonbondedUtilities::findExclusionIndex(int x, int y, const vector& exclusionIndices, const vector& exclusionRowIndices) { int start = exclusionRowIndices[x]; int end = exclusionRowIndices[x+1]; for (int i = start; i < end; i++) if (exclusionIndices[i] == y) return i*OpenCLContext::TileSize; throw OpenMMException("Internal error: exclusion in unexpected tile"); } void OpenCLNonbondedUtilities::prepareInteractions() { if (!useCutoff) return; if (usePeriodic) { mm_float4 box = context.getPeriodicBoxSize(); double minAllowedSize = 2*cutoff; if (box.x < minAllowedSize || box.y < minAllowedSize || box.z < minAllowedSize) throw OpenMMException("The periodic box size has decreased to less than twice the nonbonded cutoff."); } // Compute the neighbor list. findBlockBoundsKernel.setArg(1, context.getPeriodicBoxSize()); findBlockBoundsKernel.setArg(2, context.getInvPeriodicBoxSize()); context.executeKernel(findBlockBoundsKernel, context.getNumAtoms()); findInteractingBlocksKernel.setArg(1, context.getPeriodicBoxSize()); findInteractingBlocksKernel.setArg(2, context.getInvPeriodicBoxSize()); context.executeKernel(findInteractingBlocksKernel, context.getNumAtoms(), deviceIsCpu ? 1 : -1); if (context.getSIMDWidth() == 32 && !deviceIsCpu) { findInteractionsWithinBlocksKernel.setArg(1, context.getPeriodicBoxSize()); findInteractionsWithinBlocksKernel.setArg(2, context.getInvPeriodicBoxSize()); context.executeKernel(findInteractionsWithinBlocksKernel, context.getNumAtoms(), 128); } } void OpenCLNonbondedUtilities::computeInteractions() { if (cutoff != -1.0) { if (useCutoff) { forceKernel.setArg(10, context.getPeriodicBoxSize()); forceKernel.setArg(11, context.getInvPeriodicBoxSize()); } context.executeKernel(forceKernel, numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize); } } void OpenCLNonbondedUtilities::updateNeighborListSize() { if (!useCutoff) return; interactionCount->download(); if (interactionCount->get(0) <= (unsigned int) interactingTiles->getSize()) return; // The most recent timestep had too many interactions to fit in the arrays. Make the arrays bigger to prevent // this from happening in the future. int newSize = (int) (1.2*interactionCount->get(0)); int numTiles = context.getNumAtomBlocks()*(context.getNumAtomBlocks()+1)/2; if (newSize > numTiles) newSize = numTiles; delete interactingTiles; interactingTiles = new OpenCLArray(context, newSize, "interactingTiles"); forceKernel.setArg(8, interactingTiles->getDeviceBuffer()); forceKernel.setArg(12, newSize); findInteractingBlocksKernel.setArg(6, interactingTiles->getDeviceBuffer()); findInteractingBlocksKernel.setArg(9, newSize); if (context.getSIMDWidth() == 32 || deviceIsCpu) { delete interactionFlags; interactionFlags = new OpenCLArray(context, deviceIsCpu ? 2*newSize : newSize, "interactionFlags"); forceKernel.setArg(13, interactionFlags->getDeviceBuffer()); findInteractingBlocksKernel.setArg(7, interactionFlags->getDeviceBuffer()); findInteractionsWithinBlocksKernel.setArg(4, interactingTiles->getDeviceBuffer()); findInteractionsWithinBlocksKernel.setArg(7, interactionFlags->getDeviceBuffer()); findInteractionsWithinBlocksKernel.setArg(10, newSize); } } void OpenCLNonbondedUtilities::setTileRange(int startTileIndex, int numTiles) { this->startTileIndex = startTileIndex; this->numTiles = numTiles; if (cutoff == -1.0) return; // There are no nonbonded interactions in the System. forceKernel.setArg(6, startTileIndex); forceKernel.setArg(7, startTileIndex+numTiles); if (useCutoff) { findInteractingBlocksKernel.setArg(10, startTileIndex); findInteractingBlocksKernel.setArg(11, startTileIndex+numTiles); } else forceKernel.setArg(8, numTiles); } cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& source, const vector& params, const vector& arguments, bool useExclusions, bool isSymmetric) const { map replacements; replacements["COMPUTE_INTERACTION"] = source; const string suffixes[] = {"x", "y", "z", "w"}; stringstream localData; int localDataSize = 0; for (int i = 0; i < (int) params.size(); i++) { if (params[i].getNumComponents() == 1) localData<() == CL_MEM_OBJECT_IMAGE2D) { args << ", __read_only image2d_t "; args << arguments[i].getName(); } else { if ((arguments[i].getMemory().getInfo() & CL_MEM_READ_ONLY) == 0) args << ", __global const "; else args << ", __constant "; args << arguments[i].getType(); args << "* restrict "; args << arguments[i].getName(); } } replacements["PARAMETER_ARGUMENTS"] = args.str(); stringstream loadLocal1; for (int i = 0; i < (int) params.size(); i++) { if (params[i].getNumComponents() == 1) { loadLocal1<<"localData[localAtomIndex]."< 0) load2j<<", "; load2j<<"localData[atom2]."< defines; if (forceBufferPerAtomBlock) defines["USE_OUTPUT_BUFFER_PER_BLOCK"] = "1"; if (useCutoff) defines["USE_CUTOFF"] = "1"; if (usePeriodic) defines["USE_PERIODIC"] = "1"; if (useExclusions) defines["USE_EXCLUSIONS"] = "1"; if (isSymmetric) defines["USE_SYMMETRIC"] = "1"; defines["FORCE_WORK_GROUP_SIZE"] = OpenCLExpressionUtilities::intToString(forceThreadBlockSize); defines["CUTOFF_SQUARED"] = OpenCLExpressionUtilities::doubleToString(cutoff*cutoff); defines["NUM_ATOMS"] = OpenCLExpressionUtilities::intToString(context.getNumAtoms()); defines["PADDED_NUM_ATOMS"] = OpenCLExpressionUtilities::intToString(context.getPaddedNumAtoms()); defines["NUM_BLOCKS"] = OpenCLExpressionUtilities::intToString(context.getNumAtomBlocks()); if ((localDataSize/4)%2 == 0) defines["PARAMETER_SIZE_IS_EVEN"] = "1"; string file; if (deviceIsCpu) file = OpenCLKernelSources::nonbonded_cpu; else if (context.getSIMDWidth() == 32) file = OpenCLKernelSources::nonbonded_nvidia; else file = OpenCLKernelSources::nonbonded_default; cl::Program program = context.createProgram(context.replaceStrings(file, replacements), defines); cl::Kernel kernel(program, "computeNonbonded"); // Set arguments to the Kernel. int index = 0; if (context.getSupports64BitGlobalAtomics()) kernel.setArg(index++, context.getLongForceBuffer().getDeviceBuffer()); else kernel.setArg(index++, context.getForceBuffers().getDeviceBuffer()); kernel.setArg(index++, context.getEnergyBuffer().getDeviceBuffer()); kernel.setArg(index++, context.getPosq().getDeviceBuffer()); kernel.setArg(index++, exclusions->getDeviceBuffer()); kernel.setArg(index++, exclusionIndices->getDeviceBuffer()); kernel.setArg(index++, exclusionRowIndices->getDeviceBuffer()); kernel.setArg(index++, startTileIndex); kernel.setArg(index++, startTileIndex+numTiles); if (useCutoff) { kernel.setArg(index++, interactingTiles->getDeviceBuffer()); kernel.setArg(index++, interactionCount->getDeviceBuffer()); index += 2; // The periodic box size arguments are set when the kernel is executed. kernel.setArg(index++, interactingTiles->getSize()); kernel.setArg(index++, interactionFlags->getDeviceBuffer()); } else { kernel.setArg(index++, numTiles); } for (int i = 0; i < (int) params.size(); i++) { kernel.setArg(index++, params[i].getMemory()); } for (int i = 0; i < (int) arguments.size(); i++) { kernel.setArg(index++, arguments[i].getMemory()); } return kernel; }