/* -------------------------------------------------------------------------- * * OpenMM * * -------------------------------------------------------------------------- * * This is part of the OpenMM molecular simulation toolkit originating from * * Simbios, the NIH National Center for Physics-Based Simulation of * * Biological Structures at Stanford, funded under the NIH Roadmap for * * Medical Research, grant U54 GM072970. See https://simtk.org. * * * * Portions copyright (c) 2009-2010 Stanford University and the Authors. * * Authors: Peter Eastman * * Contributors: * * * * This program is free software: you can redistribute it and/or modify * * it under the terms of the GNU Lesser General Public License as published * * by the Free Software Foundation, either version 3 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU Lesser General Public License for more details. * * * * You should have received a copy of the GNU Lesser General Public License * * along with this program. If not, see . * * -------------------------------------------------------------------------- */ #include "OpenCLNonbondedUtilities.h" #include "OpenCLArray.h" #include "OpenCLCompact.h" #include "OpenCLKernelSources.h" #include "OpenCLExpressionUtilities.h" #include using namespace OpenMM; using namespace std; OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : context(context), cutoff(-1.0), useCutoff(false), numForceBuffers(0), tiles(NULL), exclusionIndex(NULL), exclusions(NULL), interactingTiles(NULL), interactionFlags(NULL), interactionCount(NULL), blockCenter(NULL), blockBoundingBox(NULL), compact(NULL) { // Decide how many force buffers to use. forceBufferPerAtomBlock = false; numForceBuffers = context.getNumThreadBlocks()*OpenCLContext::ThreadBlockSize/OpenCLContext::TileSize; if (numForceBuffers >= context.getNumAtomBlocks()) { // For small systems, it is more efficient to have one force buffer per block of 32 atoms instead of one per warp. forceBufferPerAtomBlock = true; numForceBuffers = context.getNumAtomBlocks(); } } OpenCLNonbondedUtilities::~OpenCLNonbondedUtilities() { if (tiles != NULL) delete tiles; if (exclusionIndex != NULL) delete exclusionIndex; if (exclusions != NULL) delete exclusions; if (interactingTiles != NULL) delete interactingTiles; if (interactionFlags != NULL) delete interactionFlags; if (interactionCount != NULL) delete interactionCount; if (blockCenter != NULL) delete blockCenter; if (blockBoundingBox != NULL) delete blockBoundingBox; if (compact != NULL) delete compact; } void OpenCLNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector >& exclusionList, const string& kernel) { if (cutoff != -1.0) { if (usesCutoff != useCutoff) throw OpenMMException("All Forces must agree on whether to use a cutoff"); if (usesPeriodic != usePeriodic) throw OpenMMException("All Forces must agree on whether to use periodic boundary conditions"); if (cutoffDistance != cutoff) throw OpenMMException("All Forces must use the same cutoff distance"); } if (usesExclusions && atomExclusions.size() != 0) { bool sameExclusions = (exclusionList.size() == atomExclusions.size()); for (int i = 0; i < (int) exclusionList.size() && sameExclusions; i++) { if (exclusionList[i].size() != atomExclusions[i].size()) sameExclusions = false; for (int j = 0; j < (int) exclusionList[i].size(); j++) if (exclusionList[i][j] != atomExclusions[i][j]) sameExclusions = false; } if (!sameExclusions) throw OpenMMException("All Forces must have identical exceptions"); } useCutoff = usesCutoff; usePeriodic = usesPeriodic; cutoff = cutoffDistance; kernelSource += kernel+"\n"; if (usesExclusions) atomExclusions = exclusionList; } void OpenCLNonbondedUtilities::addParameter(const ParameterInfo& parameter) { parameters.push_back(parameter); } void OpenCLNonbondedUtilities::addArgument(const ParameterInfo& parameter) { arguments.push_back(parameter); } void OpenCLNonbondedUtilities::initialize(const System& system) { if (cutoff == -1.0) return; // There are no nonbonded interactions in the System. if (atomExclusions.size() == 0) { // No exclusions were specifically requested, so just mark every atom as not interacting with itself. atomExclusions.resize(context.getNumAtoms()); for (int i = 0; i < (int) atomExclusions.size(); i++) atomExclusions[i].push_back(i); } // Create the list of tiles. int numAtomBlocks = context.getNumAtomBlocks(); int numTiles = numAtomBlocks*(numAtomBlocks+1)/2; tiles = new OpenCLArray(context, numTiles, "tiles"); vector tileVec(tiles->getSize()); unsigned int count = 0; for (unsigned int y = 0; y < (unsigned int) numAtomBlocks; y++) for (unsigned int x = y; x < (unsigned int) numAtomBlocks; x++) tileVec[count++] = (x << 17) | (y << 2); // Mark which tiles have exclusions. for (int atom1 = 0; atom1 < (int) atomExclusions.size(); ++atom1) { int x = atom1/OpenCLContext::TileSize; for (int j = 0; j < (int) atomExclusions[atom1].size(); ++j) { int atom2 = atomExclusions[atom1][j]; int y = atom2/OpenCLContext::TileSize; int index = (x > y ? x+y*numAtomBlocks-y*(y+1)/2 : y+x*numAtomBlocks-x*(x+1)/2); tileVec[index] |= 1; } } if (context.getPaddedNumAtoms() > context.getNumAtoms()) { int lastTile = context.getNumAtoms()/OpenCLContext::TileSize; for (int i = 0; i < numTiles; ++i) { int x = tileVec[i]>>17; int y = (tileVec[i]>>2)&0x7FFF; if (x == lastTile || y == lastTile) tileVec[i] |= 1; } } // Build a list of indices for the tiles with exclusions. exclusionIndex = new OpenCLArray(context, numTiles, "exclusionIndex"); vector exclusionIndexVec(exclusionIndex->getSize()); int numWithExclusions = 0; for (int i = 0; i < numTiles; ++i) if ((tileVec[i]&1) == 1) exclusionIndexVec[i] = (numWithExclusions++)*OpenCLContext::TileSize; // Record the exclusion data. exclusions = new OpenCLArray(context, numWithExclusions*OpenCLContext::TileSize, "exclusions"); vector exclusionVec(exclusions->getSize()); for (int i = 0; i < exclusions->getSize(); ++i) exclusionVec[i] = 0xFFFFFFFF; for (int atom1 = 0; atom1 < (int) atomExclusions.size(); ++atom1) { int x = atom1/OpenCLContext::TileSize; int offset1 = atom1-x*OpenCLContext::TileSize; for (int j = 0; j < (int) atomExclusions[atom1].size(); ++j) { int atom2 = atomExclusions[atom1][j]; int y = atom2/OpenCLContext::TileSize; int offset2 = atom2-y*OpenCLContext::TileSize; if (x > y) { int tile = x+y*numAtomBlocks-y*(y+1)/2; exclusionVec[exclusionIndexVec[tile]+offset1] &= 0xFFFFFFFF-(1<= y) { int tile = x+y*numAtomBlocks-y*(y+1)/2; exclusionVec[exclusionIndexVec[tile]+offset1] &= 0xFFFFFFFF-(1<= x) { int tile = y+x*numAtomBlocks-x*(x+1)/2; exclusionVec[exclusionIndexVec[tile]+offset2] &= 0xFFFFFFFF-(1<upload(tileVec); exclusions->upload(exclusionVec); exclusionIndex->upload(exclusionIndexVec); // Record the periodic box size. Vec3 boxVectors[3]; system.getPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]); periodicBoxSize = mm_float4((float) boxVectors[0][0], (float) boxVectors[1][1], (float) boxVectors[2][2], 0.0f); // Create data structures for the neighbor list. if (useCutoff) { interactingTiles = new OpenCLArray(context, numTiles, "interactingTiles"); interactionFlags = new OpenCLArray(context, numTiles, "interactionFlags"); interactionCount = new OpenCLArray(context, 1, "interactionCount"); blockCenter = new OpenCLArray(context, numAtomBlocks, "blockCenter"); blockBoundingBox = new OpenCLArray(context, numAtomBlocks, "blockBoundingBox"); compact = new OpenCLCompact(context); } // Create kernels. forceKernel = createInteractionKernel(kernelSource, parameters, arguments, true); if (useCutoff) { map defines; if (forceBufferPerAtomBlock) defines["USE_OUTPUT_BUFFER_PER_BLOCK"] = "1"; if (usePeriodic) defines["USE_PERIODIC"] = "1"; cl::Program interactingBlocksProgram = context.createProgram(OpenCLKernelSources::findInteractingBlocks, defines); findBlockBoundsKernel = cl::Kernel(interactingBlocksProgram, "findBlockBounds"); findBlockBoundsKernel.setArg(0, context.getNumAtoms()); findBlockBoundsKernel.setArg(1, periodicBoxSize); findBlockBoundsKernel.setArg(2, context.getPosq().getDeviceBuffer()); findBlockBoundsKernel.setArg(3, blockCenter->getDeviceBuffer()); findBlockBoundsKernel.setArg(4, blockBoundingBox->getDeviceBuffer()); findInteractingBlocksKernel = cl::Kernel(interactingBlocksProgram, "findBlocksWithInteractions"); findInteractingBlocksKernel.setArg(0, tiles->getSize()); findInteractingBlocksKernel.setArg(1, (cl_float) (cutoff*cutoff)); findInteractingBlocksKernel.setArg(2, periodicBoxSize); findInteractingBlocksKernel.setArg(3, tiles->getDeviceBuffer()); findInteractingBlocksKernel.setArg(4, blockCenter->getDeviceBuffer()); findInteractingBlocksKernel.setArg(5, blockBoundingBox->getDeviceBuffer()); findInteractingBlocksKernel.setArg(6, interactionFlags->getDeviceBuffer()); findInteractionsWithinBlocksKernel = cl::Kernel(interactingBlocksProgram, "findInteractionsWithinBlocks"); findInteractionsWithinBlocksKernel.setArg(0, (cl_float) (cutoff*cutoff)); findInteractionsWithinBlocksKernel.setArg(1, periodicBoxSize); findInteractionsWithinBlocksKernel.setArg(2, context.getPosq().getDeviceBuffer()); findInteractionsWithinBlocksKernel.setArg(3, interactingTiles->getDeviceBuffer()); findInteractionsWithinBlocksKernel.setArg(4, blockCenter->getDeviceBuffer()); findInteractionsWithinBlocksKernel.setArg(5, blockBoundingBox->getDeviceBuffer()); findInteractionsWithinBlocksKernel.setArg(6, interactionFlags->getDeviceBuffer()); findInteractionsWithinBlocksKernel.setArg(7, interactionCount->getDeviceBuffer()); findInteractionsWithinBlocksKernel.setArg(8, OpenCLContext::ThreadBlockSize*sizeof(cl_uint), NULL); } } void OpenCLNonbondedUtilities::prepareInteractions() { if (!useCutoff) return; // Compute the neighbor list. context.executeKernel(findBlockBoundsKernel, context.getNumAtoms()); context.executeKernel(findInteractingBlocksKernel, context.getNumAtoms()); compact->compactStream(*interactingTiles, *tiles, *interactionFlags, *interactionCount); if (context.getSIMDWidth() == 32) context.executeKernel(findInteractionsWithinBlocksKernel, context.getNumAtoms()); } void OpenCLNonbondedUtilities::computeInteractions() { if (tiles != NULL) context.executeKernel(forceKernel, tiles->getSize()*OpenCLContext::TileSize); } cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& source, const vector& params, const vector& arguments, bool useExclusions) const { map replacements; replacements["COMPUTE_INTERACTION"] = source; int localDataSize = 7*sizeof(cl_float); const string suffixes[] = {"x", "y", "z", "w"}; stringstream localData; for (int i = 0; i < (int) params.size(); i++) { if (params[i].getNumComponents() == 1) localData<() == CL_MEM_OBJECT_IMAGE2D) { args << ", __read_only image2d_t "; args << arguments[i].getName(); } else { if ((arguments[i].getMemory().getInfo() & CL_MEM_READ_ONLY) == 0) args << ", __global "; else args << ", __constant "; args << arguments[i].getType(); args << "* "; args << arguments[i].getName(); } } replacements["PARAMETER_ARGUMENTS"] = args.str(); stringstream loadLocal1; for (int i = 0; i < (int) params.size(); i++) { if (params[i].getNumComponents() == 1) { loadLocal1<<"localData[get_local_id(0)]."< 0) load2j<<", "; load2j<<"localData[atom2]."< defines; if (forceBufferPerAtomBlock) defines["USE_OUTPUT_BUFFER_PER_BLOCK"] = "1"; if (useCutoff) defines["USE_CUTOFF"] = "1"; if (usePeriodic) defines["USE_PERIODIC"] = "1"; if (useExclusions) defines["USE_EXCLUSIONS"] = "1"; defines["PERIODIC_BOX_SIZE_X"] = OpenCLExpressionUtilities::doubleToString(periodicBoxSize.x); defines["PERIODIC_BOX_SIZE_Y"] = OpenCLExpressionUtilities::doubleToString(periodicBoxSize.y); defines["PERIODIC_BOX_SIZE_Z"] = OpenCLExpressionUtilities::doubleToString(periodicBoxSize.z); defines["INV_PERIODIC_BOX_SIZE_X"] = OpenCLExpressionUtilities::doubleToString(1.0/periodicBoxSize.x); defines["INV_PERIODIC_BOX_SIZE_Y"] = OpenCLExpressionUtilities::doubleToString(1.0/periodicBoxSize.y); defines["INV_PERIODIC_BOX_SIZE_Z"] = OpenCLExpressionUtilities::doubleToString(1.0/periodicBoxSize.z); defines["CUTOFF_SQUARED"] = OpenCLExpressionUtilities::doubleToString(cutoff*cutoff); stringstream natom, padded; natom << context.getNumAtoms(); padded << context.getPaddedNumAtoms(); defines["NUM_ATOMS"] = natom.str(); defines["PADDED_NUM_ATOMS"] = padded.str(); string file = (context.getSIMDWidth() == 32 ? OpenCLKernelSources::nonbonded_nvidia : OpenCLKernelSources::nonbonded_default); cl::Program program = context.createProgram(context.replaceStrings(file, replacements), defines); cl::Kernel kernel(program, "computeNonbonded"); // Set arguments to the Kernel. int index = 0; kernel.setArg(index++, context.getForceBuffers().getDeviceBuffer()); kernel.setArg(index++, context.getEnergyBuffer().getDeviceBuffer()); kernel.setArg(index++, context.getPosq().getDeviceBuffer()); kernel.setArg(index++, exclusions->getDeviceBuffer()); kernel.setArg(index++, exclusionIndex->getDeviceBuffer()); kernel.setArg(index++, OpenCLContext::ThreadBlockSize*(2*sizeof(cl_float4)+localDataSize), NULL); kernel.setArg(index++, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL); if (useCutoff) { kernel.setArg(index++, interactingTiles->getDeviceBuffer()); kernel.setArg(index++, interactionFlags->getDeviceBuffer()); kernel.setArg(index++, interactionCount->getDeviceBuffer()); } else { kernel.setArg(index++, tiles->getDeviceBuffer()); kernel.setArg(index++, tiles->getSize()); } for (int i = 0; i < (int) params.size(); i++) { kernel.setArg(index++, params[i].getMemory()); } for (int i = 0; i < (int) arguments.size(); i++) { kernel.setArg(index++, arguments[i].getMemory()); } return kernel; }