/* -------------------------------------------------------------------------- * * OpenMM * * -------------------------------------------------------------------------- * * This is part of the OpenMM molecular simulation toolkit originating from * * Simbios, the NIH National Center for Physics-Based Simulation of * * Biological Structures at Stanford, funded under the NIH Roadmap for * * Medical Research, grant U54 GM072970. See https://simtk.org. * * * * Portions copyright (c) 2009 Stanford University and the Authors. * * Authors: Peter Eastman * * Contributors: * * * * This program is free software: you can redistribute it and/or modify * * it under the terms of the GNU Lesser General Public License as published * * by the Free Software Foundation, either version 3 of the License, or * * (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU Lesser General Public License for more details. * * * * You should have received a copy of the GNU Lesser General Public License * * along with this program. If not, see . * * -------------------------------------------------------------------------- */ #include "OpenCLContext.h" #include "OpenCLArray.h" #include "OpenCLForceInfo.h" #include "OpenCLIntegrationUtilities.h" #include "OpenCLNonbondedUtilities.h" #include "openmm/Platform.h" #include "openmm/System.h" #include #include #include using namespace OpenMM; using namespace std; OpenCLContext::OpenCLContext(int numParticles, int deviceIndex) : time(0.0), stepCount(0) { context = cl::Context(CL_DEVICE_TYPE_ALL); vector devices = context.getInfo(); const int minThreadBlockSize = 32; if (deviceIndex < 0 || deviceIndex >= devices.size()) { // Try to figure out which device is the fastest. int bestSpeed = 0; for (int i = 0; i < devices.size(); i++) { int maxSize = devices[i].getInfo()[0]; int speed = devices[i].getInfo()*devices[i].getInfo(); if (maxSize >= minThreadBlockSize && speed > bestSpeed) deviceIndex = i; } } if (deviceIndex == -1) throw OpenMMException("No compatible OpenCL device is available"); device = devices[deviceIndex]; if (device.getInfo()[0] < minThreadBlockSize) throw OpenMMException("The specified OpenCL device is not compatible with OpenMM"); compilationOptions = "-cl-fast-relaxed-math"; if (device.getInfo() == "NVIDIA") compilationOptions += " -DWARPS_ARE_ATOMIC"; queue = cl::CommandQueue(context, device); numAtoms = numParticles; paddedNumAtoms = TileSize*((numParticles+TileSize-1)/TileSize); numAtomBlocks = (paddedNumAtoms+(TileSize-1))/TileSize; numThreadBlocks = device.getInfo()[0]/ThreadBlockSize; nonbonded = new OpenCLNonbondedUtilities(*this); posq = new OpenCLArray(*this, paddedNumAtoms, "posq", true); velm = new OpenCLArray(*this, paddedNumAtoms, "velm", true); // Create utility kernels that are used in multiple places. utilities = createProgram(loadSourceFromFile("utilities.cl")); clearBufferKernel = cl::Kernel(utilities, "clearBuffer"); reduceFloat4Kernel = cl::Kernel(utilities, "reduceFloat4Buffer"); } OpenCLContext::~OpenCLContext() { for (int i = 0; i < (int) forces.size(); i++) delete forces[i]; if (posq != NULL) delete posq; if (velm != NULL) delete velm; if (force != NULL) delete force; if (forceBuffers != NULL) delete forceBuffers; if (energyBuffer != NULL) delete energyBuffer; if (atomIndex != NULL) delete atomIndex; if (integration != NULL) delete integration; if (nonbonded != NULL) delete nonbonded; } void OpenCLContext::initialize(const System& system) { for (int i = 0; i < numAtoms; i++) (*velm)[i].w = (float) (1.0/system.getParticleMass(i)); velm->upload(); numForceBuffers = 1; for (int i = 0; i < (int) forces.size(); i++) numForceBuffers = std::max(numForceBuffers, forces[i]->getRequiredForceBuffers()); forceBuffers = new OpenCLArray(*this, paddedNumAtoms*numForceBuffers, "forceBuffers", false); force = new OpenCLArray(*this, &forceBuffers->getDeviceBuffer(), paddedNumAtoms, "force", true); energyBuffer = new OpenCLArray(*this, numThreadBlocks*ThreadBlockSize, "energyBuffer", true); atomIndex = new OpenCLArray(*this, paddedNumAtoms, "atomIndex", true); for (int i = 0; i < paddedNumAtoms; ++i) (*atomIndex)[i] = i; atomIndex->upload(); integration = new OpenCLIntegrationUtilities(*this, system); nonbonded->initialize(system); } void OpenCLContext::addForce(OpenCLForceInfo* force) { forces.push_back(force); } string OpenCLContext::loadSourceFromFile(const string& filename) const { return loadSourceFromFile(filename, map()); } string OpenCLContext::loadSourceFromFile(const string& filename, const std::map& replacements) const { ifstream file((Platform::getDefaultPluginsDirectory()+"/opencl/"+filename).c_str()); if (!file.is_open()) throw OpenMMException("Unable to load kernel: "+filename); string kernel; string line; while (!file.eof()) { getline(file, line); kernel += line; kernel += '\n'; } file.close(); for (map::const_iterator iter = replacements.begin(); iter != replacements.end(); iter++) { int index = -1; do { index = kernel.find(iter->first); if (index != kernel.npos) kernel.replace(index, iter->first.size(), iter->second); } while (index != kernel.npos); } return kernel; } cl::Program OpenCLContext::createProgram(const string source) { return createProgram(source, map()); } cl::Program OpenCLContext::createProgram(const string source, const map& defines) { cl::Program::Sources sources(1, make_pair(source.c_str(), source.size())); cl::Program program(context, sources); string options = compilationOptions; for (map::const_iterator iter = defines.begin(); iter != defines.end(); ++iter) options += " -D"+iter->first+"="+iter->second; try { program.build(vector(1, device), options.c_str()); } catch (cl::Error err) { throw OpenMMException("Error compiling kernel: "+program.getBuildInfo(device)); } return program; } void OpenCLContext::executeKernel(cl::Kernel& kernel, int workUnits, int workUnitSize) { if (workUnitSize == -1) workUnitSize = ThreadBlockSize; int size = std::min((workUnits+workUnitSize-1)/workUnitSize, numThreadBlocks)*workUnitSize; try { queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(size), cl::NDRange(workUnitSize)); } catch (cl::Error err) { stringstream str; str<<"Error invoking kernel "<()<<": "<& array) { clearBufferKernel.setArg(0, array.getDeviceBuffer()); clearBufferKernel.setArg(1, array.getSize()); executeKernel(clearBufferKernel, array.getSize()); } void OpenCLContext::clearBuffer(OpenCLArray& array) { clearBufferKernel.setArg(0, array.getDeviceBuffer()); clearBufferKernel.setArg(1, array.getSize()*4); executeKernel(clearBufferKernel, array.getSize()*4); } void OpenCLContext::reduceBuffer(OpenCLArray& array, int numBuffers) { int bufferSize = array.getSize()/numBuffers; reduceFloat4Kernel.setArg(0, array.getDeviceBuffer()); reduceFloat4Kernel.setArg(1, bufferSize); reduceFloat4Kernel.setArg(2, numBuffers); executeKernel(reduceFloat4Kernel, bufferSize); }