/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see . *
* -------------------------------------------------------------------------- */
#ifdef WIN32
#define _USE_MATH_DEFINES // Needed to get M_PI
#endif
#include
#include "OpenCLContext.h"
#include "OpenCLArray.h"
#include "OpenCLBondedUtilities.h"
#include "OpenCLForceInfo.h"
#include "OpenCLIntegrationUtilities.h"
#include "OpenCLKernelSources.h"
#include "OpenCLNonbondedUtilities.h"
#include "hilbert.h"
#include "openmm/Platform.h"
#include "openmm/System.h"
#include "openmm/VirtualSite.h"
#include
#include
#include
#include
#include
using namespace OpenMM;
using namespace std;
#ifndef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000
#endif
#ifndef CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV
#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV 0x4001
#endif
const int OpenCLContext::ThreadBlockSize = 64;
const int OpenCLContext::TileSize = 32;
static void CL_CALLBACK errorCallback(const char* errinfo, const void* private_info, size_t cb, void* user_data) {
string skip = "OpenCL Build Warning : Compiler build log:";
if (strncmp(errinfo, skip.c_str(), skip.length()) == 0)
return; // OS X Lion insists on calling this for every build warning, even though they aren't errors.
std::cerr << "OpenCL internal error: " << errinfo << std::endl;
}
OpenCLContext::OpenCLContext(const System& system, int platformIndex, int deviceIndex, const string& precision, OpenCLPlatform::PlatformData& platformData) :
system(system), time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), atomsWereReordered(false), posq(NULL),
posqCorrection(NULL), velm(NULL), forceBuffers(NULL), longForceBuffer(NULL), energyBuffer(NULL), atomIndexDevice(NULL), integration(NULL),
expression(NULL), bonded(NULL), nonbonded(NULL), thread(NULL) {
if (precision == "single") {
useDoublePrecision = false;
useMixedPrecision = false;
}
else if (precision == "mixed") {
useDoublePrecision = false;
useMixedPrecision = true;
}
else if (precision == "double") {
useDoublePrecision = true;
useMixedPrecision = false;
}
else
throw OpenMMException("Illegal value for OpenCLPrecision: "+precision);
try {
contextIndex = platformData.contexts.size();
std::vector platforms;
cl::Platform::get(&platforms);
if (platformIndex < 0 || platformIndex >= (int) platforms.size())
throw OpenMMException("Illegal value for OpenCL platform index");
string platformVendor = platforms[platformIndex].getInfo();
vector devices;
platforms[platformIndex].getDevices(CL_DEVICE_TYPE_ALL, &devices);
const int minThreadBlockSize = 32;
if (deviceIndex < 0 || deviceIndex >= (int) devices.size()) {
// Try to figure out which device is the fastest.
int bestSpeed = -1;
for (int i = 0; i < (int) devices.size(); i++) {
if (platformVendor == "Apple" && devices[i].getInfo() == "AMD")
continue; // Don't use AMD GPUs on OS X due to serious bugs.
int maxSize = devices[i].getInfo()[0];
int processingElementsPerComputeUnit = 8;
if (devices[i].getInfo() != CL_DEVICE_TYPE_GPU) {
processingElementsPerComputeUnit = 1;
}
else if (devices[i].getInfo().find("cl_nv_device_attribute_query") != string::npos) {
cl_uint computeCapabilityMajor;
clGetDeviceInfo(devices[i](), CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, sizeof(cl_uint), &computeCapabilityMajor, NULL);
processingElementsPerComputeUnit = (computeCapabilityMajor < 2 ? 8 : 32);
}
else if (devices[i].getInfo().find("cl_amd_device_attribute_query") != string::npos) {
// This attribute does not ensure that all queries are supported by the runtime (it may be an older runtime,
// or the CPU device) so still have to check for errors.
try {
#ifdef CL_DEVICE_SIMD_WIDTH_AMD
processingElementsPerComputeUnit =
// AMD GPUs either have a single VLIW SIMD or multiple scalar SIMDs.
// The SIMD width is the number of threads the SIMD executes per cycle.
// This will be less than the wavefront width since it takes several
// cycles to execute the full wavefront.
// The SIMD instruction width is the VLIW instruction width (or 1 for scalar),
// this is the number of ALUs that can be executing per instruction per thread.
devices[i].getInfo() *
devices[i].getInfo() *
devices[i].getInfo();
// Just in case any of the queries return 0.
if (processingElementsPerComputeUnit <= 0)
processingElementsPerComputeUnit = 1;
#endif
}
catch (cl::Error err) {
// Runtime does not support the queries so use default.
}
}
int speed = devices[i].getInfo()*processingElementsPerComputeUnit*devices[i].getInfo();
if (maxSize >= minThreadBlockSize && speed > bestSpeed) {
deviceIndex = i;
bestSpeed = speed;
}
}
}
if (deviceIndex == -1)
throw OpenMMException("No compatible OpenCL device is available");
device = devices[deviceIndex];
this->deviceIndex = deviceIndex;
if (device.getInfo() < minThreadBlockSize)
throw OpenMMException("The specified OpenCL device is not compatible with OpenMM");
compilationDefines["WORK_GROUP_SIZE"] = intToString(ThreadBlockSize);
if (platformVendor.size() >= 5 && platformVendor.substr(0, 5) == "Intel")
defaultOptimizationOptions = "";
else
defaultOptimizationOptions = "-cl-fast-relaxed-math";
supports64BitGlobalAtomics = (device.getInfo().find("cl_khr_int64_base_atomics") != string::npos);
supportsDoublePrecision = (device.getInfo().find("cl_khr_fp64") != string::npos);
if ((useDoublePrecision || useMixedPrecision) && !supportsDoublePrecision)
throw OpenMMException("This device does not support double precision");
string vendor = device.getInfo();
int numThreadBlocksPerComputeUnit = 6;
if (vendor.size() >= 6 && vendor.substr(0, 6) == "NVIDIA") {
compilationDefines["WARPS_ARE_ATOMIC"] = "";
simdWidth = 32;
if (device.getInfo().find("cl_nv_device_attribute_query") != string::npos) {
// Compute level 1.2 and later Nvidia GPUs support 64 bit atomics, even though they don't list the
// proper extension as supported. We only use them on compute level 2.0 or later, since they're very
// slow on earlier GPUs.
cl_uint computeCapabilityMajor;
clGetDeviceInfo(device(), CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, sizeof(cl_uint), &computeCapabilityMajor, NULL);
if (computeCapabilityMajor > 1)
supports64BitGlobalAtomics = true;
}
}
else if (vendor.size() >= 28 && vendor.substr(0, 28) == "Advanced Micro Devices, Inc.") {
// Disable 64 bit atomics. A future version of the driver will support them, but until we can test that,
// it's safest not to use them.
supports64BitGlobalAtomics = false;
if (device.getInfo() != CL_DEVICE_TYPE_GPU) {
/// \todo Is 6 a good value for the OpenCL CPU device?
// numThreadBlocksPerComputeUnit = ?;
simdWidth = 1;
}
else {
bool amdPostSdk2_4 = false;
// Default to 1 which will use the default kernels.
simdWidth = 1;
if (device.getInfo().find("cl_amd_device_attribute_query") != string::npos) {
// This attribute does not ensure that all queries are supported by the runtime so still have to
// check for errors.
try {
#ifdef CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD
// AMD has both 32 and 64 width SIMDs. Can determine by using:
// simdWidth = device.getInfo();
// Must catch cl:Error as will fail if runtime does not support queries.
// However, the 32 width NVIDIA kernels do not have all the necessary
// barriers and so will not work for AMD.
// So for now leave default of 1 which will use the default kernels.
cl_uint simdPerComputeUnit = device.getInfo();
// If the GPU has multiple SIMDs per compute unit then it is uses the scalar instruction
// set instead of the VLIW instruction set. It therefore needs more thread blocks per
// compute unit to hide memory latency.
if (simdPerComputeUnit > 1)
numThreadBlocksPerComputeUnit = 4 * simdPerComputeUnit;
// If the queries are supported then must be newer than SDK 2.4.
amdPostSdk2_4 = true;
#endif
}
catch (cl::Error err) {
// Runtime does not support the query so is unlikely to be the newer scalar GPU.
// Stay with the default simdWidth and numThreadBlocksPerComputeUnit.
}
}
// AMD APP SDK 2.4 has a performance problem with atomics. Enable the work around. This is fixed after SDK 2.4.
if (!amdPostSdk2_4)
compilationDefines["AMD_ATOMIC_WORK_AROUND"] = "";
}
}
else
simdWidth = 1;
if (platformVendor == "Apple" && vendor == "AMD")
compilationDefines["MAC_AMD_WORKAROUND"] = "";
if (supports64BitGlobalAtomics)
compilationDefines["SUPPORTS_64_BIT_ATOMICS"] = "";
if (supportsDoublePrecision)
compilationDefines["SUPPORTS_DOUBLE_PRECISION"] = "";
vector contextDevices;
contextDevices.push_back(device);
cl_context_properties cprops[] = {CL_CONTEXT_PLATFORM, (cl_context_properties) platforms[platformIndex](), 0};
context = cl::Context(contextDevices, cprops, errorCallback);
queue = cl::CommandQueue(context, device);
numAtoms = system.getNumParticles();
paddedNumAtoms = TileSize*((numAtoms+TileSize-1)/TileSize);
numAtomBlocks = (paddedNumAtoms+(TileSize-1))/TileSize;
numThreadBlocks = numThreadBlocksPerComputeUnit*device.getInfo();
bonded = new OpenCLBondedUtilities(*this);
nonbonded = new OpenCLNonbondedUtilities(*this);
if (useDoublePrecision) {
posq = OpenCLArray::create(*this, paddedNumAtoms, "posq");
velm = OpenCLArray::create(*this, paddedNumAtoms, "velm");
compilationDefines["USE_DOUBLE_PRECISION"] = "1";
compilationDefines["convert_real4"] = "convert_double4";
compilationDefines["convert_mixed4"] = "convert_double4";
}
else if (useMixedPrecision) {
posq = OpenCLArray::create(*this, paddedNumAtoms, "posq");
posqCorrection = OpenCLArray::create(*this, paddedNumAtoms, "posq");
velm = OpenCLArray::create(*this, paddedNumAtoms, "velm");
compilationDefines["USE_MIXED_PRECISION"] = "1";
compilationDefines["convert_real4"] = "convert_float4";
compilationDefines["convert_mixed4"] = "convert_double4";
}
else {
posq = OpenCLArray::create(*this, paddedNumAtoms, "posq");
velm = OpenCLArray::create(*this, paddedNumAtoms, "velm");
compilationDefines["convert_real4"] = "convert_float4";
compilationDefines["convert_mixed4"] = "convert_float4";
}
posCellOffsets.resize(paddedNumAtoms, mm_int4(0, 0, 0, 0));
}
catch (cl::Error err) {
std::stringstream str;
str<<"Error initializing context: "< values(valuesArray.getSize());
float nextValue = 1e-4f;
for (int i = 0; i < (int) values.size(); ++i) {
values[i].s0 = nextValue;
nextValue *= (float) M_PI;
}
valuesArray.upload(values);
accuracyKernel.setArg(0, valuesArray.getDeviceBuffer());
accuracyKernel.setArg(1, values.size());
executeKernel(accuracyKernel, values.size());
valuesArray.download(values);
double maxSqrtError = 0.0, maxRsqrtError = 0.0, maxRecipError = 0.0, maxExpError = 0.0, maxLogError = 0.0;
for (int i = 0; i < (int) values.size(); ++i) {
double v = values[i].s0;
double correctSqrt = sqrt(v);
maxSqrtError = max(maxSqrtError, fabs(correctSqrt-values[i].s1)/correctSqrt);
maxRsqrtError = max(maxRsqrtError, fabs(1.0/correctSqrt-values[i].s2)*correctSqrt);
maxRecipError = max(maxRecipError, fabs(1.0/v-values[i].s3)/values[i].s3);
maxExpError = max(maxExpError, fabs(exp(v)-values[i].s4)/values[i].s4);
maxLogError = max(maxLogError, fabs(log(v)-values[i].s5)/values[i].s5);
}
compilationDefines["SQRT"] = (maxSqrtError < 1e-6) ? "native_sqrt" : "sqrt";
compilationDefines["RSQRT"] = (maxRsqrtError < 1e-6) ? "native_rsqrt" : "rsqrt";
compilationDefines["RECIP"] = (maxRecipError < 1e-6) ? "native_recip" : "1.0f/";
compilationDefines["EXP"] = (maxExpError < 1e-6) ? "native_exp" : "exp";
compilationDefines["LOG"] = (maxLogError < 1e-6) ? "native_log" : "log";
}
else {
compilationDefines["SQRT"] = "sqrt";
compilationDefines["RSQRT"] = "rsqrt";
compilationDefines["RECIP"] = "1.0/";
compilationDefines["EXP"] = "exp";
compilationDefines["LOG"] = "log";
}
// Create the work thread used for parallelization when running on multiple devices.
thread = new WorkThread();
// Create utilities objects.
integration = new OpenCLIntegrationUtilities(*this, system);
expression = new OpenCLExpressionUtilities(*this);
}
OpenCLContext::~OpenCLContext() {
for (int i = 0; i < (int) forces.size(); i++)
delete forces[i];
for (int i = 0; i < (int) reorderListeners.size(); i++)
delete reorderListeners[i];
if (pinnedBuffer != NULL)
delete pinnedBuffer;
if (posq != NULL)
delete posq;
if (posqCorrection != NULL)
delete posqCorrection;
if (velm != NULL)
delete velm;
if (force != NULL)
delete force;
if (forceBuffers != NULL)
delete forceBuffers;
if (longForceBuffer != NULL)
delete longForceBuffer;
if (energyBuffer != NULL)
delete energyBuffer;
if (atomIndexDevice != NULL)
delete atomIndexDevice;
if (integration != NULL)
delete integration;
if (expression != NULL)
delete expression;
if (bonded != NULL)
delete bonded;
if (nonbonded != NULL)
delete nonbonded;
if (thread != NULL)
delete thread;
}
void OpenCLContext::initialize() {
bonded->initialize(system);
numForceBuffers = platformData.contexts.size();
numForceBuffers = std::max(numForceBuffers, bonded->getNumForceBuffers());
for (int i = 0; i < (int) forces.size(); i++)
numForceBuffers = std::max(numForceBuffers, forces[i]->getRequiredForceBuffers());
if (useDoublePrecision) {
forceBuffers = OpenCLArray::create(*this, paddedNumAtoms*numForceBuffers, "forceBuffers");
force = OpenCLArray::create(*this, &forceBuffers->getDeviceBuffer(), paddedNumAtoms, "force");
energyBuffer = OpenCLArray::create(*this, max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers()), "energyBuffer");
}
else {
forceBuffers = OpenCLArray::create(*this, paddedNumAtoms*numForceBuffers, "forceBuffers");
force = OpenCLArray::create(*this, &forceBuffers->getDeviceBuffer(), paddedNumAtoms, "force");
energyBuffer = OpenCLArray::create(*this, max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers()), "energyBuffer");
}
if (supports64BitGlobalAtomics) {
longForceBuffer = OpenCLArray::create(*this, 3*paddedNumAtoms, "longForceBuffer");
reduceForcesKernel.setArg(0, longForceBuffer->getDeviceBuffer());
reduceForcesKernel.setArg(1, forceBuffers->getDeviceBuffer());
reduceForcesKernel.setArg(2, paddedNumAtoms);
reduceForcesKernel.setArg(3, numForceBuffers);
addAutoclearBuffer(*longForceBuffer);
}
addAutoclearBuffer(*forceBuffers);
addAutoclearBuffer(*energyBuffer);
int bufferBytes = max(velm->getSize()*velm->getElementSize(), energyBuffer->getSize()*energyBuffer->getElementSize());
pinnedBuffer = new cl::Buffer(context, CL_MEM_ALLOC_HOST_PTR, bufferBytes);
pinnedMemory = queue.enqueueMapBuffer(*pinnedBuffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, bufferBytes);
for (int i = 0; i < numAtoms; i++) {
double mass = system.getParticleMass(i);
if (useDoublePrecision || useMixedPrecision)
((mm_double4*) pinnedMemory)[i] = mm_double4(0.0, 0.0, 0.0, mass == 0.0 ? 0.0 : 1.0/mass);
else
((mm_float4*) pinnedMemory)[i] = mm_float4(0.0f, 0.0f, 0.0f, mass == 0.0 ? 0.0f : (cl_float) (1.0/mass));
}
velm->upload(pinnedMemory);
atomIndexDevice = OpenCLArray::create(*this, paddedNumAtoms, "atomIndexDevice");
atomIndex.resize(paddedNumAtoms);
for (int i = 0; i < paddedNumAtoms; ++i)
atomIndex[i] = i;
atomIndexDevice->upload(atomIndex);
findMoleculeGroups();
moleculesInvalid = false;
nonbonded->initialize(system);
}
void OpenCLContext::addForce(OpenCLForceInfo* force) {
forces.push_back(force);
}
string OpenCLContext::replaceStrings(const string& input, const std::map& replacements) const {
string result = input;
for (map::const_iterator iter = replacements.begin(); iter != replacements.end(); iter++) {
int index = -1;
do {
index = result.find(iter->first);
if (index != result.npos)
result.replace(index, iter->first.size(), iter->second);
} while (index != result.npos);
}
return result;
}
cl::Program OpenCLContext::createProgram(const string source, const char* optimizationFlags) {
return createProgram(source, map(), optimizationFlags);
}
cl::Program OpenCLContext::createProgram(const string source, const map& defines, const char* optimizationFlags) {
string options = (optimizationFlags == NULL ? defaultOptimizationOptions : string(optimizationFlags));
stringstream src;
if (!options.empty())
src << "// Compilation Options: " << options << endl << endl;
for (map::const_iterator iter = compilationDefines.begin(); iter != compilationDefines.end(); ++iter) {
src << "#define " << iter->first;
if (!iter->second.empty())
src << " " << iter->second;
src << endl;
}
if (!compilationDefines.empty())
src << endl;
if (supportsDoublePrecision)
src << "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
if (useDoublePrecision) {
src << "typedef double real;\n";
src << "typedef double2 real2;\n";
src << "typedef double4 real4;\n";
}
else {
src << "typedef float real;\n";
src << "typedef float2 real2;\n";
src << "typedef float4 real4;\n";
}
if (useDoublePrecision || useMixedPrecision) {
src << "typedef double mixed;\n";
src << "typedef double2 mixed2;\n";
src << "typedef double4 mixed4;\n";
}
else {
src << "typedef float mixed;\n";
src << "typedef float2 mixed2;\n";
src << "typedef float4 mixed4;\n";
}
for (map::const_iterator iter = defines.begin(); iter != defines.end(); ++iter) {
src << "#define " << iter->first;
if (!iter->second.empty())
src << " " << iter->second;
src << endl;
}
if (!defines.empty())
src << endl;
src << source << endl;
// Get length before using c_str() to avoid length() call invalidating the c_str() value.
string src_string = src.str();
::size_t src_length = src_string.length();
cl::Program::Sources sources(1, make_pair(src_string.c_str(), src_length));
cl::Program program(context, sources);
try {
program.build(vector(1, device), options.c_str());
} catch (cl::Error err) {
throw OpenMMException("Error compiling kernel: "+program.getBuildInfo(device));
}
return program;
}
string OpenCLContext::doubleToString(double value) {
stringstream s;
s.precision(useDoublePrecision ? 16 : 8);
s << scientific << value;
if (!useDoublePrecision)
s << "f";
return s.str();
}
string OpenCLContext::intToString(int value) {
stringstream s;
s << value;
return s.str();
}
void OpenCLContext::executeKernel(cl::Kernel& kernel, int workUnits, int blockSize) {
if (blockSize == -1)
blockSize = ThreadBlockSize;
int size = std::min((workUnits+blockSize-1)/blockSize, numThreadBlocks)*blockSize;
try {
queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(size), cl::NDRange(blockSize));
}
catch (cl::Error err) {
stringstream str;
str<<"Error invoking kernel "<()<<": "<(0, memory);
clearBufferKernel.setArg(1, words);
executeKernel(clearBufferKernel, words, 128);
}
void OpenCLContext::addAutoclearBuffer(OpenCLArray& array) {
addAutoclearBuffer(array.getDeviceBuffer(), array.getSize()*array.getElementSize());
}
void OpenCLContext::addAutoclearBuffer(cl::Memory& memory, int size) {
autoclearBuffers.push_back(&memory);
autoclearBufferSizes.push_back(size/4);
}
void OpenCLContext::clearAutoclearBuffers() {
int base = 0;
int total = autoclearBufferSizes.size();
while (total-base >= 6) {
clearSixBuffersKernel.setArg(0, *autoclearBuffers[base]);
clearSixBuffersKernel.setArg(1, autoclearBufferSizes[base]);
clearSixBuffersKernel.setArg(2, *autoclearBuffers[base+1]);
clearSixBuffersKernel.setArg(3, autoclearBufferSizes[base+1]);
clearSixBuffersKernel.setArg(4, *autoclearBuffers[base+2]);
clearSixBuffersKernel.setArg(5, autoclearBufferSizes[base+2]);
clearSixBuffersKernel.setArg(6, *autoclearBuffers[base+3]);
clearSixBuffersKernel.setArg(7, autoclearBufferSizes[base+3]);
clearSixBuffersKernel.setArg(8, *autoclearBuffers[base+4]);
clearSixBuffersKernel.setArg(9, autoclearBufferSizes[base+4]);
clearSixBuffersKernel.setArg(10, *autoclearBuffers[base+5]);
clearSixBuffersKernel.setArg(11, autoclearBufferSizes[base+5]);
executeKernel(clearSixBuffersKernel, max(max(max(max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), autoclearBufferSizes[base+3]), autoclearBufferSizes[base+4]), autoclearBufferSizes[base+5]), 128);
base += 6;
}
if (total-base == 5) {
clearFiveBuffersKernel.setArg(0, *autoclearBuffers[base]);
clearFiveBuffersKernel.setArg(1, autoclearBufferSizes[base]);
clearFiveBuffersKernel.setArg(2, *autoclearBuffers[base+1]);
clearFiveBuffersKernel.setArg(3, autoclearBufferSizes[base+1]);
clearFiveBuffersKernel.setArg(4, *autoclearBuffers[base+2]);
clearFiveBuffersKernel.setArg(5, autoclearBufferSizes[base+2]);
clearFiveBuffersKernel.setArg(6, *autoclearBuffers[base+3]);
clearFiveBuffersKernel.setArg(7, autoclearBufferSizes[base+3]);
clearFiveBuffersKernel.setArg(8, *autoclearBuffers[base+4]);
clearFiveBuffersKernel.setArg(9, autoclearBufferSizes[base+4]);
executeKernel(clearFiveBuffersKernel, max(max(max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), autoclearBufferSizes[base+3]), autoclearBufferSizes[base+4]), 128);
}
else if (total-base == 4) {
clearFourBuffersKernel.setArg(0, *autoclearBuffers[base]);
clearFourBuffersKernel.setArg(1, autoclearBufferSizes[base]);
clearFourBuffersKernel.setArg(2, *autoclearBuffers[base+1]);
clearFourBuffersKernel.setArg(3, autoclearBufferSizes[base+1]);
clearFourBuffersKernel.setArg(4, *autoclearBuffers[base+2]);
clearFourBuffersKernel.setArg(5, autoclearBufferSizes[base+2]);
clearFourBuffersKernel.setArg(6, *autoclearBuffers[base+3]);
clearFourBuffersKernel.setArg(7, autoclearBufferSizes[base+3]);
executeKernel(clearFourBuffersKernel, max(max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), autoclearBufferSizes[base+3]), 128);
}
else if (total-base == 3) {
clearThreeBuffersKernel.setArg(0, *autoclearBuffers[base]);
clearThreeBuffersKernel.setArg(1, autoclearBufferSizes[base]);
clearThreeBuffersKernel.setArg(2, *autoclearBuffers[base+1]);
clearThreeBuffersKernel.setArg(3, autoclearBufferSizes[base+1]);
clearThreeBuffersKernel.setArg(4, *autoclearBuffers[base+2]);
clearThreeBuffersKernel.setArg(5, autoclearBufferSizes[base+2]);
executeKernel(clearThreeBuffersKernel, max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), 128);
}
else if (total-base == 2) {
clearTwoBuffersKernel.setArg(0, *autoclearBuffers[base]);
clearTwoBuffersKernel.setArg(1, autoclearBufferSizes[base]);
clearTwoBuffersKernel.setArg(2, *autoclearBuffers[base+1]);
clearTwoBuffersKernel.setArg(3, autoclearBufferSizes[base+1]);
executeKernel(clearTwoBuffersKernel, max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), 128);
}
else if (total-base == 1) {
clearBuffer(*autoclearBuffers[base], autoclearBufferSizes[base]);
}
}
void OpenCLContext::reduceForces() {
if (supports64BitGlobalAtomics)
executeKernel(reduceForcesKernel, paddedNumAtoms, 128);
else
reduceBuffer(*forceBuffers, numForceBuffers);
}
void OpenCLContext::reduceBuffer(OpenCLArray& array, int numBuffers) {
int bufferSize = array.getSize()/numBuffers;
reduceReal4Kernel.setArg(0, array.getDeviceBuffer());
reduceReal4Kernel.setArg(1, bufferSize);
reduceReal4Kernel.setArg(2, numBuffers);
executeKernel(reduceReal4Kernel, bufferSize, 128);
}
void OpenCLContext::tagAtomsInMolecule(int atom, int molecule, vector& atomMolecule, vector >& atomBonds) {
// Recursively tag atoms as belonging to a particular molecule.
atomMolecule[atom] = molecule;
for (int i = 0; i < (int) atomBonds[atom].size(); i++)
if (atomMolecule[atomBonds[atom][i]] == -1)
tagAtomsInMolecule(atomBonds[atom][i], molecule, atomMolecule, atomBonds);
}
/**
* This class ensures that atom reordering doesn't break virtual sites.
*/
class OpenCLContext::VirtualSiteInfo : public OpenCLForceInfo {
public:
VirtualSiteInfo(const System& system) : OpenCLForceInfo(0) {
for (int i = 0; i < system.getNumParticles(); i++) {
if (system.isVirtualSite(i)) {
siteTypes.push_back(&typeid(system.getVirtualSite(i)));
vector particles;
particles.push_back(i);
for (int j = 0; j < system.getVirtualSite(i).getNumParticles(); j++)
particles.push_back(system.getVirtualSite(i).getParticle(j));
siteParticles.push_back(particles);
vector weights;
if (dynamic_cast(&system.getVirtualSite(i)) != NULL) {
// A two particle average.
const TwoParticleAverageSite& site = dynamic_cast(system.getVirtualSite(i));
weights.push_back(site.getWeight(0));
weights.push_back(site.getWeight(1));
}
else if (dynamic_cast(&system.getVirtualSite(i)) != NULL) {
// A three particle average.
const ThreeParticleAverageSite& site = dynamic_cast(system.getVirtualSite(i));
weights.push_back(site.getWeight(0));
weights.push_back(site.getWeight(1));
weights.push_back(site.getWeight(2));
}
else if (dynamic_cast(&system.getVirtualSite(i)) != NULL) {
// An out of plane site.
const OutOfPlaneSite& site = dynamic_cast(system.getVirtualSite(i));
weights.push_back(site.getWeight12());
weights.push_back(site.getWeight13());
weights.push_back(site.getWeightCross());
}
siteWeights.push_back(weights);
}
}
}
int getNumParticleGroups() {
return siteTypes.size();
}
void getParticlesInGroup(int index, std::vector& particles) {
particles = siteParticles[index];
}
bool areGroupsIdentical(int group1, int group2) {
if (siteTypes[group1] != siteTypes[group2])
return false;
int numParticles = siteWeights[group1].size();
if (siteWeights[group2].size() != numParticles)
return false;
for (int i = 0; i < numParticles; i++)
if (siteWeights[group1][i] != siteWeights[group2][i])
return false;
return true;
}
private:
vector siteTypes;
vector > siteParticles;
vector > siteWeights;
};
void OpenCLContext::findMoleculeGroups() {
// The first time this is called, we need to identify all the molecules in the system.
if (moleculeGroups.size() == 0) {
// Add a ForceInfo that makes sure reordering doesn't break virtual sites.
addForce(new VirtualSiteInfo(system));
// First make a list of every other atom to which each atom is connect by a constraint or force group.
vector > atomBonds(system.getNumParticles());
for (int i = 0; i < system.getNumConstraints(); i++) {
int particle1, particle2;
double distance;
system.getConstraintParameters(i, particle1, particle2, distance);
atomBonds[particle1].push_back(particle2);
atomBonds[particle2].push_back(particle1);
}
for (int i = 0; i < (int) forces.size(); i++) {
for (int j = 0; j < forces[i]->getNumParticleGroups(); j++) {
vector particles;
forces[i]->getParticlesInGroup(j, particles);
for (int k = 0; k < (int) particles.size(); k++)
for (int m = 0; m < (int) particles.size(); m++)
if (k != m)
atomBonds[particles[k]].push_back(particles[m]);
}
}
// Now tag atoms by which molecule they belong to.
vector atomMolecule(numAtoms, -1);
int numMolecules = 0;
for (int i = 0; i < numAtoms; i++)
if (atomMolecule[i] == -1)
tagAtomsInMolecule(i, numMolecules++, atomMolecule, atomBonds);
vector > atomIndices(numMolecules);
for (int i = 0; i < numAtoms; i++)
atomIndices[atomMolecule[i]].push_back(i);
// Construct a description of each molecule.
molecules.resize(numMolecules);
for (int i = 0; i < numMolecules; i++) {
molecules[i].atoms = atomIndices[i];
molecules[i].groups.resize(forces.size());
}
for (int i = 0; i < system.getNumConstraints(); i++) {
int particle1, particle2;
double distance;
system.getConstraintParameters(i, particle1, particle2, distance);
molecules[atomMolecule[particle1]].constraints.push_back(i);
}
for (int i = 0; i < (int) forces.size(); i++)
for (int j = 0; j < forces[i]->getNumParticleGroups(); j++) {
vector particles;
forces[i]->getParticlesInGroup(j, particles);
molecules[atomMolecule[particles[0]]].groups[i].push_back(j);
}
}
// Sort them into groups of identical molecules.
vector uniqueMolecules;
vector > moleculeInstances;
vector > moleculeOffsets;
for (int molIndex = 0; molIndex < (int) molecules.size(); molIndex++) {
Molecule& mol = molecules[molIndex];
// See if it is identical to another molecule.
bool isNew = true;
for (int j = 0; j < (int) uniqueMolecules.size() && isNew; j++) {
Molecule& mol2 = uniqueMolecules[j];
bool identical = (mol.atoms.size() == mol2.atoms.size() && mol.constraints.size() == mol2.constraints.size());
// See if the atoms are identical.
int atomOffset = mol2.atoms[0]-mol.atoms[0];
for (int i = 0; i < (int) mol.atoms.size() && identical; i++) {
if (mol.atoms[i] != mol2.atoms[i]-atomOffset || system.getParticleMass(mol.atoms[i]) != system.getParticleMass(mol2.atoms[i]))
identical = false;
for (int k = 0; k < (int) forces.size(); k++)
if (!forces[k]->areParticlesIdentical(mol.atoms[i], mol2.atoms[i]))
identical = false;
}
// See if the constraints are identical.
for (int i = 0; i < (int) mol.constraints.size() && identical; i++) {
int c1particle1, c1particle2, c2particle1, c2particle2;
double distance1, distance2;
system.getConstraintParameters(mol.constraints[i], c1particle1, c1particle2, distance1);
system.getConstraintParameters(mol2.constraints[i], c2particle1, c2particle2, distance2);
if (c1particle1 != c2particle1-atomOffset || c1particle2 != c2particle2-atomOffset || distance1 != distance2)
identical = false;
}
// See if the force groups are identical.
for (int i = 0; i < (int) forces.size() && identical; i++) {
if (mol.groups[i].size() != mol2.groups[i].size())
identical = false;
for (int k = 0; k < (int) mol.groups[i].size() && identical; k++)
if (!forces[i]->areGroupsIdentical(mol.groups[i][k], mol2.groups[i][k]))
identical = false;
}
if (identical) {
moleculeInstances[j].push_back(molIndex);
moleculeOffsets[j].push_back(mol.atoms[0]);
isNew = false;
}
}
if (isNew) {
uniqueMolecules.push_back(mol);
moleculeInstances.push_back(vector());
moleculeInstances[moleculeInstances.size()-1].push_back(molIndex);
moleculeOffsets.push_back(vector());
moleculeOffsets[moleculeOffsets.size()-1].push_back(mol.atoms[0]);
}
}
moleculeGroups.resize(moleculeInstances.size());
for (int i = 0; i < (int) moleculeInstances.size(); i++)
{
moleculeGroups[i].instances = moleculeInstances[i];
moleculeGroups[i].offsets = moleculeOffsets[i];
vector& atoms = uniqueMolecules[i].atoms;
moleculeGroups[i].atoms.resize(atoms.size());
for (int j = 0; j < (int) atoms.size(); j++)
moleculeGroups[i].atoms[j] = atoms[j]-atoms[0];
}
}
void OpenCLContext::invalidateMolecules() {
moleculesInvalid = true;
}
void OpenCLContext::validateMolecules() {
moleculesInvalid = false;
if (numAtoms == 0 || nonbonded == NULL || !nonbonded->getUseCutoff())
return;
bool valid = true;
for (int group = 0; valid && group < (int) moleculeGroups.size(); group++) {
MoleculeGroup& mol = moleculeGroups[group];
vector& instances = mol.instances;
vector& offsets = mol.offsets;
vector& atoms = mol.atoms;
int numMolecules = instances.size();
Molecule& m1 = molecules[instances[0]];
int offset1 = offsets[0];
for (int j = 1; valid && j < numMolecules; j++) {
// See if the atoms are identical.
Molecule& m2 = molecules[instances[j]];
int offset2 = offsets[j];
for (int i = 0; i < (int) atoms.size() && valid; i++) {
for (int k = 0; k < (int) forces.size(); k++)
if (!forces[k]->areParticlesIdentical(atoms[i]+offset1, atoms[i]+offset2))
valid = false;
}
// See if the force groups are identical.
for (int i = 0; i < (int) forces.size() && valid; i++) {
for (int k = 0; k < (int) m1.groups[i].size() && valid; k++)
if (!forces[i]->areGroupsIdentical(m1.groups[i][k], m2.groups[i][k]))
valid = false;
}
}
}
if (valid)
return;
// The list of which molecules are identical is no longer valid. We need to restore the
// atoms to their original order, rebuild the list of identical molecules, and sort them
// again.
vector newCellOffsets(numAtoms);
if (useDoublePrecision) {
vector oldPosq(paddedNumAtoms);
vector newPosq(paddedNumAtoms, mm_double4(0,0,0,0));
vector oldVelm(paddedNumAtoms);
vector newVelm(paddedNumAtoms, mm_double4(0,0,0,0));
posq->download(oldPosq);
velm->download(oldVelm);
for (int i = 0; i < numAtoms; i++) {
int index = atomIndex[i];
newPosq[index] = oldPosq[i];
newVelm[index] = oldVelm[i];
newCellOffsets[index] = posCellOffsets[i];
}
posq->upload(newPosq);
velm->upload(newVelm);
}
else if (useMixedPrecision) {
vector oldPosq(paddedNumAtoms);
vector newPosq(paddedNumAtoms, mm_float4(0,0,0,0));
vector oldPosqCorrection(paddedNumAtoms);
vector newPosqCorrection(paddedNumAtoms, mm_float4(0,0,0,0));
vector oldVelm(paddedNumAtoms);
vector newVelm(paddedNumAtoms, mm_double4(0,0,0,0));
posq->download(oldPosq);
velm->download(oldVelm);
for (int i = 0; i < numAtoms; i++) {
int index = atomIndex[i];
newPosq[index] = oldPosq[i];
newPosqCorrection[index] = oldPosqCorrection[i];
newVelm[index] = oldVelm[i];
newCellOffsets[index] = posCellOffsets[i];
}
posq->upload(newPosq);
posqCorrection->upload(newPosqCorrection);
velm->upload(newVelm);
}
else {
vector oldPosq(paddedNumAtoms);
vector newPosq(paddedNumAtoms, mm_float4(0,0,0,0));
vector oldVelm(paddedNumAtoms);
vector newVelm(paddedNumAtoms, mm_float4(0,0,0,0));
posq->download(oldPosq);
velm->download(oldVelm);
for (int i = 0; i < numAtoms; i++) {
int index = atomIndex[i];
newPosq[index] = oldPosq[i];
newVelm[index] = oldVelm[i];
newCellOffsets[index] = posCellOffsets[i];
}
posq->upload(newPosq);
velm->upload(newVelm);
}
for (int i = 0; i < numAtoms; i++) {
atomIndex[i] = i;
posCellOffsets[i] = newCellOffsets[i];
}
atomIndexDevice->upload(atomIndex);
findMoleculeGroups();
for (int i = 0; i < (int) reorderListeners.size(); i++)
reorderListeners[i]->execute();
}
void OpenCLContext::reorderAtoms(bool enforcePeriodic) {
if (numAtoms == 0 || nonbonded == NULL || !nonbonded->getUseCutoff())
return;
if (moleculesInvalid)
validateMolecules();
atomsWereReordered = true;
if (useDoublePrecision)
reorderAtomsImpl(enforcePeriodic);
else if (useMixedPrecision)
reorderAtomsImpl(enforcePeriodic);
else
reorderAtomsImpl(enforcePeriodic);
}
template
void OpenCLContext::reorderAtomsImpl(bool enforcePeriodic) {
// Find the range of positions and the number of bins along each axis.
vector oldPosq(paddedNumAtoms);
vector oldPosqCorrection(paddedNumAtoms);
vector oldVelm(paddedNumAtoms);
posq->download(oldPosq);
velm->download(oldVelm);
if (useMixedPrecision)
posqCorrection->download(oldPosqCorrection);
Real minx = oldPosq[0].x, maxx = oldPosq[0].x;
Real miny = oldPosq[0].y, maxy = oldPosq[0].y;
Real minz = oldPosq[0].z, maxz = oldPosq[0].z;
if (nonbonded->getUsePeriodic()) {
minx = miny = minz = 0.0;
maxx = periodicBoxSize.x;
maxy = periodicBoxSize.y;
maxz = periodicBoxSize.z;
}
else {
for (int i = 1; i < numAtoms; i++) {
const Real4& pos = oldPosq[i];
minx = min(minx, pos.x);
maxx = max(maxx, pos.x);
miny = min(miny, pos.y);
maxy = max(maxy, pos.y);
minz = min(minz, pos.z);
maxz = max(maxz, pos.z);
}
}
// Loop over each group of identical molecules and reorder them.
vector originalIndex(numAtoms);
vector newPosq(paddedNumAtoms, Real4(0,0,0,0));
vector newPosqCorrection(paddedNumAtoms, Real4(0,0,0,0));
vector newVelm(paddedNumAtoms, Mixed4(0,0,0,0));
vector newCellOffsets(numAtoms);
for (int group = 0; group < (int) moleculeGroups.size(); group++) {
// Find the center of each molecule.
MoleculeGroup& mol = moleculeGroups[group];
int numMolecules = mol.offsets.size();
vector& atoms = mol.atoms;
vector molPos(numMolecules);
Real invNumAtoms = (Real) (1.0/atoms.size());
for (int i = 0; i < numMolecules; i++) {
molPos[i].x = 0.0f;
molPos[i].y = 0.0f;
molPos[i].z = 0.0f;
for (int j = 0; j < (int)atoms.size(); j++) {
int atom = atoms[j]+mol.offsets[i];
const Real4& pos = oldPosq[atom];
molPos[i].x += pos.x;
molPos[i].y += pos.y;
molPos[i].z += pos.z;
}
molPos[i].x *= invNumAtoms;
molPos[i].y *= invNumAtoms;
molPos[i].z *= invNumAtoms;
}
if (nonbonded->getUsePeriodic()) {
// Move each molecule position into the same box.
for (int i = 0; i < numMolecules; i++) {
int xcell = (int) floor(molPos[i].x*invPeriodicBoxSize.x);
int ycell = (int) floor(molPos[i].y*invPeriodicBoxSize.y);
int zcell = (int) floor(molPos[i].z*invPeriodicBoxSize.z);
Real dx = xcell*periodicBoxSize.x;
Real dy = ycell*periodicBoxSize.y;
Real dz = zcell*periodicBoxSize.z;
if (dx != 0.0f || dy != 0.0f || dz != 0.0f) {
molPos[i].x -= dx;
molPos[i].y -= dy;
molPos[i].z -= dz;
if (enforcePeriodic) {
for (int j = 0; j < (int) atoms.size(); j++) {
int atom = atoms[j]+mol.offsets[i];
Real4 p = oldPosq[atom];
p.x -= dx;
p.y -= dy;
p.z -= dz;
oldPosq[atom] = p;
posCellOffsets[atom].x -= xcell;
posCellOffsets[atom].y -= ycell;
posCellOffsets[atom].z -= zcell;
}
}
}
}
}
// Select a bin for each molecule, then sort them by bin.
bool useHilbert = (numMolecules > 5000 || atoms.size() > 8); // For small systems, a simple zigzag curve works better than a Hilbert curve.
Real binWidth;
if (useHilbert)
binWidth = (Real) (max(max(maxx-minx, maxy-miny), maxz-minz)/255.0);
else
binWidth = (Real) (0.2*nonbonded->getCutoffDistance());
Real invBinWidth = (Real) (1.0/binWidth);
int xbins = 1 + (int) ((maxx-minx)*invBinWidth);
int ybins = 1 + (int) ((maxy-miny)*invBinWidth);
vector > molBins(numMolecules);
bitmask_t coords[3];
for (int i = 0; i < numMolecules; i++) {
int x = (int) ((molPos[i].x-minx)*invBinWidth);
int y = (int) ((molPos[i].y-miny)*invBinWidth);
int z = (int) ((molPos[i].z-minz)*invBinWidth);
int bin;
if (useHilbert) {
coords[0] = x;
coords[1] = y;
coords[2] = z;
bin = (int) hilbert_c2i(3, 8, coords);
}
else {
int yodd = y&1;
int zodd = z&1;
bin = z*xbins*ybins;
bin += (zodd ? ybins-y : y)*xbins;
bin += (yodd ? xbins-x : x);
}
molBins[i] = pair(bin, i);
}
sort(molBins.begin(), molBins.end());
// Reorder the atoms.
for (int i = 0; i < numMolecules; i++) {
for (int j = 0; j < (int)atoms.size(); j++) {
int oldIndex = mol.offsets[molBins[i].second]+atoms[j];
int newIndex = mol.offsets[i]+atoms[j];
originalIndex[newIndex] = atomIndex[oldIndex];
newPosq[newIndex] = oldPosq[oldIndex];
if (useMixedPrecision)
newPosqCorrection[newIndex] = oldPosqCorrection[oldIndex];
newVelm[newIndex] = oldVelm[oldIndex];
newCellOffsets[newIndex] = posCellOffsets[oldIndex];
}
}
}
// Update the streams.
for (int i = 0; i < numAtoms; i++) {
atomIndex[i] = originalIndex[i];
posCellOffsets[i] = newCellOffsets[i];
}
posq->upload(newPosq);
if (useMixedPrecision)
posqCorrection->upload(newPosqCorrection);
velm->upload(newVelm);
atomIndexDevice->upload(atomIndex);
for (int i = 0; i < (int) reorderListeners.size(); i++)
reorderListeners[i]->execute();
}
void OpenCLContext::addReorderListener(ReorderListener* listener) {
reorderListeners.push_back(listener);
}
struct OpenCLContext::WorkThread::ThreadData {
ThreadData(std::queue& tasks, bool& waiting, bool& finished,
pthread_mutex_t& queueLock, pthread_cond_t& waitForTaskCondition, pthread_cond_t& queueEmptyCondition) :
tasks(tasks), waiting(waiting), finished(finished), queueLock(queueLock),
waitForTaskCondition(waitForTaskCondition), queueEmptyCondition(queueEmptyCondition) {
}
std::queue& tasks;
bool& waiting;
bool& finished;
pthread_mutex_t& queueLock;
pthread_cond_t& waitForTaskCondition;
pthread_cond_t& queueEmptyCondition;
};
static void* threadBody(void* args) {
OpenCLContext::WorkThread::ThreadData& data = *reinterpret_cast(args);
while (!data.finished || data.tasks.size() > 0) {
pthread_mutex_lock(&data.queueLock);
while (data.tasks.empty() && !data.finished) {
data.waiting = true;
pthread_cond_signal(&data.queueEmptyCondition);
pthread_cond_wait(&data.waitForTaskCondition, &data.queueLock);
}
OpenCLContext::WorkTask* task = NULL;
if (!data.tasks.empty()) {
data.waiting = false;
task = data.tasks.front();
data.tasks.pop();
}
pthread_mutex_unlock(&data.queueLock);
if (task != NULL) {
task->execute();
delete task;
}
}
data.waiting = true;
pthread_cond_signal(&data.queueEmptyCondition);
delete &data;
return 0;
}
OpenCLContext::WorkThread::WorkThread() : waiting(true), finished(false) {
pthread_mutex_init(&queueLock, NULL);
pthread_cond_init(&waitForTaskCondition, NULL);
pthread_cond_init(&queueEmptyCondition, NULL);
ThreadData* data = new ThreadData(tasks, waiting, finished, queueLock, waitForTaskCondition, queueEmptyCondition);
pthread_create(&thread, NULL, threadBody, data);
}
OpenCLContext::WorkThread::~WorkThread() {
pthread_mutex_lock(&queueLock);
finished = true;
pthread_cond_broadcast(&waitForTaskCondition);
pthread_mutex_unlock(&queueLock);
pthread_join(thread, NULL);
pthread_mutex_destroy(&queueLock);
pthread_cond_destroy(&waitForTaskCondition);
pthread_cond_destroy(&queueEmptyCondition);
}
void OpenCLContext::WorkThread::addTask(OpenCLContext::WorkTask* task) {
pthread_mutex_lock(&queueLock);
tasks.push(task);
waiting = false;
pthread_cond_signal(&waitForTaskCondition);
pthread_mutex_unlock(&queueLock);
}
bool OpenCLContext::WorkThread::isWaiting() {
return waiting;
}
bool OpenCLContext::WorkThread::isFinished() {
return finished;
}
void OpenCLContext::WorkThread::flush() {
pthread_mutex_lock(&queueLock);
while (!waiting)
pthread_cond_wait(&queueEmptyCondition, &queueLock);
pthread_mutex_unlock(&queueLock);
}