/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2015 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see . *
* -------------------------------------------------------------------------- */
#ifdef WIN32
#define _USE_MATH_DEFINES // Needed to get M_PI
#endif
#include
#include "CudaContext.h"
#include "CudaArray.h"
#include "CudaBondedUtilities.h"
#include "CudaForceInfo.h"
#include "CudaIntegrationUtilities.h"
#include "CudaKernels.h"
#include "CudaKernelSources.h"
#include "CudaNonbondedUtilities.h"
#include "SHA1.h"
#include "hilbert.h"
#include "openmm/OpenMMException.h"
#include "openmm/Platform.h"
#include "openmm/System.h"
#include "openmm/VirtualSite.h"
#include "CudaExpressionUtilities.h"
#include "openmm/internal/ContextImpl.h"
#include
#include
#include
#include
#include
#include
#include
#include
#include
#ifndef WIN32
#include
#endif
#define CHECK_RESULT(result) CHECK_RESULT2(result, errorMessage);
#define CHECK_RESULT2(result, prefix) \
if (result != CUDA_SUCCESS) { \
std::stringstream m; \
m<compiler = "\""+compiler+"\"";
if (platformData.context != NULL) {
try {
compilerKernel = platformData.context->getPlatform().createKernel(CudaCompilerKernel::Name(), *platformData.context);
hasCompilerKernel = true;
}
catch (...) {
// The runtime compiler plugin isn't available.
}
}
if (hostCompiler.size() > 0)
this->compiler = compiler+" --compiler-bindir "+hostCompiler;
if (!hasInitializedCuda) {
CHECK_RESULT2(cuInit(0), "Error initializing CUDA");
hasInitializedCuda = true;
}
if (precision == "single") {
useDoublePrecision = false;
useMixedPrecision = false;
}
else if (precision == "mixed") {
useDoublePrecision = false;
useMixedPrecision = true;
}
else if (precision == "double") {
useDoublePrecision = true;
useMixedPrecision = false;
}
else
throw OpenMMException("Illegal value for CudaPrecision: "+precision);
char* cacheVariable = getenv("OPENMM_CACHE_DIR");
cacheDir = (cacheVariable == NULL ? tempDir : string(cacheVariable));
#ifdef WIN32
this->tempDir = tempDir+"\\";
cacheDir = cacheDir+"\\";
#else
this->tempDir = tempDir+"/";
cacheDir = cacheDir+"/";
#endif
contextIndex = platformData.contexts.size();
int numDevices;
string errorMessage = "Error initializing Context";
CHECK_RESULT(cuDeviceGetCount(&numDevices));
if (deviceIndex < -1 || deviceIndex >= numDevices)
throw OpenMMException("Illegal value for CudaDeviceIndex: "+intToString(deviceIndex));
if (deviceIndex == -1) {
// Try to figure out which device is the fastest.
int bestSpeed = -1;
int bestCompute = -1;
for (int i = 0; i < numDevices; i++) {
CHECK_RESULT(cuDeviceGet(&device, i));
int major, minor, clock, multiprocessors;
CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, device));
if (major == 1 && minor < 2)
continue; // 1.0 and 1.1 are not supported
CHECK_RESULT(cuDeviceGetAttribute(&clock, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device));
CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device));
int speed = clock*multiprocessors;
if (major > bestCompute || (major == bestCompute && speed > bestSpeed)) {
deviceIndex = i;
bestSpeed = speed;
bestCompute = major;
}
}
}
if (deviceIndex == -1)
throw OpenMMException("No compatible CUDA device is available");
CHECK_RESULT(cuDeviceGet(&device, deviceIndex));
this->deviceIndex = deviceIndex;
int major, minor;
CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, device));
// This is a workaround to support GTX 980 with CUDA 6.5. It reports its compute capability
// as 5.2, but the compiler doesn't support anything beyond 5.0. We can remove this once
// CUDA 7.0 is released.
if (major == 5)
minor = 0;
gpuArchitecture = intToString(major)+intToString(minor);
computeCapability = major+0.1*minor;
if ((useDoublePrecision || useMixedPrecision) && computeCapability < 1.3)
throw OpenMMException("This device does not support double precision");
defaultOptimizationOptions = "--use_fast_math";
unsigned int flags = CU_CTX_MAP_HOST;
if (useBlockingSync)
flags += CU_CTX_SCHED_BLOCKING_SYNC;
else
flags += CU_CTX_SCHED_SPIN;
CHECK_RESULT(cuCtxCreate(&context, flags, device));
contextIsValid = true;
CHECK_RESULT(cuCtxSetCacheConfig(CU_FUNC_CACHE_PREFER_SHARED));
if (contextIndex > 0) {
int canAccess;
cuDeviceCanAccessPeer(&canAccess, getDevice(), platformData.contexts[0]->getDevice());
if (canAccess) {
platformData.contexts[0]->setAsCurrent();
CHECK_RESULT(cuCtxEnablePeerAccess(getContext(), 0));
setAsCurrent();
CHECK_RESULT(cuCtxEnablePeerAccess(platformData.contexts[0]->getContext(), 0));
}
}
numAtoms = system.getNumParticles();
paddedNumAtoms = TileSize*((numAtoms+TileSize-1)/TileSize);
numAtomBlocks = (paddedNumAtoms+(TileSize-1))/TileSize;
int multiprocessors;
CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device));
int numThreadBlocksPerComputeUnit = 6;
numThreadBlocks = numThreadBlocksPerComputeUnit*multiprocessors;
if (useDoublePrecision) {
posq = CudaArray::create(*this, paddedNumAtoms, "posq");
velm = CudaArray::create(*this, paddedNumAtoms, "velm");
compilationDefines["USE_DOUBLE_PRECISION"] = "1";
compilationDefines["make_real2"] = "make_double2";
compilationDefines["make_real3"] = "make_double3";
compilationDefines["make_real4"] = "make_double4";
compilationDefines["make_mixed2"] = "make_double2";
compilationDefines["make_mixed3"] = "make_double3";
compilationDefines["make_mixed4"] = "make_double4";
}
else if (useMixedPrecision) {
posq = CudaArray::create(*this, paddedNumAtoms, "posq");
posqCorrection = CudaArray::create(*this, paddedNumAtoms, "posqCorrection");
velm = CudaArray::create(*this, paddedNumAtoms, "velm");
compilationDefines["USE_MIXED_PRECISION"] = "1";
compilationDefines["make_real2"] = "make_float2";
compilationDefines["make_real3"] = "make_float3";
compilationDefines["make_real4"] = "make_float4";
compilationDefines["make_mixed2"] = "make_double2";
compilationDefines["make_mixed3"] = "make_double3";
compilationDefines["make_mixed4"] = "make_double4";
}
else {
posq = CudaArray::create(*this, paddedNumAtoms, "posq");
velm = CudaArray::create(*this, paddedNumAtoms, "velm");
compilationDefines["make_real2"] = "make_float2";
compilationDefines["make_real3"] = "make_float3";
compilationDefines["make_real4"] = "make_float4";
compilationDefines["make_mixed2"] = "make_float2";
compilationDefines["make_mixed3"] = "make_float3";
compilationDefines["make_mixed4"] = "make_float4";
}
posCellOffsets.resize(paddedNumAtoms, make_int4(0, 0, 0, 0));
// Create utility kernels that are used in multiple places.
CUmodule utilities = createModule(CudaKernelSources::vectorOps+CudaKernelSources::utilities);
clearBufferKernel = getKernel(utilities, "clearBuffer");
clearTwoBuffersKernel = getKernel(utilities, "clearTwoBuffers");
clearThreeBuffersKernel = getKernel(utilities, "clearThreeBuffers");
clearFourBuffersKernel = getKernel(utilities, "clearFourBuffers");
clearFiveBuffersKernel = getKernel(utilities, "clearFiveBuffers");
clearSixBuffersKernel = getKernel(utilities, "clearSixBuffers");
// Set defines based on the requested precision.
compilationDefines["SQRT"] = useDoublePrecision ? "sqrt" : "sqrtf";
compilationDefines["RSQRT"] = useDoublePrecision ? "rsqrt" : "rsqrtf";
compilationDefines["RECIP"] = useDoublePrecision ? "1.0/" : "1.0f/";
compilationDefines["EXP"] = useDoublePrecision ? "exp" : "expf";
compilationDefines["LOG"] = useDoublePrecision ? "log" : "logf";
compilationDefines["POW"] = useDoublePrecision ? "pow" : "powf";
compilationDefines["COS"] = useDoublePrecision ? "cos" : "cosf";
compilationDefines["SIN"] = useDoublePrecision ? "sin" : "sinf";
compilationDefines["TAN"] = useDoublePrecision ? "tan" : "tanf";
compilationDefines["ACOS"] = useDoublePrecision ? "acos" : "acosf";
compilationDefines["ASIN"] = useDoublePrecision ? "asin" : "asinf";
compilationDefines["ATAN"] = useDoublePrecision ? "atan" : "atanf";
compilationDefines["ERF"] = useDoublePrecision ? "erf" : "erff";
compilationDefines["ERFC"] = useDoublePrecision ? "erfc" : "erfcf";
// Set defines for applying periodic boundary conditions.
Vec3 boxVectors[3];
system.getDefaultPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
boxIsTriclinic = (boxVectors[0][1] != 0.0 || boxVectors[0][2] != 0.0 ||
boxVectors[1][0] != 0.0 || boxVectors[1][2] != 0.0 ||
boxVectors[2][0] != 0.0 || boxVectors[2][1] != 0.0);
if (boxIsTriclinic) {
compilationDefines["APPLY_PERIODIC_TO_DELTA(delta)"] =
"{"
"real scale3 = floor(delta.z*invPeriodicBoxSize.z+0.5f); \\\n"
"delta.x -= scale3*periodicBoxVecZ.x; \\\n"
"delta.y -= scale3*periodicBoxVecZ.y; \\\n"
"delta.z -= scale3*periodicBoxVecZ.z; \\\n"
"real scale2 = floor(delta.y*invPeriodicBoxSize.y+0.5f); \\\n"
"delta.x -= scale2*periodicBoxVecY.x; \\\n"
"delta.y -= scale2*periodicBoxVecY.y; \\\n"
"real scale1 = floor(delta.x*invPeriodicBoxSize.x+0.5f); \\\n"
"delta.x -= scale1*periodicBoxVecX.x;}";
compilationDefines["APPLY_PERIODIC_TO_POS(pos)"] =
"{"
"real scale3 = floor(pos.z*invPeriodicBoxSize.z); \\\n"
"pos.x -= scale3*periodicBoxVecZ.x; \\\n"
"pos.y -= scale3*periodicBoxVecZ.y; \\\n"
"pos.z -= scale3*periodicBoxVecZ.z; \\\n"
"real scale2 = floor(pos.y*invPeriodicBoxSize.y); \\\n"
"pos.x -= scale2*periodicBoxVecY.x; \\\n"
"pos.y -= scale2*periodicBoxVecY.y; \\\n"
"real scale1 = floor(pos.x*invPeriodicBoxSize.x); \\\n"
"pos.x -= scale1*periodicBoxVecX.x;}";
compilationDefines["APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, center)"] =
"{"
"real scale3 = floor((pos.z-center.z)*invPeriodicBoxSize.z+0.5f); \\\n"
"pos.x -= scale3*periodicBoxVecZ.x; \\\n"
"pos.y -= scale3*periodicBoxVecZ.y; \\\n"
"pos.z -= scale3*periodicBoxVecZ.z; \\\n"
"real scale2 = floor((pos.y-center.y)*invPeriodicBoxSize.y+0.5f); \\\n"
"pos.x -= scale2*periodicBoxVecY.x; \\\n"
"pos.y -= scale2*periodicBoxVecY.y; \\\n"
"real scale1 = floor((pos.x-center.x)*invPeriodicBoxSize.x+0.5f); \\\n"
"pos.x -= scale1*periodicBoxVecX.x;}";
}
else {
compilationDefines["APPLY_PERIODIC_TO_DELTA(delta)"] =
"{"
"delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; \\\n"
"delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; \\\n"
"delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;}";
compilationDefines["APPLY_PERIODIC_TO_POS(pos)"] =
"{"
"pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x; \\\n"
"pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y; \\\n"
"pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;}";
compilationDefines["APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, center)"] =
"{"
"pos.x -= floor((pos.x-center.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; \\\n"
"pos.y -= floor((pos.y-center.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; \\\n"
"pos.z -= floor((pos.z-center.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;}";
}
// Create the work thread used for parallelization when running on multiple devices.
thread = new WorkThread();
// Create utilities objects.
bonded = new CudaBondedUtilities(*this);
nonbonded = new CudaNonbondedUtilities(*this);
integration = new CudaIntegrationUtilities(*this, system);
expression = new CudaExpressionUtilities(*this);
}
CudaContext::~CudaContext() {
setAsCurrent();
for (int i = 0; i < (int) forces.size(); i++)
delete forces[i];
for (int i = 0; i < (int) reorderListeners.size(); i++)
delete reorderListeners[i];
for (int i = 0; i < (int) preComputations.size(); i++)
delete preComputations[i];
for (int i = 0; i < (int) postComputations.size(); i++)
delete postComputations[i];
if (pinnedBuffer != NULL)
cuMemFreeHost(pinnedBuffer);
if (posq != NULL)
delete posq;
if (posqCorrection != NULL)
delete posqCorrection;
if (velm != NULL)
delete velm;
if (force != NULL)
delete force;
if (energyBuffer != NULL)
delete energyBuffer;
if (integration != NULL)
delete integration;
if (expression != NULL)
delete expression;
if (bonded != NULL)
delete bonded;
if (nonbonded != NULL)
delete nonbonded;
if (thread != NULL)
delete thread;
string errorMessage = "Error deleting Context";
if (contextIsValid) {
cuProfilerStop();
CHECK_RESULT(cuCtxDestroy(context));
}
contextIsValid = false;
}
void CudaContext::initialize() {
cuCtxSetCurrent(context);
string errorMessage = "Error initializing Context";
int numEnergyBuffers = max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers());
if (useDoublePrecision) {
energyBuffer = CudaArray::create(*this, numEnergyBuffers, "energyBuffer");
int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers);
CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0));
}
else if (useMixedPrecision) {
energyBuffer = CudaArray::create(*this, numEnergyBuffers, "energyBuffer");
int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers);
CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0));
}
else {
energyBuffer = CudaArray::create(*this, numEnergyBuffers, "energyBuffer");
int pinnedBufferSize = max(paddedNumAtoms*6, numEnergyBuffers);
CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(float), 0));
}
for (int i = 0; i < numAtoms; i++) {
double mass = system.getParticleMass(i);
if (useDoublePrecision || useMixedPrecision)
((double4*) pinnedBuffer)[i] = make_double4(0.0, 0.0, 0.0, mass == 0.0 ? 0.0 : 1.0/mass);
else
((float4*) pinnedBuffer)[i] = make_float4(0.0f, 0.0f, 0.0f, mass == 0.0 ? 0.0f : (float) (1.0/mass));
}
velm->upload(pinnedBuffer);
bonded->initialize(system);
force = CudaArray::create(*this, paddedNumAtoms*3, "force");
addAutoclearBuffer(force->getDevicePointer(), force->getSize()*force->getElementSize());
addAutoclearBuffer(energyBuffer->getDevicePointer(), energyBuffer->getSize()*energyBuffer->getElementSize());
atomIndexDevice = CudaArray::create(*this, paddedNumAtoms, "atomIndex");
atomIndex.resize(paddedNumAtoms);
for (int i = 0; i < paddedNumAtoms; ++i)
atomIndex[i] = i;
atomIndexDevice->upload(atomIndex);
findMoleculeGroups();
nonbonded->initialize(system);
}
void CudaContext::addForce(CudaForceInfo* force) {
forces.push_back(force);
}
void CudaContext::setAsCurrent() {
if (contextIsValid)
cuCtxSetCurrent(context);
}
string CudaContext::replaceStrings(const string& input, const std::map& replacements) const {
static set symbolChars;
if (symbolChars.size() == 0) {
symbolChars.insert('_');
for (char c = 'a'; c <= 'z'; c++)
symbolChars.insert(c);
for (char c = 'A'; c <= 'Z'; c++)
symbolChars.insert(c);
for (char c = '0'; c <= '9'; c++)
symbolChars.insert(c);
}
string result = input;
for (map::const_iterator iter = replacements.begin(); iter != replacements.end(); iter++) {
int index = 0;
int size = iter->first.size();
do {
index = result.find(iter->first, index);
if (index != result.npos) {
if ((index == 0 || symbolChars.find(result[index-1]) == symbolChars.end()) && (index == result.size()-size || symbolChars.find(result[index+size]) == symbolChars.end())) {
// We have found a complete symbol, not part of a longer symbol.
result.replace(index, size, iter->second);
index += iter->second.size();
}
else
index++;
}
} while (index != result.npos);
}
return result;
}
CUmodule CudaContext::createModule(const string source, const char* optimizationFlags) {
return createModule(source, map(), optimizationFlags);
}
#ifdef WIN32
#include
static bool compileInWindows(const string &command) {
// COMSPEC is an env variable pointing to full dir of cmd.exe
// it always defined on pretty much all Windows OSes
string fullcommand = getenv("COMSPEC") + string(" /C ") + command;
STARTUPINFO si;
PROCESS_INFORMATION pi;
ZeroMemory( &si, sizeof(si) );
si.cb = sizeof(si);
ZeroMemory( &pi, sizeof(pi) );
vector args(std::max(1000, (int) fullcommand.size()+1));
strcpy(&args[0], fullcommand.c_str());
si.dwFlags = STARTF_USESHOWWINDOW;
si.wShowWindow = SW_HIDE;
if (!CreateProcess(NULL, &args[0], NULL, NULL, FALSE, 0, NULL, NULL, &si, &pi)) {
return -1;
}
WaitForSingleObject(pi.hProcess, INFINITE);
DWORD exitCode = -1;
if(!GetExitCodeProcess(pi.hProcess, &exitCode)) {
throw(OpenMMException("Could not get nvcc.exe's exit code\n"));
} else {
if(exitCode == 0)
return 0;
else
return -1;
}
}
#endif
CUmodule CudaContext::createModule(const string source, const map& defines, const char* optimizationFlags) {
string bits = intToString(8*sizeof(void*));
string options = (optimizationFlags == NULL ? defaultOptimizationOptions : string(optimizationFlags));
stringstream src;
if (!options.empty())
src << "// Compilation Options: " << options << endl << endl;
for (map::const_iterator iter = compilationDefines.begin(); iter != compilationDefines.end(); ++iter) {
src << "#define " << iter->first;
if (!iter->second.empty())
src << " " << iter->second;
src << endl;
}
if (!compilationDefines.empty())
src << endl;
if (useDoublePrecision) {
src << "typedef double real;\n";
src << "typedef double2 real2;\n";
src << "typedef double3 real3;\n";
src << "typedef double4 real4;\n";
}
else {
src << "typedef float real;\n";
src << "typedef float2 real2;\n";
src << "typedef float3 real3;\n";
src << "typedef float4 real4;\n";
}
if (useDoublePrecision || useMixedPrecision) {
src << "typedef double mixed;\n";
src << "typedef double2 mixed2;\n";
src << "typedef double3 mixed3;\n";
src << "typedef double4 mixed4;\n";
}
else {
src << "typedef float mixed;\n";
src << "typedef float2 mixed2;\n";
src << "typedef float3 mixed3;\n";
src << "typedef float4 mixed4;\n";
}
src << "typedef unsigned int tileflags;\n";
for (map::const_iterator iter = defines.begin(); iter != defines.end(); ++iter) {
src << "#define " << iter->first;
if (!iter->second.empty())
src << " " << iter->second;
src << endl;
}
if (!defines.empty())
src << endl;
src << source << endl;
// See whether we already have PTX for this kernel cached.
CSHA1 sha1;
sha1.Update((const UINT_8*) src.str().c_str(), src.str().size());
sha1.Final();
UINT_8 hash[20];
sha1.GetHash(hash);
stringstream cacheFile;
cacheFile << cacheDir;
cacheFile.flags(ios::hex);
for (int i = 0; i < 20; i++)
cacheFile << setw(2) << setfill('0') << (int) hash[i];
cacheFile << '_' << gpuArchitecture << '_' << bits;
CUmodule module;
if (cuModuleLoad(&module, cacheFile.str().c_str()) == CUDA_SUCCESS)
return module;
// Select names for the various temporary files.
stringstream tempFileName;
tempFileName << "openmmTempKernel" << this; // Include a pointer to this context as part of the filename to avoid collisions.
#ifdef WIN32
tempFileName << "_" << GetCurrentProcessId();
#else
tempFileName << "_" << getpid();
#endif
string inputFile = (tempDir+tempFileName.str()+".cu");
string outputFile = (tempDir+tempFileName.str()+".ptx");
string logFile = (tempDir+tempFileName.str()+".log");
int res = 0;
// If the runtime compiler plugin is available, use it.
if (hasCompilerKernel) {
string ptx = compilerKernel.getAs().createModule(src.str(), "-arch=compute_"+gpuArchitecture+" "+options, *this);
// If possible, write the PTX out to a temporary file so we can cache it for later use.
bool wroteCache = false;
try {
ofstream out(outputFile.c_str());
out << ptx;
out.close();
if (!out.fail())
wroteCache = true;
}
catch (...) {
// Ignore.
}
if (!wroteCache) {
// An error occurred. Possibly we don't have permission to write to the temp directory. Just try to load the module directly.
CHECK_RESULT2(cuModuleLoadDataEx(&module, &ptx[0], 0, NULL, NULL), "Error loading CUDA module");
return module;
}
}
else {
// Write out the source to a temporary file.
ofstream out(inputFile.c_str());
out << src.str();
out.close();
#ifdef WIN32
#ifdef _DEBUG
string command = compiler+" --ptx -G -g --machine "+bits+" -arch=sm_"+gpuArchitecture+" -o "+outputFile+" "+options+" "+inputFile+" 2> "+logFile;
#else
string command = compiler+" --ptx -lineinfo --machine "+bits+" -arch=sm_"+gpuArchitecture+" -o "+outputFile+" "+options+" "+inputFile+" 2> "+logFile;
#endif
int res = compileInWindows(command);
#else
string command = compiler+" --ptx --machine "+bits+" -arch=sm_"+gpuArchitecture+" -o \""+outputFile+"\" "+options+" \""+inputFile+"\" 2> \""+logFile+"\"";
res = std::system(command.c_str());
#endif
}
try {
if (res != 0) {
// Load the error log.
stringstream error;
error << "Error launching CUDA compiler: " << res;
ifstream log(logFile.c_str());
if (log.is_open()) {
string line;
while (!log.eof()) {
getline(log, line);
error << '\n' << line;
}
log.close();
}
throw OpenMMException(error.str());
}
CUresult result = cuModuleLoad(&module, outputFile.c_str());
if (result != CUDA_SUCCESS) {
std::stringstream m;
m<<"Error loading CUDA module: "<= 2.0 && preferShared)
maxShared = 48*1024;
int max = (int) (maxShared/memory);
if (max < 64)
return 32;
int threads = 64;
while (threads+64 < max)
threads += 64;
return threads;
}
void CudaContext::clearBuffer(CudaArray& array) {
clearBuffer(array.getDevicePointer(), array.getSize()*array.getElementSize());
}
void CudaContext::clearBuffer(CUdeviceptr memory, int size) {
int words = size/4;
void* args[] = {&memory, &words};
executeKernel(clearBufferKernel, args, words, 128);
}
void CudaContext::addAutoclearBuffer(CudaArray& array) {
addAutoclearBuffer(array.getDevicePointer(), array.getSize()*array.getElementSize());
}
void CudaContext::addAutoclearBuffer(CUdeviceptr memory, int size) {
autoclearBuffers.push_back(memory);
autoclearBufferSizes.push_back(size/4);
}
void CudaContext::clearAutoclearBuffers() {
int base = 0;
int total = autoclearBufferSizes.size();
while (total-base >= 6) {
void* args[] = {&autoclearBuffers[base], &autoclearBufferSizes[base],
&autoclearBuffers[base+1], &autoclearBufferSizes[base+1],
&autoclearBuffers[base+2], &autoclearBufferSizes[base+2],
&autoclearBuffers[base+3], &autoclearBufferSizes[base+3],
&autoclearBuffers[base+4], &autoclearBufferSizes[base+4],
&autoclearBuffers[base+5], &autoclearBufferSizes[base+5]};
executeKernel(clearSixBuffersKernel, args, max(max(max(max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), autoclearBufferSizes[base+3]), autoclearBufferSizes[base+4]), autoclearBufferSizes[base+5]), 128);
base += 6;
}
if (total-base == 5) {
void* args[] = {&autoclearBuffers[base], &autoclearBufferSizes[base],
&autoclearBuffers[base+1], &autoclearBufferSizes[base+1],
&autoclearBuffers[base+2], &autoclearBufferSizes[base+2],
&autoclearBuffers[base+3], &autoclearBufferSizes[base+3],
&autoclearBuffers[base+4], &autoclearBufferSizes[base+4]};
executeKernel(clearFiveBuffersKernel, args, max(max(max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), autoclearBufferSizes[base+3]), autoclearBufferSizes[base+4]), 128);
}
else if (total-base == 4) {
void* args[] = {&autoclearBuffers[base], &autoclearBufferSizes[base],
&autoclearBuffers[base+1], &autoclearBufferSizes[base+1],
&autoclearBuffers[base+2], &autoclearBufferSizes[base+2],
&autoclearBuffers[base+3], &autoclearBufferSizes[base+3]};
executeKernel(clearFourBuffersKernel, args, max(max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), autoclearBufferSizes[base+3]), 128);
}
else if (total-base == 3) {
void* args[] = {&autoclearBuffers[base], &autoclearBufferSizes[base],
&autoclearBuffers[base+1], &autoclearBufferSizes[base+1],
&autoclearBuffers[base+2], &autoclearBufferSizes[base+2]};
executeKernel(clearThreeBuffersKernel, args, max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), 128);
}
else if (total-base == 2) {
void* args[] = {&autoclearBuffers[base], &autoclearBufferSizes[base],
&autoclearBuffers[base+1], &autoclearBufferSizes[base+1]};
executeKernel(clearTwoBuffersKernel, args, max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), 128);
}
else if (total-base == 1) {
clearBuffer(autoclearBuffers[base], autoclearBufferSizes[base]*4);
}
}
/**
* This class ensures that atom reordering doesn't break virtual sites.
*/
class CudaContext::VirtualSiteInfo : public CudaForceInfo {
public:
VirtualSiteInfo(const System& system) {
for (int i = 0; i < system.getNumParticles(); i++) {
if (system.isVirtualSite(i)) {
siteTypes.push_back(&typeid(system.getVirtualSite(i)));
vector particles;
particles.push_back(i);
for (int j = 0; j < system.getVirtualSite(i).getNumParticles(); j++)
particles.push_back(system.getVirtualSite(i).getParticle(j));
siteParticles.push_back(particles);
vector weights;
if (dynamic_cast(&system.getVirtualSite(i)) != NULL) {
// A two particle average.
const TwoParticleAverageSite& site = dynamic_cast(system.getVirtualSite(i));
weights.push_back(site.getWeight(0));
weights.push_back(site.getWeight(1));
}
else if (dynamic_cast(&system.getVirtualSite(i)) != NULL) {
// A three particle average.
const ThreeParticleAverageSite& site = dynamic_cast(system.getVirtualSite(i));
weights.push_back(site.getWeight(0));
weights.push_back(site.getWeight(1));
weights.push_back(site.getWeight(2));
}
else if (dynamic_cast(&system.getVirtualSite(i)) != NULL) {
// An out of plane site.
const OutOfPlaneSite& site = dynamic_cast(system.getVirtualSite(i));
weights.push_back(site.getWeight12());
weights.push_back(site.getWeight13());
weights.push_back(site.getWeightCross());
}
siteWeights.push_back(weights);
}
}
}
int getNumParticleGroups() {
return siteTypes.size();
}
void getParticlesInGroup(int index, std::vector& particles) {
particles = siteParticles[index];
}
bool areGroupsIdentical(int group1, int group2) {
if (siteTypes[group1] != siteTypes[group2])
return false;
int numParticles = siteWeights[group1].size();
if (siteWeights[group2].size() != numParticles)
return false;
for (int i = 0; i < numParticles; i++)
if (siteWeights[group1][i] != siteWeights[group2][i])
return false;
return true;
}
private:
vector siteTypes;
vector > siteParticles;
vector > siteWeights;
};
void CudaContext::findMoleculeGroups() {
// The first time this is called, we need to identify all the molecules in the system.
if (moleculeGroups.size() == 0) {
// Add a ForceInfo that makes sure reordering doesn't break virtual sites.
addForce(new VirtualSiteInfo(system));
// First make a list of every other atom to which each atom is connect by a constraint or force group.
vector > atomBonds(system.getNumParticles());
for (int i = 0; i < system.getNumConstraints(); i++) {
int particle1, particle2;
double distance;
system.getConstraintParameters(i, particle1, particle2, distance);
atomBonds[particle1].push_back(particle2);
atomBonds[particle2].push_back(particle1);
}
for (int i = 0; i < (int) forces.size(); i++) {
for (int j = 0; j < forces[i]->getNumParticleGroups(); j++) {
vector particles;
forces[i]->getParticlesInGroup(j, particles);
for (int k = 0; k < (int) particles.size(); k++)
for (int m = 0; m < (int) particles.size(); m++)
if (k != m)
atomBonds[particles[k]].push_back(particles[m]);
}
}
// Now identify atoms by which molecule they belong to.
vector > atomIndices = ContextImpl::findMolecules(numAtoms, atomBonds);
int numMolecules = atomIndices.size();
vector atomMolecule(numAtoms);
for (int i = 0; i < (int) atomIndices.size(); i++)
for (int j = 0; j < (int) atomIndices[i].size(); j++)
atomMolecule[atomIndices[i][j]] = i;
// Construct a description of each molecule.
molecules.resize(numMolecules);
for (int i = 0; i < numMolecules; i++) {
molecules[i].atoms = atomIndices[i];
molecules[i].groups.resize(forces.size());
}
for (int i = 0; i < system.getNumConstraints(); i++) {
int particle1, particle2;
double distance;
system.getConstraintParameters(i, particle1, particle2, distance);
molecules[atomMolecule[particle1]].constraints.push_back(i);
}
for (int i = 0; i < (int) forces.size(); i++)
for (int j = 0; j < forces[i]->getNumParticleGroups(); j++) {
vector particles;
forces[i]->getParticlesInGroup(j, particles);
if (particles.size() > 0)
molecules[atomMolecule[particles[0]]].groups[i].push_back(j);
}
}
// Sort them into groups of identical molecules.
vector uniqueMolecules;
vector > moleculeInstances;
vector > moleculeOffsets;
for (int molIndex = 0; molIndex < (int) molecules.size(); molIndex++) {
Molecule& mol = molecules[molIndex];
// See if it is identical to another molecule.
bool isNew = true;
for (int j = 0; j < (int) uniqueMolecules.size() && isNew; j++) {
Molecule& mol2 = uniqueMolecules[j];
bool identical = (mol.atoms.size() == mol2.atoms.size() && mol.constraints.size() == mol2.constraints.size());
// See if the atoms are identical.
int atomOffset = mol2.atoms[0]-mol.atoms[0];
for (int i = 0; i < (int) mol.atoms.size() && identical; i++) {
if (mol.atoms[i] != mol2.atoms[i]-atomOffset || system.getParticleMass(mol.atoms[i]) != system.getParticleMass(mol2.atoms[i]))
identical = false;
for (int k = 0; k < (int) forces.size(); k++)
if (!forces[k]->areParticlesIdentical(mol.atoms[i], mol2.atoms[i]))
identical = false;
}
// See if the constraints are identical.
for (int i = 0; i < (int) mol.constraints.size() && identical; i++) {
int c1particle1, c1particle2, c2particle1, c2particle2;
double distance1, distance2;
system.getConstraintParameters(mol.constraints[i], c1particle1, c1particle2, distance1);
system.getConstraintParameters(mol2.constraints[i], c2particle1, c2particle2, distance2);
if (c1particle1 != c2particle1-atomOffset || c1particle2 != c2particle2-atomOffset || distance1 != distance2)
identical = false;
}
// See if the force groups are identical.
for (int i = 0; i < (int) forces.size() && identical; i++) {
if (mol.groups[i].size() != mol2.groups[i].size())
identical = false;
for (int k = 0; k < (int) mol.groups[i].size() && identical; k++)
if (!forces[i]->areGroupsIdentical(mol.groups[i][k], mol2.groups[i][k]))
identical = false;
}
if (identical) {
moleculeInstances[j].push_back(molIndex);
moleculeOffsets[j].push_back(mol.atoms[0]);
isNew = false;
}
}
if (isNew) {
uniqueMolecules.push_back(mol);
moleculeInstances.push_back(vector());
moleculeInstances[moleculeInstances.size()-1].push_back(molIndex);
moleculeOffsets.push_back(vector());
moleculeOffsets[moleculeOffsets.size()-1].push_back(mol.atoms[0]);
}
}
moleculeGroups.resize(moleculeInstances.size());
for (int i = 0; i < (int) moleculeInstances.size(); i++)
{
moleculeGroups[i].instances = moleculeInstances[i];
moleculeGroups[i].offsets = moleculeOffsets[i];
vector& atoms = uniqueMolecules[i].atoms;
moleculeGroups[i].atoms.resize(atoms.size());
for (int j = 0; j < (int) atoms.size(); j++)
moleculeGroups[i].atoms[j] = atoms[j]-atoms[0];
}
}
void CudaContext::invalidateMolecules() {
if (numAtoms == 0 || nonbonded == NULL || !nonbonded->getUseCutoff())
return;
bool valid = true;
for (int group = 0; valid && group < (int) moleculeGroups.size(); group++) {
MoleculeGroup& mol = moleculeGroups[group];
vector& instances = mol.instances;
vector& offsets = mol.offsets;
vector& atoms = mol.atoms;
int numMolecules = instances.size();
Molecule& m1 = molecules[instances[0]];
int offset1 = offsets[0];
for (int j = 1; valid && j < numMolecules; j++) {
// See if the atoms are identical.
Molecule& m2 = molecules[instances[j]];
int offset2 = offsets[j];
for (int i = 0; i < (int) atoms.size() && valid; i++) {
for (int k = 0; k < (int) forces.size(); k++)
if (!forces[k]->areParticlesIdentical(atoms[i]+offset1, atoms[i]+offset2))
valid = false;
}
// See if the force groups are identical.
for (int i = 0; i < (int) forces.size() && valid; i++) {
for (int k = 0; k < (int) m1.groups[i].size() && valid; k++)
if (!forces[i]->areGroupsIdentical(m1.groups[i][k], m2.groups[i][k]))
valid = false;
}
}
}
if (valid)
return;
// The list of which molecules are identical is no longer valid. We need to restore the
// atoms to their original order, rebuild the list of identical molecules, and sort them
// again.
vector newCellOffsets(numAtoms);
if (useDoublePrecision) {
vector oldPosq(paddedNumAtoms);
vector newPosq(paddedNumAtoms, make_double4(0, 0, 0, 0));
vector oldVelm(paddedNumAtoms);
vector newVelm(paddedNumAtoms, make_double4(0, 0, 0, 0));
posq->download(oldPosq);
velm->download(oldVelm);
for (int i = 0; i < numAtoms; i++) {
int index = atomIndex[i];
newPosq[index] = oldPosq[i];
newVelm[index] = oldVelm[i];
newCellOffsets[index] = posCellOffsets[i];
}
posq->upload(newPosq);
velm->upload(newVelm);
}
else if (useMixedPrecision) {
vector oldPosq(paddedNumAtoms);
vector newPosq(paddedNumAtoms, make_float4(0, 0, 0, 0));
vector oldPosqCorrection(paddedNumAtoms);
vector newPosqCorrection(paddedNumAtoms, make_float4(0, 0, 0, 0));
vector oldVelm(paddedNumAtoms);
vector newVelm(paddedNumAtoms, make_double4(0, 0, 0, 0));
posq->download(oldPosq);
velm->download(oldVelm);
for (int i = 0; i < numAtoms; i++) {
int index = atomIndex[i];
newPosq[index] = oldPosq[i];
newPosqCorrection[index] = oldPosqCorrection[i];
newVelm[index] = oldVelm[i];
newCellOffsets[index] = posCellOffsets[i];
}
posq->upload(newPosq);
posqCorrection->upload(newPosqCorrection);
velm->upload(newVelm);
}
else {
vector oldPosq(paddedNumAtoms);
vector newPosq(paddedNumAtoms, make_float4(0, 0, 0, 0));
vector oldVelm(paddedNumAtoms);
vector newVelm(paddedNumAtoms, make_float4(0, 0, 0, 0));
posq->download(oldPosq);
velm->download(oldVelm);
for (int i = 0; i < numAtoms; i++) {
int index = atomIndex[i];
newPosq[index] = oldPosq[i];
newVelm[index] = oldVelm[i];
newCellOffsets[index] = posCellOffsets[i];
}
posq->upload(newPosq);
velm->upload(newVelm);
}
for (int i = 0; i < numAtoms; i++) {
atomIndex[i] = i;
posCellOffsets[i] = newCellOffsets[i];
}
atomIndexDevice->upload(atomIndex);
findMoleculeGroups();
for (int i = 0; i < (int) reorderListeners.size(); i++)
reorderListeners[i]->execute();
reorderAtoms();
}
void CudaContext::reorderAtoms() {
atomsWereReordered = false;
if (numAtoms == 0 || nonbonded == NULL || !nonbonded->getUseCutoff() || stepsSinceReorder < 100) {
stepsSinceReorder++;
return;
}
atomsWereReordered = true;
stepsSinceReorder = 0;
if (useDoublePrecision)
reorderAtomsImpl();
else if (useMixedPrecision)
reorderAtomsImpl();
else
reorderAtomsImpl();
nonbonded->updateNeighborListSize();
}
template
void CudaContext::reorderAtomsImpl() {
// Find the range of positions and the number of bins along each axis.
Real4 padding = {0, 0, 0, 0};
vector oldPosq(paddedNumAtoms, padding);
vector oldPosqCorrection(paddedNumAtoms, padding);
Mixed4 paddingMixed = {0, 0, 0, 0};
vector oldVelm(paddedNumAtoms, paddingMixed);
posq->download(oldPosq);
velm->download(oldVelm);
if (useMixedPrecision)
posqCorrection->download(oldPosqCorrection);
Real minx = oldPosq[0].x, maxx = oldPosq[0].x;
Real miny = oldPosq[0].y, maxy = oldPosq[0].y;
Real minz = oldPosq[0].z, maxz = oldPosq[0].z;
if (nonbonded->getUsePeriodic()) {
minx = miny = minz = 0.0;
maxx = periodicBoxSize.x;
maxy = periodicBoxSize.y;
maxz = periodicBoxSize.z;
}
else {
for (int i = 1; i < numAtoms; i++) {
const Real4& pos = oldPosq[i];
minx = min(minx, pos.x);
maxx = max(maxx, pos.x);
miny = min(miny, pos.y);
maxy = max(maxy, pos.y);
minz = min(minz, pos.z);
maxz = max(maxz, pos.z);
}
}
// Loop over each group of identical molecules and reorder them.
vector originalIndex(numAtoms);
vector newPosq(paddedNumAtoms);
vector newPosqCorrection(paddedNumAtoms);
vector newVelm(paddedNumAtoms);
vector newCellOffsets(numAtoms);
for (int group = 0; group < (int) moleculeGroups.size(); group++) {
// Find the center of each molecule.
MoleculeGroup& mol = moleculeGroups[group];
int numMolecules = mol.offsets.size();
vector& atoms = mol.atoms;
vector molPos(numMolecules);
Real invNumAtoms = (Real) (1.0/atoms.size());
for (int i = 0; i < numMolecules; i++) {
molPos[i].x = 0.0f;
molPos[i].y = 0.0f;
molPos[i].z = 0.0f;
for (int j = 0; j < (int)atoms.size(); j++) {
int atom = atoms[j]+mol.offsets[i];
const Real4& pos = oldPosq[atom];
molPos[i].x += pos.x;
molPos[i].y += pos.y;
molPos[i].z += pos.z;
}
molPos[i].x *= invNumAtoms;
molPos[i].y *= invNumAtoms;
molPos[i].z *= invNumAtoms;
if (molPos[i].x != molPos[i].x)
throw OpenMMException("Particle coordinate is nan");
}
if (nonbonded->getUsePeriodic()) {
// Move each molecule position into the same box.
for (int i = 0; i < numMolecules; i++) {
Real4 center = molPos[i];
int zcell = (int) floor(center.z*invPeriodicBoxSize.z);
center.x -= zcell*periodicBoxVecZ.x;
center.y -= zcell*periodicBoxVecZ.y;
center.z -= zcell*periodicBoxVecZ.z;
int ycell = (int) floor(center.y*invPeriodicBoxSize.y);
center.x -= ycell*periodicBoxVecY.x;
center.y -= ycell*periodicBoxVecY.y;
int xcell = (int) floor(center.x*invPeriodicBoxSize.x);
center.x -= xcell*periodicBoxVecX.x;
if (xcell != 0 || ycell != 0 || zcell != 0) {
Real dx = molPos[i].x-center.x;
Real dy = molPos[i].y-center.y;
Real dz = molPos[i].z-center.z;
molPos[i] = center;
for (int j = 0; j < (int) atoms.size(); j++) {
int atom = atoms[j]+mol.offsets[i];
Real4 p = oldPosq[atom];
p.x -= dx;
p.y -= dy;
p.z -= dz;
oldPosq[atom] = p;
posCellOffsets[atom].x -= xcell;
posCellOffsets[atom].y -= ycell;
posCellOffsets[atom].z -= zcell;
}
}
}
}
// Select a bin for each molecule, then sort them by bin.
bool useHilbert = (numMolecules > 5000 || atoms.size() > 8); // For small systems, a simple zigzag curve works better than a Hilbert curve.
Real binWidth;
if (useHilbert)
binWidth = (Real) (max(max(maxx-minx, maxy-miny), maxz-minz)/255.0);
else
binWidth = (Real) (0.2*nonbonded->getMaxCutoffDistance());
Real invBinWidth = (Real) (1.0/binWidth);
int xbins = 1 + (int) ((maxx-minx)*invBinWidth);
int ybins = 1 + (int) ((maxy-miny)*invBinWidth);
vector > molBins(numMolecules);
bitmask_t coords[3];
for (int i = 0; i < numMolecules; i++) {
int x = (int) ((molPos[i].x-minx)*invBinWidth);
int y = (int) ((molPos[i].y-miny)*invBinWidth);
int z = (int) ((molPos[i].z-minz)*invBinWidth);
int bin;
if (useHilbert) {
coords[0] = x;
coords[1] = y;
coords[2] = z;
bin = (int) hilbert_c2i(3, 8, coords);
}
else {
int yodd = y&1;
int zodd = z&1;
bin = z*xbins*ybins;
bin += (zodd ? ybins-y : y)*xbins;
bin += (yodd ? xbins-x : x);
}
molBins[i] = pair(bin, i);
}
sort(molBins.begin(), molBins.end());
// Reorder the atoms.
for (int i = 0; i < numMolecules; i++) {
for (int j = 0; j < (int)atoms.size(); j++) {
int oldIndex = mol.offsets[molBins[i].second]+atoms[j];
int newIndex = mol.offsets[i]+atoms[j];
originalIndex[newIndex] = atomIndex[oldIndex];
newPosq[newIndex] = oldPosq[oldIndex];
if (useMixedPrecision)
newPosqCorrection[newIndex] = oldPosqCorrection[oldIndex];
newVelm[newIndex] = oldVelm[oldIndex];
newCellOffsets[newIndex] = posCellOffsets[oldIndex];
}
}
}
// Update the streams.
for (int i = 0; i < numAtoms; i++) {
atomIndex[i] = originalIndex[i];
posCellOffsets[i] = newCellOffsets[i];
}
posq->upload(newPosq);
if (useMixedPrecision)
posqCorrection->upload(newPosqCorrection);
velm->upload(newVelm);
atomIndexDevice->upload(atomIndex);
for (int i = 0; i < (int) reorderListeners.size(); i++)
reorderListeners[i]->execute();
}
void CudaContext::addReorderListener(ReorderListener* listener) {
reorderListeners.push_back(listener);
}
void CudaContext::addPreComputation(ForcePreComputation* computation) {
preComputations.push_back(computation);
}
void CudaContext::addPostComputation(ForcePostComputation* computation) {
postComputations.push_back(computation);
}
struct CudaContext::WorkThread::ThreadData {
ThreadData(std::queue& tasks, bool& waiting, bool& finished,
pthread_mutex_t& queueLock, pthread_cond_t& waitForTaskCondition, pthread_cond_t& queueEmptyCondition) :
tasks(tasks), waiting(waiting), finished(finished), queueLock(queueLock),
waitForTaskCondition(waitForTaskCondition), queueEmptyCondition(queueEmptyCondition) {
}
std::queue& tasks;
bool& waiting;
bool& finished;
pthread_mutex_t& queueLock;
pthread_cond_t& waitForTaskCondition;
pthread_cond_t& queueEmptyCondition;
};
static void* threadBody(void* args) {
CudaContext::WorkThread::ThreadData& data = *reinterpret_cast(args);
while (!data.finished || data.tasks.size() > 0) {
pthread_mutex_lock(&data.queueLock);
while (data.tasks.empty() && !data.finished) {
data.waiting = true;
pthread_cond_signal(&data.queueEmptyCondition);
pthread_cond_wait(&data.waitForTaskCondition, &data.queueLock);
}
CudaContext::WorkTask* task = NULL;
if (!data.tasks.empty()) {
data.waiting = false;
task = data.tasks.front();
data.tasks.pop();
}
pthread_mutex_unlock(&data.queueLock);
if (task != NULL) {
task->execute();
delete task;
}
}
data.waiting = true;
pthread_cond_signal(&data.queueEmptyCondition);
delete &data;
return 0;
}
CudaContext::WorkThread::WorkThread() : waiting(true), finished(false) {
pthread_mutex_init(&queueLock, NULL);
pthread_cond_init(&waitForTaskCondition, NULL);
pthread_cond_init(&queueEmptyCondition, NULL);
ThreadData* data = new ThreadData(tasks, waiting, finished, queueLock, waitForTaskCondition, queueEmptyCondition);
pthread_create(&thread, NULL, threadBody, data);
}
CudaContext::WorkThread::~WorkThread() {
pthread_mutex_lock(&queueLock);
finished = true;
pthread_cond_broadcast(&waitForTaskCondition);
pthread_mutex_unlock(&queueLock);
pthread_join(thread, NULL);
pthread_mutex_destroy(&queueLock);
pthread_cond_destroy(&waitForTaskCondition);
pthread_cond_destroy(&queueEmptyCondition);
}
void CudaContext::WorkThread::addTask(CudaContext::WorkTask* task) {
pthread_mutex_lock(&queueLock);
tasks.push(task);
waiting = false;
pthread_cond_signal(&waitForTaskCondition);
pthread_mutex_unlock(&queueLock);
}
bool CudaContext::WorkThread::isWaiting() {
return waiting;
}
bool CudaContext::WorkThread::isFinished() {
return finished;
}
void CudaContext::WorkThread::flush() {
pthread_mutex_lock(&queueLock);
while (!waiting)
pthread_cond_wait(&queueEmptyCondition, &queueLock);
pthread_mutex_unlock(&queueLock);
}