Commit 3e16cab9 authored by Peter Eastman's avatar Peter Eastman
Browse files

Continuing to implement new CUDA platform

parent abb8cb4b
...@@ -83,7 +83,7 @@ public: ...@@ -83,7 +83,7 @@ public:
/** /**
* Get a pointer to the device memory. * Get a pointer to the device memory.
*/ */
CUdeviceptr getDevicePointer() { CUdeviceptr& getDevicePointer() {
return pointer; return pointer;
} }
/** /**
......
...@@ -31,7 +31,6 @@ ...@@ -31,7 +31,6 @@
#include "CudaContext.h" #include "CudaContext.h"
#include "CudaArray.h" #include "CudaArray.h"
//#include "CudaBondedUtilities.h" //#include "CudaBondedUtilities.h"
#include "CudaExpressionUtilities.h"
#include "CudaForceInfo.h" #include "CudaForceInfo.h"
//#include "CudaIntegrationUtilities.h" //#include "CudaIntegrationUtilities.h"
#include "CudaKernelSources.h" #include "CudaKernelSources.h"
...@@ -53,7 +52,7 @@ ...@@ -53,7 +52,7 @@
#define CHECK_RESULT2(result, prefix) \ #define CHECK_RESULT2(result, prefix) \
if (result != CUDA_SUCCESS) { \ if (result != CUDA_SUCCESS) { \
std::stringstream m; \ std::stringstream m; \
m<<prefix<<": "<<result<<" ("<<__FILE__<<": "<<__LINE__<<")"; \ m<<prefix<<": "<<getErrorString(result)<<" ("<<result<<")"<<" at "<<__FILE__<<":"<<__LINE__; \
throw OpenMMException(m.str());\ throw OpenMMException(m.str());\
} }
...@@ -66,7 +65,7 @@ bool CudaContext::hasInitializedCuda = false; ...@@ -66,7 +65,7 @@ bool CudaContext::hasInitializedCuda = false;
CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const string& precision, const string& compiler, CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const string& precision, const string& compiler,
const string& tempDir, CudaPlatform::PlatformData& platformData) : system(system), compiler(compiler), const string& tempDir, CudaPlatform::PlatformData& platformData) : system(system), compiler(compiler),
time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), contextIsValid(false), atomsWereReordered(false), posq(NULL), time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), contextIsValid(false), atomsWereReordered(false), pinnedBuffer(NULL), posq(NULL),
velm(NULL), /*forceBuffers(NULL), longForceBuffer(NULL), energyBuffer(NULL), atomIndex(NULL), integration(NULL), velm(NULL), /*forceBuffers(NULL), longForceBuffer(NULL), energyBuffer(NULL), atomIndex(NULL), integration(NULL),
bonded(NULL), nonbonded(NULL),*/ thread(NULL) { bonded(NULL), nonbonded(NULL),*/ thread(NULL) {
if (!hasInitializedCuda) { if (!hasInitializedCuda) {
...@@ -88,7 +87,7 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking ...@@ -88,7 +87,7 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
else else
throw OpenMMException("Illegal value for CudaPrecision: "+precision); throw OpenMMException("Illegal value for CudaPrecision: "+precision);
#ifdef WIN32 #ifdef WIN32
this->tempDir = tempDir+"\"; this->tempDir = tempDir+"\\";
#else #else
this->tempDir = tempDir+"/"; this->tempDir = tempDir+"/";
#endif #endif
...@@ -114,6 +113,7 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking ...@@ -114,6 +113,7 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
deviceIndex = i; deviceIndex = i;
bestSpeed = speed; bestSpeed = speed;
bestCompute = major; bestCompute = major;
gpuArchitecture = intToString(major)+intToString(minor);
} }
} }
} }
...@@ -121,37 +121,47 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking ...@@ -121,37 +121,47 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
throw OpenMMException("No compatible CUDA device is available"); throw OpenMMException("No compatible CUDA device is available");
CHECK_RESULT(cuDeviceGet(&device, deviceIndex)); CHECK_RESULT(cuDeviceGet(&device, deviceIndex));
this->deviceIndex = deviceIndex; this->deviceIndex = deviceIndex;
int major, minor; compilationDefines["WORK_GROUP_SIZE"] = intToString(ThreadBlockSize);
CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, device));
gpuArchitecture = CudaExpressionUtilities::intToString(major)+CudaExpressionUtilities::intToString(minor);
compilationDefines["WORK_GROUP_SIZE"] = CudaExpressionUtilities::intToString(ThreadBlockSize);
defaultOptimizationOptions = "--use_fast_math"; defaultOptimizationOptions = "--use_fast_math";
int numThreadBlocksPerComputeUnit = 6; unsigned int flags = CU_CTX_MAP_HOST;
CHECK_RESULT(cuCtxCreate(&context, 0, device)); if (useBlockingSync)
flags += CU_CTX_SCHED_BLOCKING_SYNC;
else
flags += CU_CTX_SCHED_SPIN;
CHECK_RESULT(cuCtxCreate(&context, flags, device));
contextIsValid = true; contextIsValid = true;
numAtoms = system.getNumParticles(); numAtoms = system.getNumParticles();
paddedNumAtoms = TileSize*((numAtoms+TileSize-1)/TileSize); paddedNumAtoms = TileSize*((numAtoms+TileSize-1)/TileSize);
numAtomBlocks = (paddedNumAtoms+(TileSize-1))/TileSize; numAtomBlocks = (paddedNumAtoms+(TileSize-1))/TileSize;
int multiprocessors; int multiprocessors;
CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device)); CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device));
int numThreadBlocksPerComputeUnit = 6;
numThreadBlocks = numThreadBlocksPerComputeUnit*multiprocessors; numThreadBlocks = numThreadBlocksPerComputeUnit*multiprocessors;
// bonded = new CudaBondedUtilities(*this); // bonded = new CudaBondedUtilities(*this);
// nonbonded = new CudaNonbondedUtilities(*this); // nonbonded = new CudaNonbondedUtilities(*this);
posq = CudaArray::create<float4>(paddedNumAtoms, "posq"); if (useDoublePrecision) {
velm = CudaArray::create<float4>(paddedNumAtoms, "velm"); CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, paddedNumAtoms*sizeof(double4), 0));
posq = CudaArray::create<double4>(paddedNumAtoms, "posq");
velm = CudaArray::create<double4>(paddedNumAtoms, "velm");
}
else {
CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, paddedNumAtoms*sizeof(float4), 0));
posq = CudaArray::create<float4>(paddedNumAtoms, "posq");
velm = CudaArray::create<float4>(paddedNumAtoms, "velm");
}
posCellOffsets.resize(paddedNumAtoms, make_int4(0, 0, 0, 0)); posCellOffsets.resize(paddedNumAtoms, make_int4(0, 0, 0, 0));
// Create utility kernels that are used in multiple places. // Create utility kernels that are used in multiple places.
CUmodule utilities = createModule(CudaKernelSources::vectorOps+CudaKernelSources::utilities); CUmodule utilities = createModule(CudaKernelSources::vectorOps+CudaKernelSources::utilities);
cuModuleGetFunction(&clearBufferKernel, utilities, "clearBuffer"); clearBufferKernel = getKernel(utilities, "clearBuffer");
cuModuleGetFunction(&clearTwoBuffersKernel, utilities, "clearTwoBuffers"); clearTwoBuffersKernel = getKernel(utilities, "clearTwoBuffers");
cuModuleGetFunction(&clearThreeBuffersKernel, utilities, "clearThreeBuffers"); clearThreeBuffersKernel = getKernel(utilities, "clearThreeBuffers");
cuModuleGetFunction(&clearFourBuffersKernel, utilities, "clearFourBuffers"); clearFourBuffersKernel = getKernel(utilities, "clearFourBuffers");
cuModuleGetFunction(&clearFiveBuffersKernel, utilities, "clearFiveBuffers"); clearFiveBuffersKernel = getKernel(utilities, "clearFiveBuffers");
cuModuleGetFunction(&clearSixBuffersKernel, utilities, "clearSixBuffers"); clearSixBuffersKernel = getKernel(utilities, "clearSixBuffers");
cuModuleGetFunction(&reduceFloat4Kernel, utilities, "reduceFloat4Buffer"); reduceFloat4Kernel = getKernel(utilities, "reduceFloat4Buffer");
cuModuleGetFunction(&reduceForcesKernel, utilities, "reduceForces"); reduceForcesKernel = getKernel(utilities, "reduceForces");
// Set defines based on the requested precision. // Set defines based on the requested precision.
...@@ -175,6 +185,8 @@ CudaContext::~CudaContext() { ...@@ -175,6 +185,8 @@ CudaContext::~CudaContext() {
delete forces[i]; delete forces[i];
for (int i = 0; i < (int) reorderListeners.size(); i++) for (int i = 0; i < (int) reorderListeners.size(); i++)
delete reorderListeners[i]; delete reorderListeners[i];
if (pinnedBuffer != NULL)
cuMemFreeHost(pinnedBuffer);
if (posq != NULL) if (posq != NULL)
delete posq; delete posq;
if (velm != NULL) if (velm != NULL)
...@@ -202,38 +214,29 @@ CudaContext::~CudaContext() { ...@@ -202,38 +214,29 @@ CudaContext::~CudaContext() {
CHECK_RESULT(cuCtxDestroy(context)); CHECK_RESULT(cuCtxDestroy(context));
} }
//void CudaContext::initialize() { void CudaContext::initialize() {
// for (int i = 0; i < numAtoms; i++) { for (int i = 0; i < numAtoms; i++) {
// double mass = system.getParticleMass(i); double mass = system.getParticleMass(i);
// (*velm)[i].w = (float) (mass == 0.0 ? 0.0 : 1.0/mass); if (useDoublePrecision)
// } ((double4*) pinnedBuffer)[i] = make_double4(0.0, 0.0, 0.0, mass == 0.0 ? 0.0 : 1.0/mass);
// velm->upload(); else
((float4*) pinnedBuffer)[i] = make_float4(0.0f, 0.0f, 0.0f, mass == 0.0 ? 0.0f : (float) (1.0/mass));
}
velm->upload(pinnedBuffer);
// bonded->initialize(system); // bonded->initialize(system);
// numForceBuffers = platformData.contexts.size(); force = CudaArray::create<long3>(paddedNumAtoms, "force");
// numForceBuffers = std::max(numForceBuffers, bonded->getNumForceBuffers()); addAutoclearBuffer(force->getDevicePointer(), force->getSize()*6);
// for (int i = 0; i < (int) forces.size(); i++) energyBuffer = CudaArray::create<float>(numThreadBlocks*ThreadBlockSize, "energyBuffer");
// numForceBuffers = std::max(numForceBuffers, forces[i]->getRequiredForceBuffers()); addAutoclearBuffer(energyBuffer->getDevicePointer(), energyBuffer->getSize());
// forceBuffers = new CudaArray<mm_float4>(*this, paddedNumAtoms*numForceBuffers, "forceBuffers", false); atomIndexDevice = CudaArray::create<int>(paddedNumAtoms, "atomIndex");
// if (supports64BitGlobalAtomics) { atomIndex.resize(paddedNumAtoms);
// longForceBuffer = new CudaArray<cl_long>(*this, 3*paddedNumAtoms, "longForceBuffer", false); for (int i = 0; i < paddedNumAtoms; ++i)
// reduceForcesKernel.setArg<cl::Buffer>(0, longForceBuffer->getDeviceBuffer()); atomIndex[i] = i;
// reduceForcesKernel.setArg<cl::Buffer>(1, forceBuffers->getDeviceBuffer()); atomIndexDevice->upload(atomIndex);
// reduceForcesKernel.setArg<cl_int>(2, paddedNumAtoms); findMoleculeGroups();
// reduceForcesKernel.setArg<cl_int>(3, numForceBuffers); moleculesInvalid = false;
// addAutoclearBuffer(longForceBuffer->getDeviceBuffer(), longForceBuffer->getSize()*2);
// }
// addAutoclearBuffer(forceBuffers->getDeviceBuffer(), forceBuffers->getSize()*4);
// force = new CudaArray<mm_float4>(*this, &forceBuffers->getDeviceBuffer(), paddedNumAtoms, "force", true);
// energyBuffer = new CudaArray<cl_float>(*this, max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers()), "energyBuffer", true);
// addAutoclearBuffer(energyBuffer->getDeviceBuffer(), energyBuffer->getSize());
// atomIndex = new CudaArray<cl_int>(*this, paddedNumAtoms, "atomIndex", true);
// for (int i = 0; i < paddedNumAtoms; ++i)
// (*atomIndex)[i] = i;
// atomIndex->upload();
// findMoleculeGroups();
// moleculesInvalid = false;
// nonbonded->initialize(system); // nonbonded->initialize(system);
//} }
void CudaContext::addForce(CudaForceInfo* force) { void CudaContext::addForce(CudaForceInfo* force) {
forces.push_back(force); forces.push_back(force);
...@@ -315,7 +318,7 @@ CUmodule CudaContext::createModule(const string source, const map<string, string ...@@ -315,7 +318,7 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
CUresult result = cuModuleLoad(&module, outputFile.c_str()); CUresult result = cuModuleLoad(&module, outputFile.c_str());
if (result != CUDA_SUCCESS) { if (result != CUDA_SUCCESS) {
std::stringstream m; std::stringstream m;
m<<"Error loading CUDA module: "<<result; m<<"Error loading CUDA module: "<<getErrorString(result)<<" ("<<result<<")";
throw OpenMMException(m.str()); throw OpenMMException(m.str());
} }
remove(inputFile.c_str()); remove(inputFile.c_str());
...@@ -329,52 +332,109 @@ CUmodule CudaContext::createModule(const string source, const map<string, string ...@@ -329,52 +332,109 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
remove(logFile.c_str()); remove(logFile.c_str());
throw; throw;
} }
//
// // Get length before using c_str() to avoid length() call invalidating the c_str() value.
// string src_string = src.str();
// ::size_t src_length = src_string.length();
// cl::Program::Sources sources(1, make_pair(src_string.c_str(), src_length));
// cl::Program program(context, sources);
// try {
// program.build(vector<cl::Device>(1, device), options.c_str());
// } catch (cl::Error err) {
// throw OpenMMException("Error compiling kernel: "+program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device));
// }
} }
//
//void CudaContext::executeKernel(cl::Kernel& kernel, int workUnits, int blockSize) { CUfunction CudaContext::getKernel(CUmodule& module, const string& name) {
// if (blockSize == -1) CUfunction function;
// blockSize = ThreadBlockSize; CUresult result = cuModuleGetFunction(&function, module, name.c_str());
// int size = std::min((workUnits+blockSize-1)/blockSize, numThreadBlocks)*blockSize; if (result != CUDA_SUCCESS) {
// try { std::stringstream m;
// queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(size), cl::NDRange(blockSize)); m<<"Error creating kernel "<<name<<": "<<getErrorString(result)<<" ("<<result<<")";
// } throw OpenMMException(m.str());
// catch (cl::Error err) { }
// stringstream str; return function;
// str<<"Error invoking kernel "<<kernel.getInfo<CL_KERNEL_FUNCTION_NAME>()<<": "<<err.what()<<" ("<<err.err()<<")"; }
// throw OpenMMException(str.str());
// } string CudaContext::doubleToString(double value) {
//} stringstream s;
// s.precision(useDoublePrecision ? 16 : 8);
//void CudaContext::clearBuffer(CudaArray<float>& array) { s << scientific << value;
// clearBuffer(array.getDeviceBuffer(), array.getSize()); if (!useDoublePrecision)
//} s << "f";
// return s.str();
//void CudaContext::clearBuffer(CudaArray<mm_float4>& array) { }
// clearBuffer(array.getDeviceBuffer(), array.getSize()*4);
//} string CudaContext::intToString(int value) {
// stringstream s;
//void CudaContext::clearBuffer(cl::Memory& memory, int size) { s << value;
// clearBufferKernel.setArg<cl::Memory>(0, memory); return s.str();
// clearBufferKernel.setArg<cl_int>(1, size); }
// executeKernel(clearBufferKernel, size, 128);
//} std::string CudaContext::getErrorString(CUresult result) {
// switch (result) {
//void CudaContext::addAutoclearBuffer(cl::Memory& memory, int size) { case CUDA_SUCCESS: return "CUDA_SUCCESS";
// autoclearBuffers.push_back(&memory); case CUDA_ERROR_INVALID_VALUE: return "CUDA_ERROR_INVALID_VALUE";
// autoclearBufferSizes.push_back(size); case CUDA_ERROR_OUT_OF_MEMORY: return "CUDA_ERROR_OUT_OF_MEMORY";
//} case CUDA_ERROR_NOT_INITIALIZED: return "CUDA_ERROR_NOT_INITIALIZED";
// case CUDA_ERROR_DEINITIALIZED: return "CUDA_ERROR_DEINITIALIZED";
case CUDA_ERROR_PROFILER_DISABLED: return "CUDA_ERROR_PROFILER_DISABLED";
case CUDA_ERROR_PROFILER_NOT_INITIALIZED: return "CUDA_ERROR_PROFILER_NOT_INITIALIZED";
case CUDA_ERROR_PROFILER_ALREADY_STARTED: return "CUDA_ERROR_PROFILER_ALREADY_STARTED";
case CUDA_ERROR_PROFILER_ALREADY_STOPPED: return "CUDA_ERROR_PROFILER_ALREADY_STOPPED";
case CUDA_ERROR_NO_DEVICE: return "CUDA_ERROR_NO_DEVICE";
case CUDA_ERROR_INVALID_DEVICE: return "CUDA_ERROR_INVALID_DEVICE";
case CUDA_ERROR_INVALID_IMAGE: return "CUDA_ERROR_INVALID_IMAGE";
case CUDA_ERROR_INVALID_CONTEXT: return "CUDA_ERROR_INVALID_CONTEXT";
case CUDA_ERROR_CONTEXT_ALREADY_CURRENT: return "CUDA_ERROR_CONTEXT_ALREADY_CURRENT";
case CUDA_ERROR_MAP_FAILED: return "CUDA_ERROR_MAP_FAILED";
case CUDA_ERROR_UNMAP_FAILED: return "CUDA_ERROR_UNMAP_FAILED";
case CUDA_ERROR_ARRAY_IS_MAPPED: return "CUDA_ERROR_ARRAY_IS_MAPPED";
case CUDA_ERROR_ALREADY_MAPPED: return "CUDA_ERROR_ALREADY_MAPPED";
case CUDA_ERROR_NO_BINARY_FOR_GPU: return "CUDA_ERROR_NO_BINARY_FOR_GPU";
case CUDA_ERROR_ALREADY_ACQUIRED: return "CUDA_ERROR_ALREADY_ACQUIRED";
case CUDA_ERROR_NOT_MAPPED: return "CUDA_ERROR_NOT_MAPPED";
case CUDA_ERROR_NOT_MAPPED_AS_ARRAY: return "CUDA_ERROR_NOT_MAPPED_AS_ARRAY";
case CUDA_ERROR_NOT_MAPPED_AS_POINTER: return "CUDA_ERROR_NOT_MAPPED_AS_POINTER";
case CUDA_ERROR_ECC_UNCORRECTABLE: return "CUDA_ERROR_ECC_UNCORRECTABLE";
case CUDA_ERROR_UNSUPPORTED_LIMIT: return "CUDA_ERROR_UNSUPPORTED_LIMIT";
case CUDA_ERROR_CONTEXT_ALREADY_IN_USE: return "CUDA_ERROR_CONTEXT_ALREADY_IN_USE";
case CUDA_ERROR_INVALID_SOURCE: return "CUDA_ERROR_INVALID_SOURCE";
case CUDA_ERROR_FILE_NOT_FOUND: return "CUDA_ERROR_FILE_NOT_FOUND";
case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: return "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND";
case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED: return "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED";
case CUDA_ERROR_OPERATING_SYSTEM: return "CUDA_ERROR_OPERATING_SYSTEM";
case CUDA_ERROR_INVALID_HANDLE: return "CUDA_ERROR_INVALID_HANDLE";
case CUDA_ERROR_NOT_FOUND: return "CUDA_ERROR_NOT_FOUND";
case CUDA_ERROR_NOT_READY: return "CUDA_ERROR_NOT_READY";
case CUDA_ERROR_LAUNCH_FAILED: return "CUDA_ERROR_LAUNCH_FAILED";
case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: return "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES";
case CUDA_ERROR_LAUNCH_TIMEOUT: return "CUDA_ERROR_LAUNCH_TIMEOUT";
case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: return "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING";
case CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED: return "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED";
case CUDA_ERROR_PEER_ACCESS_NOT_ENABLED: return "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED";
case CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE: return "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE";
case CUDA_ERROR_CONTEXT_IS_DESTROYED: return "CUDA_ERROR_CONTEXT_IS_DESTROYED";
case CUDA_ERROR_UNKNOWN: return "CUDA_ERROR_UNKNOWN";
}
return "Invalid error code";
}
void CudaContext::executeKernel(CUfunction kernel, void** arguments, int threads, int blockSize, unsigned int sharedSize) {
if (blockSize == -1)
blockSize = ThreadBlockSize;
int gridSize = std::min((threads+blockSize-1)/blockSize, numThreadBlocks);
CUresult result = cuLaunchKernel(kernel, gridSize, 1, 1, blockSize, 1, 1, sharedSize, 0, arguments, NULL);
if (result != CUDA_SUCCESS) {
stringstream str;
str<<"Error invoking kernel: "<<getErrorString(result)<<" ("<<result<<")";
throw OpenMMException(str.str());
}
}
void CudaContext::clearBuffer(CudaArray& array) {
clearBuffer(array.getDevicePointer(), array.getSize()*array.getElementSize()/4);
}
void CudaContext::clearBuffer(CUdeviceptr memory, int size) {
void* args[] = {&memory, &size};
executeKernel(clearBufferKernel, args, size, 128);
}
void CudaContext::addAutoclearBuffer(CUdeviceptr memory, int size) {
autoclearBuffers.push_back(memory);
autoclearBufferSizes.push_back(size);
}
//void CudaContext::clearAutoclearBuffers() { //void CudaContext::clearAutoclearBuffers() {
// int base = 0; // int base = 0;
// int total = autoclearBufferSizes.size(); // int total = autoclearBufferSizes.size();
...@@ -454,219 +514,217 @@ CUmodule CudaContext::createModule(const string source, const map<string, string ...@@ -454,219 +514,217 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
// executeKernel(reduceFloat4Kernel, bufferSize, 128); // executeKernel(reduceFloat4Kernel, bufferSize, 128);
//} //}
// //
//void CudaContext::tagAtomsInMolecule(int atom, int molecule, vector<int>& atomMolecule, vector<vector<int> >& atomBonds) { void CudaContext::tagAtomsInMolecule(int atom, int molecule, vector<int>& atomMolecule, vector<vector<int> >& atomBonds) {
// // Recursively tag atoms as belonging to a particular molecule. // Recursively tag atoms as belonging to a particular molecule.
//
// atomMolecule[atom] = molecule; atomMolecule[atom] = molecule;
// for (int i = 0; i < (int) atomBonds[atom].size(); i++) for (int i = 0; i < (int) atomBonds[atom].size(); i++)
// if (atomMolecule[atomBonds[atom][i]] == -1) if (atomMolecule[atomBonds[atom][i]] == -1)
// tagAtomsInMolecule(atomBonds[atom][i], molecule, atomMolecule, atomBonds); tagAtomsInMolecule(atomBonds[atom][i], molecule, atomMolecule, atomBonds);
//} }
//
///** /**
// * This class ensures that atom reordering doesn't break virtual sites. * This class ensures that atom reordering doesn't break virtual sites.
// */ */
//class CudaContext::VirtualSiteInfo : public CudaForceInfo { class CudaContext::VirtualSiteInfo : public CudaForceInfo {
//public: public:
// VirtualSiteInfo(const System& system) : CudaForceInfo(0) { VirtualSiteInfo(const System& system) : CudaForceInfo(0) {
// for (int i = 0; i < system.getNumParticles(); i++) { for (int i = 0; i < system.getNumParticles(); i++) {
// if (system.isVirtualSite(i)) { if (system.isVirtualSite(i)) {
// siteTypes.push_back(&typeid(system.getVirtualSite(i))); siteTypes.push_back(&typeid(system.getVirtualSite(i)));
// vector<int> particles; vector<int> particles;
// particles.push_back(i); particles.push_back(i);
// for (int j = 0; j < system.getVirtualSite(i).getNumParticles(); j++) for (int j = 0; j < system.getVirtualSite(i).getNumParticles(); j++)
// particles.push_back(system.getVirtualSite(i).getParticle(j)); particles.push_back(system.getVirtualSite(i).getParticle(j));
// siteParticles.push_back(particles); siteParticles.push_back(particles);
// vector<double> weights; vector<double> weights;
// if (dynamic_cast<const TwoParticleAverageSite*>(&system.getVirtualSite(i)) != NULL) { if (dynamic_cast<const TwoParticleAverageSite*>(&system.getVirtualSite(i)) != NULL) {
// // A two particle average. // A two particle average.
//
// const TwoParticleAverageSite& site = dynamic_cast<const TwoParticleAverageSite&>(system.getVirtualSite(i)); const TwoParticleAverageSite& site = dynamic_cast<const TwoParticleAverageSite&>(system.getVirtualSite(i));
// weights.push_back(site.getWeight(0)); weights.push_back(site.getWeight(0));
// weights.push_back(site.getWeight(1)); weights.push_back(site.getWeight(1));
// } }
// else if (dynamic_cast<const ThreeParticleAverageSite*>(&system.getVirtualSite(i)) != NULL) { else if (dynamic_cast<const ThreeParticleAverageSite*>(&system.getVirtualSite(i)) != NULL) {
// // A three particle average. // A three particle average.
//
// const ThreeParticleAverageSite& site = dynamic_cast<const ThreeParticleAverageSite&>(system.getVirtualSite(i)); const ThreeParticleAverageSite& site = dynamic_cast<const ThreeParticleAverageSite&>(system.getVirtualSite(i));
// weights.push_back(site.getWeight(0)); weights.push_back(site.getWeight(0));
// weights.push_back(site.getWeight(1)); weights.push_back(site.getWeight(1));
// weights.push_back(site.getWeight(2)); weights.push_back(site.getWeight(2));
// } }
// else if (dynamic_cast<const OutOfPlaneSite*>(&system.getVirtualSite(i)) != NULL) { else if (dynamic_cast<const OutOfPlaneSite*>(&system.getVirtualSite(i)) != NULL) {
// // An out of plane site. // An out of plane site.
//
// const OutOfPlaneSite& site = dynamic_cast<const OutOfPlaneSite&>(system.getVirtualSite(i)); const OutOfPlaneSite& site = dynamic_cast<const OutOfPlaneSite&>(system.getVirtualSite(i));
// weights.push_back(site.getWeight12()); weights.push_back(site.getWeight12());
// weights.push_back(site.getWeight13()); weights.push_back(site.getWeight13());
// weights.push_back(site.getWeightCross()); weights.push_back(site.getWeightCross());
// } }
// siteWeights.push_back(weights); siteWeights.push_back(weights);
// } }
// } }
// } }
// int getNumParticleGroups() { int getNumParticleGroups() {
// return siteTypes.size(); return siteTypes.size();
// } }
// void getParticlesInGroup(int index, std::vector<int>& particles) { void getParticlesInGroup(int index, std::vector<int>& particles) {
// particles = siteParticles[index]; particles = siteParticles[index];
// } }
// bool areGroupsIdentical(int group1, int group2) { bool areGroupsIdentical(int group1, int group2) {
// if (siteTypes[group1] != siteTypes[group2]) if (siteTypes[group1] != siteTypes[group2])
// return false; return false;
// int numParticles = siteWeights[group1].size(); int numParticles = siteWeights[group1].size();
// if (siteWeights[group2].size() != numParticles) if (siteWeights[group2].size() != numParticles)
// return false; return false;
// for (int i = 0; i < numParticles; i++) for (int i = 0; i < numParticles; i++)
// if (siteWeights[group1][i] != siteWeights[group2][i]) if (siteWeights[group1][i] != siteWeights[group2][i])
// return false; return false;
// return true; return true;
// } }
//private: private:
// vector<const type_info*> siteTypes; vector<const type_info*> siteTypes;
// vector<vector<int> > siteParticles; vector<vector<int> > siteParticles;
// vector<vector<double> > siteWeights; vector<vector<double> > siteWeights;
//}; };
//
// void CudaContext::findMoleculeGroups() {
//void CudaContext::findMoleculeGroups() { // The first time this is called, we need to identify all the molecules in the system.
// // The first time this is called, we need to identify all the molecules in the system.
// if (moleculeGroups.size() == 0) {
// if (moleculeGroups.size() == 0) { // Add a ForceInfo that makes sure reordering doesn't break virtual sites.
// // Add a ForceInfo that makes sure reordering doesn't break virtual sites.
// addForce(new VirtualSiteInfo(system));
// addForce(new VirtualSiteInfo(system));
// // First make a list of every other atom to which each atom is connect by a constraint or force group.
// // First make a list of every other atom to which each atom is connect by a constraint or force group.
// vector<vector<int> > atomBonds(system.getNumParticles());
// vector<vector<int> > atomBonds(system.getNumParticles()); for (int i = 0; i < system.getNumConstraints(); i++) {
// for (int i = 0; i < system.getNumConstraints(); i++) { int particle1, particle2;
// int particle1, particle2; double distance;
// double distance; system.getConstraintParameters(i, particle1, particle2, distance);
// system.getConstraintParameters(i, particle1, particle2, distance); atomBonds[particle1].push_back(particle2);
// atomBonds[particle1].push_back(particle2); atomBonds[particle2].push_back(particle1);
// atomBonds[particle2].push_back(particle1); }
// } for (int i = 0; i < (int) forces.size(); i++) {
// for (int i = 0; i < (int) forces.size(); i++) { for (int j = 0; j < forces[i]->getNumParticleGroups(); j++) {
// for (int j = 0; j < forces[i]->getNumParticleGroups(); j++) { vector<int> particles;
// vector<int> particles; forces[i]->getParticlesInGroup(j, particles);
// forces[i]->getParticlesInGroup(j, particles); for (int k = 0; k < (int) particles.size(); k++)
// for (int k = 0; k < (int) particles.size(); k++) for (int m = 0; m < (int) particles.size(); m++)
// for (int m = 0; m < (int) particles.size(); m++) if (k != m)
// if (k != m) atomBonds[particles[k]].push_back(particles[m]);
// atomBonds[particles[k]].push_back(particles[m]); }
// } }
// }
// // Now tag atoms by which molecule they belong to.
// // Now tag atoms by which molecule they belong to.
// vector<int> atomMolecule(numAtoms, -1);
// vector<int> atomMolecule(numAtoms, -1); int numMolecules = 0;
// int numMolecules = 0; for (int i = 0; i < numAtoms; i++)
// for (int i = 0; i < numAtoms; i++) if (atomMolecule[i] == -1)
// if (atomMolecule[i] == -1) tagAtomsInMolecule(i, numMolecules++, atomMolecule, atomBonds);
// tagAtomsInMolecule(i, numMolecules++, atomMolecule, atomBonds); vector<vector<int> > atomIndices(numMolecules);
// vector<vector<int> > atomIndices(numMolecules); for (int i = 0; i < numAtoms; i++)
// for (int i = 0; i < numAtoms; i++) atomIndices[atomMolecule[i]].push_back(i);
// atomIndices[atomMolecule[i]].push_back(i);
// // Construct a description of each molecule.
// // Construct a description of each molecule.
// molecules.resize(numMolecules);
// molecules.resize(numMolecules); for (int i = 0; i < numMolecules; i++) {
// for (int i = 0; i < numMolecules; i++) { molecules[i].atoms = atomIndices[i];
// molecules[i].atoms = atomIndices[i]; molecules[i].groups.resize(forces.size());
// molecules[i].groups.resize(forces.size()); }
// } for (int i = 0; i < system.getNumConstraints(); i++) {
// for (int i = 0; i < system.getNumConstraints(); i++) { int particle1, particle2;
// int particle1, particle2; double distance;
// double distance; system.getConstraintParameters(i, particle1, particle2, distance);
// system.getConstraintParameters(i, particle1, particle2, distance); molecules[atomMolecule[particle1]].constraints.push_back(i);
// molecules[atomMolecule[particle1]].constraints.push_back(i); }
// } for (int i = 0; i < (int) forces.size(); i++)
// for (int i = 0; i < (int) forces.size(); i++) for (int j = 0; j < forces[i]->getNumParticleGroups(); j++) {
// for (int j = 0; j < forces[i]->getNumParticleGroups(); j++) { vector<int> particles;
// vector<int> particles; forces[i]->getParticlesInGroup(j, particles);
// forces[i]->getParticlesInGroup(j, particles); molecules[atomMolecule[particles[0]]].groups[i].push_back(j);
// molecules[atomMolecule[particles[0]]].groups[i].push_back(j); }
// } }
// }
// // Sort them into groups of identical molecules.
// // Sort them into groups of identical molecules.
// vector<Molecule> uniqueMolecules;
// vector<Molecule> uniqueMolecules; vector<vector<int> > moleculeInstances;
// vector<vector<int> > moleculeInstances; vector<vector<int> > moleculeOffsets;
// vector<vector<int> > moleculeOffsets; for (int molIndex = 0; molIndex < (int) molecules.size(); molIndex++) {
// for (int molIndex = 0; molIndex < (int) molecules.size(); molIndex++) { Molecule& mol = molecules[molIndex];
// Molecule& mol = molecules[molIndex];
// // See if it is identical to another molecule.
// // See if it is identical to another molecule.
// bool isNew = true;
// bool isNew = true; for (int j = 0; j < (int) uniqueMolecules.size() && isNew; j++) {
// for (int j = 0; j < (int) uniqueMolecules.size() && isNew; j++) { Molecule& mol2 = uniqueMolecules[j];
// Molecule& mol2 = uniqueMolecules[j]; bool identical = (mol.atoms.size() == mol2.atoms.size() && mol.constraints.size() == mol2.constraints.size());
// bool identical = (mol.atoms.size() == mol2.atoms.size() && mol.constraints.size() == mol2.constraints.size());
// // See if the atoms are identical.
// // See if the atoms are identical.
// int atomOffset = mol2.atoms[0]-mol.atoms[0];
// int atomOffset = mol2.atoms[0]-mol.atoms[0]; for (int i = 0; i < (int) mol.atoms.size() && identical; i++) {
// for (int i = 0; i < (int) mol.atoms.size() && identical; i++) { if (mol.atoms[i] != mol2.atoms[i]-atomOffset || system.getParticleMass(mol.atoms[i]) != system.getParticleMass(mol2.atoms[i]))
// if (mol.atoms[i] != mol2.atoms[i]-atomOffset || system.getParticleMass(mol.atoms[i]) != system.getParticleMass(mol2.atoms[i])) identical = false;
// identical = false; for (int k = 0; k < (int) forces.size(); k++)
// for (int k = 0; k < (int) forces.size(); k++) if (!forces[k]->areParticlesIdentical(mol.atoms[i], mol2.atoms[i]))
// if (!forces[k]->areParticlesIdentical(mol.atoms[i], mol2.atoms[i])) identical = false;
// identical = false; }
// }
// // See if the constraints are identical.
// // See if the constraints are identical.
// for (int i = 0; i < (int) mol.constraints.size() && identical; i++) {
// for (int i = 0; i < (int) mol.constraints.size() && identical; i++) { int c1particle1, c1particle2, c2particle1, c2particle2;
// int c1particle1, c1particle2, c2particle1, c2particle2; double distance1, distance2;
// double distance1, distance2; system.getConstraintParameters(mol.constraints[i], c1particle1, c1particle2, distance1);
// system.getConstraintParameters(mol.constraints[i], c1particle1, c1particle2, distance1); system.getConstraintParameters(mol2.constraints[i], c2particle1, c2particle2, distance2);
// system.getConstraintParameters(mol2.constraints[i], c2particle1, c2particle2, distance2); if (c1particle1 != c2particle1-atomOffset || c1particle2 != c2particle2-atomOffset || distance1 != distance2)
// if (c1particle1 != c2particle1-atomOffset || c1particle2 != c2particle2-atomOffset || distance1 != distance2) identical = false;
// identical = false; }
// }
// // See if the force groups are identical.
// // See if the force groups are identical.
// for (int i = 0; i < (int) forces.size() && identical; i++) {
// for (int i = 0; i < (int) forces.size() && identical; i++) { if (mol.groups[i].size() != mol2.groups[i].size())
// if (mol.groups[i].size() != mol2.groups[i].size()) identical = false;
// identical = false; for (int k = 0; k < (int) mol.groups[i].size() && identical; k++)
// for (int k = 0; k < (int) mol.groups[i].size() && identical; k++) if (!forces[i]->areGroupsIdentical(mol.groups[i][k], mol2.groups[i][k]))
// if (!forces[i]->areGroupsIdentical(mol.groups[i][k], mol2.groups[i][k])) identical = false;
// identical = false; }
// } if (identical) {
// if (identical) { moleculeInstances[j].push_back(molIndex);
// moleculeInstances[j].push_back(molIndex); moleculeOffsets[j].push_back(mol.atoms[0]);
// moleculeOffsets[j].push_back(mol.atoms[0]); isNew = false;
// isNew = false; }
// } }
// } if (isNew) {
// if (isNew) { uniqueMolecules.push_back(mol);
// uniqueMolecules.push_back(mol); moleculeInstances.push_back(vector<int>());
// moleculeInstances.push_back(vector<int>()); moleculeInstances[moleculeInstances.size()-1].push_back(molIndex);
// moleculeInstances[moleculeInstances.size()-1].push_back(molIndex); moleculeOffsets.push_back(vector<int>());
// moleculeOffsets.push_back(vector<int>()); moleculeOffsets[moleculeOffsets.size()-1].push_back(mol.atoms[0]);
// moleculeOffsets[moleculeOffsets.size()-1].push_back(mol.atoms[0]); }
// } }
// } moleculeGroups.resize(moleculeInstances.size());
// moleculeGroups.resize(moleculeInstances.size()); for (int i = 0; i < (int) moleculeInstances.size(); i++)
// for (int i = 0; i < (int) moleculeInstances.size(); i++) {
// { moleculeGroups[i].instances = moleculeInstances[i];
// moleculeGroups[i].instances = moleculeInstances[i]; moleculeGroups[i].offsets = moleculeOffsets[i];
// moleculeGroups[i].offsets = moleculeOffsets[i]; vector<int>& atoms = uniqueMolecules[i].atoms;
// vector<int>& atoms = uniqueMolecules[i].atoms; moleculeGroups[i].atoms.resize(atoms.size());
// moleculeGroups[i].atoms.resize(atoms.size()); for (int j = 0; j < (int) atoms.size(); j++)
// for (int j = 0; j < (int) atoms.size(); j++) moleculeGroups[i].atoms[j] = atoms[j]-atoms[0];
// moleculeGroups[i].atoms[j] = atoms[j]-atoms[0]; }
// } }
//}
// void CudaContext::invalidateMolecules() {
//void CudaContext::invalidateMolecules() { moleculesInvalid = true;
// moleculesInvalid = true; }
//}
//
//
//void OpenCLContext::validateMolecules() { //void OpenCLContext::validateMolecules() {
// moleculesInvalid = false; // moleculesInvalid = false;
// if (numAtoms == 0 || nonbonded == NULL || !nonbonded->getUseCutoff()) // if (numAtoms == 0 || nonbonded == NULL || !nonbonded->getUseCutoff())
......
...@@ -72,11 +72,11 @@ public: ...@@ -72,11 +72,11 @@ public:
CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const std::string& precision, CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const std::string& precision,
const std::string& compiler, const std::string& tempDir, CudaPlatform::PlatformData& platformData); const std::string& compiler, const std::string& tempDir, CudaPlatform::PlatformData& platformData);
~CudaContext(); ~CudaContext();
// /** /**
// * This is called to initialize internal data structures after all Forces in the system * This is called to initialize internal data structures after all Forces in the system
// * have been initialized. * have been initialized.
// */ */
// void initialize(); void initialize();
/** /**
* Add a CudaForce to this context. * Add a CudaForce to this context.
*/ */
...@@ -123,12 +123,12 @@ public: ...@@ -123,12 +123,12 @@ public:
CudaArray& getVelm() { CudaArray& getVelm() {
return *velm; return *velm;
} }
// /** /**
// * Get the array which contains the force on each atom. * Get the array which contains the force on each atom (respresented as a long3 in 64 bit fixed point).
// */ */
// CudaArray<mm_float4>& getForce() { CudaArray& getForce() {
// return *force; return *force;
// } }
// /** // /**
// * Get the array which contains the buffers in which forces are computed. // * Get the array which contains the buffers in which forces are computed.
// */ // */
...@@ -184,36 +184,41 @@ public: ...@@ -184,36 +184,41 @@ public:
* omitted, a default set of options will be used * omitted, a default set of options will be used
*/ */
CUmodule createModule(const std::string source, const std::map<std::string, std::string>& defines, const char* optimizationFlags = NULL); CUmodule createModule(const std::string source, const std::map<std::string, std::string>& defines, const char* optimizationFlags = NULL);
// /** /**
// * Execute a kernel. * Get a kernel from a CUDA module.
// * *
// * @param kernel the kernel to execute * @param module the module to get the kernel from
// * @param workUnits the maximum number of work units that should be used * @param name the name of the kernel to get
// * @param blockSize the size of each thread block to use */
// */ CUfunction getKernel(CUmodule& module, const std::string& name);
// void executeKernel(cl::Kernel& kernel, int workUnits, int blockSize = -1); /**
// /** * Execute a kernel.
// * Set all elements of an array to 0. *
// */ * @param kernel the kernel to execute
// void clearBuffer(CudaArray<float>& array); * @param arguments an array of pointers to the kernel arguments
// /** * @param threads the maximum number of threads that should be used
// * Set all elements of an array to 0. * @param blockSize the size of each thread block to use
// */ * @param sharedSize the amount of dynamic shared memory to allocated for the kernel, in bytes
// void clearBuffer(CudaArray<mm_float4>& array); */
// /** void executeKernel(CUfunction kernel, void** arguments, int workUnits, int blockSize = -1, unsigned int sharedSize = 0);
// * Set all elements of an array to 0. /**
// * * Set all elements of an array to 0.
// * @param memory the Memory to clear */
// * @param size the number of float elements in the buffer void clearBuffer(CudaArray& array);
// */ /**
// void clearBuffer(cl::Memory& memory, int size); * Set all elements of an array to 0.
// /** *
// * Register a buffer that should be automatically cleared (all elements set to 0) at the start of each force or energy computation. * @param memory the memory to clear
// * * @param size the number of 4-byte elements in the buffer
// * @param memory the Memory to clear */
// * @param size the number of float elements in the buffer void clearBuffer(CUdeviceptr memory, int size);
// */ /**
// void addAutoclearBuffer(cl::Memory& memory, int size); * Register a buffer that should be automatically cleared (all elements set to 0) at the start of each force or energy computation.
*
* @param memory the memory to clear
* @param size the number of float/double elements in the buffer
*/
void addAutoclearBuffer(CUdeviceptr memory, int size);
// /** // /**
// * Clear all buffers that have been registered with addAutoclearBuffer(). // * Clear all buffers that have been registered with addAutoclearBuffer().
// */ // */
...@@ -230,108 +235,110 @@ public: ...@@ -230,108 +235,110 @@ public:
// * Sum the buffesr containing forces. // * Sum the buffesr containing forces.
// */ // */
// void reduceForces(); // void reduceForces();
// /** /**
// * Get the current simulation time. * Get the current simulation time.
// */ */
// double getTime() { double getTime() {
// return time; return time;
// } }
// /** /**
// * Set the current simulation time. * Set the current simulation time.
// */ */
// void setTime(double t) { void setTime(double t) {
// time = t; time = t;
// } }
// /** /**
// * Get the number of integration steps that have been taken. * Get the number of integration steps that have been taken.
// */ */
// int getStepCount() { int getStepCount() {
// return stepCount; return stepCount;
// } }
// /** /**
// * Set the number of integration steps that have been taken. * Set the number of integration steps that have been taken.
// */ */
// void setStepCount(int steps) { void setStepCount(int steps) {
// stepCount = steps; stepCount = steps;
// } }
// /** /**
// * Get the number of times forces or energy has been computed. * Get the number of times forces or energy has been computed.
// */ */
// int getComputeForceCount() { int getComputeForceCount() {
// return computeForceCount; return computeForceCount;
// } }
// /** /**
// * Set the number of times forces or energy has been computed. * Set the number of times forces or energy has been computed.
// */ */
// void setComputeForceCount(int count) { void setComputeForceCount(int count) {
// computeForceCount = count; computeForceCount = count;
// } }
// /** /**
// * Get the number of atoms. * Get the number of atoms.
// */ */
// int getNumAtoms() const { int getNumAtoms() const {
// return numAtoms; return numAtoms;
// } }
// /** /**
// * Get the number of atoms, rounded up to a multiple of TileSize. This is the actual size of * Get the number of atoms, rounded up to a multiple of TileSize. This is the actual size of
// * most arrays with one element per atom. * most arrays with one element per atom.
// */ */
// int getPaddedNumAtoms() const { int getPaddedNumAtoms() const {
// return paddedNumAtoms; return paddedNumAtoms;
// } }
// /** /**
// * Get the number of blocks of TileSize atoms. * Get the number of blocks of TileSize atoms.
// */ */
// int getNumAtomBlocks() const { int getNumAtomBlocks() const {
// return numAtomBlocks; return numAtomBlocks;
// } }
// /** /**
// * Get the standard number of thread blocks to use when executing kernels. * Get the standard number of thread blocks to use when executing kernels.
// */ */
// int getNumThreadBlocks() const { int getNumThreadBlocks() const {
// return numThreadBlocks; return numThreadBlocks;
// } }
// /** /**
// * Get the number of force buffers. * Get whether double precision is being used.
// */ */
// int getNumForceBuffers() const { bool getUseDoublePrecision() {
// return numForceBuffers; return useDoublePrecision;
// } }
// /** /**
// * Get the SIMD width of the device being used. * Get whether accumulation is being done in double precision.
// */ */
// int getSIMDWidth() const { bool getAccumulateInDouble() {
// return simdWidth; return accumulateInDouble;
// } }
// /** /**
// * Get whether the device being used supports 64 bit atomic operations on global memory. * Convert a number to a string in a format suitable for including in a kernel.
// */ * This takes into account whether the context uses single or double precision.
// bool getSupports64BitGlobalAtomics() { */
// return supports64BitGlobalAtomics; std::string doubleToString(double value);
// } /**
// /** * Convert a number to a string in a format suitable for including in a kernel.
// * Get whether the device being used supports double precision math. */
// */ std::string intToString(int value);
// bool getSupportsDoublePrecision() { /**
// return supportsDoublePrecision; * Convert a CUDA result code to the corresponding string description.
// } */
std::string getErrorString(CUresult result);
// /** // /**
// * Get the size of the periodic box. // * Get the size of the periodic box.
// */ // */
// mm_float4 getPeriodicBoxSize() const { // float4 getPeriodicBoxSize() const {
// return periodicBoxSize; // return periodicBoxSize;
// } // }
// /** // /**
// * Set the size of the periodic box. // * Set the size of the periodic box.
// */ // */
// void setPeriodicBoxSize(double xsize, double ysize, double zsize) { // void setPeriodicBoxSize(double xsize, double ysize, double zsize) {
// periodicBoxSize = mm_float4((float) xsize, (float) ysize, (float) zsize, 0); // periodicBoxSize = make_float4((float) xsize, (float) ysize, (float) zsize, 0);
// invPeriodicBoxSize = mm_float4((float) (1.0/xsize), (float) (1.0/ysize), (float) (1.0/zsize), 0); // invPeriodicBoxSize = make_float4((float) (1.0/xsize), (float) (1.0/ysize), (float) (1.0/zsize), 0);
// } // }
// /** // /**
// * Get the inverse of the size of the periodic box. // * Get the inverse of the size of the periodic box.
// */ // */
// mm_float4 getInvPeriodicBoxSize() const { // float4 getInvPeriodicBoxSize() const {
// return invPeriodicBoxSize; // return invPeriodicBoxSize;
// } // }
// /** // /**
...@@ -352,66 +359,66 @@ public: ...@@ -352,66 +359,66 @@ public:
// CudaNonbondedUtilities& getNonbondedUtilities() { // CudaNonbondedUtilities& getNonbondedUtilities() {
// return *nonbonded; // return *nonbonded;
// } // }
// /** /**
// * Get the thread used by this context for executing parallel computations. * Get the thread used by this context for executing parallel computations.
// */ */
// WorkThread& getWorkThread() { WorkThread& getWorkThread() {
// return *thread; return *thread;
// } }
// /** /**
// * Get whether atoms were reordered during the most recent force/energy computation. * Get whether atoms were reordered during the most recent force/energy computation.
// */ */
// bool getAtomsWereReordered() const { bool getAtomsWereReordered() const {
// return atomsWereReordered; return atomsWereReordered;
// } }
// /** /**
// * Set whether atoms were reordered during the most recent force/energy computation. * Set whether atoms were reordered during the most recent force/energy computation.
// */ */
// void setAtomsWereReordered(bool wereReordered) { void setAtomsWereReordered(bool wereReordered) {
// atomsWereReordered = wereReordered; atomsWereReordered = wereReordered;
// } }
// /** /**
// * Reorder the internal arrays of atoms to try to keep spatially contiguous atoms close * Reorder the internal arrays of atoms to try to keep spatially contiguous atoms close
// * together in the arrays. * together in the arrays.
// * *
// * @param enforcePeriodic if true, the atom positions may be altered to enforce periodic boundary conditions * @param enforcePeriodic if true, the atom positions may be altered to enforce periodic boundary conditions
// */ */
// void reorderAtoms(bool enforcePeriodic); void reorderAtoms(bool enforcePeriodic);
// /** /**
// * Add a listener that should be called whenever atoms get reordered. The CudaContext * Add a listener that should be called whenever atoms get reordered. The CudaContext
// * assumes ownership of the object, and deletes it when the context itself is deleted. * assumes ownership of the object, and deletes it when the context itself is deleted.
// */ */
// void addReorderListener(ReorderListener* listener); void addReorderListener(ReorderListener* listener);
// /** /**
// * Get the list of ReorderListeners. * Get the list of ReorderListeners.
// */ */
// std::vector<ReorderListener*>& getReorderListeners() { std::vector<ReorderListener*>& getReorderListeners() {
// return reorderListeners; return reorderListeners;
// } }
// /** /**
// * Mark that the current molecule definitions (and hence the atom order) may be invalid. * Mark that the current molecule definitions (and hence the atom order) may be invalid.
// * This should be called whenever force field parameters change. It will cause the definitions * This should be called whenever force field parameters change. It will cause the definitions
// * and order to be revalidated the next to reorderAtoms() is called. * and order to be revalidated the next to reorderAtoms() is called.
// */ */
// void invalidateMolecules(); void invalidateMolecules();
// /** /**
// * Get whether the current molecule definitions are valid. * Get whether the current molecule definitions are valid.
// */ */
// bool getMoleculesAreInvalid() { bool getMoleculesAreInvalid() {
// return moleculesInvalid; return moleculesInvalid;
// } }
private: private:
struct Molecule; struct Molecule;
struct MoleculeGroup; struct MoleculeGroup;
class VirtualSiteInfo; class VirtualSiteInfo;
// void findMoleculeGroups(); void findMoleculeGroups();
// static void tagAtomsInMolecule(int atom, int molecule, std::vector<int>& atomMolecule, std::vector<std::vector<int> >& atomBonds); static void tagAtomsInMolecule(int atom, int molecule, std::vector<int>& atomMolecule, std::vector<std::vector<int> >& atomBonds);
// /** /**
// * Ensure that all molecules marked as "identical" really are identical. This should be * Ensure that all molecules marked as "identical" really are identical. This should be
// * called whenever force field parameters change. If necessary, it will rebuild the list * called whenever force field parameters change. If necessary, it will rebuild the list
// * of molecules and resort the atoms. * of molecules and resort the atoms.
// */ */
// void validateMolecules(); void validateMolecules();
static bool hasInitializedCuda; static bool hasInitializedCuda;
const System& system; const System& system;
double time; double time;
...@@ -424,8 +431,6 @@ private: ...@@ -424,8 +431,6 @@ private:
int paddedNumAtoms; int paddedNumAtoms;
int numAtomBlocks; int numAtomBlocks;
int numThreadBlocks; int numThreadBlocks;
// int numForceBuffers;
// int simdWidth;
bool useBlockingSync, useDoublePrecision, accumulateInDouble, contextIsValid, atomsWereReordered, moleculesInvalid; bool useBlockingSync, useDoublePrecision, accumulateInDouble, contextIsValid, atomsWereReordered, moleculesInvalid;
std::string compiler, tempDir, gpuArchitecture; std::string compiler, tempDir, gpuArchitecture;
float4 periodicBoxSize; float4 periodicBoxSize;
...@@ -446,15 +451,15 @@ private: ...@@ -446,15 +451,15 @@ private:
std::vector<Molecule> molecules; std::vector<Molecule> molecules;
std::vector<MoleculeGroup> moleculeGroups; std::vector<MoleculeGroup> moleculeGroups;
std::vector<int4> posCellOffsets; std::vector<int4> posCellOffsets;
void* pinnedBuffer;
CudaArray* posq; CudaArray* posq;
CudaArray* velm; CudaArray* velm;
// CudaArray<mm_float4>* force; CudaArray* force;
// CudaArray<mm_float4>* forceBuffers; CudaArray* energyBuffer;
// CudaArray<cl_long>* longForceBuffer; CudaArray* atomIndexDevice;
// CudaArray<cl_float>* energyBuffer; std::vector<int> atomIndex;
// CudaArray<cl_int>* atomIndex; std::vector<CUdeviceptr> autoclearBuffers;
// std::vector<cl::Memory*> autoclearBuffers; std::vector<int> autoclearBufferSizes;
// std::vector<int> autoclearBufferSizes;
std::vector<ReorderListener*> reorderListeners; std::vector<ReorderListener*> reorderListeners;
// CudaIntegrationUtilities* integration; // CudaIntegrationUtilities* integration;
// CudaBondedUtilities* bonded; // CudaBondedUtilities* bonded;
......
...@@ -154,6 +154,7 @@ CudaPlatform::PlatformData::PlatformData(const System& system, const string& dev ...@@ -154,6 +154,7 @@ CudaPlatform::PlatformData::PlatformData(const System& system, const string& dev
device << contexts[i]->getDeviceIndex(); device << contexts[i]->getDeviceIndex();
} }
propertyValues[CudaPlatform::CudaDeviceIndex()] = device.str(); propertyValues[CudaPlatform::CudaDeviceIndex()] = device.str();
propertyValues[CudaPlatform::CudaUseBlockingSync()] = blocking ? "true" : "false";
propertyValues[CudaPlatform::CudaPrecision()] = precisionProperty; propertyValues[CudaPlatform::CudaPrecision()] = precisionProperty;
propertyValues[CudaPlatform::CudaCompiler()] = compilerProperty; propertyValues[CudaPlatform::CudaCompiler()] = compilerProperty;
propertyValues[CudaPlatform::CudaTempDirectory()] = tempProperty; propertyValues[CudaPlatform::CudaTempDirectory()] = tempProperty;
...@@ -166,11 +167,11 @@ CudaPlatform::PlatformData::~PlatformData() { ...@@ -166,11 +167,11 @@ CudaPlatform::PlatformData::~PlatformData() {
} }
void CudaPlatform::PlatformData::initializeContexts(const System& system) { void CudaPlatform::PlatformData::initializeContexts(const System& system) {
// for (int i = 0; i < (int) contexts.size(); i++) for (int i = 0; i < (int) contexts.size(); i++)
// contexts[i]->initialize(); contexts[i]->initialize();
} }
void CudaPlatform::PlatformData::syncContexts() { void CudaPlatform::PlatformData::syncContexts() {
// for (int i = 0; i < (int) contexts.size(); i++) for (int i = 0; i < (int) contexts.size(); i++)
// contexts[i]->getWorkThread().flush(); contexts[i]->getWorkThread().flush();
} }
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2010-2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "CudaSort.h"
#include "CudaKernelSources.h"
#include <map>
using namespace OpenMM;
using namespace std;
CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length) : context(context), trait(trait),
dataRange(NULL), bucketOfElement(NULL), offsetInBucket(NULL), bucketOffset(NULL), buckets(NULL) {
// Create kernels.
map<string, string> replacements;
replacements["DATA_TYPE"] = trait->getDataType();
replacements["KEY_TYPE"] = trait->getKeyType();
replacements["SORT_KEY"] = trait->getSortKey();
replacements["MIN_KEY"] = trait->getMinKey();
replacements["MAX_KEY"] = trait->getMaxKey();
replacements["MAX_VALUE"] = trait->getMaxValue();
CUmodule module = context.createModule(context.replaceStrings(CudaKernelSources::sort, replacements));
computeRangeKernel = context.getKernel(module, "computeRange");
assignElementsKernel = context.getKernel(module, "assignElementsToBuckets");
computeBucketPositionsKernel = context.getKernel(module, "computeBucketPositions");
copyToBucketsKernel = context.getKernel(module, "copyDataToBuckets");
sortBucketsKernel = context.getKernel(module, "sortBuckets");
// Work out the work group sizes for various kernels.
int maxBlockSize;
cuDeviceGetAttribute(&maxBlockSize, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, context.getDevice());
for (rangeKernelSize = 1; rangeKernelSize*2 <= maxBlockSize; rangeKernelSize *= 2)
;
positionsKernelSize = rangeKernelSize;
sortKernelSize = rangeKernelSize/2;
if (rangeKernelSize > length)
rangeKernelSize = length;
int maxSharedMem;
cuDeviceGetAttribute(&maxSharedMem, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, context.getDevice());
unsigned int maxLocalBuffer = (unsigned int) ((maxSharedMem/trait->getDataSize())/2);
if (sortKernelSize > maxLocalBuffer)
sortKernelSize = maxLocalBuffer;
unsigned int targetBucketSize = sortKernelSize/2;
unsigned int numBuckets = length/targetBucketSize;
if (numBuckets < 1)
numBuckets = 1;
if (positionsKernelSize > numBuckets)
positionsKernelSize = numBuckets;
// Create workspace arrays.
dataRange = new CudaArray(2, trait->getKeySize(), "sortDataRange");
bucketOffset = CudaArray::create<uint1>(numBuckets, "bucketOffset");
bucketOfElement = CudaArray::create<uint1>(length, "bucketOfElement");
offsetInBucket = CudaArray::create<uint1>(length, "offsetInBucket");
buckets = new CudaArray(length, trait->getDataSize(), "buckets");
}
CudaSort::~CudaSort() {
delete trait;
if (dataRange != NULL)
delete dataRange;
if (bucketOfElement != NULL)
delete bucketOfElement;
if (offsetInBucket != NULL)
delete offsetInBucket;
if (bucketOffset != NULL)
delete bucketOffset;
if (buckets != NULL)
delete buckets;
}
void CudaSort::sort(CudaArray& data) {
if (data.getSize() != bucketOfElement->getSize() || data.getElementSize() != trait->getDataSize())
throw OpenMMException("CudaSort called with different data size");
if (data.getSize() == 0)
return;
// Compute the range of data values.
unsigned int dataSize = data.getSize();
void* rangeArgs[] = {&data.getDevicePointer(), &dataSize, &dataRange->getDevicePointer()};
context.executeKernel(computeRangeKernel, rangeArgs, rangeKernelSize, rangeKernelSize, rangeKernelSize*trait->getKeySize());
// Assign array elements to buckets.
unsigned int numBuckets = bucketOffset->getSize();
context.clearBuffer(*bucketOffset);
void* elementsArgs[] = {&data.getDevicePointer(), &dataSize, &numBuckets, &dataRange->getDevicePointer(),
&bucketOffset->getDevicePointer(), &bucketOfElement->getDevicePointer(), &offsetInBucket->getDevicePointer()};
context.executeKernel(assignElementsKernel, elementsArgs, data.getSize());
// Compute the position of each bucket.
void* computeArgs[] = {&numBuckets, &bucketOffset->getDevicePointer()};
context.executeKernel(computeBucketPositionsKernel, computeArgs, positionsKernelSize, positionsKernelSize, positionsKernelSize*sizeof(int));
// Copy the data into the buckets.
void* copyArgs[] = {&data.getDevicePointer(), &buckets->getDevicePointer(), &dataSize, &bucketOffset->getDevicePointer(),
&bucketOfElement->getDevicePointer(), &offsetInBucket->getDevicePointer()};
context.executeKernel(copyToBucketsKernel, copyArgs, data.getSize());
// Sort each bucket.
void* sortArgs[] = {&data.getDevicePointer(), &buckets->getDevicePointer(), &numBuckets, &bucketOffset->getDevicePointer()};
context.executeKernel(sortBucketsKernel, sortArgs, ((data.getSize()+sortKernelSize-1)/sortKernelSize)*sortKernelSize, sortKernelSize, sortKernelSize*trait->getDataSize());
}
#ifndef __OPENMM_CUDASORT_H__
#define __OPENMM_CUDASORT_H__
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2010-2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "CudaArray.h"
#include "openmm/internal/windowsExport.h"
#include "CudaContext.h"
namespace OpenMM {
/**
* This class sorts arrays of values. It supports any type of values, not just scalars,
* so long as an appropriate sorting key can be defined by which to sort them.
*
* The sorting behavior is specified by a "trait" class that defines the type of data to
* sort and the key for sorting it. Here is an example of a trait class for
* sorting floats:
*
* class SortTrait : public CudaSort::SortTrait {
* int getDataSize() const {return 4;}
* int getKeySize() const {return 4;}
* const char* getDataType() const {return "float";}
* const char* getKeyType() const {return "float";}
* const char* getMinKey() const {return "-MAXFLOAT";}
* const char* getMaxKey() const {return "MAXFLOAT";}
* const char* getMaxValue() const {return "MAXFLOAT";}
* const char* getSortKey() const {return "value";}
* };
*
* The algorithm used is a bucket sort, followed by a bitonic sort within each bucket
* (in local memory when possible, in global memory otherwise). This is similar to
* the algorithm described in
*
* Shifu Chen, Jing Qin, Yongming Xie, Junping Zhao, and Pheng-Ann Heng. "An Efficient
* Sorting Algorithm with CUDA" Journal of the Chinese Institute of Engineers, 32(7),
* pp. 915-921 (2009)
*
* but with many modifications and simplifications. In particular, this algorithm
* involves much less communication between host and device, which is critical to get
* good performance with the array sizes we typically work with (10,000 to 100,000
* elements).
*/
class OPENMM_EXPORT CudaSort {
public:
class SortTrait;
/**
* Create a CudaSort object for sorting data of a particular type.
*
* @param context the context in which to perform calculations
* @param trait a SortTrait defining the type of data to sort. It should have been allocated
* on the heap with the "new" operator. This object takes over ownership of it,
* and deletes it when the CudaSort is deleted.
* @param length the length of the arrays this object will be used to sort
*/
CudaSort(CudaContext& context, SortTrait* trait, unsigned int length);
~CudaSort();
/**
* Sort an array.
*/
void sort(CudaArray& data);
private:
CudaContext& context;
SortTrait* trait;
CudaArray* dataRange;
CudaArray* bucketOfElement;
CudaArray* offsetInBucket;
CudaArray* bucketOffset;
CudaArray* buckets;
CUfunction computeRangeKernel, assignElementsKernel, computeBucketPositionsKernel, copyToBucketsKernel, sortBucketsKernel;
unsigned int rangeKernelSize, positionsKernelSize, sortKernelSize;
};
/**
* A subclass of SortTrait defines the type of value to sort, and the key for sorting them.
*/
class CudaSort::SortTrait {
public:
/**
* Get the size of each data value in bytes.
*/
virtual int getDataSize() const = 0;
/**
* Get the size of each key value in bytes.
*/
virtual int getKeySize() const = 0;
/**
* Get the data type of the values to sort.
*/
virtual const char* getDataType() const = 0;
/**
* Get the data type of the sorting key.
*/
virtual const char* getKeyType() const = 0;
/**
* Get the minimum value a key can take.
*/
virtual const char* getMinKey() const = 0;
/**
* Get the maximum value a key can take.
*/
virtual const char* getMaxKey() const = 0;
/**
* Get a value whose key is guaranteed to equal getMaxKey().
*/
virtual const char* getMaxValue() const = 0;
/**
* Get the CUDA code to select the key from the data value.
*/
virtual const char* getSortKey() const = 0;
};
} // namespace OpenMM
#endif // __OPENMM_CUDASORT_H__
__device__ KEY_TYPE getValue(DATA_TYPE value) {
return SORT_KEY;
}
extern "C" {
/**
* Calculate the minimum and maximum value in the array to be sorted. This kernel
* is executed as a single work group.
*/
__global__ void computeRange(const DATA_TYPE* __restrict__ data, unsigned int length, KEY_TYPE* __restrict__ range) {
extern __shared__ KEY_TYPE rangeBuffer[];
KEY_TYPE minimum = MAX_KEY;
KEY_TYPE maximum = MIN_KEY;
// Each thread calculates the range of a subset of values.
for (unsigned int index = threadIdx.x; index < length; index += blockDim.x) {
KEY_TYPE value = getValue(data[index]);
minimum = min(minimum, value);
maximum = max(maximum, value);
}
// Now reduce them.
rangeBuffer[threadIdx.x] = minimum;
__syncthreads();
for (unsigned int step = 1; step < blockDim.x; step *= 2) {
if (threadIdx.x+step < blockDim.x && threadIdx.x%(2*step) == 0)
rangeBuffer[threadIdx.x] = min(rangeBuffer[threadIdx.x], rangeBuffer[threadIdx.x+step]);
__syncthreads();
}
minimum = rangeBuffer[0];
rangeBuffer[threadIdx.x] = maximum;
__syncthreads();
for (unsigned int step = 1; step < blockDim.x; step *= 2) {
if (threadIdx.x+step < blockDim.x && threadIdx.x%(2*step) == 0)
rangeBuffer[threadIdx.x] = max(rangeBuffer[threadIdx.x], rangeBuffer[threadIdx.x+step]);
__syncthreads();
}
maximum = rangeBuffer[0];
if (threadIdx.x == 0) {
range[0] = minimum;
range[1] = maximum;
}
}
/**
* Assign elements to buckets.
*/
__global__ void assignElementsToBuckets(const DATA_TYPE* __restrict__ data, unsigned int length, unsigned int numBuckets, const KEY_TYPE* __restrict__ range,
unsigned int* bucketOffset, unsigned int* __restrict__ bucketOfElement, unsigned int* __restrict__ offsetInBucket) {
float minValue = (float) (range[0]);
float maxValue = (float) (range[1]);
float bucketWidth = (maxValue-minValue)/numBuckets;
for (unsigned int index = blockDim.x*blockIdx.x+threadIdx.x; index < length; index += blockDim.x*gridDim.x) {
float key = (float) getValue(data[index]);
unsigned int bucketIndex = min((unsigned int) ((key-minValue)/bucketWidth), numBuckets-1);
offsetInBucket[index] = atomicAdd(&bucketOffset[bucketIndex], 1);
bucketOfElement[index] = bucketIndex;
}
}
/**
* Sum the bucket sizes to compute the start position of each bucket. This kernel
* is executed as a single work group.
*/
__global__ void computeBucketPositions(unsigned int numBuckets, unsigned int* __restrict__ bucketOffset) {
extern __shared__ unsigned int posBuffer[];
unsigned int globalOffset = 0;
for (unsigned int startBucket = 0; startBucket < numBuckets; startBucket += blockDim.x) {
// Load the bucket sizes into local memory.
unsigned int globalIndex = startBucket+threadIdx.x;
posBuffer[threadIdx.x] = (globalIndex < numBuckets ? bucketOffset[globalIndex] : 0);
__syncthreads();
// Perform a parallel prefix sum.
for (unsigned int step = 1; step < blockDim.x; step *= 2) {
unsigned int add = (threadIdx.x >= step ? posBuffer[threadIdx.x-step] : 0);
__syncthreads();
posBuffer[threadIdx.x] += add;
__syncthreads();
}
// Write the results back to global memory.
if (globalIndex < numBuckets)
bucketOffset[globalIndex] = posBuffer[threadIdx.x]+globalOffset;
globalOffset += posBuffer[blockDim.x-1];
}
}
/**
* Copy the input data into the buckets for sorting.
*/
__global__ void copyDataToBuckets(const DATA_TYPE* __restrict__ data, DATA_TYPE* __restrict__ buckets, unsigned int length, const unsigned int* __restrict__ bucketOffset, const unsigned int* __restrict__ bucketOfElement, const unsigned int* __restrict__ offsetInBucket) {
for (unsigned int index = blockDim.x*blockIdx.x+threadIdx.x; index < length; index += blockDim.x*gridDim.x) {
DATA_TYPE element = data[index];
unsigned int bucketIndex = bucketOfElement[index];
unsigned int offset = (bucketIndex == 0 ? 0 : bucketOffset[bucketIndex-1]);
buckets[offset+offsetInBucket[index]] = element;
}
}
/**
* Sort the data in each bucket.
*/
__global__ void sortBuckets(DATA_TYPE* __restrict__ data, const DATA_TYPE* __restrict__ buckets, unsigned int numBuckets, const unsigned int* __restrict__ bucketOffset) {
extern __shared__ DATA_TYPE dataBuffer[];
for (unsigned int index = blockIdx.x; index < numBuckets; index += gridDim.x) {
unsigned int startIndex = (index == 0 ? 0 : bucketOffset[index-1]);
unsigned int endIndex = bucketOffset[index];
unsigned int length = endIndex-startIndex;
if (length <= blockDim.x) {
// Load the data into local memory.
if (threadIdx.x < length)
dataBuffer[threadIdx.x] = buckets[startIndex+threadIdx.x];
else
dataBuffer[threadIdx.x] = MAX_VALUE;
__syncthreads();
// Perform a bitonic sort in local memory.
for (unsigned int k = 2; k <= blockDim.x; k *= 2) {
for (unsigned int j = k/2; j > 0; j /= 2) {
int ixj = threadIdx.x^j;
if (ixj > threadIdx.x) {
DATA_TYPE value1 = dataBuffer[threadIdx.x];
DATA_TYPE value2 = dataBuffer[ixj];
bool ascending = (threadIdx.x&k) == 0;
KEY_TYPE lowKey = (ascending ? getValue(value1) : getValue(value2));
KEY_TYPE highKey = (ascending ? getValue(value2) : getValue(value1));
if (lowKey > highKey) {
dataBuffer[threadIdx.x] = value2;
dataBuffer[ixj] = value1;
}
}
__syncthreads();
}
}
// Write the data to the sorted array.
if (threadIdx.x < length)
data[startIndex+threadIdx.x] = dataBuffer[threadIdx.x];
}
else {
// Copy the bucket data over to the output array.
for (unsigned int i = threadIdx.x; i < length; i += blockDim.x)
data[startIndex+i] = buckets[startIndex+i];
__threadfence_block();
__syncthreads();
// Perform a bitonic sort in global memory.
for (unsigned int k = 2; k < 2*length; k *= 2) {
for (unsigned int j = k/2; j > 0; j /= 2) {
for (unsigned int i = threadIdx.x; i < length; i += blockDim.x) {
int ixj = i^j;
if (ixj > i && ixj < length) {
DATA_TYPE value1 = data[startIndex+i];
DATA_TYPE value2 = data[startIndex+ixj];
bool ascending = ((i&k) == 0);
for (unsigned int mask = k*2; mask < 2*length; mask *= 2)
ascending = ((i&mask) == 0 ? !ascending : ascending);
KEY_TYPE lowKey = (ascending ? getValue(value1) : getValue(value2));
KEY_TYPE highKey = (ascending ? getValue(value2) : getValue(value1));
if (lowKey > highKey) {
data[startIndex+i] = value2;
data[startIndex+ixj] = value1;
}
}
}
__threadfence_block();
__syncthreads();
}
}
}
}
}
}
\ No newline at end of file
extern "C" {
/** /**
* This is called by the various functions below to clear a buffer. * This is called by the various functions below to clear a buffer.
*/ */
...@@ -100,3 +102,5 @@ __global__ void reduceForces(const long* __restrict__ longBuffer, float4* __rest ...@@ -100,3 +102,5 @@ __global__ void reduceForces(const long* __restrict__ longBuffer, float4* __rest
buffer[index] = sum; buffer[index] = sum;
} }
} }
}
\ No newline at end of file
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008-2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
/**
* This tests the CUDA implementation of sorting.
*/
#include "openmm/internal/AssertionUtilities.h"
#include "../src/CudaArray.h"
#include "../src/CudaContext.h"
#include "../src/CudaSort.h"
#include "sfmt/SFMT.h"
#include "openmm/System.h"
#include <iostream>
#include <cmath>
#include <set>
using namespace OpenMM;
using namespace std;
class SortTrait : public CudaSort::SortTrait {
int getDataSize() const {return 4;}
int getKeySize() const {return 4;}
const char* getDataType() const {return "float";}
const char* getKeyType() const {return "float";}
const char* getMinKey() const {return "-MAXFLOAT";}
const char* getMaxKey() const {return "MAXFLOAT";}
const char* getMaxValue() const {return "MAXFLOAT";}
const char* getSortKey() const {return "value";}
};
void verifySorting(vector<float> array) {
// Sort the array.
System system;
system.addParticle(0.0);
CudaPlatform platform;
CudaPlatform::PlatformData platformData(system, "", "true", "single",
platform.getPropertyDefaultValue(CudaPlatform::CudaCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaTempDirectory()));
CudaContext& context = *platformData.contexts[0];
context.initialize();
CudaArray data(array.size(), 4, "sortData");
data.upload(array);
CudaSort sort(context, new SortTrait(), array.size());
sort.sort(data);
vector<float> sorted;
data.download(sorted);
// Verify that it is in sorted order.
for (int i = 1; i < (int) sorted.size(); i++)
ASSERT(sorted[i-1] <= sorted[i]);
// Make sure the sorted array contains the same values as the original one.
multiset<float> elements1(array.begin(), array.end());
multiset<float> elements2(sorted.begin(), sorted.end());
ASSERT(elements1 == elements2);
}
void testUniformValues()
{
OpenMM_SFMT::SFMT sfmt;
init_gen_rand(0, sfmt);
vector<float> array(10000);
for (int i = 0; i < (int) array.size(); i++)
array[i] = (float) genrand_real2(sfmt);
verifySorting(array);
}
void testLogValues()
{
OpenMM_SFMT::SFMT sfmt;
init_gen_rand(0, sfmt);
vector<float> array(10000);
for (int i = 0; i < (int) array.size(); i++)
array[i] = (float) log(genrand_real2(sfmt));
verifySorting(array);
}
int main() {
try {
testUniformValues();
testLogValues();
}
catch(const exception& e) {
cout << "exception: " << e.what() << endl;
return 1;
}
cout << "Done" << endl;
return 0;
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment