Commit 3e16cab9 authored by Peter Eastman's avatar Peter Eastman
Browse files

Continuing to implement new CUDA platform

parent abb8cb4b
......@@ -83,7 +83,7 @@ public:
/**
* Get a pointer to the device memory.
*/
CUdeviceptr getDevicePointer() {
CUdeviceptr& getDevicePointer() {
return pointer;
}
/**
......
......@@ -31,7 +31,6 @@
#include "CudaContext.h"
#include "CudaArray.h"
//#include "CudaBondedUtilities.h"
#include "CudaExpressionUtilities.h"
#include "CudaForceInfo.h"
//#include "CudaIntegrationUtilities.h"
#include "CudaKernelSources.h"
......@@ -53,7 +52,7 @@
#define CHECK_RESULT2(result, prefix) \
if (result != CUDA_SUCCESS) { \
std::stringstream m; \
m<<prefix<<": "<<result<<" ("<<__FILE__<<": "<<__LINE__<<")"; \
m<<prefix<<": "<<getErrorString(result)<<" ("<<result<<")"<<" at "<<__FILE__<<":"<<__LINE__; \
throw OpenMMException(m.str());\
}
......@@ -66,7 +65,7 @@ bool CudaContext::hasInitializedCuda = false;
CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const string& precision, const string& compiler,
const string& tempDir, CudaPlatform::PlatformData& platformData) : system(system), compiler(compiler),
time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), contextIsValid(false), atomsWereReordered(false), posq(NULL),
time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), contextIsValid(false), atomsWereReordered(false), pinnedBuffer(NULL), posq(NULL),
velm(NULL), /*forceBuffers(NULL), longForceBuffer(NULL), energyBuffer(NULL), atomIndex(NULL), integration(NULL),
bonded(NULL), nonbonded(NULL),*/ thread(NULL) {
if (!hasInitializedCuda) {
......@@ -88,7 +87,7 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
else
throw OpenMMException("Illegal value for CudaPrecision: "+precision);
#ifdef WIN32
this->tempDir = tempDir+"\";
this->tempDir = tempDir+"\\";
#else
this->tempDir = tempDir+"/";
#endif
......@@ -114,6 +113,7 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
deviceIndex = i;
bestSpeed = speed;
bestCompute = major;
gpuArchitecture = intToString(major)+intToString(minor);
}
}
}
......@@ -121,37 +121,47 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
throw OpenMMException("No compatible CUDA device is available");
CHECK_RESULT(cuDeviceGet(&device, deviceIndex));
this->deviceIndex = deviceIndex;
int major, minor;
CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, device));
gpuArchitecture = CudaExpressionUtilities::intToString(major)+CudaExpressionUtilities::intToString(minor);
compilationDefines["WORK_GROUP_SIZE"] = CudaExpressionUtilities::intToString(ThreadBlockSize);
compilationDefines["WORK_GROUP_SIZE"] = intToString(ThreadBlockSize);
defaultOptimizationOptions = "--use_fast_math";
int numThreadBlocksPerComputeUnit = 6;
CHECK_RESULT(cuCtxCreate(&context, 0, device));
unsigned int flags = CU_CTX_MAP_HOST;
if (useBlockingSync)
flags += CU_CTX_SCHED_BLOCKING_SYNC;
else
flags += CU_CTX_SCHED_SPIN;
CHECK_RESULT(cuCtxCreate(&context, flags, device));
contextIsValid = true;
numAtoms = system.getNumParticles();
paddedNumAtoms = TileSize*((numAtoms+TileSize-1)/TileSize);
numAtomBlocks = (paddedNumAtoms+(TileSize-1))/TileSize;
int multiprocessors;
CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device));
int numThreadBlocksPerComputeUnit = 6;
numThreadBlocks = numThreadBlocksPerComputeUnit*multiprocessors;
// bonded = new CudaBondedUtilities(*this);
// nonbonded = new CudaNonbondedUtilities(*this);
posq = CudaArray::create<float4>(paddedNumAtoms, "posq");
velm = CudaArray::create<float4>(paddedNumAtoms, "velm");
if (useDoublePrecision) {
CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, paddedNumAtoms*sizeof(double4), 0));
posq = CudaArray::create<double4>(paddedNumAtoms, "posq");
velm = CudaArray::create<double4>(paddedNumAtoms, "velm");
}
else {
CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, paddedNumAtoms*sizeof(float4), 0));
posq = CudaArray::create<float4>(paddedNumAtoms, "posq");
velm = CudaArray::create<float4>(paddedNumAtoms, "velm");
}
posCellOffsets.resize(paddedNumAtoms, make_int4(0, 0, 0, 0));
// Create utility kernels that are used in multiple places.
CUmodule utilities = createModule(CudaKernelSources::vectorOps+CudaKernelSources::utilities);
cuModuleGetFunction(&clearBufferKernel, utilities, "clearBuffer");
cuModuleGetFunction(&clearTwoBuffersKernel, utilities, "clearTwoBuffers");
cuModuleGetFunction(&clearThreeBuffersKernel, utilities, "clearThreeBuffers");
cuModuleGetFunction(&clearFourBuffersKernel, utilities, "clearFourBuffers");
cuModuleGetFunction(&clearFiveBuffersKernel, utilities, "clearFiveBuffers");
cuModuleGetFunction(&clearSixBuffersKernel, utilities, "clearSixBuffers");
cuModuleGetFunction(&reduceFloat4Kernel, utilities, "reduceFloat4Buffer");
cuModuleGetFunction(&reduceForcesKernel, utilities, "reduceForces");
clearBufferKernel = getKernel(utilities, "clearBuffer");
clearTwoBuffersKernel = getKernel(utilities, "clearTwoBuffers");
clearThreeBuffersKernel = getKernel(utilities, "clearThreeBuffers");
clearFourBuffersKernel = getKernel(utilities, "clearFourBuffers");
clearFiveBuffersKernel = getKernel(utilities, "clearFiveBuffers");
clearSixBuffersKernel = getKernel(utilities, "clearSixBuffers");
reduceFloat4Kernel = getKernel(utilities, "reduceFloat4Buffer");
reduceForcesKernel = getKernel(utilities, "reduceForces");
// Set defines based on the requested precision.
......@@ -175,6 +185,8 @@ CudaContext::~CudaContext() {
delete forces[i];
for (int i = 0; i < (int) reorderListeners.size(); i++)
delete reorderListeners[i];
if (pinnedBuffer != NULL)
cuMemFreeHost(pinnedBuffer);
if (posq != NULL)
delete posq;
if (velm != NULL)
......@@ -202,38 +214,29 @@ CudaContext::~CudaContext() {
CHECK_RESULT(cuCtxDestroy(context));
}
//void CudaContext::initialize() {
// for (int i = 0; i < numAtoms; i++) {
// double mass = system.getParticleMass(i);
// (*velm)[i].w = (float) (mass == 0.0 ? 0.0 : 1.0/mass);
// }
// velm->upload();
void CudaContext::initialize() {
for (int i = 0; i < numAtoms; i++) {
double mass = system.getParticleMass(i);
if (useDoublePrecision)
((double4*) pinnedBuffer)[i] = make_double4(0.0, 0.0, 0.0, mass == 0.0 ? 0.0 : 1.0/mass);
else
((float4*) pinnedBuffer)[i] = make_float4(0.0f, 0.0f, 0.0f, mass == 0.0 ? 0.0f : (float) (1.0/mass));
}
velm->upload(pinnedBuffer);
// bonded->initialize(system);
// numForceBuffers = platformData.contexts.size();
// numForceBuffers = std::max(numForceBuffers, bonded->getNumForceBuffers());
// for (int i = 0; i < (int) forces.size(); i++)
// numForceBuffers = std::max(numForceBuffers, forces[i]->getRequiredForceBuffers());
// forceBuffers = new CudaArray<mm_float4>(*this, paddedNumAtoms*numForceBuffers, "forceBuffers", false);
// if (supports64BitGlobalAtomics) {
// longForceBuffer = new CudaArray<cl_long>(*this, 3*paddedNumAtoms, "longForceBuffer", false);
// reduceForcesKernel.setArg<cl::Buffer>(0, longForceBuffer->getDeviceBuffer());
// reduceForcesKernel.setArg<cl::Buffer>(1, forceBuffers->getDeviceBuffer());
// reduceForcesKernel.setArg<cl_int>(2, paddedNumAtoms);
// reduceForcesKernel.setArg<cl_int>(3, numForceBuffers);
// addAutoclearBuffer(longForceBuffer->getDeviceBuffer(), longForceBuffer->getSize()*2);
// }
// addAutoclearBuffer(forceBuffers->getDeviceBuffer(), forceBuffers->getSize()*4);
// force = new CudaArray<mm_float4>(*this, &forceBuffers->getDeviceBuffer(), paddedNumAtoms, "force", true);
// energyBuffer = new CudaArray<cl_float>(*this, max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers()), "energyBuffer", true);
// addAutoclearBuffer(energyBuffer->getDeviceBuffer(), energyBuffer->getSize());
// atomIndex = new CudaArray<cl_int>(*this, paddedNumAtoms, "atomIndex", true);
// for (int i = 0; i < paddedNumAtoms; ++i)
// (*atomIndex)[i] = i;
// atomIndex->upload();
// findMoleculeGroups();
// moleculesInvalid = false;
force = CudaArray::create<long3>(paddedNumAtoms, "force");
addAutoclearBuffer(force->getDevicePointer(), force->getSize()*6);
energyBuffer = CudaArray::create<float>(numThreadBlocks*ThreadBlockSize, "energyBuffer");
addAutoclearBuffer(energyBuffer->getDevicePointer(), energyBuffer->getSize());
atomIndexDevice = CudaArray::create<int>(paddedNumAtoms, "atomIndex");
atomIndex.resize(paddedNumAtoms);
for (int i = 0; i < paddedNumAtoms; ++i)
atomIndex[i] = i;
atomIndexDevice->upload(atomIndex);
findMoleculeGroups();
moleculesInvalid = false;
// nonbonded->initialize(system);
//}
}
void CudaContext::addForce(CudaForceInfo* force) {
forces.push_back(force);
......@@ -315,7 +318,7 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
CUresult result = cuModuleLoad(&module, outputFile.c_str());
if (result != CUDA_SUCCESS) {
std::stringstream m;
m<<"Error loading CUDA module: "<<result;
m<<"Error loading CUDA module: "<<getErrorString(result)<<" ("<<result<<")";
throw OpenMMException(m.str());
}
remove(inputFile.c_str());
......@@ -329,52 +332,109 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
remove(logFile.c_str());
throw;
}
//
// // Get length before using c_str() to avoid length() call invalidating the c_str() value.
// string src_string = src.str();
// ::size_t src_length = src_string.length();
// cl::Program::Sources sources(1, make_pair(src_string.c_str(), src_length));
// cl::Program program(context, sources);
// try {
// program.build(vector<cl::Device>(1, device), options.c_str());
// } catch (cl::Error err) {
// throw OpenMMException("Error compiling kernel: "+program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device));
// }
}
//
//void CudaContext::executeKernel(cl::Kernel& kernel, int workUnits, int blockSize) {
// if (blockSize == -1)
// blockSize = ThreadBlockSize;
// int size = std::min((workUnits+blockSize-1)/blockSize, numThreadBlocks)*blockSize;
// try {
// queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(size), cl::NDRange(blockSize));
// }
// catch (cl::Error err) {
// stringstream str;
// str<<"Error invoking kernel "<<kernel.getInfo<CL_KERNEL_FUNCTION_NAME>()<<": "<<err.what()<<" ("<<err.err()<<")";
// throw OpenMMException(str.str());
// }
//}
//
//void CudaContext::clearBuffer(CudaArray<float>& array) {
// clearBuffer(array.getDeviceBuffer(), array.getSize());
//}
//
//void CudaContext::clearBuffer(CudaArray<mm_float4>& array) {
// clearBuffer(array.getDeviceBuffer(), array.getSize()*4);
//}
//
//void CudaContext::clearBuffer(cl::Memory& memory, int size) {
// clearBufferKernel.setArg<cl::Memory>(0, memory);
// clearBufferKernel.setArg<cl_int>(1, size);
// executeKernel(clearBufferKernel, size, 128);
//}
//
//void CudaContext::addAutoclearBuffer(cl::Memory& memory, int size) {
// autoclearBuffers.push_back(&memory);
// autoclearBufferSizes.push_back(size);
//}
//
CUfunction CudaContext::getKernel(CUmodule& module, const string& name) {
CUfunction function;
CUresult result = cuModuleGetFunction(&function, module, name.c_str());
if (result != CUDA_SUCCESS) {
std::stringstream m;
m<<"Error creating kernel "<<name<<": "<<getErrorString(result)<<" ("<<result<<")";
throw OpenMMException(m.str());
}
return function;
}
string CudaContext::doubleToString(double value) {
stringstream s;
s.precision(useDoublePrecision ? 16 : 8);
s << scientific << value;
if (!useDoublePrecision)
s << "f";
return s.str();
}
string CudaContext::intToString(int value) {
stringstream s;
s << value;
return s.str();
}
std::string CudaContext::getErrorString(CUresult result) {
switch (result) {
case CUDA_SUCCESS: return "CUDA_SUCCESS";
case CUDA_ERROR_INVALID_VALUE: return "CUDA_ERROR_INVALID_VALUE";
case CUDA_ERROR_OUT_OF_MEMORY: return "CUDA_ERROR_OUT_OF_MEMORY";
case CUDA_ERROR_NOT_INITIALIZED: return "CUDA_ERROR_NOT_INITIALIZED";
case CUDA_ERROR_DEINITIALIZED: return "CUDA_ERROR_DEINITIALIZED";
case CUDA_ERROR_PROFILER_DISABLED: return "CUDA_ERROR_PROFILER_DISABLED";
case CUDA_ERROR_PROFILER_NOT_INITIALIZED: return "CUDA_ERROR_PROFILER_NOT_INITIALIZED";
case CUDA_ERROR_PROFILER_ALREADY_STARTED: return "CUDA_ERROR_PROFILER_ALREADY_STARTED";
case CUDA_ERROR_PROFILER_ALREADY_STOPPED: return "CUDA_ERROR_PROFILER_ALREADY_STOPPED";
case CUDA_ERROR_NO_DEVICE: return "CUDA_ERROR_NO_DEVICE";
case CUDA_ERROR_INVALID_DEVICE: return "CUDA_ERROR_INVALID_DEVICE";
case CUDA_ERROR_INVALID_IMAGE: return "CUDA_ERROR_INVALID_IMAGE";
case CUDA_ERROR_INVALID_CONTEXT: return "CUDA_ERROR_INVALID_CONTEXT";
case CUDA_ERROR_CONTEXT_ALREADY_CURRENT: return "CUDA_ERROR_CONTEXT_ALREADY_CURRENT";
case CUDA_ERROR_MAP_FAILED: return "CUDA_ERROR_MAP_FAILED";
case CUDA_ERROR_UNMAP_FAILED: return "CUDA_ERROR_UNMAP_FAILED";
case CUDA_ERROR_ARRAY_IS_MAPPED: return "CUDA_ERROR_ARRAY_IS_MAPPED";
case CUDA_ERROR_ALREADY_MAPPED: return "CUDA_ERROR_ALREADY_MAPPED";
case CUDA_ERROR_NO_BINARY_FOR_GPU: return "CUDA_ERROR_NO_BINARY_FOR_GPU";
case CUDA_ERROR_ALREADY_ACQUIRED: return "CUDA_ERROR_ALREADY_ACQUIRED";
case CUDA_ERROR_NOT_MAPPED: return "CUDA_ERROR_NOT_MAPPED";
case CUDA_ERROR_NOT_MAPPED_AS_ARRAY: return "CUDA_ERROR_NOT_MAPPED_AS_ARRAY";
case CUDA_ERROR_NOT_MAPPED_AS_POINTER: return "CUDA_ERROR_NOT_MAPPED_AS_POINTER";
case CUDA_ERROR_ECC_UNCORRECTABLE: return "CUDA_ERROR_ECC_UNCORRECTABLE";
case CUDA_ERROR_UNSUPPORTED_LIMIT: return "CUDA_ERROR_UNSUPPORTED_LIMIT";
case CUDA_ERROR_CONTEXT_ALREADY_IN_USE: return "CUDA_ERROR_CONTEXT_ALREADY_IN_USE";
case CUDA_ERROR_INVALID_SOURCE: return "CUDA_ERROR_INVALID_SOURCE";
case CUDA_ERROR_FILE_NOT_FOUND: return "CUDA_ERROR_FILE_NOT_FOUND";
case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: return "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND";
case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED: return "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED";
case CUDA_ERROR_OPERATING_SYSTEM: return "CUDA_ERROR_OPERATING_SYSTEM";
case CUDA_ERROR_INVALID_HANDLE: return "CUDA_ERROR_INVALID_HANDLE";
case CUDA_ERROR_NOT_FOUND: return "CUDA_ERROR_NOT_FOUND";
case CUDA_ERROR_NOT_READY: return "CUDA_ERROR_NOT_READY";
case CUDA_ERROR_LAUNCH_FAILED: return "CUDA_ERROR_LAUNCH_FAILED";
case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: return "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES";
case CUDA_ERROR_LAUNCH_TIMEOUT: return "CUDA_ERROR_LAUNCH_TIMEOUT";
case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: return "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING";
case CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED: return "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED";
case CUDA_ERROR_PEER_ACCESS_NOT_ENABLED: return "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED";
case CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE: return "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE";
case CUDA_ERROR_CONTEXT_IS_DESTROYED: return "CUDA_ERROR_CONTEXT_IS_DESTROYED";
case CUDA_ERROR_UNKNOWN: return "CUDA_ERROR_UNKNOWN";
}
return "Invalid error code";
}
void CudaContext::executeKernel(CUfunction kernel, void** arguments, int threads, int blockSize, unsigned int sharedSize) {
if (blockSize == -1)
blockSize = ThreadBlockSize;
int gridSize = std::min((threads+blockSize-1)/blockSize, numThreadBlocks);
CUresult result = cuLaunchKernel(kernel, gridSize, 1, 1, blockSize, 1, 1, sharedSize, 0, arguments, NULL);
if (result != CUDA_SUCCESS) {
stringstream str;
str<<"Error invoking kernel: "<<getErrorString(result)<<" ("<<result<<")";
throw OpenMMException(str.str());
}
}
void CudaContext::clearBuffer(CudaArray& array) {
clearBuffer(array.getDevicePointer(), array.getSize()*array.getElementSize()/4);
}
void CudaContext::clearBuffer(CUdeviceptr memory, int size) {
void* args[] = {&memory, &size};
executeKernel(clearBufferKernel, args, size, 128);
}
void CudaContext::addAutoclearBuffer(CUdeviceptr memory, int size) {
autoclearBuffers.push_back(memory);
autoclearBufferSizes.push_back(size);
}
//void CudaContext::clearAutoclearBuffers() {
// int base = 0;
// int total = autoclearBufferSizes.size();
......@@ -454,219 +514,217 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
// executeKernel(reduceFloat4Kernel, bufferSize, 128);
//}
//
//void CudaContext::tagAtomsInMolecule(int atom, int molecule, vector<int>& atomMolecule, vector<vector<int> >& atomBonds) {
// // Recursively tag atoms as belonging to a particular molecule.
//
// atomMolecule[atom] = molecule;
// for (int i = 0; i < (int) atomBonds[atom].size(); i++)
// if (atomMolecule[atomBonds[atom][i]] == -1)
// tagAtomsInMolecule(atomBonds[atom][i], molecule, atomMolecule, atomBonds);
//}
//
///**
// * This class ensures that atom reordering doesn't break virtual sites.
// */
//class CudaContext::VirtualSiteInfo : public CudaForceInfo {
//public:
// VirtualSiteInfo(const System& system) : CudaForceInfo(0) {
// for (int i = 0; i < system.getNumParticles(); i++) {
// if (system.isVirtualSite(i)) {
// siteTypes.push_back(&typeid(system.getVirtualSite(i)));
// vector<int> particles;
// particles.push_back(i);
// for (int j = 0; j < system.getVirtualSite(i).getNumParticles(); j++)
// particles.push_back(system.getVirtualSite(i).getParticle(j));
// siteParticles.push_back(particles);
// vector<double> weights;
// if (dynamic_cast<const TwoParticleAverageSite*>(&system.getVirtualSite(i)) != NULL) {
// // A two particle average.
//
// const TwoParticleAverageSite& site = dynamic_cast<const TwoParticleAverageSite&>(system.getVirtualSite(i));
// weights.push_back(site.getWeight(0));
// weights.push_back(site.getWeight(1));
// }
// else if (dynamic_cast<const ThreeParticleAverageSite*>(&system.getVirtualSite(i)) != NULL) {
// // A three particle average.
//
// const ThreeParticleAverageSite& site = dynamic_cast<const ThreeParticleAverageSite&>(system.getVirtualSite(i));
// weights.push_back(site.getWeight(0));
// weights.push_back(site.getWeight(1));
// weights.push_back(site.getWeight(2));
// }
// else if (dynamic_cast<const OutOfPlaneSite*>(&system.getVirtualSite(i)) != NULL) {
// // An out of plane site.
//
// const OutOfPlaneSite& site = dynamic_cast<const OutOfPlaneSite&>(system.getVirtualSite(i));
// weights.push_back(site.getWeight12());
// weights.push_back(site.getWeight13());
// weights.push_back(site.getWeightCross());
// }
// siteWeights.push_back(weights);
// }
// }
// }
// int getNumParticleGroups() {
// return siteTypes.size();
// }
// void getParticlesInGroup(int index, std::vector<int>& particles) {
// particles = siteParticles[index];
// }
// bool areGroupsIdentical(int group1, int group2) {
// if (siteTypes[group1] != siteTypes[group2])
// return false;
// int numParticles = siteWeights[group1].size();
// if (siteWeights[group2].size() != numParticles)
// return false;
// for (int i = 0; i < numParticles; i++)
// if (siteWeights[group1][i] != siteWeights[group2][i])
// return false;
// return true;
// }
//private:
// vector<const type_info*> siteTypes;
// vector<vector<int> > siteParticles;
// vector<vector<double> > siteWeights;
//};
//
//
//void CudaContext::findMoleculeGroups() {
// // The first time this is called, we need to identify all the molecules in the system.
//
// if (moleculeGroups.size() == 0) {
// // Add a ForceInfo that makes sure reordering doesn't break virtual sites.
//
// addForce(new VirtualSiteInfo(system));
//
// // First make a list of every other atom to which each atom is connect by a constraint or force group.
//
// vector<vector<int> > atomBonds(system.getNumParticles());
// for (int i = 0; i < system.getNumConstraints(); i++) {
// int particle1, particle2;
// double distance;
// system.getConstraintParameters(i, particle1, particle2, distance);
// atomBonds[particle1].push_back(particle2);
// atomBonds[particle2].push_back(particle1);
// }
// for (int i = 0; i < (int) forces.size(); i++) {
// for (int j = 0; j < forces[i]->getNumParticleGroups(); j++) {
// vector<int> particles;
// forces[i]->getParticlesInGroup(j, particles);
// for (int k = 0; k < (int) particles.size(); k++)
// for (int m = 0; m < (int) particles.size(); m++)
// if (k != m)
// atomBonds[particles[k]].push_back(particles[m]);
// }
// }
//
// // Now tag atoms by which molecule they belong to.
//
// vector<int> atomMolecule(numAtoms, -1);
// int numMolecules = 0;
// for (int i = 0; i < numAtoms; i++)
// if (atomMolecule[i] == -1)
// tagAtomsInMolecule(i, numMolecules++, atomMolecule, atomBonds);
// vector<vector<int> > atomIndices(numMolecules);
// for (int i = 0; i < numAtoms; i++)
// atomIndices[atomMolecule[i]].push_back(i);
//
// // Construct a description of each molecule.
//
// molecules.resize(numMolecules);
// for (int i = 0; i < numMolecules; i++) {
// molecules[i].atoms = atomIndices[i];
// molecules[i].groups.resize(forces.size());
// }
// for (int i = 0; i < system.getNumConstraints(); i++) {
// int particle1, particle2;
// double distance;
// system.getConstraintParameters(i, particle1, particle2, distance);
// molecules[atomMolecule[particle1]].constraints.push_back(i);
// }
// for (int i = 0; i < (int) forces.size(); i++)
// for (int j = 0; j < forces[i]->getNumParticleGroups(); j++) {
// vector<int> particles;
// forces[i]->getParticlesInGroup(j, particles);
// molecules[atomMolecule[particles[0]]].groups[i].push_back(j);
// }
// }
//
// // Sort them into groups of identical molecules.
//
// vector<Molecule> uniqueMolecules;
// vector<vector<int> > moleculeInstances;
// vector<vector<int> > moleculeOffsets;
// for (int molIndex = 0; molIndex < (int) molecules.size(); molIndex++) {
// Molecule& mol = molecules[molIndex];
//
// // See if it is identical to another molecule.
//
// bool isNew = true;
// for (int j = 0; j < (int) uniqueMolecules.size() && isNew; j++) {
// Molecule& mol2 = uniqueMolecules[j];
// bool identical = (mol.atoms.size() == mol2.atoms.size() && mol.constraints.size() == mol2.constraints.size());
//
// // See if the atoms are identical.
//
// int atomOffset = mol2.atoms[0]-mol.atoms[0];
// for (int i = 0; i < (int) mol.atoms.size() && identical; i++) {
// if (mol.atoms[i] != mol2.atoms[i]-atomOffset || system.getParticleMass(mol.atoms[i]) != system.getParticleMass(mol2.atoms[i]))
// identical = false;
// for (int k = 0; k < (int) forces.size(); k++)
// if (!forces[k]->areParticlesIdentical(mol.atoms[i], mol2.atoms[i]))
// identical = false;
// }
//
// // See if the constraints are identical.
//
// for (int i = 0; i < (int) mol.constraints.size() && identical; i++) {
// int c1particle1, c1particle2, c2particle1, c2particle2;
// double distance1, distance2;
// system.getConstraintParameters(mol.constraints[i], c1particle1, c1particle2, distance1);
// system.getConstraintParameters(mol2.constraints[i], c2particle1, c2particle2, distance2);
// if (c1particle1 != c2particle1-atomOffset || c1particle2 != c2particle2-atomOffset || distance1 != distance2)
// identical = false;
// }
//
// // See if the force groups are identical.
//
// for (int i = 0; i < (int) forces.size() && identical; i++) {
// if (mol.groups[i].size() != mol2.groups[i].size())
// identical = false;
// for (int k = 0; k < (int) mol.groups[i].size() && identical; k++)
// if (!forces[i]->areGroupsIdentical(mol.groups[i][k], mol2.groups[i][k]))
// identical = false;
// }
// if (identical) {
// moleculeInstances[j].push_back(molIndex);
// moleculeOffsets[j].push_back(mol.atoms[0]);
// isNew = false;
// }
// }
// if (isNew) {
// uniqueMolecules.push_back(mol);
// moleculeInstances.push_back(vector<int>());
// moleculeInstances[moleculeInstances.size()-1].push_back(molIndex);
// moleculeOffsets.push_back(vector<int>());
// moleculeOffsets[moleculeOffsets.size()-1].push_back(mol.atoms[0]);
// }
// }
// moleculeGroups.resize(moleculeInstances.size());
// for (int i = 0; i < (int) moleculeInstances.size(); i++)
// {
// moleculeGroups[i].instances = moleculeInstances[i];
// moleculeGroups[i].offsets = moleculeOffsets[i];
// vector<int>& atoms = uniqueMolecules[i].atoms;
// moleculeGroups[i].atoms.resize(atoms.size());
// for (int j = 0; j < (int) atoms.size(); j++)
// moleculeGroups[i].atoms[j] = atoms[j]-atoms[0];
// }
//}
//
//void CudaContext::invalidateMolecules() {
// moleculesInvalid = true;
//}
//
//
void CudaContext::tagAtomsInMolecule(int atom, int molecule, vector<int>& atomMolecule, vector<vector<int> >& atomBonds) {
// Recursively tag atoms as belonging to a particular molecule.
atomMolecule[atom] = molecule;
for (int i = 0; i < (int) atomBonds[atom].size(); i++)
if (atomMolecule[atomBonds[atom][i]] == -1)
tagAtomsInMolecule(atomBonds[atom][i], molecule, atomMolecule, atomBonds);
}
/**
* This class ensures that atom reordering doesn't break virtual sites.
*/
class CudaContext::VirtualSiteInfo : public CudaForceInfo {
public:
VirtualSiteInfo(const System& system) : CudaForceInfo(0) {
for (int i = 0; i < system.getNumParticles(); i++) {
if (system.isVirtualSite(i)) {
siteTypes.push_back(&typeid(system.getVirtualSite(i)));
vector<int> particles;
particles.push_back(i);
for (int j = 0; j < system.getVirtualSite(i).getNumParticles(); j++)
particles.push_back(system.getVirtualSite(i).getParticle(j));
siteParticles.push_back(particles);
vector<double> weights;
if (dynamic_cast<const TwoParticleAverageSite*>(&system.getVirtualSite(i)) != NULL) {
// A two particle average.
const TwoParticleAverageSite& site = dynamic_cast<const TwoParticleAverageSite&>(system.getVirtualSite(i));
weights.push_back(site.getWeight(0));
weights.push_back(site.getWeight(1));
}
else if (dynamic_cast<const ThreeParticleAverageSite*>(&system.getVirtualSite(i)) != NULL) {
// A three particle average.
const ThreeParticleAverageSite& site = dynamic_cast<const ThreeParticleAverageSite&>(system.getVirtualSite(i));
weights.push_back(site.getWeight(0));
weights.push_back(site.getWeight(1));
weights.push_back(site.getWeight(2));
}
else if (dynamic_cast<const OutOfPlaneSite*>(&system.getVirtualSite(i)) != NULL) {
// An out of plane site.
const OutOfPlaneSite& site = dynamic_cast<const OutOfPlaneSite&>(system.getVirtualSite(i));
weights.push_back(site.getWeight12());
weights.push_back(site.getWeight13());
weights.push_back(site.getWeightCross());
}
siteWeights.push_back(weights);
}
}
}
int getNumParticleGroups() {
return siteTypes.size();
}
void getParticlesInGroup(int index, std::vector<int>& particles) {
particles = siteParticles[index];
}
bool areGroupsIdentical(int group1, int group2) {
if (siteTypes[group1] != siteTypes[group2])
return false;
int numParticles = siteWeights[group1].size();
if (siteWeights[group2].size() != numParticles)
return false;
for (int i = 0; i < numParticles; i++)
if (siteWeights[group1][i] != siteWeights[group2][i])
return false;
return true;
}
private:
vector<const type_info*> siteTypes;
vector<vector<int> > siteParticles;
vector<vector<double> > siteWeights;
};
void CudaContext::findMoleculeGroups() {
// The first time this is called, we need to identify all the molecules in the system.
if (moleculeGroups.size() == 0) {
// Add a ForceInfo that makes sure reordering doesn't break virtual sites.
addForce(new VirtualSiteInfo(system));
// First make a list of every other atom to which each atom is connect by a constraint or force group.
vector<vector<int> > atomBonds(system.getNumParticles());
for (int i = 0; i < system.getNumConstraints(); i++) {
int particle1, particle2;
double distance;
system.getConstraintParameters(i, particle1, particle2, distance);
atomBonds[particle1].push_back(particle2);
atomBonds[particle2].push_back(particle1);
}
for (int i = 0; i < (int) forces.size(); i++) {
for (int j = 0; j < forces[i]->getNumParticleGroups(); j++) {
vector<int> particles;
forces[i]->getParticlesInGroup(j, particles);
for (int k = 0; k < (int) particles.size(); k++)
for (int m = 0; m < (int) particles.size(); m++)
if (k != m)
atomBonds[particles[k]].push_back(particles[m]);
}
}
// Now tag atoms by which molecule they belong to.
vector<int> atomMolecule(numAtoms, -1);
int numMolecules = 0;
for (int i = 0; i < numAtoms; i++)
if (atomMolecule[i] == -1)
tagAtomsInMolecule(i, numMolecules++, atomMolecule, atomBonds);
vector<vector<int> > atomIndices(numMolecules);
for (int i = 0; i < numAtoms; i++)
atomIndices[atomMolecule[i]].push_back(i);
// Construct a description of each molecule.
molecules.resize(numMolecules);
for (int i = 0; i < numMolecules; i++) {
molecules[i].atoms = atomIndices[i];
molecules[i].groups.resize(forces.size());
}
for (int i = 0; i < system.getNumConstraints(); i++) {
int particle1, particle2;
double distance;
system.getConstraintParameters(i, particle1, particle2, distance);
molecules[atomMolecule[particle1]].constraints.push_back(i);
}
for (int i = 0; i < (int) forces.size(); i++)
for (int j = 0; j < forces[i]->getNumParticleGroups(); j++) {
vector<int> particles;
forces[i]->getParticlesInGroup(j, particles);
molecules[atomMolecule[particles[0]]].groups[i].push_back(j);
}
}
// Sort them into groups of identical molecules.
vector<Molecule> uniqueMolecules;
vector<vector<int> > moleculeInstances;
vector<vector<int> > moleculeOffsets;
for (int molIndex = 0; molIndex < (int) molecules.size(); molIndex++) {
Molecule& mol = molecules[molIndex];
// See if it is identical to another molecule.
bool isNew = true;
for (int j = 0; j < (int) uniqueMolecules.size() && isNew; j++) {
Molecule& mol2 = uniqueMolecules[j];
bool identical = (mol.atoms.size() == mol2.atoms.size() && mol.constraints.size() == mol2.constraints.size());
// See if the atoms are identical.
int atomOffset = mol2.atoms[0]-mol.atoms[0];
for (int i = 0; i < (int) mol.atoms.size() && identical; i++) {
if (mol.atoms[i] != mol2.atoms[i]-atomOffset || system.getParticleMass(mol.atoms[i]) != system.getParticleMass(mol2.atoms[i]))
identical = false;
for (int k = 0; k < (int) forces.size(); k++)
if (!forces[k]->areParticlesIdentical(mol.atoms[i], mol2.atoms[i]))
identical = false;
}
// See if the constraints are identical.
for (int i = 0; i < (int) mol.constraints.size() && identical; i++) {
int c1particle1, c1particle2, c2particle1, c2particle2;
double distance1, distance2;
system.getConstraintParameters(mol.constraints[i], c1particle1, c1particle2, distance1);
system.getConstraintParameters(mol2.constraints[i], c2particle1, c2particle2, distance2);
if (c1particle1 != c2particle1-atomOffset || c1particle2 != c2particle2-atomOffset || distance1 != distance2)
identical = false;
}
// See if the force groups are identical.
for (int i = 0; i < (int) forces.size() && identical; i++) {
if (mol.groups[i].size() != mol2.groups[i].size())
identical = false;
for (int k = 0; k < (int) mol.groups[i].size() && identical; k++)
if (!forces[i]->areGroupsIdentical(mol.groups[i][k], mol2.groups[i][k]))
identical = false;
}
if (identical) {
moleculeInstances[j].push_back(molIndex);
moleculeOffsets[j].push_back(mol.atoms[0]);
isNew = false;
}
}
if (isNew) {
uniqueMolecules.push_back(mol);
moleculeInstances.push_back(vector<int>());
moleculeInstances[moleculeInstances.size()-1].push_back(molIndex);
moleculeOffsets.push_back(vector<int>());
moleculeOffsets[moleculeOffsets.size()-1].push_back(mol.atoms[0]);
}
}
moleculeGroups.resize(moleculeInstances.size());
for (int i = 0; i < (int) moleculeInstances.size(); i++)
{
moleculeGroups[i].instances = moleculeInstances[i];
moleculeGroups[i].offsets = moleculeOffsets[i];
vector<int>& atoms = uniqueMolecules[i].atoms;
moleculeGroups[i].atoms.resize(atoms.size());
for (int j = 0; j < (int) atoms.size(); j++)
moleculeGroups[i].atoms[j] = atoms[j]-atoms[0];
}
}
void CudaContext::invalidateMolecules() {
moleculesInvalid = true;
}
//void OpenCLContext::validateMolecules() {
// moleculesInvalid = false;
// if (numAtoms == 0 || nonbonded == NULL || !nonbonded->getUseCutoff())
......
......@@ -72,11 +72,11 @@ public:
CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const std::string& precision,
const std::string& compiler, const std::string& tempDir, CudaPlatform::PlatformData& platformData);
~CudaContext();
// /**
// * This is called to initialize internal data structures after all Forces in the system
// * have been initialized.
// */
// void initialize();
/**
* This is called to initialize internal data structures after all Forces in the system
* have been initialized.
*/
void initialize();
/**
* Add a CudaForce to this context.
*/
......@@ -123,12 +123,12 @@ public:
CudaArray& getVelm() {
return *velm;
}
// /**
// * Get the array which contains the force on each atom.
// */
// CudaArray<mm_float4>& getForce() {
// return *force;
// }
/**
* Get the array which contains the force on each atom (respresented as a long3 in 64 bit fixed point).
*/
CudaArray& getForce() {
return *force;
}
// /**
// * Get the array which contains the buffers in which forces are computed.
// */
......@@ -184,36 +184,41 @@ public:
* omitted, a default set of options will be used
*/
CUmodule createModule(const std::string source, const std::map<std::string, std::string>& defines, const char* optimizationFlags = NULL);
// /**
// * Execute a kernel.
// *
// * @param kernel the kernel to execute
// * @param workUnits the maximum number of work units that should be used
// * @param blockSize the size of each thread block to use
// */
// void executeKernel(cl::Kernel& kernel, int workUnits, int blockSize = -1);
// /**
// * Set all elements of an array to 0.
// */
// void clearBuffer(CudaArray<float>& array);
// /**
// * Set all elements of an array to 0.
// */
// void clearBuffer(CudaArray<mm_float4>& array);
// /**
// * Set all elements of an array to 0.
// *
// * @param memory the Memory to clear
// * @param size the number of float elements in the buffer
// */
// void clearBuffer(cl::Memory& memory, int size);
// /**
// * Register a buffer that should be automatically cleared (all elements set to 0) at the start of each force or energy computation.
// *
// * @param memory the Memory to clear
// * @param size the number of float elements in the buffer
// */
// void addAutoclearBuffer(cl::Memory& memory, int size);
/**
* Get a kernel from a CUDA module.
*
* @param module the module to get the kernel from
* @param name the name of the kernel to get
*/
CUfunction getKernel(CUmodule& module, const std::string& name);
/**
* Execute a kernel.
*
* @param kernel the kernel to execute
* @param arguments an array of pointers to the kernel arguments
* @param threads the maximum number of threads that should be used
* @param blockSize the size of each thread block to use
* @param sharedSize the amount of dynamic shared memory to allocated for the kernel, in bytes
*/
void executeKernel(CUfunction kernel, void** arguments, int workUnits, int blockSize = -1, unsigned int sharedSize = 0);
/**
* Set all elements of an array to 0.
*/
void clearBuffer(CudaArray& array);
/**
* Set all elements of an array to 0.
*
* @param memory the memory to clear
* @param size the number of 4-byte elements in the buffer
*/
void clearBuffer(CUdeviceptr memory, int size);
/**
* Register a buffer that should be automatically cleared (all elements set to 0) at the start of each force or energy computation.
*
* @param memory the memory to clear
* @param size the number of float/double elements in the buffer
*/
void addAutoclearBuffer(CUdeviceptr memory, int size);
// /**
// * Clear all buffers that have been registered with addAutoclearBuffer().
// */
......@@ -230,108 +235,110 @@ public:
// * Sum the buffesr containing forces.
// */
// void reduceForces();
// /**
// * Get the current simulation time.
// */
// double getTime() {
// return time;
// }
// /**
// * Set the current simulation time.
// */
// void setTime(double t) {
// time = t;
// }
// /**
// * Get the number of integration steps that have been taken.
// */
// int getStepCount() {
// return stepCount;
// }
// /**
// * Set the number of integration steps that have been taken.
// */
// void setStepCount(int steps) {
// stepCount = steps;
// }
// /**
// * Get the number of times forces or energy has been computed.
// */
// int getComputeForceCount() {
// return computeForceCount;
// }
// /**
// * Set the number of times forces or energy has been computed.
// */
// void setComputeForceCount(int count) {
// computeForceCount = count;
// }
// /**
// * Get the number of atoms.
// */
// int getNumAtoms() const {
// return numAtoms;
// }
// /**
// * Get the number of atoms, rounded up to a multiple of TileSize. This is the actual size of
// * most arrays with one element per atom.
// */
// int getPaddedNumAtoms() const {
// return paddedNumAtoms;
// }
// /**
// * Get the number of blocks of TileSize atoms.
// */
// int getNumAtomBlocks() const {
// return numAtomBlocks;
// }
// /**
// * Get the standard number of thread blocks to use when executing kernels.
// */
// int getNumThreadBlocks() const {
// return numThreadBlocks;
// }
// /**
// * Get the number of force buffers.
// */
// int getNumForceBuffers() const {
// return numForceBuffers;
// }
// /**
// * Get the SIMD width of the device being used.
// */
// int getSIMDWidth() const {
// return simdWidth;
// }
// /**
// * Get whether the device being used supports 64 bit atomic operations on global memory.
// */
// bool getSupports64BitGlobalAtomics() {
// return supports64BitGlobalAtomics;
// }
// /**
// * Get whether the device being used supports double precision math.
// */
// bool getSupportsDoublePrecision() {
// return supportsDoublePrecision;
// }
/**
* Get the current simulation time.
*/
double getTime() {
return time;
}
/**
* Set the current simulation time.
*/
void setTime(double t) {
time = t;
}
/**
* Get the number of integration steps that have been taken.
*/
int getStepCount() {
return stepCount;
}
/**
* Set the number of integration steps that have been taken.
*/
void setStepCount(int steps) {
stepCount = steps;
}
/**
* Get the number of times forces or energy has been computed.
*/
int getComputeForceCount() {
return computeForceCount;
}
/**
* Set the number of times forces or energy has been computed.
*/
void setComputeForceCount(int count) {
computeForceCount = count;
}
/**
* Get the number of atoms.
*/
int getNumAtoms() const {
return numAtoms;
}
/**
* Get the number of atoms, rounded up to a multiple of TileSize. This is the actual size of
* most arrays with one element per atom.
*/
int getPaddedNumAtoms() const {
return paddedNumAtoms;
}
/**
* Get the number of blocks of TileSize atoms.
*/
int getNumAtomBlocks() const {
return numAtomBlocks;
}
/**
* Get the standard number of thread blocks to use when executing kernels.
*/
int getNumThreadBlocks() const {
return numThreadBlocks;
}
/**
* Get whether double precision is being used.
*/
bool getUseDoublePrecision() {
return useDoublePrecision;
}
/**
* Get whether accumulation is being done in double precision.
*/
bool getAccumulateInDouble() {
return accumulateInDouble;
}
/**
* Convert a number to a string in a format suitable for including in a kernel.
* This takes into account whether the context uses single or double precision.
*/
std::string doubleToString(double value);
/**
* Convert a number to a string in a format suitable for including in a kernel.
*/
std::string intToString(int value);
/**
* Convert a CUDA result code to the corresponding string description.
*/
std::string getErrorString(CUresult result);
// /**
// * Get the size of the periodic box.
// */
// mm_float4 getPeriodicBoxSize() const {
// float4 getPeriodicBoxSize() const {
// return periodicBoxSize;
// }
// /**
// * Set the size of the periodic box.
// */
// void setPeriodicBoxSize(double xsize, double ysize, double zsize) {
// periodicBoxSize = mm_float4((float) xsize, (float) ysize, (float) zsize, 0);
// invPeriodicBoxSize = mm_float4((float) (1.0/xsize), (float) (1.0/ysize), (float) (1.0/zsize), 0);
// periodicBoxSize = make_float4((float) xsize, (float) ysize, (float) zsize, 0);
// invPeriodicBoxSize = make_float4((float) (1.0/xsize), (float) (1.0/ysize), (float) (1.0/zsize), 0);
// }
// /**
// * Get the inverse of the size of the periodic box.
// */
// mm_float4 getInvPeriodicBoxSize() const {
// float4 getInvPeriodicBoxSize() const {
// return invPeriodicBoxSize;
// }
// /**
......@@ -352,66 +359,66 @@ public:
// CudaNonbondedUtilities& getNonbondedUtilities() {
// return *nonbonded;
// }
// /**
// * Get the thread used by this context for executing parallel computations.
// */
// WorkThread& getWorkThread() {
// return *thread;
// }
// /**
// * Get whether atoms were reordered during the most recent force/energy computation.
// */
// bool getAtomsWereReordered() const {
// return atomsWereReordered;
// }
// /**
// * Set whether atoms were reordered during the most recent force/energy computation.
// */
// void setAtomsWereReordered(bool wereReordered) {
// atomsWereReordered = wereReordered;
// }
// /**
// * Reorder the internal arrays of atoms to try to keep spatially contiguous atoms close
// * together in the arrays.
// *
// * @param enforcePeriodic if true, the atom positions may be altered to enforce periodic boundary conditions
// */
// void reorderAtoms(bool enforcePeriodic);
// /**
// * Add a listener that should be called whenever atoms get reordered. The CudaContext
// * assumes ownership of the object, and deletes it when the context itself is deleted.
// */
// void addReorderListener(ReorderListener* listener);
// /**
// * Get the list of ReorderListeners.
// */
// std::vector<ReorderListener*>& getReorderListeners() {
// return reorderListeners;
// }
// /**
// * Mark that the current molecule definitions (and hence the atom order) may be invalid.
// * This should be called whenever force field parameters change. It will cause the definitions
// * and order to be revalidated the next to reorderAtoms() is called.
// */
// void invalidateMolecules();
// /**
// * Get whether the current molecule definitions are valid.
// */
// bool getMoleculesAreInvalid() {
// return moleculesInvalid;
// }
/**
* Get the thread used by this context for executing parallel computations.
*/
WorkThread& getWorkThread() {
return *thread;
}
/**
* Get whether atoms were reordered during the most recent force/energy computation.
*/
bool getAtomsWereReordered() const {
return atomsWereReordered;
}
/**
* Set whether atoms were reordered during the most recent force/energy computation.
*/
void setAtomsWereReordered(bool wereReordered) {
atomsWereReordered = wereReordered;
}
/**
* Reorder the internal arrays of atoms to try to keep spatially contiguous atoms close
* together in the arrays.
*
* @param enforcePeriodic if true, the atom positions may be altered to enforce periodic boundary conditions
*/
void reorderAtoms(bool enforcePeriodic);
/**
* Add a listener that should be called whenever atoms get reordered. The CudaContext
* assumes ownership of the object, and deletes it when the context itself is deleted.
*/
void addReorderListener(ReorderListener* listener);
/**
* Get the list of ReorderListeners.
*/
std::vector<ReorderListener*>& getReorderListeners() {
return reorderListeners;
}
/**
* Mark that the current molecule definitions (and hence the atom order) may be invalid.
* This should be called whenever force field parameters change. It will cause the definitions
* and order to be revalidated the next to reorderAtoms() is called.
*/
void invalidateMolecules();
/**
* Get whether the current molecule definitions are valid.
*/
bool getMoleculesAreInvalid() {
return moleculesInvalid;
}
private:
struct Molecule;
struct MoleculeGroup;
class VirtualSiteInfo;
// void findMoleculeGroups();
// static void tagAtomsInMolecule(int atom, int molecule, std::vector<int>& atomMolecule, std::vector<std::vector<int> >& atomBonds);
// /**
// * Ensure that all molecules marked as "identical" really are identical. This should be
// * called whenever force field parameters change. If necessary, it will rebuild the list
// * of molecules and resort the atoms.
// */
// void validateMolecules();
void findMoleculeGroups();
static void tagAtomsInMolecule(int atom, int molecule, std::vector<int>& atomMolecule, std::vector<std::vector<int> >& atomBonds);
/**
* Ensure that all molecules marked as "identical" really are identical. This should be
* called whenever force field parameters change. If necessary, it will rebuild the list
* of molecules and resort the atoms.
*/
void validateMolecules();
static bool hasInitializedCuda;
const System& system;
double time;
......@@ -424,8 +431,6 @@ private:
int paddedNumAtoms;
int numAtomBlocks;
int numThreadBlocks;
// int numForceBuffers;
// int simdWidth;
bool useBlockingSync, useDoublePrecision, accumulateInDouble, contextIsValid, atomsWereReordered, moleculesInvalid;
std::string compiler, tempDir, gpuArchitecture;
float4 periodicBoxSize;
......@@ -446,15 +451,15 @@ private:
std::vector<Molecule> molecules;
std::vector<MoleculeGroup> moleculeGroups;
std::vector<int4> posCellOffsets;
void* pinnedBuffer;
CudaArray* posq;
CudaArray* velm;
// CudaArray<mm_float4>* force;
// CudaArray<mm_float4>* forceBuffers;
// CudaArray<cl_long>* longForceBuffer;
// CudaArray<cl_float>* energyBuffer;
// CudaArray<cl_int>* atomIndex;
// std::vector<cl::Memory*> autoclearBuffers;
// std::vector<int> autoclearBufferSizes;
CudaArray* force;
CudaArray* energyBuffer;
CudaArray* atomIndexDevice;
std::vector<int> atomIndex;
std::vector<CUdeviceptr> autoclearBuffers;
std::vector<int> autoclearBufferSizes;
std::vector<ReorderListener*> reorderListeners;
// CudaIntegrationUtilities* integration;
// CudaBondedUtilities* bonded;
......
......@@ -154,6 +154,7 @@ CudaPlatform::PlatformData::PlatformData(const System& system, const string& dev
device << contexts[i]->getDeviceIndex();
}
propertyValues[CudaPlatform::CudaDeviceIndex()] = device.str();
propertyValues[CudaPlatform::CudaUseBlockingSync()] = blocking ? "true" : "false";
propertyValues[CudaPlatform::CudaPrecision()] = precisionProperty;
propertyValues[CudaPlatform::CudaCompiler()] = compilerProperty;
propertyValues[CudaPlatform::CudaTempDirectory()] = tempProperty;
......@@ -166,11 +167,11 @@ CudaPlatform::PlatformData::~PlatformData() {
}
void CudaPlatform::PlatformData::initializeContexts(const System& system) {
// for (int i = 0; i < (int) contexts.size(); i++)
// contexts[i]->initialize();
for (int i = 0; i < (int) contexts.size(); i++)
contexts[i]->initialize();
}
void CudaPlatform::PlatformData::syncContexts() {
// for (int i = 0; i < (int) contexts.size(); i++)
// contexts[i]->getWorkThread().flush();
for (int i = 0; i < (int) contexts.size(); i++)
contexts[i]->getWorkThread().flush();
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2010-2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "CudaSort.h"
#include "CudaKernelSources.h"
#include <map>
using namespace OpenMM;
using namespace std;
CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length) : context(context), trait(trait),
dataRange(NULL), bucketOfElement(NULL), offsetInBucket(NULL), bucketOffset(NULL), buckets(NULL) {
// Create kernels.
map<string, string> replacements;
replacements["DATA_TYPE"] = trait->getDataType();
replacements["KEY_TYPE"] = trait->getKeyType();
replacements["SORT_KEY"] = trait->getSortKey();
replacements["MIN_KEY"] = trait->getMinKey();
replacements["MAX_KEY"] = trait->getMaxKey();
replacements["MAX_VALUE"] = trait->getMaxValue();
CUmodule module = context.createModule(context.replaceStrings(CudaKernelSources::sort, replacements));
computeRangeKernel = context.getKernel(module, "computeRange");
assignElementsKernel = context.getKernel(module, "assignElementsToBuckets");
computeBucketPositionsKernel = context.getKernel(module, "computeBucketPositions");
copyToBucketsKernel = context.getKernel(module, "copyDataToBuckets");
sortBucketsKernel = context.getKernel(module, "sortBuckets");
// Work out the work group sizes for various kernels.
int maxBlockSize;
cuDeviceGetAttribute(&maxBlockSize, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, context.getDevice());
for (rangeKernelSize = 1; rangeKernelSize*2 <= maxBlockSize; rangeKernelSize *= 2)
;
positionsKernelSize = rangeKernelSize;
sortKernelSize = rangeKernelSize/2;
if (rangeKernelSize > length)
rangeKernelSize = length;
int maxSharedMem;
cuDeviceGetAttribute(&maxSharedMem, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, context.getDevice());
unsigned int maxLocalBuffer = (unsigned int) ((maxSharedMem/trait->getDataSize())/2);
if (sortKernelSize > maxLocalBuffer)
sortKernelSize = maxLocalBuffer;
unsigned int targetBucketSize = sortKernelSize/2;
unsigned int numBuckets = length/targetBucketSize;
if (numBuckets < 1)
numBuckets = 1;
if (positionsKernelSize > numBuckets)
positionsKernelSize = numBuckets;
// Create workspace arrays.
dataRange = new CudaArray(2, trait->getKeySize(), "sortDataRange");
bucketOffset = CudaArray::create<uint1>(numBuckets, "bucketOffset");
bucketOfElement = CudaArray::create<uint1>(length, "bucketOfElement");
offsetInBucket = CudaArray::create<uint1>(length, "offsetInBucket");
buckets = new CudaArray(length, trait->getDataSize(), "buckets");
}
CudaSort::~CudaSort() {
delete trait;
if (dataRange != NULL)
delete dataRange;
if (bucketOfElement != NULL)
delete bucketOfElement;
if (offsetInBucket != NULL)
delete offsetInBucket;
if (bucketOffset != NULL)
delete bucketOffset;
if (buckets != NULL)
delete buckets;
}
void CudaSort::sort(CudaArray& data) {
if (data.getSize() != bucketOfElement->getSize() || data.getElementSize() != trait->getDataSize())
throw OpenMMException("CudaSort called with different data size");
if (data.getSize() == 0)
return;
// Compute the range of data values.
unsigned int dataSize = data.getSize();
void* rangeArgs[] = {&data.getDevicePointer(), &dataSize, &dataRange->getDevicePointer()};
context.executeKernel(computeRangeKernel, rangeArgs, rangeKernelSize, rangeKernelSize, rangeKernelSize*trait->getKeySize());
// Assign array elements to buckets.
unsigned int numBuckets = bucketOffset->getSize();
context.clearBuffer(*bucketOffset);
void* elementsArgs[] = {&data.getDevicePointer(), &dataSize, &numBuckets, &dataRange->getDevicePointer(),
&bucketOffset->getDevicePointer(), &bucketOfElement->getDevicePointer(), &offsetInBucket->getDevicePointer()};
context.executeKernel(assignElementsKernel, elementsArgs, data.getSize());
// Compute the position of each bucket.
void* computeArgs[] = {&numBuckets, &bucketOffset->getDevicePointer()};
context.executeKernel(computeBucketPositionsKernel, computeArgs, positionsKernelSize, positionsKernelSize, positionsKernelSize*sizeof(int));
// Copy the data into the buckets.
void* copyArgs[] = {&data.getDevicePointer(), &buckets->getDevicePointer(), &dataSize, &bucketOffset->getDevicePointer(),
&bucketOfElement->getDevicePointer(), &offsetInBucket->getDevicePointer()};
context.executeKernel(copyToBucketsKernel, copyArgs, data.getSize());
// Sort each bucket.
void* sortArgs[] = {&data.getDevicePointer(), &buckets->getDevicePointer(), &numBuckets, &bucketOffset->getDevicePointer()};
context.executeKernel(sortBucketsKernel, sortArgs, ((data.getSize()+sortKernelSize-1)/sortKernelSize)*sortKernelSize, sortKernelSize, sortKernelSize*trait->getDataSize());
}
#ifndef __OPENMM_CUDASORT_H__
#define __OPENMM_CUDASORT_H__
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2010-2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "CudaArray.h"
#include "openmm/internal/windowsExport.h"
#include "CudaContext.h"
namespace OpenMM {
/**
* This class sorts arrays of values. It supports any type of values, not just scalars,
* so long as an appropriate sorting key can be defined by which to sort them.
*
* The sorting behavior is specified by a "trait" class that defines the type of data to
* sort and the key for sorting it. Here is an example of a trait class for
* sorting floats:
*
* class SortTrait : public CudaSort::SortTrait {
* int getDataSize() const {return 4;}
* int getKeySize() const {return 4;}
* const char* getDataType() const {return "float";}
* const char* getKeyType() const {return "float";}
* const char* getMinKey() const {return "-MAXFLOAT";}
* const char* getMaxKey() const {return "MAXFLOAT";}
* const char* getMaxValue() const {return "MAXFLOAT";}
* const char* getSortKey() const {return "value";}
* };
*
* The algorithm used is a bucket sort, followed by a bitonic sort within each bucket
* (in local memory when possible, in global memory otherwise). This is similar to
* the algorithm described in
*
* Shifu Chen, Jing Qin, Yongming Xie, Junping Zhao, and Pheng-Ann Heng. "An Efficient
* Sorting Algorithm with CUDA" Journal of the Chinese Institute of Engineers, 32(7),
* pp. 915-921 (2009)
*
* but with many modifications and simplifications. In particular, this algorithm
* involves much less communication between host and device, which is critical to get
* good performance with the array sizes we typically work with (10,000 to 100,000
* elements).
*/
class OPENMM_EXPORT CudaSort {
public:
class SortTrait;
/**
* Create a CudaSort object for sorting data of a particular type.
*
* @param context the context in which to perform calculations
* @param trait a SortTrait defining the type of data to sort. It should have been allocated
* on the heap with the "new" operator. This object takes over ownership of it,
* and deletes it when the CudaSort is deleted.
* @param length the length of the arrays this object will be used to sort
*/
CudaSort(CudaContext& context, SortTrait* trait, unsigned int length);
~CudaSort();
/**
* Sort an array.
*/
void sort(CudaArray& data);
private:
CudaContext& context;
SortTrait* trait;
CudaArray* dataRange;
CudaArray* bucketOfElement;
CudaArray* offsetInBucket;
CudaArray* bucketOffset;
CudaArray* buckets;
CUfunction computeRangeKernel, assignElementsKernel, computeBucketPositionsKernel, copyToBucketsKernel, sortBucketsKernel;
unsigned int rangeKernelSize, positionsKernelSize, sortKernelSize;
};
/**
* A subclass of SortTrait defines the type of value to sort, and the key for sorting them.
*/
class CudaSort::SortTrait {
public:
/**
* Get the size of each data value in bytes.
*/
virtual int getDataSize() const = 0;
/**
* Get the size of each key value in bytes.
*/
virtual int getKeySize() const = 0;
/**
* Get the data type of the values to sort.
*/
virtual const char* getDataType() const = 0;
/**
* Get the data type of the sorting key.
*/
virtual const char* getKeyType() const = 0;
/**
* Get the minimum value a key can take.
*/
virtual const char* getMinKey() const = 0;
/**
* Get the maximum value a key can take.
*/
virtual const char* getMaxKey() const = 0;
/**
* Get a value whose key is guaranteed to equal getMaxKey().
*/
virtual const char* getMaxValue() const = 0;
/**
* Get the CUDA code to select the key from the data value.
*/
virtual const char* getSortKey() const = 0;
};
} // namespace OpenMM
#endif // __OPENMM_CUDASORT_H__
__device__ KEY_TYPE getValue(DATA_TYPE value) {
return SORT_KEY;
}
extern "C" {
/**
* Calculate the minimum and maximum value in the array to be sorted. This kernel
* is executed as a single work group.
*/
__global__ void computeRange(const DATA_TYPE* __restrict__ data, unsigned int length, KEY_TYPE* __restrict__ range) {
extern __shared__ KEY_TYPE rangeBuffer[];
KEY_TYPE minimum = MAX_KEY;
KEY_TYPE maximum = MIN_KEY;
// Each thread calculates the range of a subset of values.
for (unsigned int index = threadIdx.x; index < length; index += blockDim.x) {
KEY_TYPE value = getValue(data[index]);
minimum = min(minimum, value);
maximum = max(maximum, value);
}
// Now reduce them.
rangeBuffer[threadIdx.x] = minimum;
__syncthreads();
for (unsigned int step = 1; step < blockDim.x; step *= 2) {
if (threadIdx.x+step < blockDim.x && threadIdx.x%(2*step) == 0)
rangeBuffer[threadIdx.x] = min(rangeBuffer[threadIdx.x], rangeBuffer[threadIdx.x+step]);
__syncthreads();
}
minimum = rangeBuffer[0];
rangeBuffer[threadIdx.x] = maximum;
__syncthreads();
for (unsigned int step = 1; step < blockDim.x; step *= 2) {
if (threadIdx.x+step < blockDim.x && threadIdx.x%(2*step) == 0)
rangeBuffer[threadIdx.x] = max(rangeBuffer[threadIdx.x], rangeBuffer[threadIdx.x+step]);
__syncthreads();
}
maximum = rangeBuffer[0];
if (threadIdx.x == 0) {
range[0] = minimum;
range[1] = maximum;
}
}
/**
* Assign elements to buckets.
*/
__global__ void assignElementsToBuckets(const DATA_TYPE* __restrict__ data, unsigned int length, unsigned int numBuckets, const KEY_TYPE* __restrict__ range,
unsigned int* bucketOffset, unsigned int* __restrict__ bucketOfElement, unsigned int* __restrict__ offsetInBucket) {
float minValue = (float) (range[0]);
float maxValue = (float) (range[1]);
float bucketWidth = (maxValue-minValue)/numBuckets;
for (unsigned int index = blockDim.x*blockIdx.x+threadIdx.x; index < length; index += blockDim.x*gridDim.x) {
float key = (float) getValue(data[index]);
unsigned int bucketIndex = min((unsigned int) ((key-minValue)/bucketWidth), numBuckets-1);
offsetInBucket[index] = atomicAdd(&bucketOffset[bucketIndex], 1);
bucketOfElement[index] = bucketIndex;
}
}
/**
* Sum the bucket sizes to compute the start position of each bucket. This kernel
* is executed as a single work group.
*/
__global__ void computeBucketPositions(unsigned int numBuckets, unsigned int* __restrict__ bucketOffset) {
extern __shared__ unsigned int posBuffer[];
unsigned int globalOffset = 0;
for (unsigned int startBucket = 0; startBucket < numBuckets; startBucket += blockDim.x) {
// Load the bucket sizes into local memory.
unsigned int globalIndex = startBucket+threadIdx.x;
posBuffer[threadIdx.x] = (globalIndex < numBuckets ? bucketOffset[globalIndex] : 0);
__syncthreads();
// Perform a parallel prefix sum.
for (unsigned int step = 1; step < blockDim.x; step *= 2) {
unsigned int add = (threadIdx.x >= step ? posBuffer[threadIdx.x-step] : 0);
__syncthreads();
posBuffer[threadIdx.x] += add;
__syncthreads();
}
// Write the results back to global memory.
if (globalIndex < numBuckets)
bucketOffset[globalIndex] = posBuffer[threadIdx.x]+globalOffset;
globalOffset += posBuffer[blockDim.x-1];
}
}
/**
* Copy the input data into the buckets for sorting.
*/
__global__ void copyDataToBuckets(const DATA_TYPE* __restrict__ data, DATA_TYPE* __restrict__ buckets, unsigned int length, const unsigned int* __restrict__ bucketOffset, const unsigned int* __restrict__ bucketOfElement, const unsigned int* __restrict__ offsetInBucket) {
for (unsigned int index = blockDim.x*blockIdx.x+threadIdx.x; index < length; index += blockDim.x*gridDim.x) {
DATA_TYPE element = data[index];
unsigned int bucketIndex = bucketOfElement[index];
unsigned int offset = (bucketIndex == 0 ? 0 : bucketOffset[bucketIndex-1]);
buckets[offset+offsetInBucket[index]] = element;
}
}
/**
* Sort the data in each bucket.
*/
__global__ void sortBuckets(DATA_TYPE* __restrict__ data, const DATA_TYPE* __restrict__ buckets, unsigned int numBuckets, const unsigned int* __restrict__ bucketOffset) {
extern __shared__ DATA_TYPE dataBuffer[];
for (unsigned int index = blockIdx.x; index < numBuckets; index += gridDim.x) {
unsigned int startIndex = (index == 0 ? 0 : bucketOffset[index-1]);
unsigned int endIndex = bucketOffset[index];
unsigned int length = endIndex-startIndex;
if (length <= blockDim.x) {
// Load the data into local memory.
if (threadIdx.x < length)
dataBuffer[threadIdx.x] = buckets[startIndex+threadIdx.x];
else
dataBuffer[threadIdx.x] = MAX_VALUE;
__syncthreads();
// Perform a bitonic sort in local memory.
for (unsigned int k = 2; k <= blockDim.x; k *= 2) {
for (unsigned int j = k/2; j > 0; j /= 2) {
int ixj = threadIdx.x^j;
if (ixj > threadIdx.x) {
DATA_TYPE value1 = dataBuffer[threadIdx.x];
DATA_TYPE value2 = dataBuffer[ixj];
bool ascending = (threadIdx.x&k) == 0;
KEY_TYPE lowKey = (ascending ? getValue(value1) : getValue(value2));
KEY_TYPE highKey = (ascending ? getValue(value2) : getValue(value1));
if (lowKey > highKey) {
dataBuffer[threadIdx.x] = value2;
dataBuffer[ixj] = value1;
}
}
__syncthreads();
}
}
// Write the data to the sorted array.
if (threadIdx.x < length)
data[startIndex+threadIdx.x] = dataBuffer[threadIdx.x];
}
else {
// Copy the bucket data over to the output array.
for (unsigned int i = threadIdx.x; i < length; i += blockDim.x)
data[startIndex+i] = buckets[startIndex+i];
__threadfence_block();
__syncthreads();
// Perform a bitonic sort in global memory.
for (unsigned int k = 2; k < 2*length; k *= 2) {
for (unsigned int j = k/2; j > 0; j /= 2) {
for (unsigned int i = threadIdx.x; i < length; i += blockDim.x) {
int ixj = i^j;
if (ixj > i && ixj < length) {
DATA_TYPE value1 = data[startIndex+i];
DATA_TYPE value2 = data[startIndex+ixj];
bool ascending = ((i&k) == 0);
for (unsigned int mask = k*2; mask < 2*length; mask *= 2)
ascending = ((i&mask) == 0 ? !ascending : ascending);
KEY_TYPE lowKey = (ascending ? getValue(value1) : getValue(value2));
KEY_TYPE highKey = (ascending ? getValue(value2) : getValue(value1));
if (lowKey > highKey) {
data[startIndex+i] = value2;
data[startIndex+ixj] = value1;
}
}
}
__threadfence_block();
__syncthreads();
}
}
}
}
}
}
\ No newline at end of file
extern "C" {
/**
* This is called by the various functions below to clear a buffer.
*/
......@@ -100,3 +102,5 @@ __global__ void reduceForces(const long* __restrict__ longBuffer, float4* __rest
buffer[index] = sum;
}
}
}
\ No newline at end of file
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008-2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
/**
* This tests the CUDA implementation of sorting.
*/
#include "openmm/internal/AssertionUtilities.h"
#include "../src/CudaArray.h"
#include "../src/CudaContext.h"
#include "../src/CudaSort.h"
#include "sfmt/SFMT.h"
#include "openmm/System.h"
#include <iostream>
#include <cmath>
#include <set>
using namespace OpenMM;
using namespace std;
class SortTrait : public CudaSort::SortTrait {
int getDataSize() const {return 4;}
int getKeySize() const {return 4;}
const char* getDataType() const {return "float";}
const char* getKeyType() const {return "float";}
const char* getMinKey() const {return "-MAXFLOAT";}
const char* getMaxKey() const {return "MAXFLOAT";}
const char* getMaxValue() const {return "MAXFLOAT";}
const char* getSortKey() const {return "value";}
};
void verifySorting(vector<float> array) {
// Sort the array.
System system;
system.addParticle(0.0);
CudaPlatform platform;
CudaPlatform::PlatformData platformData(system, "", "true", "single",
platform.getPropertyDefaultValue(CudaPlatform::CudaCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaTempDirectory()));
CudaContext& context = *platformData.contexts[0];
context.initialize();
CudaArray data(array.size(), 4, "sortData");
data.upload(array);
CudaSort sort(context, new SortTrait(), array.size());
sort.sort(data);
vector<float> sorted;
data.download(sorted);
// Verify that it is in sorted order.
for (int i = 1; i < (int) sorted.size(); i++)
ASSERT(sorted[i-1] <= sorted[i]);
// Make sure the sorted array contains the same values as the original one.
multiset<float> elements1(array.begin(), array.end());
multiset<float> elements2(sorted.begin(), sorted.end());
ASSERT(elements1 == elements2);
}
void testUniformValues()
{
OpenMM_SFMT::SFMT sfmt;
init_gen_rand(0, sfmt);
vector<float> array(10000);
for (int i = 0; i < (int) array.size(); i++)
array[i] = (float) genrand_real2(sfmt);
verifySorting(array);
}
void testLogValues()
{
OpenMM_SFMT::SFMT sfmt;
init_gen_rand(0, sfmt);
vector<float> array(10000);
for (int i = 0; i < (int) array.size(); i++)
array[i] = (float) log(genrand_real2(sfmt));
verifySorting(array);
}
int main() {
try {
testUniformValues();
testLogValues();
}
catch(const exception& e) {
cout << "exception: " << e.what() << endl;
return 1;
}
cout << "Done" << endl;
return 0;
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment