Commit 3e16cab9 authored by Peter Eastman's avatar Peter Eastman
Browse files

Continuing to implement new CUDA platform

parent abb8cb4b
...@@ -83,7 +83,7 @@ public: ...@@ -83,7 +83,7 @@ public:
/** /**
* Get a pointer to the device memory. * Get a pointer to the device memory.
*/ */
CUdeviceptr getDevicePointer() { CUdeviceptr& getDevicePointer() {
return pointer; return pointer;
} }
/** /**
......
This diff is collapsed.
...@@ -72,11 +72,11 @@ public: ...@@ -72,11 +72,11 @@ public:
CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const std::string& precision, CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const std::string& precision,
const std::string& compiler, const std::string& tempDir, CudaPlatform::PlatformData& platformData); const std::string& compiler, const std::string& tempDir, CudaPlatform::PlatformData& platformData);
~CudaContext(); ~CudaContext();
// /** /**
// * This is called to initialize internal data structures after all Forces in the system * This is called to initialize internal data structures after all Forces in the system
// * have been initialized. * have been initialized.
// */ */
// void initialize(); void initialize();
/** /**
* Add a CudaForce to this context. * Add a CudaForce to this context.
*/ */
...@@ -123,12 +123,12 @@ public: ...@@ -123,12 +123,12 @@ public:
CudaArray& getVelm() { CudaArray& getVelm() {
return *velm; return *velm;
} }
// /** /**
// * Get the array which contains the force on each atom. * Get the array which contains the force on each atom (respresented as a long3 in 64 bit fixed point).
// */ */
// CudaArray<mm_float4>& getForce() { CudaArray& getForce() {
// return *force; return *force;
// } }
// /** // /**
// * Get the array which contains the buffers in which forces are computed. // * Get the array which contains the buffers in which forces are computed.
// */ // */
...@@ -184,36 +184,41 @@ public: ...@@ -184,36 +184,41 @@ public:
* omitted, a default set of options will be used * omitted, a default set of options will be used
*/ */
CUmodule createModule(const std::string source, const std::map<std::string, std::string>& defines, const char* optimizationFlags = NULL); CUmodule createModule(const std::string source, const std::map<std::string, std::string>& defines, const char* optimizationFlags = NULL);
// /** /**
// * Execute a kernel. * Get a kernel from a CUDA module.
// * *
// * @param kernel the kernel to execute * @param module the module to get the kernel from
// * @param workUnits the maximum number of work units that should be used * @param name the name of the kernel to get
// * @param blockSize the size of each thread block to use */
// */ CUfunction getKernel(CUmodule& module, const std::string& name);
// void executeKernel(cl::Kernel& kernel, int workUnits, int blockSize = -1); /**
// /** * Execute a kernel.
// * Set all elements of an array to 0. *
// */ * @param kernel the kernel to execute
// void clearBuffer(CudaArray<float>& array); * @param arguments an array of pointers to the kernel arguments
// /** * @param threads the maximum number of threads that should be used
// * Set all elements of an array to 0. * @param blockSize the size of each thread block to use
// */ * @param sharedSize the amount of dynamic shared memory to allocated for the kernel, in bytes
// void clearBuffer(CudaArray<mm_float4>& array); */
// /** void executeKernel(CUfunction kernel, void** arguments, int workUnits, int blockSize = -1, unsigned int sharedSize = 0);
// * Set all elements of an array to 0. /**
// * * Set all elements of an array to 0.
// * @param memory the Memory to clear */
// * @param size the number of float elements in the buffer void clearBuffer(CudaArray& array);
// */ /**
// void clearBuffer(cl::Memory& memory, int size); * Set all elements of an array to 0.
// /** *
// * Register a buffer that should be automatically cleared (all elements set to 0) at the start of each force or energy computation. * @param memory the memory to clear
// * * @param size the number of 4-byte elements in the buffer
// * @param memory the Memory to clear */
// * @param size the number of float elements in the buffer void clearBuffer(CUdeviceptr memory, int size);
// */ /**
// void addAutoclearBuffer(cl::Memory& memory, int size); * Register a buffer that should be automatically cleared (all elements set to 0) at the start of each force or energy computation.
*
* @param memory the memory to clear
* @param size the number of float/double elements in the buffer
*/
void addAutoclearBuffer(CUdeviceptr memory, int size);
// /** // /**
// * Clear all buffers that have been registered with addAutoclearBuffer(). // * Clear all buffers that have been registered with addAutoclearBuffer().
// */ // */
...@@ -230,108 +235,110 @@ public: ...@@ -230,108 +235,110 @@ public:
// * Sum the buffesr containing forces. // * Sum the buffesr containing forces.
// */ // */
// void reduceForces(); // void reduceForces();
// /** /**
// * Get the current simulation time. * Get the current simulation time.
// */ */
// double getTime() { double getTime() {
// return time; return time;
// } }
// /** /**
// * Set the current simulation time. * Set the current simulation time.
// */ */
// void setTime(double t) { void setTime(double t) {
// time = t; time = t;
// } }
// /** /**
// * Get the number of integration steps that have been taken. * Get the number of integration steps that have been taken.
// */ */
// int getStepCount() { int getStepCount() {
// return stepCount; return stepCount;
// } }
// /** /**
// * Set the number of integration steps that have been taken. * Set the number of integration steps that have been taken.
// */ */
// void setStepCount(int steps) { void setStepCount(int steps) {
// stepCount = steps; stepCount = steps;
// } }
// /** /**
// * Get the number of times forces or energy has been computed. * Get the number of times forces or energy has been computed.
// */ */
// int getComputeForceCount() { int getComputeForceCount() {
// return computeForceCount; return computeForceCount;
// } }
// /** /**
// * Set the number of times forces or energy has been computed. * Set the number of times forces or energy has been computed.
// */ */
// void setComputeForceCount(int count) { void setComputeForceCount(int count) {
// computeForceCount = count; computeForceCount = count;
// } }
// /** /**
// * Get the number of atoms. * Get the number of atoms.
// */ */
// int getNumAtoms() const { int getNumAtoms() const {
// return numAtoms; return numAtoms;
// } }
// /** /**
// * Get the number of atoms, rounded up to a multiple of TileSize. This is the actual size of * Get the number of atoms, rounded up to a multiple of TileSize. This is the actual size of
// * most arrays with one element per atom. * most arrays with one element per atom.
// */ */
// int getPaddedNumAtoms() const { int getPaddedNumAtoms() const {
// return paddedNumAtoms; return paddedNumAtoms;
// } }
// /** /**
// * Get the number of blocks of TileSize atoms. * Get the number of blocks of TileSize atoms.
// */ */
// int getNumAtomBlocks() const { int getNumAtomBlocks() const {
// return numAtomBlocks; return numAtomBlocks;
// } }
// /** /**
// * Get the standard number of thread blocks to use when executing kernels. * Get the standard number of thread blocks to use when executing kernels.
// */ */
// int getNumThreadBlocks() const { int getNumThreadBlocks() const {
// return numThreadBlocks; return numThreadBlocks;
// } }
// /** /**
// * Get the number of force buffers. * Get whether double precision is being used.
// */ */
// int getNumForceBuffers() const { bool getUseDoublePrecision() {
// return numForceBuffers; return useDoublePrecision;
// } }
// /** /**
// * Get the SIMD width of the device being used. * Get whether accumulation is being done in double precision.
// */ */
// int getSIMDWidth() const { bool getAccumulateInDouble() {
// return simdWidth; return accumulateInDouble;
// } }
// /** /**
// * Get whether the device being used supports 64 bit atomic operations on global memory. * Convert a number to a string in a format suitable for including in a kernel.
// */ * This takes into account whether the context uses single or double precision.
// bool getSupports64BitGlobalAtomics() { */
// return supports64BitGlobalAtomics; std::string doubleToString(double value);
// } /**
// /** * Convert a number to a string in a format suitable for including in a kernel.
// * Get whether the device being used supports double precision math. */
// */ std::string intToString(int value);
// bool getSupportsDoublePrecision() { /**
// return supportsDoublePrecision; * Convert a CUDA result code to the corresponding string description.
// } */
std::string getErrorString(CUresult result);
// /** // /**
// * Get the size of the periodic box. // * Get the size of the periodic box.
// */ // */
// mm_float4 getPeriodicBoxSize() const { // float4 getPeriodicBoxSize() const {
// return periodicBoxSize; // return periodicBoxSize;
// } // }
// /** // /**
// * Set the size of the periodic box. // * Set the size of the periodic box.
// */ // */
// void setPeriodicBoxSize(double xsize, double ysize, double zsize) { // void setPeriodicBoxSize(double xsize, double ysize, double zsize) {
// periodicBoxSize = mm_float4((float) xsize, (float) ysize, (float) zsize, 0); // periodicBoxSize = make_float4((float) xsize, (float) ysize, (float) zsize, 0);
// invPeriodicBoxSize = mm_float4((float) (1.0/xsize), (float) (1.0/ysize), (float) (1.0/zsize), 0); // invPeriodicBoxSize = make_float4((float) (1.0/xsize), (float) (1.0/ysize), (float) (1.0/zsize), 0);
// } // }
// /** // /**
// * Get the inverse of the size of the periodic box. // * Get the inverse of the size of the periodic box.
// */ // */
// mm_float4 getInvPeriodicBoxSize() const { // float4 getInvPeriodicBoxSize() const {
// return invPeriodicBoxSize; // return invPeriodicBoxSize;
// } // }
// /** // /**
...@@ -352,66 +359,66 @@ public: ...@@ -352,66 +359,66 @@ public:
// CudaNonbondedUtilities& getNonbondedUtilities() { // CudaNonbondedUtilities& getNonbondedUtilities() {
// return *nonbonded; // return *nonbonded;
// } // }
// /** /**
// * Get the thread used by this context for executing parallel computations. * Get the thread used by this context for executing parallel computations.
// */ */
// WorkThread& getWorkThread() { WorkThread& getWorkThread() {
// return *thread; return *thread;
// } }
// /** /**
// * Get whether atoms were reordered during the most recent force/energy computation. * Get whether atoms were reordered during the most recent force/energy computation.
// */ */
// bool getAtomsWereReordered() const { bool getAtomsWereReordered() const {
// return atomsWereReordered; return atomsWereReordered;
// } }
// /** /**
// * Set whether atoms were reordered during the most recent force/energy computation. * Set whether atoms were reordered during the most recent force/energy computation.
// */ */
// void setAtomsWereReordered(bool wereReordered) { void setAtomsWereReordered(bool wereReordered) {
// atomsWereReordered = wereReordered; atomsWereReordered = wereReordered;
// } }
// /** /**
// * Reorder the internal arrays of atoms to try to keep spatially contiguous atoms close * Reorder the internal arrays of atoms to try to keep spatially contiguous atoms close
// * together in the arrays. * together in the arrays.
// * *
// * @param enforcePeriodic if true, the atom positions may be altered to enforce periodic boundary conditions * @param enforcePeriodic if true, the atom positions may be altered to enforce periodic boundary conditions
// */ */
// void reorderAtoms(bool enforcePeriodic); void reorderAtoms(bool enforcePeriodic);
// /** /**
// * Add a listener that should be called whenever atoms get reordered. The CudaContext * Add a listener that should be called whenever atoms get reordered. The CudaContext
// * assumes ownership of the object, and deletes it when the context itself is deleted. * assumes ownership of the object, and deletes it when the context itself is deleted.
// */ */
// void addReorderListener(ReorderListener* listener); void addReorderListener(ReorderListener* listener);
// /** /**
// * Get the list of ReorderListeners. * Get the list of ReorderListeners.
// */ */
// std::vector<ReorderListener*>& getReorderListeners() { std::vector<ReorderListener*>& getReorderListeners() {
// return reorderListeners; return reorderListeners;
// } }
// /** /**
// * Mark that the current molecule definitions (and hence the atom order) may be invalid. * Mark that the current molecule definitions (and hence the atom order) may be invalid.
// * This should be called whenever force field parameters change. It will cause the definitions * This should be called whenever force field parameters change. It will cause the definitions
// * and order to be revalidated the next to reorderAtoms() is called. * and order to be revalidated the next to reorderAtoms() is called.
// */ */
// void invalidateMolecules(); void invalidateMolecules();
// /** /**
// * Get whether the current molecule definitions are valid. * Get whether the current molecule definitions are valid.
// */ */
// bool getMoleculesAreInvalid() { bool getMoleculesAreInvalid() {
// return moleculesInvalid; return moleculesInvalid;
// } }
private: private:
struct Molecule; struct Molecule;
struct MoleculeGroup; struct MoleculeGroup;
class VirtualSiteInfo; class VirtualSiteInfo;
// void findMoleculeGroups(); void findMoleculeGroups();
// static void tagAtomsInMolecule(int atom, int molecule, std::vector<int>& atomMolecule, std::vector<std::vector<int> >& atomBonds); static void tagAtomsInMolecule(int atom, int molecule, std::vector<int>& atomMolecule, std::vector<std::vector<int> >& atomBonds);
// /** /**
// * Ensure that all molecules marked as "identical" really are identical. This should be * Ensure that all molecules marked as "identical" really are identical. This should be
// * called whenever force field parameters change. If necessary, it will rebuild the list * called whenever force field parameters change. If necessary, it will rebuild the list
// * of molecules and resort the atoms. * of molecules and resort the atoms.
// */ */
// void validateMolecules(); void validateMolecules();
static bool hasInitializedCuda; static bool hasInitializedCuda;
const System& system; const System& system;
double time; double time;
...@@ -424,8 +431,6 @@ private: ...@@ -424,8 +431,6 @@ private:
int paddedNumAtoms; int paddedNumAtoms;
int numAtomBlocks; int numAtomBlocks;
int numThreadBlocks; int numThreadBlocks;
// int numForceBuffers;
// int simdWidth;
bool useBlockingSync, useDoublePrecision, accumulateInDouble, contextIsValid, atomsWereReordered, moleculesInvalid; bool useBlockingSync, useDoublePrecision, accumulateInDouble, contextIsValid, atomsWereReordered, moleculesInvalid;
std::string compiler, tempDir, gpuArchitecture; std::string compiler, tempDir, gpuArchitecture;
float4 periodicBoxSize; float4 periodicBoxSize;
...@@ -446,15 +451,15 @@ private: ...@@ -446,15 +451,15 @@ private:
std::vector<Molecule> molecules; std::vector<Molecule> molecules;
std::vector<MoleculeGroup> moleculeGroups; std::vector<MoleculeGroup> moleculeGroups;
std::vector<int4> posCellOffsets; std::vector<int4> posCellOffsets;
void* pinnedBuffer;
CudaArray* posq; CudaArray* posq;
CudaArray* velm; CudaArray* velm;
// CudaArray<mm_float4>* force; CudaArray* force;
// CudaArray<mm_float4>* forceBuffers; CudaArray* energyBuffer;
// CudaArray<cl_long>* longForceBuffer; CudaArray* atomIndexDevice;
// CudaArray<cl_float>* energyBuffer; std::vector<int> atomIndex;
// CudaArray<cl_int>* atomIndex; std::vector<CUdeviceptr> autoclearBuffers;
// std::vector<cl::Memory*> autoclearBuffers; std::vector<int> autoclearBufferSizes;
// std::vector<int> autoclearBufferSizes;
std::vector<ReorderListener*> reorderListeners; std::vector<ReorderListener*> reorderListeners;
// CudaIntegrationUtilities* integration; // CudaIntegrationUtilities* integration;
// CudaBondedUtilities* bonded; // CudaBondedUtilities* bonded;
......
...@@ -154,6 +154,7 @@ CudaPlatform::PlatformData::PlatformData(const System& system, const string& dev ...@@ -154,6 +154,7 @@ CudaPlatform::PlatformData::PlatformData(const System& system, const string& dev
device << contexts[i]->getDeviceIndex(); device << contexts[i]->getDeviceIndex();
} }
propertyValues[CudaPlatform::CudaDeviceIndex()] = device.str(); propertyValues[CudaPlatform::CudaDeviceIndex()] = device.str();
propertyValues[CudaPlatform::CudaUseBlockingSync()] = blocking ? "true" : "false";
propertyValues[CudaPlatform::CudaPrecision()] = precisionProperty; propertyValues[CudaPlatform::CudaPrecision()] = precisionProperty;
propertyValues[CudaPlatform::CudaCompiler()] = compilerProperty; propertyValues[CudaPlatform::CudaCompiler()] = compilerProperty;
propertyValues[CudaPlatform::CudaTempDirectory()] = tempProperty; propertyValues[CudaPlatform::CudaTempDirectory()] = tempProperty;
...@@ -166,11 +167,11 @@ CudaPlatform::PlatformData::~PlatformData() { ...@@ -166,11 +167,11 @@ CudaPlatform::PlatformData::~PlatformData() {
} }
void CudaPlatform::PlatformData::initializeContexts(const System& system) { void CudaPlatform::PlatformData::initializeContexts(const System& system) {
// for (int i = 0; i < (int) contexts.size(); i++) for (int i = 0; i < (int) contexts.size(); i++)
// contexts[i]->initialize(); contexts[i]->initialize();
} }
void CudaPlatform::PlatformData::syncContexts() { void CudaPlatform::PlatformData::syncContexts() {
// for (int i = 0; i < (int) contexts.size(); i++) for (int i = 0; i < (int) contexts.size(); i++)
// contexts[i]->getWorkThread().flush(); contexts[i]->getWorkThread().flush();
} }
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2010-2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "CudaSort.h"
#include "CudaKernelSources.h"
#include <map>
using namespace OpenMM;
using namespace std;
CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length) : context(context), trait(trait),
dataRange(NULL), bucketOfElement(NULL), offsetInBucket(NULL), bucketOffset(NULL), buckets(NULL) {
// Create kernels.
map<string, string> replacements;
replacements["DATA_TYPE"] = trait->getDataType();
replacements["KEY_TYPE"] = trait->getKeyType();
replacements["SORT_KEY"] = trait->getSortKey();
replacements["MIN_KEY"] = trait->getMinKey();
replacements["MAX_KEY"] = trait->getMaxKey();
replacements["MAX_VALUE"] = trait->getMaxValue();
CUmodule module = context.createModule(context.replaceStrings(CudaKernelSources::sort, replacements));
computeRangeKernel = context.getKernel(module, "computeRange");
assignElementsKernel = context.getKernel(module, "assignElementsToBuckets");
computeBucketPositionsKernel = context.getKernel(module, "computeBucketPositions");
copyToBucketsKernel = context.getKernel(module, "copyDataToBuckets");
sortBucketsKernel = context.getKernel(module, "sortBuckets");
// Work out the work group sizes for various kernels.
int maxBlockSize;
cuDeviceGetAttribute(&maxBlockSize, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, context.getDevice());
for (rangeKernelSize = 1; rangeKernelSize*2 <= maxBlockSize; rangeKernelSize *= 2)
;
positionsKernelSize = rangeKernelSize;
sortKernelSize = rangeKernelSize/2;
if (rangeKernelSize > length)
rangeKernelSize = length;
int maxSharedMem;
cuDeviceGetAttribute(&maxSharedMem, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, context.getDevice());
unsigned int maxLocalBuffer = (unsigned int) ((maxSharedMem/trait->getDataSize())/2);
if (sortKernelSize > maxLocalBuffer)
sortKernelSize = maxLocalBuffer;
unsigned int targetBucketSize = sortKernelSize/2;
unsigned int numBuckets = length/targetBucketSize;
if (numBuckets < 1)
numBuckets = 1;
if (positionsKernelSize > numBuckets)
positionsKernelSize = numBuckets;
// Create workspace arrays.
dataRange = new CudaArray(2, trait->getKeySize(), "sortDataRange");
bucketOffset = CudaArray::create<uint1>(numBuckets, "bucketOffset");
bucketOfElement = CudaArray::create<uint1>(length, "bucketOfElement");
offsetInBucket = CudaArray::create<uint1>(length, "offsetInBucket");
buckets = new CudaArray(length, trait->getDataSize(), "buckets");
}
CudaSort::~CudaSort() {
delete trait;
if (dataRange != NULL)
delete dataRange;
if (bucketOfElement != NULL)
delete bucketOfElement;
if (offsetInBucket != NULL)
delete offsetInBucket;
if (bucketOffset != NULL)
delete bucketOffset;
if (buckets != NULL)
delete buckets;
}
void CudaSort::sort(CudaArray& data) {
if (data.getSize() != bucketOfElement->getSize() || data.getElementSize() != trait->getDataSize())
throw OpenMMException("CudaSort called with different data size");
if (data.getSize() == 0)
return;
// Compute the range of data values.
unsigned int dataSize = data.getSize();
void* rangeArgs[] = {&data.getDevicePointer(), &dataSize, &dataRange->getDevicePointer()};
context.executeKernel(computeRangeKernel, rangeArgs, rangeKernelSize, rangeKernelSize, rangeKernelSize*trait->getKeySize());
// Assign array elements to buckets.
unsigned int numBuckets = bucketOffset->getSize();
context.clearBuffer(*bucketOffset);
void* elementsArgs[] = {&data.getDevicePointer(), &dataSize, &numBuckets, &dataRange->getDevicePointer(),
&bucketOffset->getDevicePointer(), &bucketOfElement->getDevicePointer(), &offsetInBucket->getDevicePointer()};
context.executeKernel(assignElementsKernel, elementsArgs, data.getSize());
// Compute the position of each bucket.
void* computeArgs[] = {&numBuckets, &bucketOffset->getDevicePointer()};
context.executeKernel(computeBucketPositionsKernel, computeArgs, positionsKernelSize, positionsKernelSize, positionsKernelSize*sizeof(int));
// Copy the data into the buckets.
void* copyArgs[] = {&data.getDevicePointer(), &buckets->getDevicePointer(), &dataSize, &bucketOffset->getDevicePointer(),
&bucketOfElement->getDevicePointer(), &offsetInBucket->getDevicePointer()};
context.executeKernel(copyToBucketsKernel, copyArgs, data.getSize());
// Sort each bucket.
void* sortArgs[] = {&data.getDevicePointer(), &buckets->getDevicePointer(), &numBuckets, &bucketOffset->getDevicePointer()};
context.executeKernel(sortBucketsKernel, sortArgs, ((data.getSize()+sortKernelSize-1)/sortKernelSize)*sortKernelSize, sortKernelSize, sortKernelSize*trait->getDataSize());
}
#ifndef __OPENMM_CUDASORT_H__
#define __OPENMM_CUDASORT_H__
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2010-2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "CudaArray.h"
#include "openmm/internal/windowsExport.h"
#include "CudaContext.h"
namespace OpenMM {
/**
* This class sorts arrays of values. It supports any type of values, not just scalars,
* so long as an appropriate sorting key can be defined by which to sort them.
*
* The sorting behavior is specified by a "trait" class that defines the type of data to
* sort and the key for sorting it. Here is an example of a trait class for
* sorting floats:
*
* class SortTrait : public CudaSort::SortTrait {
* int getDataSize() const {return 4;}
* int getKeySize() const {return 4;}
* const char* getDataType() const {return "float";}
* const char* getKeyType() const {return "float";}
* const char* getMinKey() const {return "-MAXFLOAT";}
* const char* getMaxKey() const {return "MAXFLOAT";}
* const char* getMaxValue() const {return "MAXFLOAT";}
* const char* getSortKey() const {return "value";}
* };
*
* The algorithm used is a bucket sort, followed by a bitonic sort within each bucket
* (in local memory when possible, in global memory otherwise). This is similar to
* the algorithm described in
*
* Shifu Chen, Jing Qin, Yongming Xie, Junping Zhao, and Pheng-Ann Heng. "An Efficient
* Sorting Algorithm with CUDA" Journal of the Chinese Institute of Engineers, 32(7),
* pp. 915-921 (2009)
*
* but with many modifications and simplifications. In particular, this algorithm
* involves much less communication between host and device, which is critical to get
* good performance with the array sizes we typically work with (10,000 to 100,000
* elements).
*/
class OPENMM_EXPORT CudaSort {
public:
class SortTrait;
/**
* Create a CudaSort object for sorting data of a particular type.
*
* @param context the context in which to perform calculations
* @param trait a SortTrait defining the type of data to sort. It should have been allocated
* on the heap with the "new" operator. This object takes over ownership of it,
* and deletes it when the CudaSort is deleted.
* @param length the length of the arrays this object will be used to sort
*/
CudaSort(CudaContext& context, SortTrait* trait, unsigned int length);
~CudaSort();
/**
* Sort an array.
*/
void sort(CudaArray& data);
private:
CudaContext& context;
SortTrait* trait;
CudaArray* dataRange;
CudaArray* bucketOfElement;
CudaArray* offsetInBucket;
CudaArray* bucketOffset;
CudaArray* buckets;
CUfunction computeRangeKernel, assignElementsKernel, computeBucketPositionsKernel, copyToBucketsKernel, sortBucketsKernel;
unsigned int rangeKernelSize, positionsKernelSize, sortKernelSize;
};
/**
* A subclass of SortTrait defines the type of value to sort, and the key for sorting them.
*/
class CudaSort::SortTrait {
public:
/**
* Get the size of each data value in bytes.
*/
virtual int getDataSize() const = 0;
/**
* Get the size of each key value in bytes.
*/
virtual int getKeySize() const = 0;
/**
* Get the data type of the values to sort.
*/
virtual const char* getDataType() const = 0;
/**
* Get the data type of the sorting key.
*/
virtual const char* getKeyType() const = 0;
/**
* Get the minimum value a key can take.
*/
virtual const char* getMinKey() const = 0;
/**
* Get the maximum value a key can take.
*/
virtual const char* getMaxKey() const = 0;
/**
* Get a value whose key is guaranteed to equal getMaxKey().
*/
virtual const char* getMaxValue() const = 0;
/**
* Get the CUDA code to select the key from the data value.
*/
virtual const char* getSortKey() const = 0;
};
} // namespace OpenMM
#endif // __OPENMM_CUDASORT_H__
__device__ KEY_TYPE getValue(DATA_TYPE value) {
return SORT_KEY;
}
extern "C" {
/**
* Calculate the minimum and maximum value in the array to be sorted. This kernel
* is executed as a single work group.
*/
__global__ void computeRange(const DATA_TYPE* __restrict__ data, unsigned int length, KEY_TYPE* __restrict__ range) {
extern __shared__ KEY_TYPE rangeBuffer[];
KEY_TYPE minimum = MAX_KEY;
KEY_TYPE maximum = MIN_KEY;
// Each thread calculates the range of a subset of values.
for (unsigned int index = threadIdx.x; index < length; index += blockDim.x) {
KEY_TYPE value = getValue(data[index]);
minimum = min(minimum, value);
maximum = max(maximum, value);
}
// Now reduce them.
rangeBuffer[threadIdx.x] = minimum;
__syncthreads();
for (unsigned int step = 1; step < blockDim.x; step *= 2) {
if (threadIdx.x+step < blockDim.x && threadIdx.x%(2*step) == 0)
rangeBuffer[threadIdx.x] = min(rangeBuffer[threadIdx.x], rangeBuffer[threadIdx.x+step]);
__syncthreads();
}
minimum = rangeBuffer[0];
rangeBuffer[threadIdx.x] = maximum;
__syncthreads();
for (unsigned int step = 1; step < blockDim.x; step *= 2) {
if (threadIdx.x+step < blockDim.x && threadIdx.x%(2*step) == 0)
rangeBuffer[threadIdx.x] = max(rangeBuffer[threadIdx.x], rangeBuffer[threadIdx.x+step]);
__syncthreads();
}
maximum = rangeBuffer[0];
if (threadIdx.x == 0) {
range[0] = minimum;
range[1] = maximum;
}
}
/**
* Assign elements to buckets.
*/
__global__ void assignElementsToBuckets(const DATA_TYPE* __restrict__ data, unsigned int length, unsigned int numBuckets, const KEY_TYPE* __restrict__ range,
unsigned int* bucketOffset, unsigned int* __restrict__ bucketOfElement, unsigned int* __restrict__ offsetInBucket) {
float minValue = (float) (range[0]);
float maxValue = (float) (range[1]);
float bucketWidth = (maxValue-minValue)/numBuckets;
for (unsigned int index = blockDim.x*blockIdx.x+threadIdx.x; index < length; index += blockDim.x*gridDim.x) {
float key = (float) getValue(data[index]);
unsigned int bucketIndex = min((unsigned int) ((key-minValue)/bucketWidth), numBuckets-1);
offsetInBucket[index] = atomicAdd(&bucketOffset[bucketIndex], 1);
bucketOfElement[index] = bucketIndex;
}
}
/**
* Sum the bucket sizes to compute the start position of each bucket. This kernel
* is executed as a single work group.
*/
__global__ void computeBucketPositions(unsigned int numBuckets, unsigned int* __restrict__ bucketOffset) {
extern __shared__ unsigned int posBuffer[];
unsigned int globalOffset = 0;
for (unsigned int startBucket = 0; startBucket < numBuckets; startBucket += blockDim.x) {
// Load the bucket sizes into local memory.
unsigned int globalIndex = startBucket+threadIdx.x;
posBuffer[threadIdx.x] = (globalIndex < numBuckets ? bucketOffset[globalIndex] : 0);
__syncthreads();
// Perform a parallel prefix sum.
for (unsigned int step = 1; step < blockDim.x; step *= 2) {
unsigned int add = (threadIdx.x >= step ? posBuffer[threadIdx.x-step] : 0);
__syncthreads();
posBuffer[threadIdx.x] += add;
__syncthreads();
}
// Write the results back to global memory.
if (globalIndex < numBuckets)
bucketOffset[globalIndex] = posBuffer[threadIdx.x]+globalOffset;
globalOffset += posBuffer[blockDim.x-1];
}
}
/**
* Copy the input data into the buckets for sorting.
*/
__global__ void copyDataToBuckets(const DATA_TYPE* __restrict__ data, DATA_TYPE* __restrict__ buckets, unsigned int length, const unsigned int* __restrict__ bucketOffset, const unsigned int* __restrict__ bucketOfElement, const unsigned int* __restrict__ offsetInBucket) {
for (unsigned int index = blockDim.x*blockIdx.x+threadIdx.x; index < length; index += blockDim.x*gridDim.x) {
DATA_TYPE element = data[index];
unsigned int bucketIndex = bucketOfElement[index];
unsigned int offset = (bucketIndex == 0 ? 0 : bucketOffset[bucketIndex-1]);
buckets[offset+offsetInBucket[index]] = element;
}
}
/**
* Sort the data in each bucket.
*/
__global__ void sortBuckets(DATA_TYPE* __restrict__ data, const DATA_TYPE* __restrict__ buckets, unsigned int numBuckets, const unsigned int* __restrict__ bucketOffset) {
extern __shared__ DATA_TYPE dataBuffer[];
for (unsigned int index = blockIdx.x; index < numBuckets; index += gridDim.x) {
unsigned int startIndex = (index == 0 ? 0 : bucketOffset[index-1]);
unsigned int endIndex = bucketOffset[index];
unsigned int length = endIndex-startIndex;
if (length <= blockDim.x) {
// Load the data into local memory.
if (threadIdx.x < length)
dataBuffer[threadIdx.x] = buckets[startIndex+threadIdx.x];
else
dataBuffer[threadIdx.x] = MAX_VALUE;
__syncthreads();
// Perform a bitonic sort in local memory.
for (unsigned int k = 2; k <= blockDim.x; k *= 2) {
for (unsigned int j = k/2; j > 0; j /= 2) {
int ixj = threadIdx.x^j;
if (ixj > threadIdx.x) {
DATA_TYPE value1 = dataBuffer[threadIdx.x];
DATA_TYPE value2 = dataBuffer[ixj];
bool ascending = (threadIdx.x&k) == 0;
KEY_TYPE lowKey = (ascending ? getValue(value1) : getValue(value2));
KEY_TYPE highKey = (ascending ? getValue(value2) : getValue(value1));
if (lowKey > highKey) {
dataBuffer[threadIdx.x] = value2;
dataBuffer[ixj] = value1;
}
}
__syncthreads();
}
}
// Write the data to the sorted array.
if (threadIdx.x < length)
data[startIndex+threadIdx.x] = dataBuffer[threadIdx.x];
}
else {
// Copy the bucket data over to the output array.
for (unsigned int i = threadIdx.x; i < length; i += blockDim.x)
data[startIndex+i] = buckets[startIndex+i];
__threadfence_block();
__syncthreads();
// Perform a bitonic sort in global memory.
for (unsigned int k = 2; k < 2*length; k *= 2) {
for (unsigned int j = k/2; j > 0; j /= 2) {
for (unsigned int i = threadIdx.x; i < length; i += blockDim.x) {
int ixj = i^j;
if (ixj > i && ixj < length) {
DATA_TYPE value1 = data[startIndex+i];
DATA_TYPE value2 = data[startIndex+ixj];
bool ascending = ((i&k) == 0);
for (unsigned int mask = k*2; mask < 2*length; mask *= 2)
ascending = ((i&mask) == 0 ? !ascending : ascending);
KEY_TYPE lowKey = (ascending ? getValue(value1) : getValue(value2));
KEY_TYPE highKey = (ascending ? getValue(value2) : getValue(value1));
if (lowKey > highKey) {
data[startIndex+i] = value2;
data[startIndex+ixj] = value1;
}
}
}
__threadfence_block();
__syncthreads();
}
}
}
}
}
}
\ No newline at end of file
extern "C" {
/** /**
* This is called by the various functions below to clear a buffer. * This is called by the various functions below to clear a buffer.
*/ */
...@@ -100,3 +102,5 @@ __global__ void reduceForces(const long* __restrict__ longBuffer, float4* __rest ...@@ -100,3 +102,5 @@ __global__ void reduceForces(const long* __restrict__ longBuffer, float4* __rest
buffer[index] = sum; buffer[index] = sum;
} }
} }
}
\ No newline at end of file
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008-2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
/**
* This tests the CUDA implementation of sorting.
*/
#include "openmm/internal/AssertionUtilities.h"
#include "../src/CudaArray.h"
#include "../src/CudaContext.h"
#include "../src/CudaSort.h"
#include "sfmt/SFMT.h"
#include "openmm/System.h"
#include <iostream>
#include <cmath>
#include <set>
using namespace OpenMM;
using namespace std;
class SortTrait : public CudaSort::SortTrait {
int getDataSize() const {return 4;}
int getKeySize() const {return 4;}
const char* getDataType() const {return "float";}
const char* getKeyType() const {return "float";}
const char* getMinKey() const {return "-MAXFLOAT";}
const char* getMaxKey() const {return "MAXFLOAT";}
const char* getMaxValue() const {return "MAXFLOAT";}
const char* getSortKey() const {return "value";}
};
void verifySorting(vector<float> array) {
// Sort the array.
System system;
system.addParticle(0.0);
CudaPlatform platform;
CudaPlatform::PlatformData platformData(system, "", "true", "single",
platform.getPropertyDefaultValue(CudaPlatform::CudaCompiler()), platform.getPropertyDefaultValue(CudaPlatform::CudaTempDirectory()));
CudaContext& context = *platformData.contexts[0];
context.initialize();
CudaArray data(array.size(), 4, "sortData");
data.upload(array);
CudaSort sort(context, new SortTrait(), array.size());
sort.sort(data);
vector<float> sorted;
data.download(sorted);
// Verify that it is in sorted order.
for (int i = 1; i < (int) sorted.size(); i++)
ASSERT(sorted[i-1] <= sorted[i]);
// Make sure the sorted array contains the same values as the original one.
multiset<float> elements1(array.begin(), array.end());
multiset<float> elements2(sorted.begin(), sorted.end());
ASSERT(elements1 == elements2);
}
void testUniformValues()
{
OpenMM_SFMT::SFMT sfmt;
init_gen_rand(0, sfmt);
vector<float> array(10000);
for (int i = 0; i < (int) array.size(); i++)
array[i] = (float) genrand_real2(sfmt);
verifySorting(array);
}
void testLogValues()
{
OpenMM_SFMT::SFMT sfmt;
init_gen_rand(0, sfmt);
vector<float> array(10000);
for (int i = 0; i < (int) array.size(); i++)
array[i] = (float) log(genrand_real2(sfmt));
verifySorting(array);
}
int main() {
try {
testUniformValues();
testLogValues();
}
catch(const exception& e) {
cout << "exception: " << e.what() << endl;
return 1;
}
cout << "Done" << endl;
return 0;
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment