Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
3e16cab9
Commit
3e16cab9
authored
Jun 05, 2012
by
Peter Eastman
Browse files
Continuing to implement new CUDA platform
parent
abb8cb4b
Changes
9
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
1158 additions
and
509 deletions
+1158
-509
platforms/cuda2/src/CudaArray.h
platforms/cuda2/src/CudaArray.h
+1
-1
platforms/cuda2/src/CudaContext.cpp
platforms/cuda2/src/CudaContext.cpp
+367
-309
platforms/cuda2/src/CudaContext.h
platforms/cuda2/src/CudaContext.h
+200
-195
platforms/cuda2/src/CudaPlatform.cpp
platforms/cuda2/src/CudaPlatform.cpp
+5
-4
platforms/cuda2/src/CudaSort.cpp
platforms/cuda2/src/CudaSort.cpp
+132
-0
platforms/cuda2/src/CudaSort.h
platforms/cuda2/src/CudaSort.h
+141
-0
platforms/cuda2/src/kernels/sort.cu
platforms/cuda2/src/kernels/sort.cu
+186
-0
platforms/cuda2/src/kernels/utilities.cu
platforms/cuda2/src/kernels/utilities.cu
+4
-0
platforms/cuda2/tests/TestCudaSort.cpp
platforms/cuda2/tests/TestCudaSort.cpp
+122
-0
No files found.
platforms/cuda2/src/CudaArray.h
View file @
3e16cab9
...
...
@@ -83,7 +83,7 @@ public:
/**
* Get a pointer to the device memory.
*/
CUdeviceptr
getDevicePointer
()
{
CUdeviceptr
&
getDevicePointer
()
{
return
pointer
;
}
/**
...
...
platforms/cuda2/src/CudaContext.cpp
View file @
3e16cab9
This diff is collapsed.
Click to expand it.
platforms/cuda2/src/CudaContext.h
View file @
3e16cab9
...
...
@@ -72,11 +72,11 @@ public:
CudaContext
(
const
System
&
system
,
int
deviceIndex
,
bool
useBlockingSync
,
const
std
::
string
&
precision
,
const
std
::
string
&
compiler
,
const
std
::
string
&
tempDir
,
CudaPlatform
::
PlatformData
&
platformData
);
~
CudaContext
();
//
/**
//
* This is called to initialize internal data structures after all Forces in the system
//
* have been initialized.
//
*/
//
void initialize();
/**
* This is called to initialize internal data structures after all Forces in the system
* have been initialized.
*/
void
initialize
();
/**
* Add a CudaForce to this context.
*/
...
...
@@ -123,12 +123,12 @@ public:
CudaArray
&
getVelm
()
{
return
*
velm
;
}
//
/**
//
* Get the array which contains the force on each atom.
//
*/
//
CudaArray
<mm_float4>
& getForce() {
//
return *force;
//
}
/**
* Get the array which contains the force on each atom
(respresented as a long3 in 64 bit fixed point)
.
*/
CudaArray
&
getForce
()
{
return
*
force
;
}
// /**
// * Get the array which contains the buffers in which forces are computed.
// */
...
...
@@ -184,36 +184,41 @@ public:
* omitted, a default set of options will be used
*/
CUmodule
createModule
(
const
std
::
string
source
,
const
std
::
map
<
std
::
string
,
std
::
string
>&
defines
,
const
char
*
optimizationFlags
=
NULL
);
// /**
// * Execute a kernel.
// *
// * @param kernel the kernel to execute
// * @param workUnits the maximum number of work units that should be used
// * @param blockSize the size of each thread block to use
// */
// void executeKernel(cl::Kernel& kernel, int workUnits, int blockSize = -1);
// /**
// * Set all elements of an array to 0.
// */
// void clearBuffer(CudaArray<float>& array);
// /**
// * Set all elements of an array to 0.
// */
// void clearBuffer(CudaArray<mm_float4>& array);
// /**
// * Set all elements of an array to 0.
// *
// * @param memory the Memory to clear
// * @param size the number of float elements in the buffer
// */
// void clearBuffer(cl::Memory& memory, int size);
// /**
// * Register a buffer that should be automatically cleared (all elements set to 0) at the start of each force or energy computation.
// *
// * @param memory the Memory to clear
// * @param size the number of float elements in the buffer
// */
// void addAutoclearBuffer(cl::Memory& memory, int size);
/**
* Get a kernel from a CUDA module.
*
* @param module the module to get the kernel from
* @param name the name of the kernel to get
*/
CUfunction
getKernel
(
CUmodule
&
module
,
const
std
::
string
&
name
);
/**
* Execute a kernel.
*
* @param kernel the kernel to execute
* @param arguments an array of pointers to the kernel arguments
* @param threads the maximum number of threads that should be used
* @param blockSize the size of each thread block to use
* @param sharedSize the amount of dynamic shared memory to allocated for the kernel, in bytes
*/
void
executeKernel
(
CUfunction
kernel
,
void
**
arguments
,
int
workUnits
,
int
blockSize
=
-
1
,
unsigned
int
sharedSize
=
0
);
/**
* Set all elements of an array to 0.
*/
void
clearBuffer
(
CudaArray
&
array
);
/**
* Set all elements of an array to 0.
*
* @param memory the memory to clear
* @param size the number of 4-byte elements in the buffer
*/
void
clearBuffer
(
CUdeviceptr
memory
,
int
size
);
/**
* Register a buffer that should be automatically cleared (all elements set to 0) at the start of each force or energy computation.
*
* @param memory the memory to clear
* @param size the number of float/double elements in the buffer
*/
void
addAutoclearBuffer
(
CUdeviceptr
memory
,
int
size
);
// /**
// * Clear all buffers that have been registered with addAutoclearBuffer().
// */
...
...
@@ -230,108 +235,110 @@ public:
// * Sum the buffesr containing forces.
// */
// void reduceForces();
// /**
// * Get the current simulation time.
// */
// double getTime() {
// return time;
// }
// /**
// * Set the current simulation time.
// */
// void setTime(double t) {
// time = t;
// }
// /**
// * Get the number of integration steps that have been taken.
// */
// int getStepCount() {
// return stepCount;
// }
// /**
// * Set the number of integration steps that have been taken.
// */
// void setStepCount(int steps) {
// stepCount = steps;
// }
// /**
// * Get the number of times forces or energy has been computed.
// */
// int getComputeForceCount() {
// return computeForceCount;
// }
// /**
// * Set the number of times forces or energy has been computed.
// */
// void setComputeForceCount(int count) {
// computeForceCount = count;
// }
// /**
// * Get the number of atoms.
// */
// int getNumAtoms() const {
// return numAtoms;
// }
// /**
// * Get the number of atoms, rounded up to a multiple of TileSize. This is the actual size of
// * most arrays with one element per atom.
// */
// int getPaddedNumAtoms() const {
// return paddedNumAtoms;
// }
// /**
// * Get the number of blocks of TileSize atoms.
// */
// int getNumAtomBlocks() const {
// return numAtomBlocks;
// }
// /**
// * Get the standard number of thread blocks to use when executing kernels.
// */
// int getNumThreadBlocks() const {
// return numThreadBlocks;
// }
// /**
// * Get the number of force buffers.
// */
// int getNumForceBuffers() const {
// return numForceBuffers;
// }
// /**
// * Get the SIMD width of the device being used.
// */
// int getSIMDWidth() const {
// return simdWidth;
// }
// /**
// * Get whether the device being used supports 64 bit atomic operations on global memory.
// */
// bool getSupports64BitGlobalAtomics() {
// return supports64BitGlobalAtomics;
// }
// /**
// * Get whether the device being used supports double precision math.
// */
// bool getSupportsDoublePrecision() {
// return supportsDoublePrecision;
// }
/**
* Get the current simulation time.
*/
double
getTime
()
{
return
time
;
}
/**
* Set the current simulation time.
*/
void
setTime
(
double
t
)
{
time
=
t
;
}
/**
* Get the number of integration steps that have been taken.
*/
int
getStepCount
()
{
return
stepCount
;
}
/**
* Set the number of integration steps that have been taken.
*/
void
setStepCount
(
int
steps
)
{
stepCount
=
steps
;
}
/**
* Get the number of times forces or energy has been computed.
*/
int
getComputeForceCount
()
{
return
computeForceCount
;
}
/**
* Set the number of times forces or energy has been computed.
*/
void
setComputeForceCount
(
int
count
)
{
computeForceCount
=
count
;
}
/**
* Get the number of atoms.
*/
int
getNumAtoms
()
const
{
return
numAtoms
;
}
/**
* Get the number of atoms, rounded up to a multiple of TileSize. This is the actual size of
* most arrays with one element per atom.
*/
int
getPaddedNumAtoms
()
const
{
return
paddedNumAtoms
;
}
/**
* Get the number of blocks of TileSize atoms.
*/
int
getNumAtomBlocks
()
const
{
return
numAtomBlocks
;
}
/**
* Get the standard number of thread blocks to use when executing kernels.
*/
int
getNumThreadBlocks
()
const
{
return
numThreadBlocks
;
}
/**
* Get whether double precision is being used.
*/
bool
getUseDoublePrecision
()
{
return
useDoublePrecision
;
}
/**
* Get whether accumulation is being done in double precision.
*/
bool
getAccumulateInDouble
()
{
return
accumulateInDouble
;
}
/**
* Convert a number to a string in a format suitable for including in a kernel.
* This takes into account whether the context uses single or double precision.
*/
std
::
string
doubleToString
(
double
value
);
/**
* Convert a number to a string in a format suitable for including in a kernel.
*/
std
::
string
intToString
(
int
value
);
/**
* Convert a CUDA result code to the corresponding string description.
*/
std
::
string
getErrorString
(
CUresult
result
);
// /**
// * Get the size of the periodic box.
// */
//
mm_
float4 getPeriodicBoxSize() const {
// float4 getPeriodicBoxSize() const {
// return periodicBoxSize;
// }
// /**
// * Set the size of the periodic box.
// */
// void setPeriodicBoxSize(double xsize, double ysize, double zsize) {
// periodicBoxSize = m
m
_float4((float) xsize, (float) ysize, (float) zsize, 0);
// invPeriodicBoxSize = m
m
_float4((float) (1.0/xsize), (float) (1.0/ysize), (float) (1.0/zsize), 0);
// periodicBoxSize = m
ake
_float4((float) xsize, (float) ysize, (float) zsize, 0);
// invPeriodicBoxSize = m
ake
_float4((float) (1.0/xsize), (float) (1.0/ysize), (float) (1.0/zsize), 0);
// }
// /**
// * Get the inverse of the size of the periodic box.
// */
//
mm_
float4 getInvPeriodicBoxSize() const {
// float4 getInvPeriodicBoxSize() const {
// return invPeriodicBoxSize;
// }
// /**
...
...
@@ -352,66 +359,66 @@ public:
// CudaNonbondedUtilities& getNonbondedUtilities() {
// return *nonbonded;
// }
//
/**
//
* Get the thread used by this context for executing parallel computations.
//
*/
//
WorkThread& getWorkThread() {
//
return *thread;
//
}
//
/**
//
* Get whether atoms were reordered during the most recent force/energy computation.
//
*/
//
bool getAtomsWereReordered() const {
//
return atomsWereReordered;
//
}
//
/**
//
* Set whether atoms were reordered during the most recent force/energy computation.
//
*/
//
void setAtomsWereReordered(bool wereReordered) {
//
atomsWereReordered = wereReordered;
//
}
//
/**
//
* Reorder the internal arrays of atoms to try to keep spatially contiguous atoms close
//
* together in the arrays.
//
*
//
* @param enforcePeriodic if true, the atom positions may be altered to enforce periodic boundary conditions
//
*/
//
void reorderAtoms(bool enforcePeriodic);
//
/**
//
* Add a listener that should be called whenever atoms get reordered. The CudaContext
//
* assumes ownership of the object, and deletes it when the context itself is deleted.
//
*/
//
void addReorderListener(ReorderListener* listener);
//
/**
//
* Get the list of ReorderListeners.
//
*/
//
std::vector<ReorderListener*>& getReorderListeners() {
//
return reorderListeners;
//
}
//
/**
//
* Mark that the current molecule definitions (and hence the atom order) may be invalid.
//
* This should be called whenever force field parameters change. It will cause the definitions
//
* and order to be revalidated the next to reorderAtoms() is called.
//
*/
//
void invalidateMolecules();
//
/**
//
* Get whether the current molecule definitions are valid.
//
*/
//
bool getMoleculesAreInvalid() {
//
return moleculesInvalid;
//
}
/**
* Get the thread used by this context for executing parallel computations.
*/
WorkThread
&
getWorkThread
()
{
return
*
thread
;
}
/**
* Get whether atoms were reordered during the most recent force/energy computation.
*/
bool
getAtomsWereReordered
()
const
{
return
atomsWereReordered
;
}
/**
* Set whether atoms were reordered during the most recent force/energy computation.
*/
void
setAtomsWereReordered
(
bool
wereReordered
)
{
atomsWereReordered
=
wereReordered
;
}
/**
* Reorder the internal arrays of atoms to try to keep spatially contiguous atoms close
* together in the arrays.
*
* @param enforcePeriodic if true, the atom positions may be altered to enforce periodic boundary conditions
*/
void
reorderAtoms
(
bool
enforcePeriodic
);
/**
* Add a listener that should be called whenever atoms get reordered. The CudaContext
* assumes ownership of the object, and deletes it when the context itself is deleted.
*/
void
addReorderListener
(
ReorderListener
*
listener
);
/**
* Get the list of ReorderListeners.
*/
std
::
vector
<
ReorderListener
*>&
getReorderListeners
()
{
return
reorderListeners
;
}
/**
* Mark that the current molecule definitions (and hence the atom order) may be invalid.
* This should be called whenever force field parameters change. It will cause the definitions
* and order to be revalidated the next to reorderAtoms() is called.
*/
void
invalidateMolecules
();
/**
* Get whether the current molecule definitions are valid.
*/
bool
getMoleculesAreInvalid
()
{
return
moleculesInvalid
;
}
private:
struct
Molecule
;
struct
MoleculeGroup
;
class
VirtualSiteInfo
;
//
void findMoleculeGroups();
//
static void tagAtomsInMolecule(int atom, int molecule, std::vector<int>& atomMolecule, std::vector<std::vector<int> >& atomBonds);
//
/**
//
* Ensure that all molecules marked as "identical" really are identical. This should be
//
* called whenever force field parameters change. If necessary, it will rebuild the list
//
* of molecules and resort the atoms.
//
*/
//
void validateMolecules();
void
findMoleculeGroups
();
static
void
tagAtomsInMolecule
(
int
atom
,
int
molecule
,
std
::
vector
<
int
>&
atomMolecule
,
std
::
vector
<
std
::
vector
<
int
>
>&
atomBonds
);
/**
* Ensure that all molecules marked as "identical" really are identical. This should be
* called whenever force field parameters change. If necessary, it will rebuild the list
* of molecules and resort the atoms.
*/
void
validateMolecules
();
static
bool
hasInitializedCuda
;
const
System
&
system
;
double
time
;
...
...
@@ -424,8 +431,6 @@ private:
int
paddedNumAtoms
;
int
numAtomBlocks
;
int
numThreadBlocks
;
// int numForceBuffers;
// int simdWidth;
bool
useBlockingSync
,
useDoublePrecision
,
accumulateInDouble
,
contextIsValid
,
atomsWereReordered
,
moleculesInvalid
;
std
::
string
compiler
,
tempDir
,
gpuArchitecture
;
float4
periodicBoxSize
;
...
...
@@ -446,15 +451,15 @@ private:
std
::
vector
<
Molecule
>
molecules
;
std
::
vector
<
MoleculeGroup
>
moleculeGroups
;
std
::
vector
<
int4
>
posCellOffsets
;
void
*
pinnedBuffer
;
CudaArray
*
posq
;
CudaArray
*
velm
;
// CudaArray<mm_float4>* force;
// CudaArray<mm_float4>* forceBuffers;
// CudaArray<cl_long>* longForceBuffer;
// CudaArray<cl_float>* energyBuffer;
// CudaArray<cl_int>* atomIndex;
// std::vector<cl::Memory*> autoclearBuffers;
// std::vector<int> autoclearBufferSizes;
CudaArray
*
force
;
CudaArray
*
energyBuffer
;
CudaArray
*
atomIndexDevice
;
std
::
vector
<
int
>
atomIndex
;
std
::
vector
<
CUdeviceptr
>
autoclearBuffers
;
std
::
vector
<
int
>
autoclearBufferSizes
;
std
::
vector
<
ReorderListener
*>
reorderListeners
;
// CudaIntegrationUtilities* integration;
// CudaBondedUtilities* bonded;
...
...
platforms/cuda2/src/CudaPlatform.cpp
View file @
3e16cab9
...
...
@@ -154,6 +154,7 @@ CudaPlatform::PlatformData::PlatformData(const System& system, const string& dev
device
<<
contexts
[
i
]
->
getDeviceIndex
();
}
propertyValues
[
CudaPlatform
::
CudaDeviceIndex
()]
=
device
.
str
();
propertyValues
[
CudaPlatform
::
CudaUseBlockingSync
()]
=
blocking
?
"true"
:
"false"
;
propertyValues
[
CudaPlatform
::
CudaPrecision
()]
=
precisionProperty
;
propertyValues
[
CudaPlatform
::
CudaCompiler
()]
=
compilerProperty
;
propertyValues
[
CudaPlatform
::
CudaTempDirectory
()]
=
tempProperty
;
...
...
@@ -166,11 +167,11 @@ CudaPlatform::PlatformData::~PlatformData() {
}
void
CudaPlatform
::
PlatformData
::
initializeContexts
(
const
System
&
system
)
{
//
for (int i = 0; i < (int) contexts.size(); i++)
//
contexts[i]->initialize();
for
(
int
i
=
0
;
i
<
(
int
)
contexts
.
size
();
i
++
)
contexts
[
i
]
->
initialize
();
}
void
CudaPlatform
::
PlatformData
::
syncContexts
()
{
//
for (int i = 0; i < (int) contexts.size(); i++)
//
contexts[i]->getWorkThread().flush();
for
(
int
i
=
0
;
i
<
(
int
)
contexts
.
size
();
i
++
)
contexts
[
i
]
->
getWorkThread
().
flush
();
}
platforms/cuda2/src/CudaSort.cpp
0 → 100644
View file @
3e16cab9
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2010-2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "CudaSort.h"
#include "CudaKernelSources.h"
#include <map>
using
namespace
OpenMM
;
using
namespace
std
;
CudaSort
::
CudaSort
(
CudaContext
&
context
,
SortTrait
*
trait
,
unsigned
int
length
)
:
context
(
context
),
trait
(
trait
),
dataRange
(
NULL
),
bucketOfElement
(
NULL
),
offsetInBucket
(
NULL
),
bucketOffset
(
NULL
),
buckets
(
NULL
)
{
// Create kernels.
map
<
string
,
string
>
replacements
;
replacements
[
"DATA_TYPE"
]
=
trait
->
getDataType
();
replacements
[
"KEY_TYPE"
]
=
trait
->
getKeyType
();
replacements
[
"SORT_KEY"
]
=
trait
->
getSortKey
();
replacements
[
"MIN_KEY"
]
=
trait
->
getMinKey
();
replacements
[
"MAX_KEY"
]
=
trait
->
getMaxKey
();
replacements
[
"MAX_VALUE"
]
=
trait
->
getMaxValue
();
CUmodule
module
=
context
.
createModule
(
context
.
replaceStrings
(
CudaKernelSources
::
sort
,
replacements
));
computeRangeKernel
=
context
.
getKernel
(
module
,
"computeRange"
);
assignElementsKernel
=
context
.
getKernel
(
module
,
"assignElementsToBuckets"
);
computeBucketPositionsKernel
=
context
.
getKernel
(
module
,
"computeBucketPositions"
);
copyToBucketsKernel
=
context
.
getKernel
(
module
,
"copyDataToBuckets"
);
sortBucketsKernel
=
context
.
getKernel
(
module
,
"sortBuckets"
);
// Work out the work group sizes for various kernels.
int
maxBlockSize
;
cuDeviceGetAttribute
(
&
maxBlockSize
,
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X
,
context
.
getDevice
());
for
(
rangeKernelSize
=
1
;
rangeKernelSize
*
2
<=
maxBlockSize
;
rangeKernelSize
*=
2
)
;
positionsKernelSize
=
rangeKernelSize
;
sortKernelSize
=
rangeKernelSize
/
2
;
if
(
rangeKernelSize
>
length
)
rangeKernelSize
=
length
;
int
maxSharedMem
;
cuDeviceGetAttribute
(
&
maxSharedMem
,
CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK
,
context
.
getDevice
());
unsigned
int
maxLocalBuffer
=
(
unsigned
int
)
((
maxSharedMem
/
trait
->
getDataSize
())
/
2
);
if
(
sortKernelSize
>
maxLocalBuffer
)
sortKernelSize
=
maxLocalBuffer
;
unsigned
int
targetBucketSize
=
sortKernelSize
/
2
;
unsigned
int
numBuckets
=
length
/
targetBucketSize
;
if
(
numBuckets
<
1
)
numBuckets
=
1
;
if
(
positionsKernelSize
>
numBuckets
)
positionsKernelSize
=
numBuckets
;
// Create workspace arrays.
dataRange
=
new
CudaArray
(
2
,
trait
->
getKeySize
(),
"sortDataRange"
);
bucketOffset
=
CudaArray
::
create
<
uint1
>
(
numBuckets
,
"bucketOffset"
);
bucketOfElement
=
CudaArray
::
create
<
uint1
>
(
length
,
"bucketOfElement"
);
offsetInBucket
=
CudaArray
::
create
<
uint1
>
(
length
,
"offsetInBucket"
);
buckets
=
new
CudaArray
(
length
,
trait
->
getDataSize
(),
"buckets"
);
}
CudaSort
::~
CudaSort
()
{
delete
trait
;
if
(
dataRange
!=
NULL
)
delete
dataRange
;
if
(
bucketOfElement
!=
NULL
)
delete
bucketOfElement
;
if
(
offsetInBucket
!=
NULL
)
delete
offsetInBucket
;
if
(
bucketOffset
!=
NULL
)
delete
bucketOffset
;
if
(
buckets
!=
NULL
)
delete
buckets
;
}
void
CudaSort
::
sort
(
CudaArray
&
data
)
{
if
(
data
.
getSize
()
!=
bucketOfElement
->
getSize
()
||
data
.
getElementSize
()
!=
trait
->
getDataSize
())
throw
OpenMMException
(
"CudaSort called with different data size"
);
if
(
data
.
getSize
()
==
0
)
return
;
// Compute the range of data values.
unsigned
int
dataSize
=
data
.
getSize
();
void
*
rangeArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
dataSize
,
&
dataRange
->
getDevicePointer
()};
context
.
executeKernel
(
computeRangeKernel
,
rangeArgs
,
rangeKernelSize
,
rangeKernelSize
,
rangeKernelSize
*
trait
->
getKeySize
());
// Assign array elements to buckets.
unsigned
int
numBuckets
=
bucketOffset
->
getSize
();
context
.
clearBuffer
(
*
bucketOffset
);
void
*
elementsArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
dataSize
,
&
numBuckets
,
&
dataRange
->
getDevicePointer
(),
&
bucketOffset
->
getDevicePointer
(),
&
bucketOfElement
->
getDevicePointer
(),
&
offsetInBucket
->
getDevicePointer
()};
context
.
executeKernel
(
assignElementsKernel
,
elementsArgs
,
data
.
getSize
());
// Compute the position of each bucket.
void
*
computeArgs
[]
=
{
&
numBuckets
,
&
bucketOffset
->
getDevicePointer
()};
context
.
executeKernel
(
computeBucketPositionsKernel
,
computeArgs
,
positionsKernelSize
,
positionsKernelSize
,
positionsKernelSize
*
sizeof
(
int
));
// Copy the data into the buckets.
void
*
copyArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
buckets
->
getDevicePointer
(),
&
dataSize
,
&
bucketOffset
->
getDevicePointer
(),
&
bucketOfElement
->
getDevicePointer
(),
&
offsetInBucket
->
getDevicePointer
()};
context
.
executeKernel
(
copyToBucketsKernel
,
copyArgs
,
data
.
getSize
());
// Sort each bucket.
void
*
sortArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
buckets
->
getDevicePointer
(),
&
numBuckets
,
&
bucketOffset
->
getDevicePointer
()};
context
.
executeKernel
(
sortBucketsKernel
,
sortArgs
,
((
data
.
getSize
()
+
sortKernelSize
-
1
)
/
sortKernelSize
)
*
sortKernelSize
,
sortKernelSize
,
sortKernelSize
*
trait
->
getDataSize
());
}
platforms/cuda2/src/CudaSort.h
0 → 100644
View file @
3e16cab9
#ifndef __OPENMM_CUDASORT_H__
#define __OPENMM_CUDASORT_H__
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2010-2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "CudaArray.h"
#include "openmm/internal/windowsExport.h"
#include "CudaContext.h"
namespace
OpenMM
{
/**
* This class sorts arrays of values. It supports any type of values, not just scalars,
* so long as an appropriate sorting key can be defined by which to sort them.
*
* The sorting behavior is specified by a "trait" class that defines the type of data to
* sort and the key for sorting it. Here is an example of a trait class for
* sorting floats:
*
* class SortTrait : public CudaSort::SortTrait {
* int getDataSize() const {return 4;}
* int getKeySize() const {return 4;}
* const char* getDataType() const {return "float";}
* const char* getKeyType() const {return "float";}
* const char* getMinKey() const {return "-MAXFLOAT";}
* const char* getMaxKey() const {return "MAXFLOAT";}
* const char* getMaxValue() const {return "MAXFLOAT";}
* const char* getSortKey() const {return "value";}
* };
*
* The algorithm used is a bucket sort, followed by a bitonic sort within each bucket
* (in local memory when possible, in global memory otherwise). This is similar to
* the algorithm described in
*
* Shifu Chen, Jing Qin, Yongming Xie, Junping Zhao, and Pheng-Ann Heng. "An Efficient
* Sorting Algorithm with CUDA" Journal of the Chinese Institute of Engineers, 32(7),
* pp. 915-921 (2009)
*
* but with many modifications and simplifications. In particular, this algorithm
* involves much less communication between host and device, which is critical to get
* good performance with the array sizes we typically work with (10,000 to 100,000
* elements).
*/
class
OPENMM_EXPORT
CudaSort
{
public:
class
SortTrait
;
/**
* Create a CudaSort object for sorting data of a particular type.
*
* @param context the context in which to perform calculations
* @param trait a SortTrait defining the type of data to sort. It should have been allocated
* on the heap with the "new" operator. This object takes over ownership of it,
* and deletes it when the CudaSort is deleted.
* @param length the length of the arrays this object will be used to sort
*/
CudaSort
(
CudaContext
&
context
,
SortTrait
*
trait
,
unsigned
int
length
);
~
CudaSort
();
/**
* Sort an array.
*/
void
sort
(
CudaArray
&
data
);
private:
CudaContext
&
context
;
SortTrait
*
trait
;
CudaArray
*
dataRange
;
CudaArray
*
bucketOfElement
;
CudaArray
*
offsetInBucket
;
CudaArray
*
bucketOffset
;
CudaArray
*
buckets
;
CUfunction
computeRangeKernel
,
assignElementsKernel
,
computeBucketPositionsKernel
,
copyToBucketsKernel
,
sortBucketsKernel
;
unsigned
int
rangeKernelSize
,
positionsKernelSize
,
sortKernelSize
;
};
/**
* A subclass of SortTrait defines the type of value to sort, and the key for sorting them.
*/
class
CudaSort
::
SortTrait
{
public:
/**
* Get the size of each data value in bytes.
*/
virtual
int
getDataSize
()
const
=
0
;
/**
* Get the size of each key value in bytes.
*/
virtual
int
getKeySize
()
const
=
0
;
/**
* Get the data type of the values to sort.
*/
virtual
const
char
*
getDataType
()
const
=
0
;
/**
* Get the data type of the sorting key.
*/
virtual
const
char
*
getKeyType
()
const
=
0
;
/**
* Get the minimum value a key can take.
*/
virtual
const
char
*
getMinKey
()
const
=
0
;
/**
* Get the maximum value a key can take.
*/
virtual
const
char
*
getMaxKey
()
const
=
0
;
/**
* Get a value whose key is guaranteed to equal getMaxKey().
*/
virtual
const
char
*
getMaxValue
()
const
=
0
;
/**
* Get the CUDA code to select the key from the data value.
*/
virtual
const
char
*
getSortKey
()
const
=
0
;
};
}
// namespace OpenMM
#endif // __OPENMM_CUDASORT_H__
platforms/cuda2/src/kernels/sort.cu
0 → 100644
View file @
3e16cab9
__device__
KEY_TYPE
getValue
(
DATA_TYPE
value
)
{
return
SORT_KEY
;
}
extern
"C"
{
/**
* Calculate the minimum and maximum value in the array to be sorted. This kernel
* is executed as a single work group.
*/
__global__
void
computeRange
(
const
DATA_TYPE
*
__restrict__
data
,
unsigned
int
length
,
KEY_TYPE
*
__restrict__
range
)
{
extern
__shared__
KEY_TYPE
rangeBuffer
[];
KEY_TYPE
minimum
=
MAX_KEY
;
KEY_TYPE
maximum
=
MIN_KEY
;
// Each thread calculates the range of a subset of values.
for
(
unsigned
int
index
=
threadIdx
.
x
;
index
<
length
;
index
+=
blockDim
.
x
)
{
KEY_TYPE
value
=
getValue
(
data
[
index
]);
minimum
=
min
(
minimum
,
value
);
maximum
=
max
(
maximum
,
value
);
}
// Now reduce them.
rangeBuffer
[
threadIdx
.
x
]
=
minimum
;
__syncthreads
();
for
(
unsigned
int
step
=
1
;
step
<
blockDim
.
x
;
step
*=
2
)
{
if
(
threadIdx
.
x
+
step
<
blockDim
.
x
&&
threadIdx
.
x
%
(
2
*
step
)
==
0
)
rangeBuffer
[
threadIdx
.
x
]
=
min
(
rangeBuffer
[
threadIdx
.
x
],
rangeBuffer
[
threadIdx
.
x
+
step
]);
__syncthreads
();
}
minimum
=
rangeBuffer
[
0
];
rangeBuffer
[
threadIdx
.
x
]
=
maximum
;
__syncthreads
();
for
(
unsigned
int
step
=
1
;
step
<
blockDim
.
x
;
step
*=
2
)
{
if
(
threadIdx
.
x
+
step
<
blockDim
.
x
&&
threadIdx
.
x
%
(
2
*
step
)
==
0
)
rangeBuffer
[
threadIdx
.
x
]
=
max
(
rangeBuffer
[
threadIdx
.
x
],
rangeBuffer
[
threadIdx
.
x
+
step
]);
__syncthreads
();
}
maximum
=
rangeBuffer
[
0
];
if
(
threadIdx
.
x
==
0
)
{
range
[
0
]
=
minimum
;
range
[
1
]
=
maximum
;
}
}
/**
* Assign elements to buckets.
*/
__global__
void
assignElementsToBuckets
(
const
DATA_TYPE
*
__restrict__
data
,
unsigned
int
length
,
unsigned
int
numBuckets
,
const
KEY_TYPE
*
__restrict__
range
,
unsigned
int
*
bucketOffset
,
unsigned
int
*
__restrict__
bucketOfElement
,
unsigned
int
*
__restrict__
offsetInBucket
)
{
float
minValue
=
(
float
)
(
range
[
0
]);
float
maxValue
=
(
float
)
(
range
[
1
]);
float
bucketWidth
=
(
maxValue
-
minValue
)
/
numBuckets
;
for
(
unsigned
int
index
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
index
<
length
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
float
key
=
(
float
)
getValue
(
data
[
index
]);
unsigned
int
bucketIndex
=
min
((
unsigned
int
)
((
key
-
minValue
)
/
bucketWidth
),
numBuckets
-
1
);
offsetInBucket
[
index
]
=
atomicAdd
(
&
bucketOffset
[
bucketIndex
],
1
);
bucketOfElement
[
index
]
=
bucketIndex
;
}
}
/**
* Sum the bucket sizes to compute the start position of each bucket. This kernel
* is executed as a single work group.
*/
__global__
void
computeBucketPositions
(
unsigned
int
numBuckets
,
unsigned
int
*
__restrict__
bucketOffset
)
{
extern
__shared__
unsigned
int
posBuffer
[];
unsigned
int
globalOffset
=
0
;
for
(
unsigned
int
startBucket
=
0
;
startBucket
<
numBuckets
;
startBucket
+=
blockDim
.
x
)
{
// Load the bucket sizes into local memory.
unsigned
int
globalIndex
=
startBucket
+
threadIdx
.
x
;
posBuffer
[
threadIdx
.
x
]
=
(
globalIndex
<
numBuckets
?
bucketOffset
[
globalIndex
]
:
0
);
__syncthreads
();
// Perform a parallel prefix sum.
for
(
unsigned
int
step
=
1
;
step
<
blockDim
.
x
;
step
*=
2
)
{
unsigned
int
add
=
(
threadIdx
.
x
>=
step
?
posBuffer
[
threadIdx
.
x
-
step
]
:
0
);
__syncthreads
();
posBuffer
[
threadIdx
.
x
]
+=
add
;
__syncthreads
();
}
// Write the results back to global memory.
if
(
globalIndex
<
numBuckets
)
bucketOffset
[
globalIndex
]
=
posBuffer
[
threadIdx
.
x
]
+
globalOffset
;
globalOffset
+=
posBuffer
[
blockDim
.
x
-
1
];
}
}
/**
* Copy the input data into the buckets for sorting.
*/
__global__
void
copyDataToBuckets
(
const
DATA_TYPE
*
__restrict__
data
,
DATA_TYPE
*
__restrict__
buckets
,
unsigned
int
length
,
const
unsigned
int
*
__restrict__
bucketOffset
,
const
unsigned
int
*
__restrict__
bucketOfElement
,
const
unsigned
int
*
__restrict__
offsetInBucket
)
{
for
(
unsigned
int
index
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
index
<
length
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
DATA_TYPE
element
=
data
[
index
];
unsigned
int
bucketIndex
=
bucketOfElement
[
index
];
unsigned
int
offset
=
(
bucketIndex
==
0
?
0
:
bucketOffset
[
bucketIndex
-
1
]);
buckets
[
offset
+
offsetInBucket
[
index
]]
=
element
;
}
}
/**
* Sort the data in each bucket.
*/
__global__
void
sortBuckets
(
DATA_TYPE
*
__restrict__
data
,
const
DATA_TYPE
*
__restrict__
buckets
,
unsigned
int
numBuckets
,
const
unsigned
int
*
__restrict__
bucketOffset
)
{
extern
__shared__
DATA_TYPE
dataBuffer
[];
for
(
unsigned
int
index
=
blockIdx
.
x
;
index
<
numBuckets
;
index
+=
gridDim
.
x
)
{
unsigned
int
startIndex
=
(
index
==
0
?
0
:
bucketOffset
[
index
-
1
]);
unsigned
int
endIndex
=
bucketOffset
[
index
];
unsigned
int
length
=
endIndex
-
startIndex
;
if
(
length
<=
blockDim
.
x
)
{
// Load the data into local memory.
if
(
threadIdx
.
x
<
length
)
dataBuffer
[
threadIdx
.
x
]
=
buckets
[
startIndex
+
threadIdx
.
x
];
else
dataBuffer
[
threadIdx
.
x
]
=
MAX_VALUE
;
__syncthreads
();
// Perform a bitonic sort in local memory.
for
(
unsigned
int
k
=
2
;
k
<=
blockDim
.
x
;
k
*=
2
)
{
for
(
unsigned
int
j
=
k
/
2
;
j
>
0
;
j
/=
2
)
{
int
ixj
=
threadIdx
.
x
^
j
;
if
(
ixj
>
threadIdx
.
x
)
{
DATA_TYPE
value1
=
dataBuffer
[
threadIdx
.
x
];
DATA_TYPE
value2
=
dataBuffer
[
ixj
];
bool
ascending
=
(
threadIdx
.
x
&
k
)
==
0
;
KEY_TYPE
lowKey
=
(
ascending
?
getValue
(
value1
)
:
getValue
(
value2
));
KEY_TYPE
highKey
=
(
ascending
?
getValue
(
value2
)
:
getValue
(
value1
));
if
(
lowKey
>
highKey
)
{
dataBuffer
[
threadIdx
.
x
]
=
value2
;
dataBuffer
[
ixj
]
=
value1
;
}
}
__syncthreads
();
}
}
// Write the data to the sorted array.
if
(
threadIdx
.
x
<
length
)
data
[
startIndex
+
threadIdx
.
x
]
=
dataBuffer
[
threadIdx
.
x
];
}
else
{
// Copy the bucket data over to the output array.
for
(
unsigned
int
i
=
threadIdx
.
x
;
i
<
length
;
i
+=
blockDim
.
x
)
data
[
startIndex
+
i
]
=
buckets
[
startIndex
+
i
];
__threadfence_block
();
__syncthreads
();
// Perform a bitonic sort in global memory.
for
(
unsigned
int
k
=
2
;
k
<
2
*
length
;
k
*=
2
)
{
for
(
unsigned
int
j
=
k
/
2
;
j
>
0
;
j
/=
2
)
{
for
(
unsigned
int
i
=
threadIdx
.
x
;
i
<
length
;
i
+=
blockDim
.
x
)
{
int
ixj
=
i
^
j
;
if
(
ixj
>
i
&&
ixj
<
length
)
{
DATA_TYPE
value1
=
data
[
startIndex
+
i
];
DATA_TYPE
value2
=
data
[
startIndex
+
ixj
];
bool
ascending
=
((
i
&
k
)
==
0
);
for
(
unsigned
int
mask
=
k
*
2
;
mask
<
2
*
length
;
mask
*=
2
)
ascending
=
((
i
&
mask
)
==
0
?
!
ascending
:
ascending
);
KEY_TYPE
lowKey
=
(
ascending
?
getValue
(
value1
)
:
getValue
(
value2
));
KEY_TYPE
highKey
=
(
ascending
?
getValue
(
value2
)
:
getValue
(
value1
));
if
(
lowKey
>
highKey
)
{
data
[
startIndex
+
i
]
=
value2
;
data
[
startIndex
+
ixj
]
=
value1
;
}
}
}
__threadfence_block
();
__syncthreads
();
}
}
}
}
}
}
\ No newline at end of file
platforms/cuda2/src/kernels/utilities.cu
View file @
3e16cab9
extern
"C"
{
/**
* This is called by the various functions below to clear a buffer.
*/
...
...
@@ -100,3 +102,5 @@ __global__ void reduceForces(const long* __restrict__ longBuffer, float4* __rest
buffer
[
index
]
=
sum
;
}
}
}
\ No newline at end of file
platforms/cuda2/tests/TestCudaSort.cpp
0 → 100644
View file @
3e16cab9
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008-2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
/**
* This tests the CUDA implementation of sorting.
*/
#include "openmm/internal/AssertionUtilities.h"
#include "../src/CudaArray.h"
#include "../src/CudaContext.h"
#include "../src/CudaSort.h"
#include "sfmt/SFMT.h"
#include "openmm/System.h"
#include <iostream>
#include <cmath>
#include <set>
using
namespace
OpenMM
;
using
namespace
std
;
class
SortTrait
:
public
CudaSort
::
SortTrait
{
int
getDataSize
()
const
{
return
4
;}
int
getKeySize
()
const
{
return
4
;}
const
char
*
getDataType
()
const
{
return
"float"
;}
const
char
*
getKeyType
()
const
{
return
"float"
;}
const
char
*
getMinKey
()
const
{
return
"-MAXFLOAT"
;}
const
char
*
getMaxKey
()
const
{
return
"MAXFLOAT"
;}
const
char
*
getMaxValue
()
const
{
return
"MAXFLOAT"
;}
const
char
*
getSortKey
()
const
{
return
"value"
;}
};
void
verifySorting
(
vector
<
float
>
array
)
{
// Sort the array.
System
system
;
system
.
addParticle
(
0.0
);
CudaPlatform
platform
;
CudaPlatform
::
PlatformData
platformData
(
system
,
""
,
"true"
,
"single"
,
platform
.
getPropertyDefaultValue
(
CudaPlatform
::
CudaCompiler
()),
platform
.
getPropertyDefaultValue
(
CudaPlatform
::
CudaTempDirectory
()));
CudaContext
&
context
=
*
platformData
.
contexts
[
0
];
context
.
initialize
();
CudaArray
data
(
array
.
size
(),
4
,
"sortData"
);
data
.
upload
(
array
);
CudaSort
sort
(
context
,
new
SortTrait
(),
array
.
size
());
sort
.
sort
(
data
);
vector
<
float
>
sorted
;
data
.
download
(
sorted
);
// Verify that it is in sorted order.
for
(
int
i
=
1
;
i
<
(
int
)
sorted
.
size
();
i
++
)
ASSERT
(
sorted
[
i
-
1
]
<=
sorted
[
i
]);
// Make sure the sorted array contains the same values as the original one.
multiset
<
float
>
elements1
(
array
.
begin
(),
array
.
end
());
multiset
<
float
>
elements2
(
sorted
.
begin
(),
sorted
.
end
());
ASSERT
(
elements1
==
elements2
);
}
void
testUniformValues
()
{
OpenMM_SFMT
::
SFMT
sfmt
;
init_gen_rand
(
0
,
sfmt
);
vector
<
float
>
array
(
10000
);
for
(
int
i
=
0
;
i
<
(
int
)
array
.
size
();
i
++
)
array
[
i
]
=
(
float
)
genrand_real2
(
sfmt
);
verifySorting
(
array
);
}
void
testLogValues
()
{
OpenMM_SFMT
::
SFMT
sfmt
;
init_gen_rand
(
0
,
sfmt
);
vector
<
float
>
array
(
10000
);
for
(
int
i
=
0
;
i
<
(
int
)
array
.
size
();
i
++
)
array
[
i
]
=
(
float
)
log
(
genrand_real2
(
sfmt
));
verifySorting
(
array
);
}
int
main
()
{
try
{
testUniformValues
();
testLogValues
();
}
catch
(
const
exception
&
e
)
{
cout
<<
"exception: "
<<
e
.
what
()
<<
endl
;
return
1
;
}
cout
<<
"Done"
<<
endl
;
return
0
;
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment