Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
3e16cab9
"vscode:/vscode.git/clone" did not exist on "817fd3bced1aa84ecbd9bf540212f57a4372dcd7"
Commit
3e16cab9
authored
Jun 05, 2012
by
Peter Eastman
Browse files
Continuing to implement new CUDA platform
parent
abb8cb4b
Changes
9
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
1158 additions
and
509 deletions
+1158
-509
platforms/cuda2/src/CudaArray.h
platforms/cuda2/src/CudaArray.h
+1
-1
platforms/cuda2/src/CudaContext.cpp
platforms/cuda2/src/CudaContext.cpp
+367
-309
platforms/cuda2/src/CudaContext.h
platforms/cuda2/src/CudaContext.h
+200
-195
platforms/cuda2/src/CudaPlatform.cpp
platforms/cuda2/src/CudaPlatform.cpp
+5
-4
platforms/cuda2/src/CudaSort.cpp
platforms/cuda2/src/CudaSort.cpp
+132
-0
platforms/cuda2/src/CudaSort.h
platforms/cuda2/src/CudaSort.h
+141
-0
platforms/cuda2/src/kernels/sort.cu
platforms/cuda2/src/kernels/sort.cu
+186
-0
platforms/cuda2/src/kernels/utilities.cu
platforms/cuda2/src/kernels/utilities.cu
+4
-0
platforms/cuda2/tests/TestCudaSort.cpp
platforms/cuda2/tests/TestCudaSort.cpp
+122
-0
No files found.
platforms/cuda2/src/CudaArray.h
View file @
3e16cab9
...
...
@@ -83,7 +83,7 @@ public:
/**
* Get a pointer to the device memory.
*/
CUdeviceptr
getDevicePointer
()
{
CUdeviceptr
&
getDevicePointer
()
{
return
pointer
;
}
/**
...
...
platforms/cuda2/src/CudaContext.cpp
View file @
3e16cab9
This diff is collapsed.
Click to expand it.
platforms/cuda2/src/CudaContext.h
View file @
3e16cab9
...
...
@@ -72,11 +72,11 @@ public:
CudaContext
(
const
System
&
system
,
int
deviceIndex
,
bool
useBlockingSync
,
const
std
::
string
&
precision
,
const
std
::
string
&
compiler
,
const
std
::
string
&
tempDir
,
CudaPlatform
::
PlatformData
&
platformData
);
~
CudaContext
();
//
/**
//
* This is called to initialize internal data structures after all Forces in the system
//
* have been initialized.
//
*/
//
void initialize();
/**
* This is called to initialize internal data structures after all Forces in the system
* have been initialized.
*/
void
initialize
();
/**
* Add a CudaForce to this context.
*/
...
...
@@ -123,12 +123,12 @@ public:
CudaArray
&
getVelm
()
{
return
*
velm
;
}
//
/**
//
* Get the array which contains the force on each atom.
//
*/
//
CudaArray
<mm_float4>
& getForce() {
//
return *force;
//
}
/**
* Get the array which contains the force on each atom
(respresented as a long3 in 64 bit fixed point)
.
*/
CudaArray
&
getForce
()
{
return
*
force
;
}
// /**
// * Get the array which contains the buffers in which forces are computed.
// */
...
...
@@ -184,36 +184,41 @@ public:
* omitted, a default set of options will be used
*/
CUmodule
createModule
(
const
std
::
string
source
,
const
std
::
map
<
std
::
string
,
std
::
string
>&
defines
,
const
char
*
optimizationFlags
=
NULL
);
// /**
// * Execute a kernel.
// *
// * @param kernel the kernel to execute
// * @param workUnits the maximum number of work units that should be used
// * @param blockSize the size of each thread block to use
// */
// void executeKernel(cl::Kernel& kernel, int workUnits, int blockSize = -1);
// /**
// * Set all elements of an array to 0.
// */
// void clearBuffer(CudaArray<float>& array);
// /**
// * Set all elements of an array to 0.
// */
// void clearBuffer(CudaArray<mm_float4>& array);
// /**
// * Set all elements of an array to 0.
// *
// * @param memory the Memory to clear
// * @param size the number of float elements in the buffer
// */
// void clearBuffer(cl::Memory& memory, int size);
// /**
// * Register a buffer that should be automatically cleared (all elements set to 0) at the start of each force or energy computation.
// *
// * @param memory the Memory to clear
// * @param size the number of float elements in the buffer
// */
// void addAutoclearBuffer(cl::Memory& memory, int size);
/**
* Get a kernel from a CUDA module.
*
* @param module the module to get the kernel from
* @param name the name of the kernel to get
*/
CUfunction
getKernel
(
CUmodule
&
module
,
const
std
::
string
&
name
);
/**
* Execute a kernel.
*
* @param kernel the kernel to execute
* @param arguments an array of pointers to the kernel arguments
* @param threads the maximum number of threads that should be used
* @param blockSize the size of each thread block to use
* @param sharedSize the amount of dynamic shared memory to allocated for the kernel, in bytes
*/
void
executeKernel
(
CUfunction
kernel
,
void
**
arguments
,
int
workUnits
,
int
blockSize
=
-
1
,
unsigned
int
sharedSize
=
0
);
/**
* Set all elements of an array to 0.
*/
void
clearBuffer
(
CudaArray
&
array
);
/**
* Set all elements of an array to 0.
*
* @param memory the memory to clear
* @param size the number of 4-byte elements in the buffer
*/
void
clearBuffer
(
CUdeviceptr
memory
,
int
size
);
/**
* Register a buffer that should be automatically cleared (all elements set to 0) at the start of each force or energy computation.
*
* @param memory the memory to clear
* @param size the number of float/double elements in the buffer
*/
void
addAutoclearBuffer
(
CUdeviceptr
memory
,
int
size
);
// /**
// * Clear all buffers that have been registered with addAutoclearBuffer().
// */
...
...
@@ -230,108 +235,110 @@ public:
// * Sum the buffesr containing forces.
// */
// void reduceForces();
// /**
// * Get the current simulation time.
// */
// double getTime() {
// return time;
// }
// /**
// * Set the current simulation time.
// */
// void setTime(double t) {
// time = t;
// }
// /**
// * Get the number of integration steps that have been taken.
// */
// int getStepCount() {
// return stepCount;
// }
// /**
// * Set the number of integration steps that have been taken.
// */
// void setStepCount(int steps) {
// stepCount = steps;
// }
// /**
// * Get the number of times forces or energy has been computed.
// */
// int getComputeForceCount() {
// return computeForceCount;
// }
// /**
// * Set the number of times forces or energy has been computed.
// */
// void setComputeForceCount(int count) {
// computeForceCount = count;
// }
// /**
// * Get the number of atoms.
// */
// int getNumAtoms() const {
// return numAtoms;
// }
// /**
// * Get the number of atoms, rounded up to a multiple of TileSize. This is the actual size of
// * most arrays with one element per atom.
// */
// int getPaddedNumAtoms() const {
// return paddedNumAtoms;
// }
// /**
// * Get the number of blocks of TileSize atoms.
// */
// int getNumAtomBlocks() const {
// return numAtomBlocks;
// }
// /**
// * Get the standard number of thread blocks to use when executing kernels.
// */
// int getNumThreadBlocks() const {
// return numThreadBlocks;
// }
// /**
// * Get the number of force buffers.
// */
// int getNumForceBuffers() const {
// return numForceBuffers;
// }
// /**
// * Get the SIMD width of the device being used.
// */
// int getSIMDWidth() const {
// return simdWidth;
// }
// /**
// * Get whether the device being used supports 64 bit atomic operations on global memory.
// */
// bool getSupports64BitGlobalAtomics() {
// return supports64BitGlobalAtomics;
// }
// /**
// * Get whether the device being used supports double precision math.
// */
// bool getSupportsDoublePrecision() {
// return supportsDoublePrecision;
// }
/**
* Get the current simulation time.
*/
double
getTime
()
{
return
time
;
}
/**
* Set the current simulation time.
*/
void
setTime
(
double
t
)
{
time
=
t
;
}
/**
* Get the number of integration steps that have been taken.
*/
int
getStepCount
()
{
return
stepCount
;
}
/**
* Set the number of integration steps that have been taken.
*/
void
setStepCount
(
int
steps
)
{
stepCount
=
steps
;
}
/**
* Get the number of times forces or energy has been computed.
*/
int
getComputeForceCount
()
{
return
computeForceCount
;
}
/**
* Set the number of times forces or energy has been computed.
*/
void
setComputeForceCount
(
int
count
)
{
computeForceCount
=
count
;
}
/**
* Get the number of atoms.
*/
int
getNumAtoms
()
const
{
return
numAtoms
;
}
/**
* Get the number of atoms, rounded up to a multiple of TileSize. This is the actual size of
* most arrays with one element per atom.
*/
int
getPaddedNumAtoms
()
const
{
return
paddedNumAtoms
;
}
/**
* Get the number of blocks of TileSize atoms.
*/
int
getNumAtomBlocks
()
const
{
return
numAtomBlocks
;
}
/**
* Get the standard number of thread blocks to use when executing kernels.
*/
int
getNumThreadBlocks
()
const
{
return
numThreadBlocks
;
}
/**
* Get whether double precision is being used.
*/
bool
getUseDoublePrecision
()
{
return
useDoublePrecision
;
}
/**
* Get whether accumulation is being done in double precision.
*/
bool
getAccumulateInDouble
()
{
return
accumulateInDouble
;
}
/**
* Convert a number to a string in a format suitable for including in a kernel.
* This takes into account whether the context uses single or double precision.
*/
std
::
string
doubleToString
(
double
value
);
/**
* Convert a number to a string in a format suitable for including in a kernel.
*/
std
::
string
intToString
(
int
value
);
/**
* Convert a CUDA result code to the corresponding string description.
*/
std
::
string
getErrorString
(
CUresult
result
);
// /**
// * Get the size of the periodic box.
// */
//
mm_
float4 getPeriodicBoxSize() const {
// float4 getPeriodicBoxSize() const {
// return periodicBoxSize;
// }
// /**
// * Set the size of the periodic box.
// */
// void setPeriodicBoxSize(double xsize, double ysize, double zsize) {
// periodicBoxSize = m
m
_float4((float) xsize, (float) ysize, (float) zsize, 0);
// invPeriodicBoxSize = m
m
_float4((float) (1.0/xsize), (float) (1.0/ysize), (float) (1.0/zsize), 0);
// periodicBoxSize = m
ake
_float4((float) xsize, (float) ysize, (float) zsize, 0);
// invPeriodicBoxSize = m
ake
_float4((float) (1.0/xsize), (float) (1.0/ysize), (float) (1.0/zsize), 0);
// }
// /**
// * Get the inverse of the size of the periodic box.
// */
//
mm_
float4 getInvPeriodicBoxSize() const {
// float4 getInvPeriodicBoxSize() const {
// return invPeriodicBoxSize;
// }
// /**
...
...
@@ -352,66 +359,66 @@ public:
// CudaNonbondedUtilities& getNonbondedUtilities() {
// return *nonbonded;
// }
//
/**
//
* Get the thread used by this context for executing parallel computations.
//
*/
//
WorkThread& getWorkThread() {
//
return *thread;
//
}
//
/**
//
* Get whether atoms were reordered during the most recent force/energy computation.
//
*/
//
bool getAtomsWereReordered() const {
//
return atomsWereReordered;
//
}
//
/**
//
* Set whether atoms were reordered during the most recent force/energy computation.
//
*/
//
void setAtomsWereReordered(bool wereReordered) {
//
atomsWereReordered = wereReordered;
//
}
//
/**
//
* Reorder the internal arrays of atoms to try to keep spatially contiguous atoms close
//
* together in the arrays.
//
*
//
* @param enforcePeriodic if true, the atom positions may be altered to enforce periodic boundary conditions
//
*/
//
void reorderAtoms(bool enforcePeriodic);
//
/**
//
* Add a listener that should be called whenever atoms get reordered. The CudaContext
//
* assumes ownership of the object, and deletes it when the context itself is deleted.
//
*/
//
void addReorderListener(ReorderListener* listener);
//
/**
//
* Get the list of ReorderListeners.
//
*/
//
std::vector<ReorderListener*>& getReorderListeners() {
//
return reorderListeners;
//
}
//
/**
//
* Mark that the current molecule definitions (and hence the atom order) may be invalid.
//
* This should be called whenever force field parameters change. It will cause the definitions
//
* and order to be revalidated the next to reorderAtoms() is called.
//
*/
//
void invalidateMolecules();
//
/**
//
* Get whether the current molecule definitions are valid.
//
*/
//
bool getMoleculesAreInvalid() {
//
return moleculesInvalid;
//
}
/**
* Get the thread used by this context for executing parallel computations.
*/
WorkThread
&
getWorkThread
()
{
return
*
thread
;
}
/**
* Get whether atoms were reordered during the most recent force/energy computation.
*/
bool
getAtomsWereReordered
()
const
{
return
atomsWereReordered
;
}
/**
* Set whether atoms were reordered during the most recent force/energy computation.
*/
void
setAtomsWereReordered
(
bool
wereReordered
)
{
atomsWereReordered
=
wereReordered
;
}
/**
* Reorder the internal arrays of atoms to try to keep spatially contiguous atoms close
* together in the arrays.
*
* @param enforcePeriodic if true, the atom positions may be altered to enforce periodic boundary conditions
*/
void
reorderAtoms
(
bool
enforcePeriodic
);
/**
* Add a listener that should be called whenever atoms get reordered. The CudaContext
* assumes ownership of the object, and deletes it when the context itself is deleted.
*/
void
addReorderListener
(
ReorderListener
*
listener
);
/**
* Get the list of ReorderListeners.
*/
std
::
vector
<
ReorderListener
*>&
getReorderListeners
()
{
return
reorderListeners
;
}
/**
* Mark that the current molecule definitions (and hence the atom order) may be invalid.
* This should be called whenever force field parameters change. It will cause the definitions
* and order to be revalidated the next to reorderAtoms() is called.
*/
void
invalidateMolecules
();
/**
* Get whether the current molecule definitions are valid.
*/
bool
getMoleculesAreInvalid
()
{
return
moleculesInvalid
;
}
private:
struct
Molecule
;
struct
MoleculeGroup
;
class
VirtualSiteInfo
;
//
void findMoleculeGroups();
//
static void tagAtomsInMolecule(int atom, int molecule, std::vector<int>& atomMolecule, std::vector<std::vector<int> >& atomBonds);
//
/**
//
* Ensure that all molecules marked as "identical" really are identical. This should be
//
* called whenever force field parameters change. If necessary, it will rebuild the list
//
* of molecules and resort the atoms.
//
*/
//
void validateMolecules();
void
findMoleculeGroups
();
static
void
tagAtomsInMolecule
(
int
atom
,
int
molecule
,
std
::
vector
<
int
>&
atomMolecule
,
std
::
vector
<
std
::
vector
<
int
>
>&
atomBonds
);
/**
* Ensure that all molecules marked as "identical" really are identical. This should be
* called whenever force field parameters change. If necessary, it will rebuild the list
* of molecules and resort the atoms.
*/
void
validateMolecules
();
static
bool
hasInitializedCuda
;
const
System
&
system
;
double
time
;
...
...
@@ -424,8 +431,6 @@ private:
int
paddedNumAtoms
;
int
numAtomBlocks
;
int
numThreadBlocks
;
// int numForceBuffers;
// int simdWidth;
bool
useBlockingSync
,
useDoublePrecision
,
accumulateInDouble
,
contextIsValid
,
atomsWereReordered
,
moleculesInvalid
;
std
::
string
compiler
,
tempDir
,
gpuArchitecture
;
float4
periodicBoxSize
;
...
...
@@ -446,15 +451,15 @@ private:
std
::
vector
<
Molecule
>
molecules
;
std
::
vector
<
MoleculeGroup
>
moleculeGroups
;
std
::
vector
<
int4
>
posCellOffsets
;
void
*
pinnedBuffer
;
CudaArray
*
posq
;
CudaArray
*
velm
;
// CudaArray<mm_float4>* force;
// CudaArray<mm_float4>* forceBuffers;
// CudaArray<cl_long>* longForceBuffer;
// CudaArray<cl_float>* energyBuffer;
// CudaArray<cl_int>* atomIndex;
// std::vector<cl::Memory*> autoclearBuffers;
// std::vector<int> autoclearBufferSizes;
CudaArray
*
force
;
CudaArray
*
energyBuffer
;
CudaArray
*
atomIndexDevice
;
std
::
vector
<
int
>
atomIndex
;
std
::
vector
<
CUdeviceptr
>
autoclearBuffers
;
std
::
vector
<
int
>
autoclearBufferSizes
;
std
::
vector
<
ReorderListener
*>
reorderListeners
;
// CudaIntegrationUtilities* integration;
// CudaBondedUtilities* bonded;
...
...
platforms/cuda2/src/CudaPlatform.cpp
View file @
3e16cab9
...
...
@@ -154,6 +154,7 @@ CudaPlatform::PlatformData::PlatformData(const System& system, const string& dev
device
<<
contexts
[
i
]
->
getDeviceIndex
();
}
propertyValues
[
CudaPlatform
::
CudaDeviceIndex
()]
=
device
.
str
();
propertyValues
[
CudaPlatform
::
CudaUseBlockingSync
()]
=
blocking
?
"true"
:
"false"
;
propertyValues
[
CudaPlatform
::
CudaPrecision
()]
=
precisionProperty
;
propertyValues
[
CudaPlatform
::
CudaCompiler
()]
=
compilerProperty
;
propertyValues
[
CudaPlatform
::
CudaTempDirectory
()]
=
tempProperty
;
...
...
@@ -166,11 +167,11 @@ CudaPlatform::PlatformData::~PlatformData() {
}
void
CudaPlatform
::
PlatformData
::
initializeContexts
(
const
System
&
system
)
{
//
for (int i = 0; i < (int) contexts.size(); i++)
//
contexts[i]->initialize();
for
(
int
i
=
0
;
i
<
(
int
)
contexts
.
size
();
i
++
)
contexts
[
i
]
->
initialize
();
}
void
CudaPlatform
::
PlatformData
::
syncContexts
()
{
//
for (int i = 0; i < (int) contexts.size(); i++)
//
contexts[i]->getWorkThread().flush();
for
(
int
i
=
0
;
i
<
(
int
)
contexts
.
size
();
i
++
)
contexts
[
i
]
->
getWorkThread
().
flush
();
}
platforms/cuda2/src/CudaSort.cpp
0 → 100644
View file @
3e16cab9
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2010-2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "CudaSort.h"
#include "CudaKernelSources.h"
#include <map>
using
namespace
OpenMM
;
using
namespace
std
;
CudaSort
::
CudaSort
(
CudaContext
&
context
,
SortTrait
*
trait
,
unsigned
int
length
)
:
context
(
context
),
trait
(
trait
),
dataRange
(
NULL
),
bucketOfElement
(
NULL
),
offsetInBucket
(
NULL
),
bucketOffset
(
NULL
),
buckets
(
NULL
)
{
// Create kernels.
map
<
string
,
string
>
replacements
;
replacements
[
"DATA_TYPE"
]
=
trait
->
getDataType
();
replacements
[
"KEY_TYPE"
]
=
trait
->
getKeyType
();
replacements
[
"SORT_KEY"
]
=
trait
->
getSortKey
();
replacements
[
"MIN_KEY"
]
=
trait
->
getMinKey
();
replacements
[
"MAX_KEY"
]
=
trait
->
getMaxKey
();
replacements
[
"MAX_VALUE"
]
=
trait
->
getMaxValue
();
CUmodule
module
=
context
.
createModule
(
context
.
replaceStrings
(
CudaKernelSources
::
sort
,
replacements
));
computeRangeKernel
=
context
.
getKernel
(
module
,
"computeRange"
);
assignElementsKernel
=
context
.
getKernel
(
module
,
"assignElementsToBuckets"
);
computeBucketPositionsKernel
=
context
.
getKernel
(
module
,
"computeBucketPositions"
);
copyToBucketsKernel
=
context
.
getKernel
(
module
,
"copyDataToBuckets"
);
sortBucketsKernel
=
context
.
getKernel
(
module
,
"sortBuckets"
);
// Work out the work group sizes for various kernels.
int
maxBlockSize
;
cuDeviceGetAttribute
(
&
maxBlockSize
,
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X
,
context
.
getDevice
());
for
(
rangeKernelSize
=
1
;
rangeKernelSize
*
2
<=
maxBlockSize
;
rangeKernelSize
*=
2
)
;
positionsKernelSize
=
rangeKernelSize
;
sortKernelSize
=
rangeKernelSize
/
2
;
if
(
rangeKernelSize
>
length
)
rangeKernelSize
=
length
;
int
maxSharedMem
;
cuDeviceGetAttribute
(
&
maxSharedMem
,
CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK
,
context
.
getDevice
());
unsigned
int
maxLocalBuffer
=
(
unsigned
int
)
((
maxSharedMem
/
trait
->
getDataSize
())
/
2
);
if
(
sortKernelSize
>
maxLocalBuffer
)
sortKernelSize
=
maxLocalBuffer
;
unsigned
int
targetBucketSize
=
sortKernelSize
/
2
;
unsigned
int
numBuckets
=
length
/
targetBucketSize
;
if
(
numBuckets
<
1
)
numBuckets
=
1
;
if
(
positionsKernelSize
>
numBuckets
)
positionsKernelSize
=
numBuckets
;
// Create workspace arrays.
dataRange
=
new
CudaArray
(
2
,
trait
->
getKeySize
(),
"sortDataRange"
);
bucketOffset
=
CudaArray
::
create
<
uint1
>
(
numBuckets
,
"bucketOffset"
);
bucketOfElement
=
CudaArray
::
create
<
uint1
>
(
length
,
"bucketOfElement"
);
offsetInBucket
=
CudaArray
::
create
<
uint1
>
(
length
,
"offsetInBucket"
);
buckets
=
new
CudaArray
(
length
,
trait
->
getDataSize
(),
"buckets"
);
}
CudaSort
::~
CudaSort
()
{
delete
trait
;
if
(
dataRange
!=
NULL
)
delete
dataRange
;
if
(
bucketOfElement
!=
NULL
)
delete
bucketOfElement
;
if
(
offsetInBucket
!=
NULL
)
delete
offsetInBucket
;
if
(
bucketOffset
!=
NULL
)
delete
bucketOffset
;
if
(
buckets
!=
NULL
)
delete
buckets
;
}
void
CudaSort
::
sort
(
CudaArray
&
data
)
{
if
(
data
.
getSize
()
!=
bucketOfElement
->
getSize
()
||
data
.
getElementSize
()
!=
trait
->
getDataSize
())
throw
OpenMMException
(
"CudaSort called with different data size"
);
if
(
data
.
getSize
()
==
0
)
return
;
// Compute the range of data values.
unsigned
int
dataSize
=
data
.
getSize
();
void
*
rangeArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
dataSize
,
&
dataRange
->
getDevicePointer
()};
context
.
executeKernel
(
computeRangeKernel
,
rangeArgs
,
rangeKernelSize
,
rangeKernelSize
,
rangeKernelSize
*
trait
->
getKeySize
());
// Assign array elements to buckets.
unsigned
int
numBuckets
=
bucketOffset
->
getSize
();
context
.
clearBuffer
(
*
bucketOffset
);
void
*
elementsArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
dataSize
,
&
numBuckets
,
&
dataRange
->
getDevicePointer
(),
&
bucketOffset
->
getDevicePointer
(),
&
bucketOfElement
->
getDevicePointer
(),
&
offsetInBucket
->
getDevicePointer
()};
context
.
executeKernel
(
assignElementsKernel
,
elementsArgs
,
data
.
getSize
());
// Compute the position of each bucket.
void
*
computeArgs
[]
=
{
&
numBuckets
,
&
bucketOffset
->
getDevicePointer
()};
context
.
executeKernel
(
computeBucketPositionsKernel
,
computeArgs
,
positionsKernelSize
,
positionsKernelSize
,
positionsKernelSize
*
sizeof
(
int
));
// Copy the data into the buckets.
void
*
copyArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
buckets
->
getDevicePointer
(),
&
dataSize
,
&
bucketOffset
->
getDevicePointer
(),
&
bucketOfElement
->
getDevicePointer
(),
&
offsetInBucket
->
getDevicePointer
()};
context
.
executeKernel
(
copyToBucketsKernel
,
copyArgs
,
data
.
getSize
());
// Sort each bucket.
void
*
sortArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
buckets
->
getDevicePointer
(),
&
numBuckets
,
&
bucketOffset
->
getDevicePointer
()};
context
.
executeKernel
(
sortBucketsKernel
,
sortArgs
,
((
data
.
getSize
()
+
sortKernelSize
-
1
)
/
sortKernelSize
)
*
sortKernelSize
,
sortKernelSize
,
sortKernelSize
*
trait
->
getDataSize
());
}
platforms/cuda2/src/CudaSort.h
0 → 100644
View file @
3e16cab9
#ifndef __OPENMM_CUDASORT_H__
#define __OPENMM_CUDASORT_H__
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2010-2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "CudaArray.h"
#include "openmm/internal/windowsExport.h"
#include "CudaContext.h"
namespace
OpenMM
{
/**
* This class sorts arrays of values. It supports any type of values, not just scalars,
* so long as an appropriate sorting key can be defined by which to sort them.
*
* The sorting behavior is specified by a "trait" class that defines the type of data to
* sort and the key for sorting it. Here is an example of a trait class for
* sorting floats:
*
* class SortTrait : public CudaSort::SortTrait {
* int getDataSize() const {return 4;}
* int getKeySize() const {return 4;}
* const char* getDataType() const {return "float";}
* const char* getKeyType() const {return "float";}
* const char* getMinKey() const {return "-MAXFLOAT";}
* const char* getMaxKey() const {return "MAXFLOAT";}
* const char* getMaxValue() const {return "MAXFLOAT";}
* const char* getSortKey() const {return "value";}
* };
*
* The algorithm used is a bucket sort, followed by a bitonic sort within each bucket
* (in local memory when possible, in global memory otherwise). This is similar to
* the algorithm described in
*
* Shifu Chen, Jing Qin, Yongming Xie, Junping Zhao, and Pheng-Ann Heng. "An Efficient
* Sorting Algorithm with CUDA" Journal of the Chinese Institute of Engineers, 32(7),
* pp. 915-921 (2009)
*
* but with many modifications and simplifications. In particular, this algorithm
* involves much less communication between host and device, which is critical to get
* good performance with the array sizes we typically work with (10,000 to 100,000
* elements).
*/
class
OPENMM_EXPORT
CudaSort
{
public:
class
SortTrait
;
/**
* Create a CudaSort object for sorting data of a particular type.
*
* @param context the context in which to perform calculations
* @param trait a SortTrait defining the type of data to sort. It should have been allocated
* on the heap with the "new" operator. This object takes over ownership of it,
* and deletes it when the CudaSort is deleted.
* @param length the length of the arrays this object will be used to sort
*/
CudaSort
(
CudaContext
&
context
,
SortTrait
*
trait
,
unsigned
int
length
);
~
CudaSort
();
/**
* Sort an array.
*/
void
sort
(
CudaArray
&
data
);
private:
CudaContext
&
context
;
SortTrait
*
trait
;
CudaArray
*
dataRange
;
CudaArray
*
bucketOfElement
;
CudaArray
*
offsetInBucket
;
CudaArray
*
bucketOffset
;
CudaArray
*
buckets
;
CUfunction
computeRangeKernel
,
assignElementsKernel
,
computeBucketPositionsKernel
,
copyToBucketsKernel
,
sortBucketsKernel
;
unsigned
int
rangeKernelSize
,
positionsKernelSize
,
sortKernelSize
;
};
/**
* A subclass of SortTrait defines the type of value to sort, and the key for sorting them.
*/
class
CudaSort
::
SortTrait
{
public:
/**
* Get the size of each data value in bytes.
*/
virtual
int
getDataSize
()
const
=
0
;
/**
* Get the size of each key value in bytes.
*/
virtual
int
getKeySize
()
const
=
0
;
/**
* Get the data type of the values to sort.
*/
virtual
const
char
*
getDataType
()
const
=
0
;
/**
* Get the data type of the sorting key.
*/
virtual
const
char
*
getKeyType
()
const
=
0
;
/**
* Get the minimum value a key can take.
*/
virtual
const
char
*
getMinKey
()
const
=
0
;
/**
* Get the maximum value a key can take.
*/
virtual
const
char
*
getMaxKey
()
const
=
0
;
/**
* Get a value whose key is guaranteed to equal getMaxKey().
*/
virtual
const
char
*
getMaxValue
()
const
=
0
;
/**
* Get the CUDA code to select the key from the data value.
*/
virtual
const
char
*
getSortKey
()
const
=
0
;
};
}
// namespace OpenMM
#endif // __OPENMM_CUDASORT_H__
platforms/cuda2/src/kernels/sort.cu
0 → 100644
View file @
3e16cab9
__device__
KEY_TYPE
getValue
(
DATA_TYPE
value
)
{
return
SORT_KEY
;
}
extern
"C"
{
/**
* Calculate the minimum and maximum value in the array to be sorted. This kernel
* is executed as a single work group.
*/
__global__
void
computeRange
(
const
DATA_TYPE
*
__restrict__
data
,
unsigned
int
length
,
KEY_TYPE
*
__restrict__
range
)
{
extern
__shared__
KEY_TYPE
rangeBuffer
[];
KEY_TYPE
minimum
=
MAX_KEY
;
KEY_TYPE
maximum
=
MIN_KEY
;
// Each thread calculates the range of a subset of values.
for
(
unsigned
int
index
=
threadIdx
.
x
;
index
<
length
;
index
+=
blockDim
.
x
)
{
KEY_TYPE
value
=
getValue
(
data
[
index
]);
minimum
=
min
(
minimum
,
value
);
maximum
=
max
(
maximum
,
value
);
}
// Now reduce them.
rangeBuffer
[
threadIdx
.
x
]
=
minimum
;
__syncthreads
();
for
(
unsigned
int
step
=
1
;
step
<
blockDim
.
x
;
step
*=
2
)
{
if
(
threadIdx
.
x
+
step
<
blockDim
.
x
&&
threadIdx
.
x
%
(
2
*
step
)
==
0
)
rangeBuffer
[
threadIdx
.
x
]
=
min
(
rangeBuffer
[
threadIdx
.
x
],
rangeBuffer
[
threadIdx
.
x
+
step
]);
__syncthreads
();
}
minimum
=
rangeBuffer
[
0
];
rangeBuffer
[
threadIdx
.
x
]
=
maximum
;
__syncthreads
();
for
(
unsigned
int
step
=
1
;
step
<
blockDim
.
x
;
step
*=
2
)
{
if
(
threadIdx
.
x
+
step
<
blockDim
.
x
&&
threadIdx
.
x
%
(
2
*
step
)
==
0
)
rangeBuffer
[
threadIdx
.
x
]
=
max
(
rangeBuffer
[
threadIdx
.
x
],
rangeBuffer
[
threadIdx
.
x
+
step
]);
__syncthreads
();
}
maximum
=
rangeBuffer
[
0
];
if
(
threadIdx
.
x
==
0
)
{
range
[
0
]
=
minimum
;
range
[
1
]
=
maximum
;
}
}
/**
* Assign elements to buckets.
*/
__global__
void
assignElementsToBuckets
(
const
DATA_TYPE
*
__restrict__
data
,
unsigned
int
length
,
unsigned
int
numBuckets
,
const
KEY_TYPE
*
__restrict__
range
,
unsigned
int
*
bucketOffset
,
unsigned
int
*
__restrict__
bucketOfElement
,
unsigned
int
*
__restrict__
offsetInBucket
)
{
float
minValue
=
(
float
)
(
range
[
0
]);
float
maxValue
=
(
float
)
(
range
[
1
]);
float
bucketWidth
=
(
maxValue
-
minValue
)
/
numBuckets
;
for
(
unsigned
int
index
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
index
<
length
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
float
key
=
(
float
)
getValue
(
data
[
index
]);
unsigned
int
bucketIndex
=
min
((
unsigned
int
)
((
key
-
minValue
)
/
bucketWidth
),
numBuckets
-
1
);
offsetInBucket
[
index
]
=
atomicAdd
(
&
bucketOffset
[
bucketIndex
],
1
);
bucketOfElement
[
index
]
=
bucketIndex
;
}
}
/**
* Sum the bucket sizes to compute the start position of each bucket. This kernel
* is executed as a single work group.
*/
__global__
void
computeBucketPositions
(
unsigned
int
numBuckets
,
unsigned
int
*
__restrict__
bucketOffset
)
{
extern
__shared__
unsigned
int
posBuffer
[];
unsigned
int
globalOffset
=
0
;
for
(
unsigned
int
startBucket
=
0
;
startBucket
<
numBuckets
;
startBucket
+=
blockDim
.
x
)
{
// Load the bucket sizes into local memory.
unsigned
int
globalIndex
=
startBucket
+
threadIdx
.
x
;
posBuffer
[
threadIdx
.
x
]
=
(
globalIndex
<
numBuckets
?
bucketOffset
[
globalIndex
]
:
0
);
__syncthreads
();
// Perform a parallel prefix sum.
for
(
unsigned
int
step
=
1
;
step
<
blockDim
.
x
;
step
*=
2
)
{
unsigned
int
add
=
(
threadIdx
.
x
>=
step
?
posBuffer
[
threadIdx
.
x
-
step
]
:
0
);
__syncthreads
();
posBuffer
[
threadIdx
.
x
]
+=
add
;
__syncthreads
();
}
// Write the results back to global memory.
if
(
globalIndex
<
numBuckets
)
bucketOffset
[
globalIndex
]
=
posBuffer
[
threadIdx
.
x
]
+
globalOffset
;
globalOffset
+=
posBuffer
[
blockDim
.
x
-
1
];
}
}
/**
* Copy the input data into the buckets for sorting.
*/
__global__
void
copyDataToBuckets
(
const
DATA_TYPE
*
__restrict__
data
,
DATA_TYPE
*
__restrict__
buckets
,
unsigned
int
length
,
const
unsigned
int
*
__restrict__
bucketOffset
,
const
unsigned
int
*
__restrict__
bucketOfElement
,
const
unsigned
int
*
__restrict__
offsetInBucket
)
{
for
(
unsigned
int
index
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
index
<
length
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
DATA_TYPE
element
=
data
[
index
];
unsigned
int
bucketIndex
=
bucketOfElement
[
index
];
unsigned
int
offset
=
(
bucketIndex
==
0
?
0
:
bucketOffset
[
bucketIndex
-
1
]);
buckets
[
offset
+
offsetInBucket
[
index
]]
=
element
;
}
}
/**
* Sort the data in each bucket.
*/
__global__
void
sortBuckets
(
DATA_TYPE
*
__restrict__
data
,
const
DATA_TYPE
*
__restrict__
buckets
,
unsigned
int
numBuckets
,
const
unsigned
int
*
__restrict__
bucketOffset
)
{
extern
__shared__
DATA_TYPE
dataBuffer
[];
for
(
unsigned
int
index
=
blockIdx
.
x
;
index
<
numBuckets
;
index
+=
gridDim
.
x
)
{
unsigned
int
startIndex
=
(
index
==
0
?
0
:
bucketOffset
[
index
-
1
]);
unsigned
int
endIndex
=
bucketOffset
[
index
];
unsigned
int
length
=
endIndex
-
startIndex
;
if
(
length
<=
blockDim
.
x
)
{
// Load the data into local memory.
if
(
threadIdx
.
x
<
length
)
dataBuffer
[
threadIdx
.
x
]
=
buckets
[
startIndex
+
threadIdx
.
x
];
else
dataBuffer
[
threadIdx
.
x
]
=
MAX_VALUE
;
__syncthreads
();
// Perform a bitonic sort in local memory.
for
(
unsigned
int
k
=
2
;
k
<=
blockDim
.
x
;
k
*=
2
)
{
for
(
unsigned
int
j
=
k
/
2
;
j
>
0
;
j
/=
2
)
{
int
ixj
=
threadIdx
.
x
^
j
;
if
(
ixj
>
threadIdx
.
x
)
{
DATA_TYPE
value1
=
dataBuffer
[
threadIdx
.
x
];
DATA_TYPE
value2
=
dataBuffer
[
ixj
];
bool
ascending
=
(
threadIdx
.
x
&
k
)
==
0
;
KEY_TYPE
lowKey
=
(
ascending
?
getValue
(
value1
)
:
getValue
(
value2
));
KEY_TYPE
highKey
=
(
ascending
?
getValue
(
value2
)
:
getValue
(
value1
));
if
(
lowKey
>
highKey
)
{
dataBuffer
[
threadIdx
.
x
]
=
value2
;
dataBuffer
[
ixj
]
=
value1
;
}
}
__syncthreads
();
}
}
// Write the data to the sorted array.
if
(
threadIdx
.
x
<
length
)
data
[
startIndex
+
threadIdx
.
x
]
=
dataBuffer
[
threadIdx
.
x
];
}
else
{
// Copy the bucket data over to the output array.
for
(
unsigned
int
i
=
threadIdx
.
x
;
i
<
length
;
i
+=
blockDim
.
x
)
data
[
startIndex
+
i
]
=
buckets
[
startIndex
+
i
];
__threadfence_block
();
__syncthreads
();
// Perform a bitonic sort in global memory.
for
(
unsigned
int
k
=
2
;
k
<
2
*
length
;
k
*=
2
)
{
for
(
unsigned
int
j
=
k
/
2
;
j
>
0
;
j
/=
2
)
{
for
(
unsigned
int
i
=
threadIdx
.
x
;
i
<
length
;
i
+=
blockDim
.
x
)
{
int
ixj
=
i
^
j
;
if
(
ixj
>
i
&&
ixj
<
length
)
{
DATA_TYPE
value1
=
data
[
startIndex
+
i
];
DATA_TYPE
value2
=
data
[
startIndex
+
ixj
];
bool
ascending
=
((
i
&
k
)
==
0
);
for
(
unsigned
int
mask
=
k
*
2
;
mask
<
2
*
length
;
mask
*=
2
)
ascending
=
((
i
&
mask
)
==
0
?
!
ascending
:
ascending
);
KEY_TYPE
lowKey
=
(
ascending
?
getValue
(
value1
)
:
getValue
(
value2
));
KEY_TYPE
highKey
=
(
ascending
?
getValue
(
value2
)
:
getValue
(
value1
));
if
(
lowKey
>
highKey
)
{
data
[
startIndex
+
i
]
=
value2
;
data
[
startIndex
+
ixj
]
=
value1
;
}
}
}
__threadfence_block
();
__syncthreads
();
}
}
}
}
}
}
\ No newline at end of file
platforms/cuda2/src/kernels/utilities.cu
View file @
3e16cab9
extern
"C"
{
/**
* This is called by the various functions below to clear a buffer.
*/
...
...
@@ -100,3 +102,5 @@ __global__ void reduceForces(const long* __restrict__ longBuffer, float4* __rest
buffer
[
index
]
=
sum
;
}
}
}
\ No newline at end of file
platforms/cuda2/tests/TestCudaSort.cpp
0 → 100644
View file @
3e16cab9
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008-2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
/**
* This tests the CUDA implementation of sorting.
*/
#include "openmm/internal/AssertionUtilities.h"
#include "../src/CudaArray.h"
#include "../src/CudaContext.h"
#include "../src/CudaSort.h"
#include "sfmt/SFMT.h"
#include "openmm/System.h"
#include <iostream>
#include <cmath>
#include <set>
using
namespace
OpenMM
;
using
namespace
std
;
class
SortTrait
:
public
CudaSort
::
SortTrait
{
int
getDataSize
()
const
{
return
4
;}
int
getKeySize
()
const
{
return
4
;}
const
char
*
getDataType
()
const
{
return
"float"
;}
const
char
*
getKeyType
()
const
{
return
"float"
;}
const
char
*
getMinKey
()
const
{
return
"-MAXFLOAT"
;}
const
char
*
getMaxKey
()
const
{
return
"MAXFLOAT"
;}
const
char
*
getMaxValue
()
const
{
return
"MAXFLOAT"
;}
const
char
*
getSortKey
()
const
{
return
"value"
;}
};
void
verifySorting
(
vector
<
float
>
array
)
{
// Sort the array.
System
system
;
system
.
addParticle
(
0.0
);
CudaPlatform
platform
;
CudaPlatform
::
PlatformData
platformData
(
system
,
""
,
"true"
,
"single"
,
platform
.
getPropertyDefaultValue
(
CudaPlatform
::
CudaCompiler
()),
platform
.
getPropertyDefaultValue
(
CudaPlatform
::
CudaTempDirectory
()));
CudaContext
&
context
=
*
platformData
.
contexts
[
0
];
context
.
initialize
();
CudaArray
data
(
array
.
size
(),
4
,
"sortData"
);
data
.
upload
(
array
);
CudaSort
sort
(
context
,
new
SortTrait
(),
array
.
size
());
sort
.
sort
(
data
);
vector
<
float
>
sorted
;
data
.
download
(
sorted
);
// Verify that it is in sorted order.
for
(
int
i
=
1
;
i
<
(
int
)
sorted
.
size
();
i
++
)
ASSERT
(
sorted
[
i
-
1
]
<=
sorted
[
i
]);
// Make sure the sorted array contains the same values as the original one.
multiset
<
float
>
elements1
(
array
.
begin
(),
array
.
end
());
multiset
<
float
>
elements2
(
sorted
.
begin
(),
sorted
.
end
());
ASSERT
(
elements1
==
elements2
);
}
void
testUniformValues
()
{
OpenMM_SFMT
::
SFMT
sfmt
;
init_gen_rand
(
0
,
sfmt
);
vector
<
float
>
array
(
10000
);
for
(
int
i
=
0
;
i
<
(
int
)
array
.
size
();
i
++
)
array
[
i
]
=
(
float
)
genrand_real2
(
sfmt
);
verifySorting
(
array
);
}
void
testLogValues
()
{
OpenMM_SFMT
::
SFMT
sfmt
;
init_gen_rand
(
0
,
sfmt
);
vector
<
float
>
array
(
10000
);
for
(
int
i
=
0
;
i
<
(
int
)
array
.
size
();
i
++
)
array
[
i
]
=
(
float
)
log
(
genrand_real2
(
sfmt
));
verifySorting
(
array
);
}
int
main
()
{
try
{
testUniformValues
();
testLogValues
();
}
catch
(
const
exception
&
e
)
{
cout
<<
"exception: "
<<
e
.
what
()
<<
endl
;
return
1
;
}
cout
<<
"Done"
<<
endl
;
return
0
;
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment