Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
3e16cab9
Commit
3e16cab9
authored
Jun 05, 2012
by
Peter Eastman
Browse files
Continuing to implement new CUDA platform
parent
abb8cb4b
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
1158 additions
and
509 deletions
+1158
-509
platforms/cuda2/src/CudaArray.h
platforms/cuda2/src/CudaArray.h
+1
-1
platforms/cuda2/src/CudaContext.cpp
platforms/cuda2/src/CudaContext.cpp
+367
-309
platforms/cuda2/src/CudaContext.h
platforms/cuda2/src/CudaContext.h
+200
-195
platforms/cuda2/src/CudaPlatform.cpp
platforms/cuda2/src/CudaPlatform.cpp
+5
-4
platforms/cuda2/src/CudaSort.cpp
platforms/cuda2/src/CudaSort.cpp
+132
-0
platforms/cuda2/src/CudaSort.h
platforms/cuda2/src/CudaSort.h
+141
-0
platforms/cuda2/src/kernels/sort.cu
platforms/cuda2/src/kernels/sort.cu
+186
-0
platforms/cuda2/src/kernels/utilities.cu
platforms/cuda2/src/kernels/utilities.cu
+4
-0
platforms/cuda2/tests/TestCudaSort.cpp
platforms/cuda2/tests/TestCudaSort.cpp
+122
-0
No files found.
platforms/cuda2/src/CudaArray.h
View file @
3e16cab9
...
...
@@ -83,7 +83,7 @@ public:
/**
* Get a pointer to the device memory.
*/
CUdeviceptr
getDevicePointer
()
{
CUdeviceptr
&
getDevicePointer
()
{
return
pointer
;
}
/**
...
...
platforms/cuda2/src/CudaContext.cpp
View file @
3e16cab9
...
...
@@ -31,7 +31,6 @@
#include "CudaContext.h"
#include "CudaArray.h"
//#include "CudaBondedUtilities.h"
#include "CudaExpressionUtilities.h"
#include "CudaForceInfo.h"
//#include "CudaIntegrationUtilities.h"
#include "CudaKernelSources.h"
...
...
@@ -53,7 +52,7 @@
#define CHECK_RESULT2(result, prefix) \
if (result != CUDA_SUCCESS) { \
std::stringstream m; \
m<<prefix<<": "<<result<<" ("<<__FILE__<<":
"<<__LINE__
<<")"
; \
m<<prefix<<": "<<
getErrorString(
result
)
<<" (
"<<result<<")"<<" at
"<<__FILE__<<":"<<__LINE__; \
throw OpenMMException(m.str());\
}
...
...
@@ -66,7 +65,7 @@ bool CudaContext::hasInitializedCuda = false;
CudaContext
::
CudaContext
(
const
System
&
system
,
int
deviceIndex
,
bool
useBlockingSync
,
const
string
&
precision
,
const
string
&
compiler
,
const
string
&
tempDir
,
CudaPlatform
::
PlatformData
&
platformData
)
:
system
(
system
),
compiler
(
compiler
),
time
(
0.0
),
platformData
(
platformData
),
stepCount
(
0
),
computeForceCount
(
0
),
contextIsValid
(
false
),
atomsWereReordered
(
false
),
posq
(
NULL
),
time
(
0.0
),
platformData
(
platformData
),
stepCount
(
0
),
computeForceCount
(
0
),
contextIsValid
(
false
),
atomsWereReordered
(
false
),
pinnedBuffer
(
NULL
),
posq
(
NULL
),
velm
(
NULL
),
/*forceBuffers(NULL), longForceBuffer(NULL), energyBuffer(NULL), atomIndex(NULL), integration(NULL),
bonded(NULL), nonbonded(NULL),*/
thread
(
NULL
)
{
if
(
!
hasInitializedCuda
)
{
...
...
@@ -88,7 +87,7 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
else
throw
OpenMMException
(
"Illegal value for CudaPrecision: "
+
precision
);
#ifdef WIN32
this
->
tempDir
=
tempDir
+
"
\"
;
this
->
tempDir
=
tempDir
+
"
\
\
"
;
#else
this
->
tempDir
=
tempDir
+
"/"
;
#endif
...
...
@@ -114,6 +113,7 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
deviceIndex
=
i
;
bestSpeed
=
speed
;
bestCompute
=
major
;
gpuArchitecture
=
intToString
(
major
)
+
intToString
(
minor
);
}
}
}
...
...
@@ -121,37 +121,47 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
throw
OpenMMException
(
"No compatible CUDA device is available"
);
CHECK_RESULT
(
cuDeviceGet
(
&
device
,
deviceIndex
));
this
->
deviceIndex
=
deviceIndex
;
int major, minor;
CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, device));
gpuArchitecture = CudaExpressionUtilities::intToString(major)+CudaExpressionUtilities::intToString(minor);
compilationDefines["
WORK_GROUP_SIZE
"] = CudaExpressionUtilities::intToString(ThreadBlockSize);
compilationDefines
[
"WORK_GROUP_SIZE"
]
=
intToString
(
ThreadBlockSize
);
defaultOptimizationOptions
=
"--use_fast_math"
;
int numThreadBlocksPerComputeUnit = 6;
CHECK_RESULT(cuCtxCreate(&context, 0, device));
unsigned
int
flags
=
CU_CTX_MAP_HOST
;
if
(
useBlockingSync
)
flags
+=
CU_CTX_SCHED_BLOCKING_SYNC
;
else
flags
+=
CU_CTX_SCHED_SPIN
;
CHECK_RESULT
(
cuCtxCreate
(
&
context
,
flags
,
device
));
contextIsValid
=
true
;
numAtoms
=
system
.
getNumParticles
();
paddedNumAtoms
=
TileSize
*
((
numAtoms
+
TileSize
-
1
)
/
TileSize
);
numAtomBlocks
=
(
paddedNumAtoms
+
(
TileSize
-
1
))
/
TileSize
;
int
multiprocessors
;
CHECK_RESULT
(
cuDeviceGetAttribute
(
&
multiprocessors
,
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT
,
device
));
int
numThreadBlocksPerComputeUnit
=
6
;
numThreadBlocks
=
numThreadBlocksPerComputeUnit
*
multiprocessors
;
// bonded = new CudaBondedUtilities(*this);
// nonbonded = new CudaNonbondedUtilities(*this);
posq = CudaArray::create<float4>(paddedNumAtoms, "
posq
");
velm = CudaArray::create<float4>(paddedNumAtoms, "
velm
");
if
(
useDoublePrecision
)
{
CHECK_RESULT
(
cuMemHostAlloc
(
&
pinnedBuffer
,
paddedNumAtoms
*
sizeof
(
double4
),
0
));
posq
=
CudaArray
::
create
<
double4
>
(
paddedNumAtoms
,
"posq"
);
velm
=
CudaArray
::
create
<
double4
>
(
paddedNumAtoms
,
"velm"
);
}
else
{
CHECK_RESULT
(
cuMemHostAlloc
(
&
pinnedBuffer
,
paddedNumAtoms
*
sizeof
(
float4
),
0
));
posq
=
CudaArray
::
create
<
float4
>
(
paddedNumAtoms
,
"posq"
);
velm
=
CudaArray
::
create
<
float4
>
(
paddedNumAtoms
,
"velm"
);
}
posCellOffsets
.
resize
(
paddedNumAtoms
,
make_int4
(
0
,
0
,
0
,
0
));
// Create utility kernels that are used in multiple places.
CUmodule
utilities
=
createModule
(
CudaKernelSources
::
vectorOps
+
CudaKernelSources
::
utilities
);
cuModuleGetFunction(&
clearBufferKernel
,
utilities, "
clearBuffer
");
cuModuleGetFunction(&
clearTwoBuffersKernel
,
utilities, "
clearTwoBuffers
");
cuModuleGetFunction(&
clearThreeBuffersKernel
,
utilities, "
clearThreeBuffers
");
cuModuleGetFunction(&
clearFourBuffersKernel
,
utilities, "
clearFourBuffers
");
cuModuleGetFunction(&
clearFiveBuffersKernel
,
utilities, "
clearFiveBuffers
");
cuModuleGetFunction(&
clearSixBuffersKernel
,
utilities, "
clearSixBuffers
");
cuModuleGetFunction(&
reduceFloat4Kernel
,
utilities, "
reduceFloat4Buffer
");
cuModuleGetFunction(&
reduceForcesKernel
,
utilities, "
reduceForces
");
clearBufferKernel
=
getKernel
(
utilities
,
"clearBuffer"
);
clearTwoBuffersKernel
=
getKernel
(
utilities
,
"clearTwoBuffers"
);
clearThreeBuffersKernel
=
getKernel
(
utilities
,
"clearThreeBuffers"
);
clearFourBuffersKernel
=
getKernel
(
utilities
,
"clearFourBuffers"
);
clearFiveBuffersKernel
=
getKernel
(
utilities
,
"clearFiveBuffers"
);
clearSixBuffersKernel
=
getKernel
(
utilities
,
"clearSixBuffers"
);
reduceFloat4Kernel
=
getKernel
(
utilities
,
"reduceFloat4Buffer"
);
reduceForcesKernel
=
getKernel
(
utilities
,
"reduceForces"
);
// Set defines based on the requested precision.
...
...
@@ -175,6 +185,8 @@ CudaContext::~CudaContext() {
delete
forces
[
i
];
for
(
int
i
=
0
;
i
<
(
int
)
reorderListeners
.
size
();
i
++
)
delete
reorderListeners
[
i
];
if
(
pinnedBuffer
!=
NULL
)
cuMemFreeHost
(
pinnedBuffer
);
if
(
posq
!=
NULL
)
delete
posq
;
if
(
velm
!=
NULL
)
...
...
@@ -202,38 +214,29 @@ CudaContext::~CudaContext() {
CHECK_RESULT
(
cuCtxDestroy
(
context
));
}
//void CudaContext::initialize() {
// for (int i = 0; i < numAtoms; i++) {
// double mass = system.getParticleMass(i);
// (*velm)[i].w = (float) (mass == 0.0 ? 0.0 : 1.0/mass);
// }
// velm->upload();
void
CudaContext
::
initialize
()
{
for
(
int
i
=
0
;
i
<
numAtoms
;
i
++
)
{
double
mass
=
system
.
getParticleMass
(
i
);
if
(
useDoublePrecision
)
((
double4
*
)
pinnedBuffer
)[
i
]
=
make_double4
(
0.0
,
0.0
,
0.0
,
mass
==
0.0
?
0.0
:
1.0
/
mass
);
else
((
float4
*
)
pinnedBuffer
)[
i
]
=
make_float4
(
0.0
f
,
0.0
f
,
0.0
f
,
mass
==
0.0
?
0.0
f
:
(
float
)
(
1.0
/
mass
));
}
velm
->
upload
(
pinnedBuffer
);
// bonded->initialize(system);
// numForceBuffers = platformData.contexts.size();
// numForceBuffers = std::max(numForceBuffers, bonded->getNumForceBuffers());
// for (int i = 0; i < (int) forces.size(); i++)
// numForceBuffers = std::max(numForceBuffers, forces[i]->getRequiredForceBuffers());
// forceBuffers = new CudaArray<mm_float4>(*this, paddedNumAtoms*numForceBuffers, "
forceBuffers
", false);
// if (supports64BitGlobalAtomics) {
// longForceBuffer = new CudaArray<cl_long>(*this, 3*paddedNumAtoms, "
longForceBuffer
", false);
// reduceForcesKernel.setArg<cl::Buffer>(0, longForceBuffer->getDeviceBuffer());
// reduceForcesKernel.setArg<cl::Buffer>(1, forceBuffers->getDeviceBuffer());
// reduceForcesKernel.setArg<cl_int>(2, paddedNumAtoms);
// reduceForcesKernel.setArg<cl_int>(3, numForceBuffers);
// addAutoclearBuffer(longForceBuffer->getDeviceBuffer(), longForceBuffer->getSize()*2);
// }
// addAutoclearBuffer(forceBuffers->getDeviceBuffer(), forceBuffers->getSize()*4);
// force = new CudaArray<mm_float4>(*this, &forceBuffers->getDeviceBuffer(), paddedNumAtoms, "
force
", true);
// energyBuffer = new CudaArray<cl_float>(*this, max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers()), "
energyBuffer
", true);
// addAutoclearBuffer(energyBuffer->getDeviceBuffer(), energyBuffer->getSize());
// atomIndex = new CudaArray<cl_int>(*this, paddedNumAtoms, "
atomIndex
", true);
// for (int i = 0; i < paddedNumAtoms; ++i)
// (*atomIndex)[i] = i;
// atomIndex->upload();
// findMoleculeGroups();
// moleculesInvalid = false;
force
=
CudaArray
::
create
<
long3
>
(
paddedNumAtoms
,
"force"
);
addAutoclearBuffer
(
force
->
getDevicePointer
(),
force
->
getSize
()
*
6
);
energyBuffer
=
CudaArray
::
create
<
float
>
(
numThreadBlocks
*
ThreadBlockSize
,
"energyBuffer"
);
addAutoclearBuffer
(
energyBuffer
->
getDevicePointer
(),
energyBuffer
->
getSize
());
atomIndexDevice
=
CudaArray
::
create
<
int
>
(
paddedNumAtoms
,
"atomIndex"
);
atomIndex
.
resize
(
paddedNumAtoms
);
for
(
int
i
=
0
;
i
<
paddedNumAtoms
;
++
i
)
atomIndex
[
i
]
=
i
;
atomIndexDevice
->
upload
(
atomIndex
);
findMoleculeGroups
();
moleculesInvalid
=
false
;
// nonbonded->initialize(system);
//
}
}
void
CudaContext
::
addForce
(
CudaForceInfo
*
force
)
{
forces
.
push_back
(
force
);
...
...
@@ -315,7 +318,7 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
CUresult
result
=
cuModuleLoad
(
&
module
,
outputFile
.
c_str
());
if
(
result
!=
CUDA_SUCCESS
)
{
std
::
stringstream
m
;
m
<<
"Error loading CUDA module: "
<<
result
;
m
<<
"Error loading CUDA module: "
<<
getErrorString
(
result
)
<<
" ("
<<
result
<<
")"
;
throw
OpenMMException
(
m
.
str
());
}
remove
(
inputFile
.
c_str
());
...
...
@@ -329,52 +332,109 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
remove
(
logFile
.
c_str
());
throw
;
}
//
// // Get length before using c_str() to avoid length() call invalidating the c_str() value.
// string src_string = src.str();
// ::size_t src_length = src_string.length();
// cl::Program::Sources sources(1, make_pair(src_string.c_str(), src_length));
// cl::Program program(context, sources);
// try {
// program.build(vector<cl::Device>(1, device), options.c_str());
// } catch (cl::Error err) {
// throw OpenMMException("Error compiling kernel: "+program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device));
// }
}
//
//void CudaContext::executeKernel(cl::Kernel& kernel, int workUnits, int blockSize) {
// if (blockSize == -1)
// blockSize = ThreadBlockSize;
// int size = std::min((workUnits+blockSize-1)/blockSize, numThreadBlocks)*blockSize;
// try {
// queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(size), cl::NDRange(blockSize));
// }
// catch (cl::Error err) {
// stringstream str;
// str<<"Error invoking kernel "<<kernel.getInfo<CL_KERNEL_FUNCTION_NAME>()<<": "<<err.what()<<" ("<<err.err()<<")";
// throw OpenMMException(str.str());
// }
//}
//
//void CudaContext::clearBuffer(CudaArray<float>& array) {
// clearBuffer(array.getDeviceBuffer(), array.getSize());
//}
//
//void CudaContext::clearBuffer(CudaArray<mm_float4>& array) {
// clearBuffer(array.getDeviceBuffer(), array.getSize()*4);
//}
//
//void CudaContext::clearBuffer(cl::Memory& memory, int size) {
// clearBufferKernel.setArg<cl::Memory>(0, memory);
// clearBufferKernel.setArg<cl_int>(1, size);
// executeKernel(clearBufferKernel, size, 128);
//}
//
//void CudaContext::addAutoclearBuffer(cl::Memory& memory, int size) {
// autoclearBuffers.push_back(&memory);
// autoclearBufferSizes.push_back(size);
//}
//
CUfunction
CudaContext
::
getKernel
(
CUmodule
&
module
,
const
string
&
name
)
{
CUfunction
function
;
CUresult
result
=
cuModuleGetFunction
(
&
function
,
module
,
name
.
c_str
());
if
(
result
!=
CUDA_SUCCESS
)
{
std
::
stringstream
m
;
m
<<
"Error creating kernel "
<<
name
<<
": "
<<
getErrorString
(
result
)
<<
" ("
<<
result
<<
")"
;
throw
OpenMMException
(
m
.
str
());
}
return
function
;
}
string
CudaContext
::
doubleToString
(
double
value
)
{
stringstream
s
;
s
.
precision
(
useDoublePrecision
?
16
:
8
);
s
<<
scientific
<<
value
;
if
(
!
useDoublePrecision
)
s
<<
"f"
;
return
s
.
str
();
}
string
CudaContext
::
intToString
(
int
value
)
{
stringstream
s
;
s
<<
value
;
return
s
.
str
();
}
std
::
string
CudaContext
::
getErrorString
(
CUresult
result
)
{
switch
(
result
)
{
case
CUDA_SUCCESS
:
return
"CUDA_SUCCESS"
;
case
CUDA_ERROR_INVALID_VALUE
:
return
"CUDA_ERROR_INVALID_VALUE"
;
case
CUDA_ERROR_OUT_OF_MEMORY
:
return
"CUDA_ERROR_OUT_OF_MEMORY"
;
case
CUDA_ERROR_NOT_INITIALIZED
:
return
"CUDA_ERROR_NOT_INITIALIZED"
;
case
CUDA_ERROR_DEINITIALIZED
:
return
"CUDA_ERROR_DEINITIALIZED"
;
case
CUDA_ERROR_PROFILER_DISABLED
:
return
"CUDA_ERROR_PROFILER_DISABLED"
;
case
CUDA_ERROR_PROFILER_NOT_INITIALIZED
:
return
"CUDA_ERROR_PROFILER_NOT_INITIALIZED"
;
case
CUDA_ERROR_PROFILER_ALREADY_STARTED
:
return
"CUDA_ERROR_PROFILER_ALREADY_STARTED"
;
case
CUDA_ERROR_PROFILER_ALREADY_STOPPED
:
return
"CUDA_ERROR_PROFILER_ALREADY_STOPPED"
;
case
CUDA_ERROR_NO_DEVICE
:
return
"CUDA_ERROR_NO_DEVICE"
;
case
CUDA_ERROR_INVALID_DEVICE
:
return
"CUDA_ERROR_INVALID_DEVICE"
;
case
CUDA_ERROR_INVALID_IMAGE
:
return
"CUDA_ERROR_INVALID_IMAGE"
;
case
CUDA_ERROR_INVALID_CONTEXT
:
return
"CUDA_ERROR_INVALID_CONTEXT"
;
case
CUDA_ERROR_CONTEXT_ALREADY_CURRENT
:
return
"CUDA_ERROR_CONTEXT_ALREADY_CURRENT"
;
case
CUDA_ERROR_MAP_FAILED
:
return
"CUDA_ERROR_MAP_FAILED"
;
case
CUDA_ERROR_UNMAP_FAILED
:
return
"CUDA_ERROR_UNMAP_FAILED"
;
case
CUDA_ERROR_ARRAY_IS_MAPPED
:
return
"CUDA_ERROR_ARRAY_IS_MAPPED"
;
case
CUDA_ERROR_ALREADY_MAPPED
:
return
"CUDA_ERROR_ALREADY_MAPPED"
;
case
CUDA_ERROR_NO_BINARY_FOR_GPU
:
return
"CUDA_ERROR_NO_BINARY_FOR_GPU"
;
case
CUDA_ERROR_ALREADY_ACQUIRED
:
return
"CUDA_ERROR_ALREADY_ACQUIRED"
;
case
CUDA_ERROR_NOT_MAPPED
:
return
"CUDA_ERROR_NOT_MAPPED"
;
case
CUDA_ERROR_NOT_MAPPED_AS_ARRAY
:
return
"CUDA_ERROR_NOT_MAPPED_AS_ARRAY"
;
case
CUDA_ERROR_NOT_MAPPED_AS_POINTER
:
return
"CUDA_ERROR_NOT_MAPPED_AS_POINTER"
;
case
CUDA_ERROR_ECC_UNCORRECTABLE
:
return
"CUDA_ERROR_ECC_UNCORRECTABLE"
;
case
CUDA_ERROR_UNSUPPORTED_LIMIT
:
return
"CUDA_ERROR_UNSUPPORTED_LIMIT"
;
case
CUDA_ERROR_CONTEXT_ALREADY_IN_USE
:
return
"CUDA_ERROR_CONTEXT_ALREADY_IN_USE"
;
case
CUDA_ERROR_INVALID_SOURCE
:
return
"CUDA_ERROR_INVALID_SOURCE"
;
case
CUDA_ERROR_FILE_NOT_FOUND
:
return
"CUDA_ERROR_FILE_NOT_FOUND"
;
case
CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND
:
return
"CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND"
;
case
CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
:
return
"CUDA_ERROR_SHARED_OBJECT_INIT_FAILED"
;
case
CUDA_ERROR_OPERATING_SYSTEM
:
return
"CUDA_ERROR_OPERATING_SYSTEM"
;
case
CUDA_ERROR_INVALID_HANDLE
:
return
"CUDA_ERROR_INVALID_HANDLE"
;
case
CUDA_ERROR_NOT_FOUND
:
return
"CUDA_ERROR_NOT_FOUND"
;
case
CUDA_ERROR_NOT_READY
:
return
"CUDA_ERROR_NOT_READY"
;
case
CUDA_ERROR_LAUNCH_FAILED
:
return
"CUDA_ERROR_LAUNCH_FAILED"
;
case
CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES
:
return
"CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES"
;
case
CUDA_ERROR_LAUNCH_TIMEOUT
:
return
"CUDA_ERROR_LAUNCH_TIMEOUT"
;
case
CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING
:
return
"CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING"
;
case
CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED
:
return
"CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED"
;
case
CUDA_ERROR_PEER_ACCESS_NOT_ENABLED
:
return
"CUDA_ERROR_PEER_ACCESS_NOT_ENABLED"
;
case
CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE
:
return
"CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE"
;
case
CUDA_ERROR_CONTEXT_IS_DESTROYED
:
return
"CUDA_ERROR_CONTEXT_IS_DESTROYED"
;
case
CUDA_ERROR_UNKNOWN
:
return
"CUDA_ERROR_UNKNOWN"
;
}
return
"Invalid error code"
;
}
void
CudaContext
::
executeKernel
(
CUfunction
kernel
,
void
**
arguments
,
int
threads
,
int
blockSize
,
unsigned
int
sharedSize
)
{
if
(
blockSize
==
-
1
)
blockSize
=
ThreadBlockSize
;
int
gridSize
=
std
::
min
((
threads
+
blockSize
-
1
)
/
blockSize
,
numThreadBlocks
);
CUresult
result
=
cuLaunchKernel
(
kernel
,
gridSize
,
1
,
1
,
blockSize
,
1
,
1
,
sharedSize
,
0
,
arguments
,
NULL
);
if
(
result
!=
CUDA_SUCCESS
)
{
stringstream
str
;
str
<<
"Error invoking kernel: "
<<
getErrorString
(
result
)
<<
" ("
<<
result
<<
")"
;
throw
OpenMMException
(
str
.
str
());
}
}
void
CudaContext
::
clearBuffer
(
CudaArray
&
array
)
{
clearBuffer
(
array
.
getDevicePointer
(),
array
.
getSize
()
*
array
.
getElementSize
()
/
4
);
}
void
CudaContext
::
clearBuffer
(
CUdeviceptr
memory
,
int
size
)
{
void
*
args
[]
=
{
&
memory
,
&
size
};
executeKernel
(
clearBufferKernel
,
args
,
size
,
128
);
}
void
CudaContext
::
addAutoclearBuffer
(
CUdeviceptr
memory
,
int
size
)
{
autoclearBuffers
.
push_back
(
memory
);
autoclearBufferSizes
.
push_back
(
size
);
}
//void CudaContext::clearAutoclearBuffers() {
// int base = 0;
// int total = autoclearBufferSizes.size();
...
...
@@ -454,219 +514,217 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
// executeKernel(reduceFloat4Kernel, bufferSize, 128);
//}
//
//void CudaContext::tagAtomsInMolecule(int atom, int molecule, vector<int>& atomMolecule, vector<vector<int> >& atomBonds) {
// // Recursively tag atoms as belonging to a particular molecule.
//
// atomMolecule[atom] = molecule;
// for (int i = 0; i < (int) atomBonds[atom].size(); i++)
// if (atomMolecule[atomBonds[atom][i]] == -1)
// tagAtomsInMolecule(atomBonds[atom][i], molecule, atomMolecule, atomBonds);
//}
//
///**
// * This class ensures that atom reordering doesn't break virtual sites.
// */
//class CudaContext::VirtualSiteInfo : public CudaForceInfo {
//public:
// VirtualSiteInfo(const System& system) : CudaForceInfo(0) {
// for (int i = 0; i < system.getNumParticles(); i++) {
// if (system.isVirtualSite(i)) {
// siteTypes.push_back(&typeid(system.getVirtualSite(i)));
// vector<int> particles;
// particles.push_back(i);
// for (int j = 0; j < system.getVirtualSite(i).getNumParticles(); j++)
// particles.push_back(system.getVirtualSite(i).getParticle(j));
// siteParticles.push_back(particles);
// vector<double> weights;
// if (dynamic_cast<const TwoParticleAverageSite*>(&system.getVirtualSite(i)) != NULL) {
// // A two particle average.
//
// const TwoParticleAverageSite& site = dynamic_cast<const TwoParticleAverageSite&>(system.getVirtualSite(i));
// weights.push_back(site.getWeight(0));
// weights.push_back(site.getWeight(1));
// }
// else if (dynamic_cast<const ThreeParticleAverageSite*>(&system.getVirtualSite(i)) != NULL) {
// // A three particle average.
//
// const ThreeParticleAverageSite& site = dynamic_cast<const ThreeParticleAverageSite&>(system.getVirtualSite(i));
// weights.push_back(site.getWeight(0));
// weights.push_back(site.getWeight(1));
// weights.push_back(site.getWeight(2));
// }
// else if (dynamic_cast<const OutOfPlaneSite*>(&system.getVirtualSite(i)) != NULL) {
// // An out of plane site.
//
// const OutOfPlaneSite& site = dynamic_cast<const OutOfPlaneSite&>(system.getVirtualSite(i));
// weights.push_back(site.getWeight12());
// weights.push_back(site.getWeight13());
// weights.push_back(site.getWeightCross());
// }
// siteWeights.push_back(weights);
// }
// }
// }
// int getNumParticleGroups() {
// return siteTypes.size();
// }
// void getParticlesInGroup(int index, std::vector<int>& particles) {
// particles = siteParticles[index];
// }
// bool areGroupsIdentical(int group1, int group2) {
// if (siteTypes[group1] != siteTypes[group2])
// return false;
// int numParticles = siteWeights[group1].size();
// if (siteWeights[group2].size() != numParticles)
// return false;
// for (int i = 0; i < numParticles; i++)
// if (siteWeights[group1][i] != siteWeights[group2][i])
// return false;
// return true;
// }
//private:
// vector<const type_info*> siteTypes;
// vector<vector<int> > siteParticles;
// vector<vector<double> > siteWeights;
//};
//
//
//void CudaContext::findMoleculeGroups() {
// // The first time this is called, we need to identify all the molecules in the system.
//
// if (moleculeGroups.size() == 0) {
// // Add a ForceInfo that makes sure reordering doesn't break virtual sites.
//
// addForce(new VirtualSiteInfo(system));
//
// // First make a list of every other atom to which each atom is connect by a constraint or force group.
//
// vector<vector<int> > atomBonds(system.getNumParticles());
// for (int i = 0; i < system.getNumConstraints(); i++) {
// int particle1, particle2;
// double distance;
// system.getConstraintParameters(i, particle1, particle2, distance);
// atomBonds[particle1].push_back(particle2);
// atomBonds[particle2].push_back(particle1);
// }
// for (int i = 0; i < (int) forces.size(); i++) {
// for (int j = 0; j < forces[i]->getNumParticleGroups(); j++) {
// vector<int> particles;
// forces[i]->getParticlesInGroup(j, particles);
// for (int k = 0; k < (int) particles.size(); k++)
// for (int m = 0; m < (int) particles.size(); m++)
// if (k != m)
// atomBonds[particles[k]].push_back(particles[m]);
// }
// }
//
// // Now tag atoms by which molecule they belong to.
//
// vector<int> atomMolecule(numAtoms, -1);
// int numMolecules = 0;
// for (int i = 0; i < numAtoms; i++)
// if (atomMolecule[i] == -1)
// tagAtomsInMolecule(i, numMolecules++, atomMolecule, atomBonds);
// vector<vector<int> > atomIndices(numMolecules);
// for (int i = 0; i < numAtoms; i++)
// atomIndices[atomMolecule[i]].push_back(i);
//
// // Construct a description of each molecule.
//
// molecules.resize(numMolecules);
// for (int i = 0; i < numMolecules; i++) {
// molecules[i].atoms = atomIndices[i];
// molecules[i].groups.resize(forces.size());
// }
// for (int i = 0; i < system.getNumConstraints(); i++) {
// int particle1, particle2;
// double distance;
// system.getConstraintParameters(i, particle1, particle2, distance);
// molecules[atomMolecule[particle1]].constraints.push_back(i);
// }
// for (int i = 0; i < (int) forces.size(); i++)
// for (int j = 0; j < forces[i]->getNumParticleGroups(); j++) {
// vector<int> particles;
// forces[i]->getParticlesInGroup(j, particles);
// molecules[atomMolecule[particles[0]]].groups[i].push_back(j);
// }
// }
//
// // Sort them into groups of identical molecules.
//
// vector<Molecule> uniqueMolecules;
// vector<vector<int> > moleculeInstances;
// vector<vector<int> > moleculeOffsets;
// for (int molIndex = 0; molIndex < (int) molecules.size(); molIndex++) {
// Molecule& mol = molecules[molIndex];
//
// // See if it is identical to another molecule.
//
// bool isNew = true;
// for (int j = 0; j < (int) uniqueMolecules.size() && isNew; j++) {
// Molecule& mol2 = uniqueMolecules[j];
// bool identical = (mol.atoms.size() == mol2.atoms.size() && mol.constraints.size() == mol2.constraints.size());
//
// // See if the atoms are identical.
//
// int atomOffset = mol2.atoms[0]-mol.atoms[0];
// for (int i = 0; i < (int) mol.atoms.size() && identical; i++) {
// if (mol.atoms[i] != mol2.atoms[i]-atomOffset || system.getParticleMass(mol.atoms[i]) != system.getParticleMass(mol2.atoms[i]))
// identical = false;
// for (int k = 0; k < (int) forces.size(); k++)
// if (!forces[k]->areParticlesIdentical(mol.atoms[i], mol2.atoms[i]))
// identical = false;
// }
//
// // See if the constraints are identical.
//
// for (int i = 0; i < (int) mol.constraints.size() && identical; i++) {
// int c1particle1, c1particle2, c2particle1, c2particle2;
// double distance1, distance2;
// system.getConstraintParameters(mol.constraints[i], c1particle1, c1particle2, distance1);
// system.getConstraintParameters(mol2.constraints[i], c2particle1, c2particle2, distance2);
// if (c1particle1 != c2particle1-atomOffset || c1particle2 != c2particle2-atomOffset || distance1 != distance2)
// identical = false;
// }
//
// // See if the force groups are identical.
//
// for (int i = 0; i < (int) forces.size() && identical; i++) {
// if (mol.groups[i].size() != mol2.groups[i].size())
// identical = false;
// for (int k = 0; k < (int) mol.groups[i].size() && identical; k++)
// if (!forces[i]->areGroupsIdentical(mol.groups[i][k], mol2.groups[i][k]))
// identical = false;
// }
// if (identical) {
// moleculeInstances[j].push_back(molIndex);
// moleculeOffsets[j].push_back(mol.atoms[0]);
// isNew = false;
// }
// }
// if (isNew) {
// uniqueMolecules.push_back(mol);
// moleculeInstances.push_back(vector<int>());
// moleculeInstances[moleculeInstances.size()-1].push_back(molIndex);
// moleculeOffsets.push_back(vector<int>());
// moleculeOffsets[moleculeOffsets.size()-1].push_back(mol.atoms[0]);
// }
// }
// moleculeGroups.resize(moleculeInstances.size());
// for (int i = 0; i < (int) moleculeInstances.size(); i++)
// {
// moleculeGroups[i].instances = moleculeInstances[i];
// moleculeGroups[i].offsets = moleculeOffsets[i];
// vector<int>& atoms = uniqueMolecules[i].atoms;
// moleculeGroups[i].atoms.resize(atoms.size());
// for (int j = 0; j < (int) atoms.size(); j++)
// moleculeGroups[i].atoms[j] = atoms[j]-atoms[0];
// }
//}
//
//void CudaContext::invalidateMolecules() {
// moleculesInvalid = true;
//}
//
//
void
CudaContext
::
tagAtomsInMolecule
(
int
atom
,
int
molecule
,
vector
<
int
>&
atomMolecule
,
vector
<
vector
<
int
>
>&
atomBonds
)
{
// Recursively tag atoms as belonging to a particular molecule.
atomMolecule
[
atom
]
=
molecule
;
for
(
int
i
=
0
;
i
<
(
int
)
atomBonds
[
atom
].
size
();
i
++
)
if
(
atomMolecule
[
atomBonds
[
atom
][
i
]]
==
-
1
)
tagAtomsInMolecule
(
atomBonds
[
atom
][
i
],
molecule
,
atomMolecule
,
atomBonds
);
}
/**
* This class ensures that atom reordering doesn't break virtual sites.
*/
class
CudaContext
::
VirtualSiteInfo
:
public
CudaForceInfo
{
public:
VirtualSiteInfo
(
const
System
&
system
)
:
CudaForceInfo
(
0
)
{
for
(
int
i
=
0
;
i
<
system
.
getNumParticles
();
i
++
)
{
if
(
system
.
isVirtualSite
(
i
))
{
siteTypes
.
push_back
(
&
typeid
(
system
.
getVirtualSite
(
i
)));
vector
<
int
>
particles
;
particles
.
push_back
(
i
);
for
(
int
j
=
0
;
j
<
system
.
getVirtualSite
(
i
).
getNumParticles
();
j
++
)
particles
.
push_back
(
system
.
getVirtualSite
(
i
).
getParticle
(
j
));
siteParticles
.
push_back
(
particles
);
vector
<
double
>
weights
;
if
(
dynamic_cast
<
const
TwoParticleAverageSite
*>
(
&
system
.
getVirtualSite
(
i
))
!=
NULL
)
{
// A two particle average.
const
TwoParticleAverageSite
&
site
=
dynamic_cast
<
const
TwoParticleAverageSite
&>
(
system
.
getVirtualSite
(
i
));
weights
.
push_back
(
site
.
getWeight
(
0
));
weights
.
push_back
(
site
.
getWeight
(
1
));
}
else
if
(
dynamic_cast
<
const
ThreeParticleAverageSite
*>
(
&
system
.
getVirtualSite
(
i
))
!=
NULL
)
{
// A three particle average.
const
ThreeParticleAverageSite
&
site
=
dynamic_cast
<
const
ThreeParticleAverageSite
&>
(
system
.
getVirtualSite
(
i
));
weights
.
push_back
(
site
.
getWeight
(
0
));
weights
.
push_back
(
site
.
getWeight
(
1
));
weights
.
push_back
(
site
.
getWeight
(
2
));
}
else
if
(
dynamic_cast
<
const
OutOfPlaneSite
*>
(
&
system
.
getVirtualSite
(
i
))
!=
NULL
)
{
// An out of plane site.
const
OutOfPlaneSite
&
site
=
dynamic_cast
<
const
OutOfPlaneSite
&>
(
system
.
getVirtualSite
(
i
));
weights
.
push_back
(
site
.
getWeight12
());
weights
.
push_back
(
site
.
getWeight13
());
weights
.
push_back
(
site
.
getWeightCross
());
}
siteWeights
.
push_back
(
weights
);
}
}
}
int
getNumParticleGroups
()
{
return
siteTypes
.
size
();
}
void
getParticlesInGroup
(
int
index
,
std
::
vector
<
int
>&
particles
)
{
particles
=
siteParticles
[
index
];
}
bool
areGroupsIdentical
(
int
group1
,
int
group2
)
{
if
(
siteTypes
[
group1
]
!=
siteTypes
[
group2
])
return
false
;
int
numParticles
=
siteWeights
[
group1
].
size
();
if
(
siteWeights
[
group2
].
size
()
!=
numParticles
)
return
false
;
for
(
int
i
=
0
;
i
<
numParticles
;
i
++
)
if
(
siteWeights
[
group1
][
i
]
!=
siteWeights
[
group2
][
i
])
return
false
;
return
true
;
}
private:
vector
<
const
type_info
*>
siteTypes
;
vector
<
vector
<
int
>
>
siteParticles
;
vector
<
vector
<
double
>
>
siteWeights
;
};
void
CudaContext
::
findMoleculeGroups
()
{
// The first time this is called, we need to identify all the molecules in the system.
if
(
moleculeGroups
.
size
()
==
0
)
{
// Add a ForceInfo that makes sure reordering doesn't break virtual sites.
addForce
(
new
VirtualSiteInfo
(
system
));
// First make a list of every other atom to which each atom is connect by a constraint or force group.
vector
<
vector
<
int
>
>
atomBonds
(
system
.
getNumParticles
());
for
(
int
i
=
0
;
i
<
system
.
getNumConstraints
();
i
++
)
{
int
particle1
,
particle2
;
double
distance
;
system
.
getConstraintParameters
(
i
,
particle1
,
particle2
,
distance
);
atomBonds
[
particle1
].
push_back
(
particle2
);
atomBonds
[
particle2
].
push_back
(
particle1
);
}
for
(
int
i
=
0
;
i
<
(
int
)
forces
.
size
();
i
++
)
{
for
(
int
j
=
0
;
j
<
forces
[
i
]
->
getNumParticleGroups
();
j
++
)
{
vector
<
int
>
particles
;
forces
[
i
]
->
getParticlesInGroup
(
j
,
particles
);
for
(
int
k
=
0
;
k
<
(
int
)
particles
.
size
();
k
++
)
for
(
int
m
=
0
;
m
<
(
int
)
particles
.
size
();
m
++
)
if
(
k
!=
m
)
atomBonds
[
particles
[
k
]].
push_back
(
particles
[
m
]);
}
}
// Now tag atoms by which molecule they belong to.
vector
<
int
>
atomMolecule
(
numAtoms
,
-
1
);
int
numMolecules
=
0
;
for
(
int
i
=
0
;
i
<
numAtoms
;
i
++
)
if
(
atomMolecule
[
i
]
==
-
1
)
tagAtomsInMolecule
(
i
,
numMolecules
++
,
atomMolecule
,
atomBonds
);
vector
<
vector
<
int
>
>
atomIndices
(
numMolecules
);
for
(
int
i
=
0
;
i
<
numAtoms
;
i
++
)
atomIndices
[
atomMolecule
[
i
]].
push_back
(
i
);
// Construct a description of each molecule.
molecules
.
resize
(
numMolecules
);
for
(
int
i
=
0
;
i
<
numMolecules
;
i
++
)
{
molecules
[
i
].
atoms
=
atomIndices
[
i
];
molecules
[
i
].
groups
.
resize
(
forces
.
size
());
}
for
(
int
i
=
0
;
i
<
system
.
getNumConstraints
();
i
++
)
{
int
particle1
,
particle2
;
double
distance
;
system
.
getConstraintParameters
(
i
,
particle1
,
particle2
,
distance
);
molecules
[
atomMolecule
[
particle1
]].
constraints
.
push_back
(
i
);
}
for
(
int
i
=
0
;
i
<
(
int
)
forces
.
size
();
i
++
)
for
(
int
j
=
0
;
j
<
forces
[
i
]
->
getNumParticleGroups
();
j
++
)
{
vector
<
int
>
particles
;
forces
[
i
]
->
getParticlesInGroup
(
j
,
particles
);
molecules
[
atomMolecule
[
particles
[
0
]]].
groups
[
i
].
push_back
(
j
);
}
}
// Sort them into groups of identical molecules.
vector
<
Molecule
>
uniqueMolecules
;
vector
<
vector
<
int
>
>
moleculeInstances
;
vector
<
vector
<
int
>
>
moleculeOffsets
;
for
(
int
molIndex
=
0
;
molIndex
<
(
int
)
molecules
.
size
();
molIndex
++
)
{
Molecule
&
mol
=
molecules
[
molIndex
];
// See if it is identical to another molecule.
bool
isNew
=
true
;
for
(
int
j
=
0
;
j
<
(
int
)
uniqueMolecules
.
size
()
&&
isNew
;
j
++
)
{
Molecule
&
mol2
=
uniqueMolecules
[
j
];
bool
identical
=
(
mol
.
atoms
.
size
()
==
mol2
.
atoms
.
size
()
&&
mol
.
constraints
.
size
()
==
mol2
.
constraints
.
size
());
// See if the atoms are identical.
int
atomOffset
=
mol2
.
atoms
[
0
]
-
mol
.
atoms
[
0
];
for
(
int
i
=
0
;
i
<
(
int
)
mol
.
atoms
.
size
()
&&
identical
;
i
++
)
{
if
(
mol
.
atoms
[
i
]
!=
mol2
.
atoms
[
i
]
-
atomOffset
||
system
.
getParticleMass
(
mol
.
atoms
[
i
])
!=
system
.
getParticleMass
(
mol2
.
atoms
[
i
]))
identical
=
false
;
for
(
int
k
=
0
;
k
<
(
int
)
forces
.
size
();
k
++
)
if
(
!
forces
[
k
]
->
areParticlesIdentical
(
mol
.
atoms
[
i
],
mol2
.
atoms
[
i
]))
identical
=
false
;
}
// See if the constraints are identical.
for
(
int
i
=
0
;
i
<
(
int
)
mol
.
constraints
.
size
()
&&
identical
;
i
++
)
{
int
c1particle1
,
c1particle2
,
c2particle1
,
c2particle2
;
double
distance1
,
distance2
;
system
.
getConstraintParameters
(
mol
.
constraints
[
i
],
c1particle1
,
c1particle2
,
distance1
);
system
.
getConstraintParameters
(
mol2
.
constraints
[
i
],
c2particle1
,
c2particle2
,
distance2
);
if
(
c1particle1
!=
c2particle1
-
atomOffset
||
c1particle2
!=
c2particle2
-
atomOffset
||
distance1
!=
distance2
)
identical
=
false
;
}
// See if the force groups are identical.
for
(
int
i
=
0
;
i
<
(
int
)
forces
.
size
()
&&
identical
;
i
++
)
{
if
(
mol
.
groups
[
i
].
size
()
!=
mol2
.
groups
[
i
].
size
())
identical
=
false
;
for
(
int
k
=
0
;
k
<
(
int
)
mol
.
groups
[
i
].
size
()
&&
identical
;
k
++
)
if
(
!
forces
[
i
]
->
areGroupsIdentical
(
mol
.
groups
[
i
][
k
],
mol2
.
groups
[
i
][
k
]))
identical
=
false
;
}
if
(
identical
)
{
moleculeInstances
[
j
].
push_back
(
molIndex
);
moleculeOffsets
[
j
].
push_back
(
mol
.
atoms
[
0
]);
isNew
=
false
;
}
}
if
(
isNew
)
{
uniqueMolecules
.
push_back
(
mol
);
moleculeInstances
.
push_back
(
vector
<
int
>
());
moleculeInstances
[
moleculeInstances
.
size
()
-
1
].
push_back
(
molIndex
);
moleculeOffsets
.
push_back
(
vector
<
int
>
());
moleculeOffsets
[
moleculeOffsets
.
size
()
-
1
].
push_back
(
mol
.
atoms
[
0
]);
}
}
moleculeGroups
.
resize
(
moleculeInstances
.
size
());
for
(
int
i
=
0
;
i
<
(
int
)
moleculeInstances
.
size
();
i
++
)
{
moleculeGroups
[
i
].
instances
=
moleculeInstances
[
i
];
moleculeGroups
[
i
].
offsets
=
moleculeOffsets
[
i
];
vector
<
int
>&
atoms
=
uniqueMolecules
[
i
].
atoms
;
moleculeGroups
[
i
].
atoms
.
resize
(
atoms
.
size
());
for
(
int
j
=
0
;
j
<
(
int
)
atoms
.
size
();
j
++
)
moleculeGroups
[
i
].
atoms
[
j
]
=
atoms
[
j
]
-
atoms
[
0
];
}
}
void
CudaContext
::
invalidateMolecules
()
{
moleculesInvalid
=
true
;
}
//void OpenCLContext::validateMolecules() {
// moleculesInvalid = false;
// if (numAtoms == 0 || nonbonded == NULL || !nonbonded->getUseCutoff())
...
...
platforms/cuda2/src/CudaContext.h
View file @
3e16cab9
...
...
@@ -72,11 +72,11 @@ public:
CudaContext
(
const
System
&
system
,
int
deviceIndex
,
bool
useBlockingSync
,
const
std
::
string
&
precision
,
const
std
::
string
&
compiler
,
const
std
::
string
&
tempDir
,
CudaPlatform
::
PlatformData
&
platformData
);
~
CudaContext
();
//
/**
//
* This is called to initialize internal data structures after all Forces in the system
//
* have been initialized.
//
*/
//
void initialize();
/**
* This is called to initialize internal data structures after all Forces in the system
* have been initialized.
*/
void
initialize
();
/**
* Add a CudaForce to this context.
*/
...
...
@@ -123,12 +123,12 @@ public:
CudaArray
&
getVelm
()
{
return
*
velm
;
}
//
/**
//
* Get the array which contains the force on each atom.
//
*/
//
CudaArray
<mm_float4>
& getForce() {
//
return *force;
//
}
/**
* Get the array which contains the force on each atom
(respresented as a long3 in 64 bit fixed point)
.
*/
CudaArray
&
getForce
()
{
return
*
force
;
}
// /**
// * Get the array which contains the buffers in which forces are computed.
// */
...
...
@@ -184,36 +184,41 @@ public:
* omitted, a default set of options will be used
*/
CUmodule
createModule
(
const
std
::
string
source
,
const
std
::
map
<
std
::
string
,
std
::
string
>&
defines
,
const
char
*
optimizationFlags
=
NULL
);
// /**
// * Execute a kernel.
// *
// * @param kernel the kernel to execute
// * @param workUnits the maximum number of work units that should be used
// * @param blockSize the size of each thread block to use
// */
// void executeKernel(cl::Kernel& kernel, int workUnits, int blockSize = -1);
// /**
// * Set all elements of an array to 0.
// */
// void clearBuffer(CudaArray<float>& array);
// /**
// * Set all elements of an array to 0.
// */
// void clearBuffer(CudaArray<mm_float4>& array);
// /**
// * Set all elements of an array to 0.
// *
// * @param memory the Memory to clear
// * @param size the number of float elements in the buffer
// */
// void clearBuffer(cl::Memory& memory, int size);
// /**
// * Register a buffer that should be automatically cleared (all elements set to 0) at the start of each force or energy computation.
// *
// * @param memory the Memory to clear
// * @param size the number of float elements in the buffer
// */
// void addAutoclearBuffer(cl::Memory& memory, int size);
/**
* Get a kernel from a CUDA module.
*
* @param module the module to get the kernel from
* @param name the name of the kernel to get
*/
CUfunction
getKernel
(
CUmodule
&
module
,
const
std
::
string
&
name
);
/**
* Execute a kernel.
*
* @param kernel the kernel to execute
* @param arguments an array of pointers to the kernel arguments
* @param threads the maximum number of threads that should be used
* @param blockSize the size of each thread block to use
* @param sharedSize the amount of dynamic shared memory to allocated for the kernel, in bytes
*/
void
executeKernel
(
CUfunction
kernel
,
void
**
arguments
,
int
workUnits
,
int
blockSize
=
-
1
,
unsigned
int
sharedSize
=
0
);
/**
* Set all elements of an array to 0.
*/
void
clearBuffer
(
CudaArray
&
array
);
/**
* Set all elements of an array to 0.
*
* @param memory the memory to clear
* @param size the number of 4-byte elements in the buffer
*/
void
clearBuffer
(
CUdeviceptr
memory
,
int
size
);
/**
* Register a buffer that should be automatically cleared (all elements set to 0) at the start of each force or energy computation.
*
* @param memory the memory to clear
* @param size the number of float/double elements in the buffer
*/
void
addAutoclearBuffer
(
CUdeviceptr
memory
,
int
size
);
// /**
// * Clear all buffers that have been registered with addAutoclearBuffer().
// */
...
...
@@ -230,108 +235,110 @@ public:
// * Sum the buffesr containing forces.
// */
// void reduceForces();
// /**
// * Get the current simulation time.
// */
// double getTime() {
// return time;
// }
// /**
// * Set the current simulation time.
// */
// void setTime(double t) {
// time = t;
// }
// /**
// * Get the number of integration steps that have been taken.
// */
// int getStepCount() {
// return stepCount;
// }
// /**
// * Set the number of integration steps that have been taken.
// */
// void setStepCount(int steps) {
// stepCount = steps;
// }
// /**
// * Get the number of times forces or energy has been computed.
// */
// int getComputeForceCount() {
// return computeForceCount;
// }
// /**
// * Set the number of times forces or energy has been computed.
// */
// void setComputeForceCount(int count) {
// computeForceCount = count;
// }
// /**
// * Get the number of atoms.
// */
// int getNumAtoms() const {
// return numAtoms;
// }
// /**
// * Get the number of atoms, rounded up to a multiple of TileSize. This is the actual size of
// * most arrays with one element per atom.
// */
// int getPaddedNumAtoms() const {
// return paddedNumAtoms;
// }
// /**
// * Get the number of blocks of TileSize atoms.
// */
// int getNumAtomBlocks() const {
// return numAtomBlocks;
// }
// /**
// * Get the standard number of thread blocks to use when executing kernels.
// */
// int getNumThreadBlocks() const {
// return numThreadBlocks;
// }
// /**
// * Get the number of force buffers.
// */
// int getNumForceBuffers() const {
// return numForceBuffers;
// }
// /**
// * Get the SIMD width of the device being used.
// */
// int getSIMDWidth() const {
// return simdWidth;
// }
// /**
// * Get whether the device being used supports 64 bit atomic operations on global memory.
// */
// bool getSupports64BitGlobalAtomics() {
// return supports64BitGlobalAtomics;
// }
// /**
// * Get whether the device being used supports double precision math.
// */
// bool getSupportsDoublePrecision() {
// return supportsDoublePrecision;
// }
/**
* Get the current simulation time.
*/
double
getTime
()
{
return
time
;
}
/**
* Set the current simulation time.
*/
void
setTime
(
double
t
)
{
time
=
t
;
}
/**
* Get the number of integration steps that have been taken.
*/
int
getStepCount
()
{
return
stepCount
;
}
/**
* Set the number of integration steps that have been taken.
*/
void
setStepCount
(
int
steps
)
{
stepCount
=
steps
;
}
/**
* Get the number of times forces or energy has been computed.
*/
int
getComputeForceCount
()
{
return
computeForceCount
;
}
/**
* Set the number of times forces or energy has been computed.
*/
void
setComputeForceCount
(
int
count
)
{
computeForceCount
=
count
;
}
/**
* Get the number of atoms.
*/
int
getNumAtoms
()
const
{
return
numAtoms
;
}
/**
* Get the number of atoms, rounded up to a multiple of TileSize. This is the actual size of
* most arrays with one element per atom.
*/
int
getPaddedNumAtoms
()
const
{
return
paddedNumAtoms
;
}
/**
* Get the number of blocks of TileSize atoms.
*/
int
getNumAtomBlocks
()
const
{
return
numAtomBlocks
;
}
/**
* Get the standard number of thread blocks to use when executing kernels.
*/
int
getNumThreadBlocks
()
const
{
return
numThreadBlocks
;
}
/**
* Get whether double precision is being used.
*/
bool
getUseDoublePrecision
()
{
return
useDoublePrecision
;
}
/**
* Get whether accumulation is being done in double precision.
*/
bool
getAccumulateInDouble
()
{
return
accumulateInDouble
;
}
/**
* Convert a number to a string in a format suitable for including in a kernel.
* This takes into account whether the context uses single or double precision.
*/
std
::
string
doubleToString
(
double
value
);
/**
* Convert a number to a string in a format suitable for including in a kernel.
*/
std
::
string
intToString
(
int
value
);
/**
* Convert a CUDA result code to the corresponding string description.
*/
std
::
string
getErrorString
(
CUresult
result
);
// /**
// * Get the size of the periodic box.
// */
//
mm_
float4 getPeriodicBoxSize() const {
// float4 getPeriodicBoxSize() const {
// return periodicBoxSize;
// }
// /**
// * Set the size of the periodic box.
// */
// void setPeriodicBoxSize(double xsize, double ysize, double zsize) {
// periodicBoxSize = m
m
_float4((float) xsize, (float) ysize, (float) zsize, 0);
// invPeriodicBoxSize = m
m
_float4((float) (1.0/xsize), (float) (1.0/ysize), (float) (1.0/zsize), 0);
// periodicBoxSize = m
ake
_float4((float) xsize, (float) ysize, (float) zsize, 0);
// invPeriodicBoxSize = m
ake
_float4((float) (1.0/xsize), (float) (1.0/ysize), (float) (1.0/zsize), 0);
// }
// /**
// * Get the inverse of the size of the periodic box.
// */
//
mm_
float4 getInvPeriodicBoxSize() const {
// float4 getInvPeriodicBoxSize() const {
// return invPeriodicBoxSize;
// }
// /**
...
...
@@ -352,66 +359,66 @@ public:
// CudaNonbondedUtilities& getNonbondedUtilities() {
// return *nonbonded;
// }
//
/**
//
* Get the thread used by this context for executing parallel computations.
//
*/
//
WorkThread& getWorkThread() {
//
return *thread;
//
}
//
/**
//
* Get whether atoms were reordered during the most recent force/energy computation.
//
*/
//
bool getAtomsWereReordered() const {
//
return atomsWereReordered;
//
}
//
/**
//
* Set whether atoms were reordered during the most recent force/energy computation.
//
*/
//
void setAtomsWereReordered(bool wereReordered) {
//
atomsWereReordered = wereReordered;
//
}
//
/**
//
* Reorder the internal arrays of atoms to try to keep spatially contiguous atoms close
//
* together in the arrays.
//
*
//
* @param enforcePeriodic if true, the atom positions may be altered to enforce periodic boundary conditions
//
*/
//
void reorderAtoms(bool enforcePeriodic);
//
/**
//
* Add a listener that should be called whenever atoms get reordered. The CudaContext
//
* assumes ownership of the object, and deletes it when the context itself is deleted.
//
*/
//
void addReorderListener(ReorderListener* listener);
//
/**
//
* Get the list of ReorderListeners.
//
*/
//
std::vector<ReorderListener*>& getReorderListeners() {
//
return reorderListeners;
//
}
//
/**
//
* Mark that the current molecule definitions (and hence the atom order) may be invalid.
//
* This should be called whenever force field parameters change. It will cause the definitions
//
* and order to be revalidated the next to reorderAtoms() is called.
//
*/
//
void invalidateMolecules();
//
/**
//
* Get whether the current molecule definitions are valid.
//
*/
//
bool getMoleculesAreInvalid() {
//
return moleculesInvalid;
//
}
/**
* Get the thread used by this context for executing parallel computations.
*/
WorkThread
&
getWorkThread
()
{
return
*
thread
;
}
/**
* Get whether atoms were reordered during the most recent force/energy computation.
*/
bool
getAtomsWereReordered
()
const
{
return
atomsWereReordered
;
}
/**
* Set whether atoms were reordered during the most recent force/energy computation.
*/
void
setAtomsWereReordered
(
bool
wereReordered
)
{
atomsWereReordered
=
wereReordered
;
}
/**
* Reorder the internal arrays of atoms to try to keep spatially contiguous atoms close
* together in the arrays.
*
* @param enforcePeriodic if true, the atom positions may be altered to enforce periodic boundary conditions
*/
void
reorderAtoms
(
bool
enforcePeriodic
);
/**
* Add a listener that should be called whenever atoms get reordered. The CudaContext
* assumes ownership of the object, and deletes it when the context itself is deleted.
*/
void
addReorderListener
(
ReorderListener
*
listener
);
/**
* Get the list of ReorderListeners.
*/
std
::
vector
<
ReorderListener
*>&
getReorderListeners
()
{
return
reorderListeners
;
}
/**
* Mark that the current molecule definitions (and hence the atom order) may be invalid.
* This should be called whenever force field parameters change. It will cause the definitions
* and order to be revalidated the next to reorderAtoms() is called.
*/
void
invalidateMolecules
();
/**
* Get whether the current molecule definitions are valid.
*/
bool
getMoleculesAreInvalid
()
{
return
moleculesInvalid
;
}
private:
struct
Molecule
;
struct
MoleculeGroup
;
class
VirtualSiteInfo
;
//
void findMoleculeGroups();
//
static void tagAtomsInMolecule(int atom, int molecule, std::vector<int>& atomMolecule, std::vector<std::vector<int> >& atomBonds);
//
/**
//
* Ensure that all molecules marked as "identical" really are identical. This should be
//
* called whenever force field parameters change. If necessary, it will rebuild the list
//
* of molecules and resort the atoms.
//
*/
//
void validateMolecules();
void
findMoleculeGroups
();
static
void
tagAtomsInMolecule
(
int
atom
,
int
molecule
,
std
::
vector
<
int
>&
atomMolecule
,
std
::
vector
<
std
::
vector
<
int
>
>&
atomBonds
);
/**
* Ensure that all molecules marked as "identical" really are identical. This should be
* called whenever force field parameters change. If necessary, it will rebuild the list
* of molecules and resort the atoms.
*/
void
validateMolecules
();
static
bool
hasInitializedCuda
;
const
System
&
system
;
double
time
;
...
...
@@ -424,8 +431,6 @@ private:
int
paddedNumAtoms
;
int
numAtomBlocks
;
int
numThreadBlocks
;
// int numForceBuffers;
// int simdWidth;
bool
useBlockingSync
,
useDoublePrecision
,
accumulateInDouble
,
contextIsValid
,
atomsWereReordered
,
moleculesInvalid
;
std
::
string
compiler
,
tempDir
,
gpuArchitecture
;
float4
periodicBoxSize
;
...
...
@@ -446,15 +451,15 @@ private:
std
::
vector
<
Molecule
>
molecules
;
std
::
vector
<
MoleculeGroup
>
moleculeGroups
;
std
::
vector
<
int4
>
posCellOffsets
;
void
*
pinnedBuffer
;
CudaArray
*
posq
;
CudaArray
*
velm
;
// CudaArray<mm_float4>* force;
// CudaArray<mm_float4>* forceBuffers;
// CudaArray<cl_long>* longForceBuffer;
// CudaArray<cl_float>* energyBuffer;
// CudaArray<cl_int>* atomIndex;
// std::vector<cl::Memory*> autoclearBuffers;
// std::vector<int> autoclearBufferSizes;
CudaArray
*
force
;
CudaArray
*
energyBuffer
;
CudaArray
*
atomIndexDevice
;
std
::
vector
<
int
>
atomIndex
;
std
::
vector
<
CUdeviceptr
>
autoclearBuffers
;
std
::
vector
<
int
>
autoclearBufferSizes
;
std
::
vector
<
ReorderListener
*>
reorderListeners
;
// CudaIntegrationUtilities* integration;
// CudaBondedUtilities* bonded;
...
...
platforms/cuda2/src/CudaPlatform.cpp
View file @
3e16cab9
...
...
@@ -154,6 +154,7 @@ CudaPlatform::PlatformData::PlatformData(const System& system, const string& dev
device
<<
contexts
[
i
]
->
getDeviceIndex
();
}
propertyValues
[
CudaPlatform
::
CudaDeviceIndex
()]
=
device
.
str
();
propertyValues
[
CudaPlatform
::
CudaUseBlockingSync
()]
=
blocking
?
"true"
:
"false"
;
propertyValues
[
CudaPlatform
::
CudaPrecision
()]
=
precisionProperty
;
propertyValues
[
CudaPlatform
::
CudaCompiler
()]
=
compilerProperty
;
propertyValues
[
CudaPlatform
::
CudaTempDirectory
()]
=
tempProperty
;
...
...
@@ -166,11 +167,11 @@ CudaPlatform::PlatformData::~PlatformData() {
}
void
CudaPlatform
::
PlatformData
::
initializeContexts
(
const
System
&
system
)
{
//
for (int i = 0; i < (int) contexts.size(); i++)
//
contexts[i]->initialize();
for
(
int
i
=
0
;
i
<
(
int
)
contexts
.
size
();
i
++
)
contexts
[
i
]
->
initialize
();
}
void
CudaPlatform
::
PlatformData
::
syncContexts
()
{
//
for (int i = 0; i < (int) contexts.size(); i++)
//
contexts[i]->getWorkThread().flush();
for
(
int
i
=
0
;
i
<
(
int
)
contexts
.
size
();
i
++
)
contexts
[
i
]
->
getWorkThread
().
flush
();
}
platforms/cuda2/src/CudaSort.cpp
0 → 100644
View file @
3e16cab9
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2010-2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "CudaSort.h"
#include "CudaKernelSources.h"
#include <map>
using
namespace
OpenMM
;
using
namespace
std
;
CudaSort
::
CudaSort
(
CudaContext
&
context
,
SortTrait
*
trait
,
unsigned
int
length
)
:
context
(
context
),
trait
(
trait
),
dataRange
(
NULL
),
bucketOfElement
(
NULL
),
offsetInBucket
(
NULL
),
bucketOffset
(
NULL
),
buckets
(
NULL
)
{
// Create kernels.
map
<
string
,
string
>
replacements
;
replacements
[
"DATA_TYPE"
]
=
trait
->
getDataType
();
replacements
[
"KEY_TYPE"
]
=
trait
->
getKeyType
();
replacements
[
"SORT_KEY"
]
=
trait
->
getSortKey
();
replacements
[
"MIN_KEY"
]
=
trait
->
getMinKey
();
replacements
[
"MAX_KEY"
]
=
trait
->
getMaxKey
();
replacements
[
"MAX_VALUE"
]
=
trait
->
getMaxValue
();
CUmodule
module
=
context
.
createModule
(
context
.
replaceStrings
(
CudaKernelSources
::
sort
,
replacements
));
computeRangeKernel
=
context
.
getKernel
(
module
,
"computeRange"
);
assignElementsKernel
=
context
.
getKernel
(
module
,
"assignElementsToBuckets"
);
computeBucketPositionsKernel
=
context
.
getKernel
(
module
,
"computeBucketPositions"
);
copyToBucketsKernel
=
context
.
getKernel
(
module
,
"copyDataToBuckets"
);
sortBucketsKernel
=
context
.
getKernel
(
module
,
"sortBuckets"
);
// Work out the work group sizes for various kernels.
int
maxBlockSize
;
cuDeviceGetAttribute
(
&
maxBlockSize
,
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X
,
context
.
getDevice
());
for
(
rangeKernelSize
=
1
;
rangeKernelSize
*
2
<=
maxBlockSize
;
rangeKernelSize
*=
2
)
;
positionsKernelSize
=
rangeKernelSize
;
sortKernelSize
=
rangeKernelSize
/
2
;
if
(
rangeKernelSize
>
length
)
rangeKernelSize
=
length
;
int
maxSharedMem
;
cuDeviceGetAttribute
(
&
maxSharedMem
,
CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK
,
context
.
getDevice
());
unsigned
int
maxLocalBuffer
=
(
unsigned
int
)
((
maxSharedMem
/
trait
->
getDataSize
())
/
2
);
if
(
sortKernelSize
>
maxLocalBuffer
)
sortKernelSize
=
maxLocalBuffer
;
unsigned
int
targetBucketSize
=
sortKernelSize
/
2
;
unsigned
int
numBuckets
=
length
/
targetBucketSize
;
if
(
numBuckets
<
1
)
numBuckets
=
1
;
if
(
positionsKernelSize
>
numBuckets
)
positionsKernelSize
=
numBuckets
;
// Create workspace arrays.
dataRange
=
new
CudaArray
(
2
,
trait
->
getKeySize
(),
"sortDataRange"
);
bucketOffset
=
CudaArray
::
create
<
uint1
>
(
numBuckets
,
"bucketOffset"
);
bucketOfElement
=
CudaArray
::
create
<
uint1
>
(
length
,
"bucketOfElement"
);
offsetInBucket
=
CudaArray
::
create
<
uint1
>
(
length
,
"offsetInBucket"
);
buckets
=
new
CudaArray
(
length
,
trait
->
getDataSize
(),
"buckets"
);
}
CudaSort
::~
CudaSort
()
{
delete
trait
;
if
(
dataRange
!=
NULL
)
delete
dataRange
;
if
(
bucketOfElement
!=
NULL
)
delete
bucketOfElement
;
if
(
offsetInBucket
!=
NULL
)
delete
offsetInBucket
;
if
(
bucketOffset
!=
NULL
)
delete
bucketOffset
;
if
(
buckets
!=
NULL
)
delete
buckets
;
}
void
CudaSort
::
sort
(
CudaArray
&
data
)
{
if
(
data
.
getSize
()
!=
bucketOfElement
->
getSize
()
||
data
.
getElementSize
()
!=
trait
->
getDataSize
())
throw
OpenMMException
(
"CudaSort called with different data size"
);
if
(
data
.
getSize
()
==
0
)
return
;
// Compute the range of data values.
unsigned
int
dataSize
=
data
.
getSize
();
void
*
rangeArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
dataSize
,
&
dataRange
->
getDevicePointer
()};
context
.
executeKernel
(
computeRangeKernel
,
rangeArgs
,
rangeKernelSize
,
rangeKernelSize
,
rangeKernelSize
*
trait
->
getKeySize
());
// Assign array elements to buckets.
unsigned
int
numBuckets
=
bucketOffset
->
getSize
();
context
.
clearBuffer
(
*
bucketOffset
);
void
*
elementsArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
dataSize
,
&
numBuckets
,
&
dataRange
->
getDevicePointer
(),
&
bucketOffset
->
getDevicePointer
(),
&
bucketOfElement
->
getDevicePointer
(),
&
offsetInBucket
->
getDevicePointer
()};
context
.
executeKernel
(
assignElementsKernel
,
elementsArgs
,
data
.
getSize
());
// Compute the position of each bucket.
void
*
computeArgs
[]
=
{
&
numBuckets
,
&
bucketOffset
->
getDevicePointer
()};
context
.
executeKernel
(
computeBucketPositionsKernel
,
computeArgs
,
positionsKernelSize
,
positionsKernelSize
,
positionsKernelSize
*
sizeof
(
int
));
// Copy the data into the buckets.
void
*
copyArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
buckets
->
getDevicePointer
(),
&
dataSize
,
&
bucketOffset
->
getDevicePointer
(),
&
bucketOfElement
->
getDevicePointer
(),
&
offsetInBucket
->
getDevicePointer
()};
context
.
executeKernel
(
copyToBucketsKernel
,
copyArgs
,
data
.
getSize
());
// Sort each bucket.
void
*
sortArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
buckets
->
getDevicePointer
(),
&
numBuckets
,
&
bucketOffset
->
getDevicePointer
()};
context
.
executeKernel
(
sortBucketsKernel
,
sortArgs
,
((
data
.
getSize
()
+
sortKernelSize
-
1
)
/
sortKernelSize
)
*
sortKernelSize
,
sortKernelSize
,
sortKernelSize
*
trait
->
getDataSize
());
}
platforms/cuda2/src/CudaSort.h
0 → 100644
View file @
3e16cab9
#ifndef __OPENMM_CUDASORT_H__
#define __OPENMM_CUDASORT_H__
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2010-2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "CudaArray.h"
#include "openmm/internal/windowsExport.h"
#include "CudaContext.h"
namespace
OpenMM
{
/**
* This class sorts arrays of values. It supports any type of values, not just scalars,
* so long as an appropriate sorting key can be defined by which to sort them.
*
* The sorting behavior is specified by a "trait" class that defines the type of data to
* sort and the key for sorting it. Here is an example of a trait class for
* sorting floats:
*
* class SortTrait : public CudaSort::SortTrait {
* int getDataSize() const {return 4;}
* int getKeySize() const {return 4;}
* const char* getDataType() const {return "float";}
* const char* getKeyType() const {return "float";}
* const char* getMinKey() const {return "-MAXFLOAT";}
* const char* getMaxKey() const {return "MAXFLOAT";}
* const char* getMaxValue() const {return "MAXFLOAT";}
* const char* getSortKey() const {return "value";}
* };
*
* The algorithm used is a bucket sort, followed by a bitonic sort within each bucket
* (in local memory when possible, in global memory otherwise). This is similar to
* the algorithm described in
*
* Shifu Chen, Jing Qin, Yongming Xie, Junping Zhao, and Pheng-Ann Heng. "An Efficient
* Sorting Algorithm with CUDA" Journal of the Chinese Institute of Engineers, 32(7),
* pp. 915-921 (2009)
*
* but with many modifications and simplifications. In particular, this algorithm
* involves much less communication between host and device, which is critical to get
* good performance with the array sizes we typically work with (10,000 to 100,000
* elements).
*/
class
OPENMM_EXPORT
CudaSort
{
public:
class
SortTrait
;
/**
* Create a CudaSort object for sorting data of a particular type.
*
* @param context the context in which to perform calculations
* @param trait a SortTrait defining the type of data to sort. It should have been allocated
* on the heap with the "new" operator. This object takes over ownership of it,
* and deletes it when the CudaSort is deleted.
* @param length the length of the arrays this object will be used to sort
*/
CudaSort
(
CudaContext
&
context
,
SortTrait
*
trait
,
unsigned
int
length
);
~
CudaSort
();
/**
* Sort an array.
*/
void
sort
(
CudaArray
&
data
);
private:
CudaContext
&
context
;
SortTrait
*
trait
;
CudaArray
*
dataRange
;
CudaArray
*
bucketOfElement
;
CudaArray
*
offsetInBucket
;
CudaArray
*
bucketOffset
;
CudaArray
*
buckets
;
CUfunction
computeRangeKernel
,
assignElementsKernel
,
computeBucketPositionsKernel
,
copyToBucketsKernel
,
sortBucketsKernel
;
unsigned
int
rangeKernelSize
,
positionsKernelSize
,
sortKernelSize
;
};
/**
* A subclass of SortTrait defines the type of value to sort, and the key for sorting them.
*/
class
CudaSort
::
SortTrait
{
public:
/**
* Get the size of each data value in bytes.
*/
virtual
int
getDataSize
()
const
=
0
;
/**
* Get the size of each key value in bytes.
*/
virtual
int
getKeySize
()
const
=
0
;
/**
* Get the data type of the values to sort.
*/
virtual
const
char
*
getDataType
()
const
=
0
;
/**
* Get the data type of the sorting key.
*/
virtual
const
char
*
getKeyType
()
const
=
0
;
/**
* Get the minimum value a key can take.
*/
virtual
const
char
*
getMinKey
()
const
=
0
;
/**
* Get the maximum value a key can take.
*/
virtual
const
char
*
getMaxKey
()
const
=
0
;
/**
* Get a value whose key is guaranteed to equal getMaxKey().
*/
virtual
const
char
*
getMaxValue
()
const
=
0
;
/**
* Get the CUDA code to select the key from the data value.
*/
virtual
const
char
*
getSortKey
()
const
=
0
;
};
}
// namespace OpenMM
#endif // __OPENMM_CUDASORT_H__
platforms/cuda2/src/kernels/sort.cu
0 → 100644
View file @
3e16cab9
__device__
KEY_TYPE
getValue
(
DATA_TYPE
value
)
{
return
SORT_KEY
;
}
extern
"C"
{
/**
* Calculate the minimum and maximum value in the array to be sorted. This kernel
* is executed as a single work group.
*/
__global__
void
computeRange
(
const
DATA_TYPE
*
__restrict__
data
,
unsigned
int
length
,
KEY_TYPE
*
__restrict__
range
)
{
extern
__shared__
KEY_TYPE
rangeBuffer
[];
KEY_TYPE
minimum
=
MAX_KEY
;
KEY_TYPE
maximum
=
MIN_KEY
;
// Each thread calculates the range of a subset of values.
for
(
unsigned
int
index
=
threadIdx
.
x
;
index
<
length
;
index
+=
blockDim
.
x
)
{
KEY_TYPE
value
=
getValue
(
data
[
index
]);
minimum
=
min
(
minimum
,
value
);
maximum
=
max
(
maximum
,
value
);
}
// Now reduce them.
rangeBuffer
[
threadIdx
.
x
]
=
minimum
;
__syncthreads
();
for
(
unsigned
int
step
=
1
;
step
<
blockDim
.
x
;
step
*=
2
)
{
if
(
threadIdx
.
x
+
step
<
blockDim
.
x
&&
threadIdx
.
x
%
(
2
*
step
)
==
0
)
rangeBuffer
[
threadIdx
.
x
]
=
min
(
rangeBuffer
[
threadIdx
.
x
],
rangeBuffer
[
threadIdx
.
x
+
step
]);
__syncthreads
();
}
minimum
=
rangeBuffer
[
0
];
rangeBuffer
[
threadIdx
.
x
]
=
maximum
;
__syncthreads
();
for
(
unsigned
int
step
=
1
;
step
<
blockDim
.
x
;
step
*=
2
)
{
if
(
threadIdx
.
x
+
step
<
blockDim
.
x
&&
threadIdx
.
x
%
(
2
*
step
)
==
0
)
rangeBuffer
[
threadIdx
.
x
]
=
max
(
rangeBuffer
[
threadIdx
.
x
],
rangeBuffer
[
threadIdx
.
x
+
step
]);
__syncthreads
();
}
maximum
=
rangeBuffer
[
0
];
if
(
threadIdx
.
x
==
0
)
{
range
[
0
]
=
minimum
;
range
[
1
]
=
maximum
;
}
}
/**
* Assign elements to buckets.
*/
__global__
void
assignElementsToBuckets
(
const
DATA_TYPE
*
__restrict__
data
,
unsigned
int
length
,
unsigned
int
numBuckets
,
const
KEY_TYPE
*
__restrict__
range
,
unsigned
int
*
bucketOffset
,
unsigned
int
*
__restrict__
bucketOfElement
,
unsigned
int
*
__restrict__
offsetInBucket
)
{
float
minValue
=
(
float
)
(
range
[
0
]);
float
maxValue
=
(
float
)
(
range
[
1
]);
float
bucketWidth
=
(
maxValue
-
minValue
)
/
numBuckets
;
for
(
unsigned
int
index
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
index
<
length
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
float
key
=
(
float
)
getValue
(
data
[
index
]);
unsigned
int
bucketIndex
=
min
((
unsigned
int
)
((
key
-
minValue
)
/
bucketWidth
),
numBuckets
-
1
);
offsetInBucket
[
index
]
=
atomicAdd
(
&
bucketOffset
[
bucketIndex
],
1
);
bucketOfElement
[
index
]
=
bucketIndex
;
}
}
/**
* Sum the bucket sizes to compute the start position of each bucket. This kernel
* is executed as a single work group.
*/
__global__
void
computeBucketPositions
(
unsigned
int
numBuckets
,
unsigned
int
*
__restrict__
bucketOffset
)
{
extern
__shared__
unsigned
int
posBuffer
[];
unsigned
int
globalOffset
=
0
;
for
(
unsigned
int
startBucket
=
0
;
startBucket
<
numBuckets
;
startBucket
+=
blockDim
.
x
)
{
// Load the bucket sizes into local memory.
unsigned
int
globalIndex
=
startBucket
+
threadIdx
.
x
;
posBuffer
[
threadIdx
.
x
]
=
(
globalIndex
<
numBuckets
?
bucketOffset
[
globalIndex
]
:
0
);
__syncthreads
();
// Perform a parallel prefix sum.
for
(
unsigned
int
step
=
1
;
step
<
blockDim
.
x
;
step
*=
2
)
{
unsigned
int
add
=
(
threadIdx
.
x
>=
step
?
posBuffer
[
threadIdx
.
x
-
step
]
:
0
);
__syncthreads
();
posBuffer
[
threadIdx
.
x
]
+=
add
;
__syncthreads
();
}
// Write the results back to global memory.
if
(
globalIndex
<
numBuckets
)
bucketOffset
[
globalIndex
]
=
posBuffer
[
threadIdx
.
x
]
+
globalOffset
;
globalOffset
+=
posBuffer
[
blockDim
.
x
-
1
];
}
}
/**
* Copy the input data into the buckets for sorting.
*/
__global__
void
copyDataToBuckets
(
const
DATA_TYPE
*
__restrict__
data
,
DATA_TYPE
*
__restrict__
buckets
,
unsigned
int
length
,
const
unsigned
int
*
__restrict__
bucketOffset
,
const
unsigned
int
*
__restrict__
bucketOfElement
,
const
unsigned
int
*
__restrict__
offsetInBucket
)
{
for
(
unsigned
int
index
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
index
<
length
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
DATA_TYPE
element
=
data
[
index
];
unsigned
int
bucketIndex
=
bucketOfElement
[
index
];
unsigned
int
offset
=
(
bucketIndex
==
0
?
0
:
bucketOffset
[
bucketIndex
-
1
]);
buckets
[
offset
+
offsetInBucket
[
index
]]
=
element
;
}
}
/**
* Sort the data in each bucket.
*/
__global__
void
sortBuckets
(
DATA_TYPE
*
__restrict__
data
,
const
DATA_TYPE
*
__restrict__
buckets
,
unsigned
int
numBuckets
,
const
unsigned
int
*
__restrict__
bucketOffset
)
{
extern
__shared__
DATA_TYPE
dataBuffer
[];
for
(
unsigned
int
index
=
blockIdx
.
x
;
index
<
numBuckets
;
index
+=
gridDim
.
x
)
{
unsigned
int
startIndex
=
(
index
==
0
?
0
:
bucketOffset
[
index
-
1
]);
unsigned
int
endIndex
=
bucketOffset
[
index
];
unsigned
int
length
=
endIndex
-
startIndex
;
if
(
length
<=
blockDim
.
x
)
{
// Load the data into local memory.
if
(
threadIdx
.
x
<
length
)
dataBuffer
[
threadIdx
.
x
]
=
buckets
[
startIndex
+
threadIdx
.
x
];
else
dataBuffer
[
threadIdx
.
x
]
=
MAX_VALUE
;
__syncthreads
();
// Perform a bitonic sort in local memory.
for
(
unsigned
int
k
=
2
;
k
<=
blockDim
.
x
;
k
*=
2
)
{
for
(
unsigned
int
j
=
k
/
2
;
j
>
0
;
j
/=
2
)
{
int
ixj
=
threadIdx
.
x
^
j
;
if
(
ixj
>
threadIdx
.
x
)
{
DATA_TYPE
value1
=
dataBuffer
[
threadIdx
.
x
];
DATA_TYPE
value2
=
dataBuffer
[
ixj
];
bool
ascending
=
(
threadIdx
.
x
&
k
)
==
0
;
KEY_TYPE
lowKey
=
(
ascending
?
getValue
(
value1
)
:
getValue
(
value2
));
KEY_TYPE
highKey
=
(
ascending
?
getValue
(
value2
)
:
getValue
(
value1
));
if
(
lowKey
>
highKey
)
{
dataBuffer
[
threadIdx
.
x
]
=
value2
;
dataBuffer
[
ixj
]
=
value1
;
}
}
__syncthreads
();
}
}
// Write the data to the sorted array.
if
(
threadIdx
.
x
<
length
)
data
[
startIndex
+
threadIdx
.
x
]
=
dataBuffer
[
threadIdx
.
x
];
}
else
{
// Copy the bucket data over to the output array.
for
(
unsigned
int
i
=
threadIdx
.
x
;
i
<
length
;
i
+=
blockDim
.
x
)
data
[
startIndex
+
i
]
=
buckets
[
startIndex
+
i
];
__threadfence_block
();
__syncthreads
();
// Perform a bitonic sort in global memory.
for
(
unsigned
int
k
=
2
;
k
<
2
*
length
;
k
*=
2
)
{
for
(
unsigned
int
j
=
k
/
2
;
j
>
0
;
j
/=
2
)
{
for
(
unsigned
int
i
=
threadIdx
.
x
;
i
<
length
;
i
+=
blockDim
.
x
)
{
int
ixj
=
i
^
j
;
if
(
ixj
>
i
&&
ixj
<
length
)
{
DATA_TYPE
value1
=
data
[
startIndex
+
i
];
DATA_TYPE
value2
=
data
[
startIndex
+
ixj
];
bool
ascending
=
((
i
&
k
)
==
0
);
for
(
unsigned
int
mask
=
k
*
2
;
mask
<
2
*
length
;
mask
*=
2
)
ascending
=
((
i
&
mask
)
==
0
?
!
ascending
:
ascending
);
KEY_TYPE
lowKey
=
(
ascending
?
getValue
(
value1
)
:
getValue
(
value2
));
KEY_TYPE
highKey
=
(
ascending
?
getValue
(
value2
)
:
getValue
(
value1
));
if
(
lowKey
>
highKey
)
{
data
[
startIndex
+
i
]
=
value2
;
data
[
startIndex
+
ixj
]
=
value1
;
}
}
}
__threadfence_block
();
__syncthreads
();
}
}
}
}
}
}
\ No newline at end of file
platforms/cuda2/src/kernels/utilities.cu
View file @
3e16cab9
extern
"C"
{
/**
* This is called by the various functions below to clear a buffer.
*/
...
...
@@ -100,3 +102,5 @@ __global__ void reduceForces(const long* __restrict__ longBuffer, float4* __rest
buffer
[
index
]
=
sum
;
}
}
}
\ No newline at end of file
platforms/cuda2/tests/TestCudaSort.cpp
0 → 100644
View file @
3e16cab9
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008-2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
/**
* This tests the CUDA implementation of sorting.
*/
#include "openmm/internal/AssertionUtilities.h"
#include "../src/CudaArray.h"
#include "../src/CudaContext.h"
#include "../src/CudaSort.h"
#include "sfmt/SFMT.h"
#include "openmm/System.h"
#include <iostream>
#include <cmath>
#include <set>
using
namespace
OpenMM
;
using
namespace
std
;
class
SortTrait
:
public
CudaSort
::
SortTrait
{
int
getDataSize
()
const
{
return
4
;}
int
getKeySize
()
const
{
return
4
;}
const
char
*
getDataType
()
const
{
return
"float"
;}
const
char
*
getKeyType
()
const
{
return
"float"
;}
const
char
*
getMinKey
()
const
{
return
"-MAXFLOAT"
;}
const
char
*
getMaxKey
()
const
{
return
"MAXFLOAT"
;}
const
char
*
getMaxValue
()
const
{
return
"MAXFLOAT"
;}
const
char
*
getSortKey
()
const
{
return
"value"
;}
};
void
verifySorting
(
vector
<
float
>
array
)
{
// Sort the array.
System
system
;
system
.
addParticle
(
0.0
);
CudaPlatform
platform
;
CudaPlatform
::
PlatformData
platformData
(
system
,
""
,
"true"
,
"single"
,
platform
.
getPropertyDefaultValue
(
CudaPlatform
::
CudaCompiler
()),
platform
.
getPropertyDefaultValue
(
CudaPlatform
::
CudaTempDirectory
()));
CudaContext
&
context
=
*
platformData
.
contexts
[
0
];
context
.
initialize
();
CudaArray
data
(
array
.
size
(),
4
,
"sortData"
);
data
.
upload
(
array
);
CudaSort
sort
(
context
,
new
SortTrait
(),
array
.
size
());
sort
.
sort
(
data
);
vector
<
float
>
sorted
;
data
.
download
(
sorted
);
// Verify that it is in sorted order.
for
(
int
i
=
1
;
i
<
(
int
)
sorted
.
size
();
i
++
)
ASSERT
(
sorted
[
i
-
1
]
<=
sorted
[
i
]);
// Make sure the sorted array contains the same values as the original one.
multiset
<
float
>
elements1
(
array
.
begin
(),
array
.
end
());
multiset
<
float
>
elements2
(
sorted
.
begin
(),
sorted
.
end
());
ASSERT
(
elements1
==
elements2
);
}
void
testUniformValues
()
{
OpenMM_SFMT
::
SFMT
sfmt
;
init_gen_rand
(
0
,
sfmt
);
vector
<
float
>
array
(
10000
);
for
(
int
i
=
0
;
i
<
(
int
)
array
.
size
();
i
++
)
array
[
i
]
=
(
float
)
genrand_real2
(
sfmt
);
verifySorting
(
array
);
}
void
testLogValues
()
{
OpenMM_SFMT
::
SFMT
sfmt
;
init_gen_rand
(
0
,
sfmt
);
vector
<
float
>
array
(
10000
);
for
(
int
i
=
0
;
i
<
(
int
)
array
.
size
();
i
++
)
array
[
i
]
=
(
float
)
log
(
genrand_real2
(
sfmt
));
verifySorting
(
array
);
}
int
main
()
{
try
{
testUniformValues
();
testLogValues
();
}
catch
(
const
exception
&
e
)
{
cout
<<
"exception: "
<<
e
.
what
()
<<
endl
;
return
1
;
}
cout
<<
"Done"
<<
endl
;
return
0
;
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment