Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
3e16cab9
"platforms/vscode:/vscode.git/clone" did not exist on "83470b8e295ff3667df3d4cc45996c1015f69904"
Commit
3e16cab9
authored
Jun 05, 2012
by
Peter Eastman
Browse files
Continuing to implement new CUDA platform
parent
abb8cb4b
Changes
9
Show whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
1158 additions
and
509 deletions
+1158
-509
platforms/cuda2/src/CudaArray.h
platforms/cuda2/src/CudaArray.h
+1
-1
platforms/cuda2/src/CudaContext.cpp
platforms/cuda2/src/CudaContext.cpp
+367
-309
platforms/cuda2/src/CudaContext.h
platforms/cuda2/src/CudaContext.h
+200
-195
platforms/cuda2/src/CudaPlatform.cpp
platforms/cuda2/src/CudaPlatform.cpp
+5
-4
platforms/cuda2/src/CudaSort.cpp
platforms/cuda2/src/CudaSort.cpp
+132
-0
platforms/cuda2/src/CudaSort.h
platforms/cuda2/src/CudaSort.h
+141
-0
platforms/cuda2/src/kernels/sort.cu
platforms/cuda2/src/kernels/sort.cu
+186
-0
platforms/cuda2/src/kernels/utilities.cu
platforms/cuda2/src/kernels/utilities.cu
+4
-0
platforms/cuda2/tests/TestCudaSort.cpp
platforms/cuda2/tests/TestCudaSort.cpp
+122
-0
No files found.
platforms/cuda2/src/CudaArray.h
View file @
3e16cab9
...
...
@@ -83,7 +83,7 @@ public:
/**
* Get a pointer to the device memory.
*/
CUdeviceptr
getDevicePointer
()
{
CUdeviceptr
&
getDevicePointer
()
{
return
pointer
;
}
/**
...
...
platforms/cuda2/src/CudaContext.cpp
View file @
3e16cab9
...
...
@@ -31,7 +31,6 @@
#include "CudaContext.h"
#include "CudaArray.h"
//#include "CudaBondedUtilities.h"
#include "CudaExpressionUtilities.h"
#include "CudaForceInfo.h"
//#include "CudaIntegrationUtilities.h"
#include "CudaKernelSources.h"
...
...
@@ -53,7 +52,7 @@
#define CHECK_RESULT2(result, prefix) \
if (result != CUDA_SUCCESS) { \
std::stringstream m; \
m<<prefix<<": "<<result<<" ("<<__FILE__<<":
"<<__LINE__
<<")"
; \
m<<prefix<<": "<<
getErrorString(
result
)
<<" (
"<<result<<")"<<" at
"<<__FILE__<<":"<<__LINE__; \
throw OpenMMException(m.str());\
}
...
...
@@ -66,7 +65,7 @@ bool CudaContext::hasInitializedCuda = false;
CudaContext
::
CudaContext
(
const
System
&
system
,
int
deviceIndex
,
bool
useBlockingSync
,
const
string
&
precision
,
const
string
&
compiler
,
const
string
&
tempDir
,
CudaPlatform
::
PlatformData
&
platformData
)
:
system
(
system
),
compiler
(
compiler
),
time
(
0.0
),
platformData
(
platformData
),
stepCount
(
0
),
computeForceCount
(
0
),
contextIsValid
(
false
),
atomsWereReordered
(
false
),
posq
(
NULL
),
time
(
0.0
),
platformData
(
platformData
),
stepCount
(
0
),
computeForceCount
(
0
),
contextIsValid
(
false
),
atomsWereReordered
(
false
),
pinnedBuffer
(
NULL
),
posq
(
NULL
),
velm
(
NULL
),
/*forceBuffers(NULL), longForceBuffer(NULL), energyBuffer(NULL), atomIndex(NULL), integration(NULL),
bonded(NULL), nonbonded(NULL),*/
thread
(
NULL
)
{
if
(
!
hasInitializedCuda
)
{
...
...
@@ -88,7 +87,7 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
else
throw
OpenMMException
(
"Illegal value for CudaPrecision: "
+
precision
);
#ifdef WIN32
this
->
tempDir
=
tempDir
+
"
\"
;
this
->
tempDir
=
tempDir
+
"
\
\
"
;
#else
this
->
tempDir
=
tempDir
+
"/"
;
#endif
...
...
@@ -114,6 +113,7 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
deviceIndex
=
i
;
bestSpeed
=
speed
;
bestCompute
=
major
;
gpuArchitecture
=
intToString
(
major
)
+
intToString
(
minor
);
}
}
}
...
...
@@ -121,37 +121,47 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
throw
OpenMMException
(
"No compatible CUDA device is available"
);
CHECK_RESULT
(
cuDeviceGet
(
&
device
,
deviceIndex
));
this
->
deviceIndex
=
deviceIndex
;
int major, minor;
CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, device));
gpuArchitecture = CudaExpressionUtilities::intToString(major)+CudaExpressionUtilities::intToString(minor);
compilationDefines["
WORK_GROUP_SIZE
"] = CudaExpressionUtilities::intToString(ThreadBlockSize);
compilationDefines
[
"WORK_GROUP_SIZE"
]
=
intToString
(
ThreadBlockSize
);
defaultOptimizationOptions
=
"--use_fast_math"
;
int numThreadBlocksPerComputeUnit = 6;
CHECK_RESULT(cuCtxCreate(&context, 0, device));
unsigned
int
flags
=
CU_CTX_MAP_HOST
;
if
(
useBlockingSync
)
flags
+=
CU_CTX_SCHED_BLOCKING_SYNC
;
else
flags
+=
CU_CTX_SCHED_SPIN
;
CHECK_RESULT
(
cuCtxCreate
(
&
context
,
flags
,
device
));
contextIsValid
=
true
;
numAtoms
=
system
.
getNumParticles
();
paddedNumAtoms
=
TileSize
*
((
numAtoms
+
TileSize
-
1
)
/
TileSize
);
numAtomBlocks
=
(
paddedNumAtoms
+
(
TileSize
-
1
))
/
TileSize
;
int
multiprocessors
;
CHECK_RESULT
(
cuDeviceGetAttribute
(
&
multiprocessors
,
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT
,
device
));
int
numThreadBlocksPerComputeUnit
=
6
;
numThreadBlocks
=
numThreadBlocksPerComputeUnit
*
multiprocessors
;
// bonded = new CudaBondedUtilities(*this);
// nonbonded = new CudaNonbondedUtilities(*this);
if
(
useDoublePrecision
)
{
CHECK_RESULT
(
cuMemHostAlloc
(
&
pinnedBuffer
,
paddedNumAtoms
*
sizeof
(
double4
),
0
));
posq
=
CudaArray
::
create
<
double4
>
(
paddedNumAtoms
,
"posq"
);
velm
=
CudaArray
::
create
<
double4
>
(
paddedNumAtoms
,
"velm"
);
}
else
{
CHECK_RESULT
(
cuMemHostAlloc
(
&
pinnedBuffer
,
paddedNumAtoms
*
sizeof
(
float4
),
0
));
posq
=
CudaArray
::
create
<
float4
>
(
paddedNumAtoms
,
"posq"
);
velm
=
CudaArray
::
create
<
float4
>
(
paddedNumAtoms
,
"velm"
);
}
posCellOffsets
.
resize
(
paddedNumAtoms
,
make_int4
(
0
,
0
,
0
,
0
));
// Create utility kernels that are used in multiple places.
CUmodule
utilities
=
createModule
(
CudaKernelSources
::
vectorOps
+
CudaKernelSources
::
utilities
);
cuModuleGetFunction(&
clearBufferKernel
,
utilities, "
clearBuffer
");
cuModuleGetFunction(&
clearTwoBuffersKernel
,
utilities, "
clearTwoBuffers
");
cuModuleGetFunction(&
clearThreeBuffersKernel
,
utilities, "
clearThreeBuffers
");
cuModuleGetFunction(&
clearFourBuffersKernel
,
utilities, "
clearFourBuffers
");
cuModuleGetFunction(&
clearFiveBuffersKernel
,
utilities, "
clearFiveBuffers
");
cuModuleGetFunction(&
clearSixBuffersKernel
,
utilities, "
clearSixBuffers
");
cuModuleGetFunction(&
reduceFloat4Kernel
,
utilities, "
reduceFloat4Buffer
");
cuModuleGetFunction(&
reduceForcesKernel
,
utilities, "
reduceForces
");
clearBufferKernel
=
getKernel
(
utilities
,
"clearBuffer"
);
clearTwoBuffersKernel
=
getKernel
(
utilities
,
"clearTwoBuffers"
);
clearThreeBuffersKernel
=
getKernel
(
utilities
,
"clearThreeBuffers"
);
clearFourBuffersKernel
=
getKernel
(
utilities
,
"clearFourBuffers"
);
clearFiveBuffersKernel
=
getKernel
(
utilities
,
"clearFiveBuffers"
);
clearSixBuffersKernel
=
getKernel
(
utilities
,
"clearSixBuffers"
);
reduceFloat4Kernel
=
getKernel
(
utilities
,
"reduceFloat4Buffer"
);
reduceForcesKernel
=
getKernel
(
utilities
,
"reduceForces"
);
// Set defines based on the requested precision.
...
...
@@ -175,6 +185,8 @@ CudaContext::~CudaContext() {
delete
forces
[
i
];
for
(
int
i
=
0
;
i
<
(
int
)
reorderListeners
.
size
();
i
++
)
delete
reorderListeners
[
i
];
if
(
pinnedBuffer
!=
NULL
)
cuMemFreeHost
(
pinnedBuffer
);
if
(
posq
!=
NULL
)
delete
posq
;
if
(
velm
!=
NULL
)
...
...
@@ -202,38 +214,29 @@ CudaContext::~CudaContext() {
CHECK_RESULT
(
cuCtxDestroy
(
context
));
}
//void CudaContext::initialize() {
// for (int i = 0; i < numAtoms; i++) {
// double mass = system.getParticleMass(i);
// (*velm)[i].w = (float) (mass == 0.0 ? 0.0 : 1.0/mass);
// }
// velm->upload();
void
CudaContext
::
initialize
()
{
for
(
int
i
=
0
;
i
<
numAtoms
;
i
++
)
{
double
mass
=
system
.
getParticleMass
(
i
);
if
(
useDoublePrecision
)
((
double4
*
)
pinnedBuffer
)[
i
]
=
make_double4
(
0.0
,
0.0
,
0.0
,
mass
==
0.0
?
0.0
:
1.0
/
mass
);
else
((
float4
*
)
pinnedBuffer
)[
i
]
=
make_float4
(
0.0
f
,
0.0
f
,
0.0
f
,
mass
==
0.0
?
0.0
f
:
(
float
)
(
1.0
/
mass
));
}
velm
->
upload
(
pinnedBuffer
);
// bonded->initialize(system);
// numForceBuffers = platformData.contexts.size();
// numForceBuffers = std::max(numForceBuffers, bonded->getNumForceBuffers());
// for (int i = 0; i < (int) forces.size(); i++)
// numForceBuffers = std::max(numForceBuffers, forces[i]->getRequiredForceBuffers());
// forceBuffers = new CudaArray<mm_float4>(*this, paddedNumAtoms*numForceBuffers, "
forceBuffers
", false);
// if (supports64BitGlobalAtomics) {
// longForceBuffer = new CudaArray<cl_long>(*this, 3*paddedNumAtoms, "
longForceBuffer
", false);
// reduceForcesKernel.setArg<cl::Buffer>(0, longForceBuffer->getDeviceBuffer());
// reduceForcesKernel.setArg<cl::Buffer>(1, forceBuffers->getDeviceBuffer());
// reduceForcesKernel.setArg<cl_int>(2, paddedNumAtoms);
// reduceForcesKernel.setArg<cl_int>(3, numForceBuffers);
// addAutoclearBuffer(longForceBuffer->getDeviceBuffer(), longForceBuffer->getSize()*2);
// }
// addAutoclearBuffer(forceBuffers->getDeviceBuffer(), forceBuffers->getSize()*4);
// force = new CudaArray<mm_float4>(*this, &forceBuffers->getDeviceBuffer(), paddedNumAtoms, "
force
", true);
// energyBuffer = new CudaArray<cl_float>(*this, max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers()), "
energyBuffer
", true);
// addAutoclearBuffer(energyBuffer->getDeviceBuffer(), energyBuffer->getSize());
// atomIndex = new CudaArray<cl_int>(*this, paddedNumAtoms, "
atomIndex
", true);
// for (int i = 0; i < paddedNumAtoms; ++i)
// (*atomIndex)[i] = i;
// atomIndex->upload();
// findMoleculeGroups();
// moleculesInvalid = false;
force
=
CudaArray
::
create
<
long3
>
(
paddedNumAtoms
,
"force"
);
addAutoclearBuffer
(
force
->
getDevicePointer
(),
force
->
getSize
()
*
6
);
energyBuffer
=
CudaArray
::
create
<
float
>
(
numThreadBlocks
*
ThreadBlockSize
,
"energyBuffer"
);
addAutoclearBuffer
(
energyBuffer
->
getDevicePointer
(),
energyBuffer
->
getSize
());
atomIndexDevice
=
CudaArray
::
create
<
int
>
(
paddedNumAtoms
,
"atomIndex"
);
atomIndex
.
resize
(
paddedNumAtoms
);
for
(
int
i
=
0
;
i
<
paddedNumAtoms
;
++
i
)
atomIndex
[
i
]
=
i
;
atomIndexDevice
->
upload
(
atomIndex
);
findMoleculeGroups
();
moleculesInvalid
=
false
;
// nonbonded->initialize(system);
//
}
}
void
CudaContext
::
addForce
(
CudaForceInfo
*
force
)
{
forces
.
push_back
(
force
);
...
...
@@ -315,7 +318,7 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
CUresult
result
=
cuModuleLoad
(
&
module
,
outputFile
.
c_str
());
if
(
result
!=
CUDA_SUCCESS
)
{
std
::
stringstream
m
;
m
<<
"Error loading CUDA module: "
<<
result
;
m
<<
"Error loading CUDA module: "
<<
getErrorString
(
result
)
<<
" ("
<<
result
<<
")"
;
throw
OpenMMException
(
m
.
str
());
}
remove
(
inputFile
.
c_str
());
...
...
@@ -329,52 +332,109 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
remove
(
logFile
.
c_str
());
throw
;
}
//
// // Get length before using c_str() to avoid length() call invalidating the c_str() value.
// string src_string = src.str();
// ::size_t src_length = src_string.length();
// cl::Program::Sources sources(1, make_pair(src_string.c_str(), src_length));
// cl::Program program(context, sources);
// try {
// program.build(vector<cl::Device>(1, device), options.c_str());
// } catch (cl::Error err) {
// throw OpenMMException("Error compiling kernel: "+program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device));
// }
}
//
//void CudaContext::executeKernel(cl::Kernel& kernel, int workUnits, int blockSize) {
// if (blockSize == -1)
// blockSize = ThreadBlockSize;
// int size = std::min((workUnits+blockSize-1)/blockSize, numThreadBlocks)*blockSize;
// try {
// queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(size), cl::NDRange(blockSize));
// }
// catch (cl::Error err) {
// stringstream str;
// str<<"Error invoking kernel "<<kernel.getInfo<CL_KERNEL_FUNCTION_NAME>()<<": "<<err.what()<<" ("<<err.err()<<")";
// throw OpenMMException(str.str());
// }
//}
//
//void CudaContext::clearBuffer(CudaArray<float>& array) {
// clearBuffer(array.getDeviceBuffer(), array.getSize());
//}
//
//void CudaContext::clearBuffer(CudaArray<mm_float4>& array) {
// clearBuffer(array.getDeviceBuffer(), array.getSize()*4);
//}
//
//void CudaContext::clearBuffer(cl::Memory& memory, int size) {
// clearBufferKernel.setArg<cl::Memory>(0, memory);
// clearBufferKernel.setArg<cl_int>(1, size);
// executeKernel(clearBufferKernel, size, 128);
//}
//
//void CudaContext::addAutoclearBuffer(cl::Memory& memory, int size) {
// autoclearBuffers.push_back(&memory);
// autoclearBufferSizes.push_back(size);
//}
//
CUfunction
CudaContext
::
getKernel
(
CUmodule
&
module
,
const
string
&
name
)
{
CUfunction
function
;
CUresult
result
=
cuModuleGetFunction
(
&
function
,
module
,
name
.
c_str
());
if
(
result
!=
CUDA_SUCCESS
)
{
std
::
stringstream
m
;
m
<<
"Error creating kernel "
<<
name
<<
": "
<<
getErrorString
(
result
)
<<
" ("
<<
result
<<
")"
;
throw
OpenMMException
(
m
.
str
());
}
return
function
;
}
string
CudaContext
::
doubleToString
(
double
value
)
{
stringstream
s
;
s
.
precision
(
useDoublePrecision
?
16
:
8
);
s
<<
scientific
<<
value
;
if
(
!
useDoublePrecision
)
s
<<
"f"
;
return
s
.
str
();
}
string
CudaContext
::
intToString
(
int
value
)
{
stringstream
s
;
s
<<
value
;
return
s
.
str
();
}
std
::
string
CudaContext
::
getErrorString
(
CUresult
result
)
{
switch
(
result
)
{
case
CUDA_SUCCESS
:
return
"CUDA_SUCCESS"
;
case
CUDA_ERROR_INVALID_VALUE
:
return
"CUDA_ERROR_INVALID_VALUE"
;
case
CUDA_ERROR_OUT_OF_MEMORY
:
return
"CUDA_ERROR_OUT_OF_MEMORY"
;
case
CUDA_ERROR_NOT_INITIALIZED
:
return
"CUDA_ERROR_NOT_INITIALIZED"
;
case
CUDA_ERROR_DEINITIALIZED
:
return
"CUDA_ERROR_DEINITIALIZED"
;
case
CUDA_ERROR_PROFILER_DISABLED
:
return
"CUDA_ERROR_PROFILER_DISABLED"
;
case
CUDA_ERROR_PROFILER_NOT_INITIALIZED
:
return
"CUDA_ERROR_PROFILER_NOT_INITIALIZED"
;
case
CUDA_ERROR_PROFILER_ALREADY_STARTED
:
return
"CUDA_ERROR_PROFILER_ALREADY_STARTED"
;
case
CUDA_ERROR_PROFILER_ALREADY_STOPPED
:
return
"CUDA_ERROR_PROFILER_ALREADY_STOPPED"
;
case
CUDA_ERROR_NO_DEVICE
:
return
"CUDA_ERROR_NO_DEVICE"
;
case
CUDA_ERROR_INVALID_DEVICE
:
return
"CUDA_ERROR_INVALID_DEVICE"
;
case
CUDA_ERROR_INVALID_IMAGE
:
return
"CUDA_ERROR_INVALID_IMAGE"
;
case
CUDA_ERROR_INVALID_CONTEXT
:
return
"CUDA_ERROR_INVALID_CONTEXT"
;
case
CUDA_ERROR_CONTEXT_ALREADY_CURRENT
:
return
"CUDA_ERROR_CONTEXT_ALREADY_CURRENT"
;
case
CUDA_ERROR_MAP_FAILED
:
return
"CUDA_ERROR_MAP_FAILED"
;
case
CUDA_ERROR_UNMAP_FAILED
:
return
"CUDA_ERROR_UNMAP_FAILED"
;
case
CUDA_ERROR_ARRAY_IS_MAPPED
:
return
"CUDA_ERROR_ARRAY_IS_MAPPED"
;
case
CUDA_ERROR_ALREADY_MAPPED
:
return
"CUDA_ERROR_ALREADY_MAPPED"
;
case
CUDA_ERROR_NO_BINARY_FOR_GPU
:
return
"CUDA_ERROR_NO_BINARY_FOR_GPU"
;
case
CUDA_ERROR_ALREADY_ACQUIRED
:
return
"CUDA_ERROR_ALREADY_ACQUIRED"
;
case
CUDA_ERROR_NOT_MAPPED
:
return
"CUDA_ERROR_NOT_MAPPED"
;
case
CUDA_ERROR_NOT_MAPPED_AS_ARRAY
:
return
"CUDA_ERROR_NOT_MAPPED_AS_ARRAY"
;
case
CUDA_ERROR_NOT_MAPPED_AS_POINTER
:
return
"CUDA_ERROR_NOT_MAPPED_AS_POINTER"
;
case
CUDA_ERROR_ECC_UNCORRECTABLE
:
return
"CUDA_ERROR_ECC_UNCORRECTABLE"
;
case
CUDA_ERROR_UNSUPPORTED_LIMIT
:
return
"CUDA_ERROR_UNSUPPORTED_LIMIT"
;
case
CUDA_ERROR_CONTEXT_ALREADY_IN_USE
:
return
"CUDA_ERROR_CONTEXT_ALREADY_IN_USE"
;
case
CUDA_ERROR_INVALID_SOURCE
:
return
"CUDA_ERROR_INVALID_SOURCE"
;
case
CUDA_ERROR_FILE_NOT_FOUND
:
return
"CUDA_ERROR_FILE_NOT_FOUND"
;
case
CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND
:
return
"CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND"
;
case
CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
:
return
"CUDA_ERROR_SHARED_OBJECT_INIT_FAILED"
;
case
CUDA_ERROR_OPERATING_SYSTEM
:
return
"CUDA_ERROR_OPERATING_SYSTEM"
;
case
CUDA_ERROR_INVALID_HANDLE
:
return
"CUDA_ERROR_INVALID_HANDLE"
;
case
CUDA_ERROR_NOT_FOUND
:
return
"CUDA_ERROR_NOT_FOUND"
;
case
CUDA_ERROR_NOT_READY
:
return
"CUDA_ERROR_NOT_READY"
;
case
CUDA_ERROR_LAUNCH_FAILED
:
return
"CUDA_ERROR_LAUNCH_FAILED"
;
case
CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES
:
return
"CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES"
;
case
CUDA_ERROR_LAUNCH_TIMEOUT
:
return
"CUDA_ERROR_LAUNCH_TIMEOUT"
;
case
CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING
:
return
"CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING"
;
case
CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED
:
return
"CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED"
;
case
CUDA_ERROR_PEER_ACCESS_NOT_ENABLED
:
return
"CUDA_ERROR_PEER_ACCESS_NOT_ENABLED"
;
case
CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE
:
return
"CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE"
;
case
CUDA_ERROR_CONTEXT_IS_DESTROYED
:
return
"CUDA_ERROR_CONTEXT_IS_DESTROYED"
;
case
CUDA_ERROR_UNKNOWN
:
return
"CUDA_ERROR_UNKNOWN"
;
}
return
"Invalid error code"
;
}
void
CudaContext
::
executeKernel
(
CUfunction
kernel
,
void
**
arguments
,
int
threads
,
int
blockSize
,
unsigned
int
sharedSize
)
{
if
(
blockSize
==
-
1
)
blockSize
=
ThreadBlockSize
;
int
gridSize
=
std
::
min
((
threads
+
blockSize
-
1
)
/
blockSize
,
numThreadBlocks
);
CUresult
result
=
cuLaunchKernel
(
kernel
,
gridSize
,
1
,
1
,
blockSize
,
1
,
1
,
sharedSize
,
0
,
arguments
,
NULL
);
if
(
result
!=
CUDA_SUCCESS
)
{
stringstream
str
;
str
<<
"Error invoking kernel: "
<<
getErrorString
(
result
)
<<
" ("
<<
result
<<
")"
;
throw
OpenMMException
(
str
.
str
());
}
}
void
CudaContext
::
clearBuffer
(
CudaArray
&
array
)
{
clearBuffer
(
array
.
getDevicePointer
(),
array
.
getSize
()
*
array
.
getElementSize
()
/
4
);
}
void
CudaContext
::
clearBuffer
(
CUdeviceptr
memory
,
int
size
)
{
void
*
args
[]
=
{
&
memory
,
&
size
};
executeKernel
(
clearBufferKernel
,
args
,
size
,
128
);
}
void
CudaContext
::
addAutoclearBuffer
(
CUdeviceptr
memory
,
int
size
)
{
autoclearBuffers
.
push_back
(
memory
);
autoclearBufferSizes
.
push_back
(
size
);
}
//void CudaContext::clearAutoclearBuffers() {
// int base = 0;
// int total = autoclearBufferSizes.size();
...
...
@@ -454,219 +514,217 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
// executeKernel(reduceFloat4Kernel, bufferSize, 128);
//}
//
//void CudaContext::tagAtomsInMolecule(int atom, int molecule, vector<int>& atomMolecule, vector<vector<int> >& atomBonds) {
// // Recursively tag atoms as belonging to a particular molecule.
//
// atomMolecule[atom] = molecule;
// for (int i = 0; i < (int) atomBonds[atom].size(); i++)
// if (atomMolecule[atomBonds[atom][i]] == -1)
// tagAtomsInMolecule(atomBonds[atom][i], molecule, atomMolecule, atomBonds);
//}
//
///**
// * This class ensures that atom reordering doesn't break virtual sites.
// */
//class CudaContext::VirtualSiteInfo : public CudaForceInfo {
//public:
// VirtualSiteInfo(const System& system) : CudaForceInfo(0) {
// for (int i = 0; i < system.getNumParticles(); i++) {
// if (system.isVirtualSite(i)) {
// siteTypes.push_back(&typeid(system.getVirtualSite(i)));
// vector<int> particles;
// particles.push_back(i);
// for (int j = 0; j < system.getVirtualSite(i).getNumParticles(); j++)
// particles.push_back(system.getVirtualSite(i).getParticle(j));
// siteParticles.push_back(particles);
// vector<double> weights;
// if (dynamic_cast<const TwoParticleAverageSite*>(&system.getVirtualSite(i)) != NULL) {
// // A two particle average.
//
// const TwoParticleAverageSite& site = dynamic_cast<const TwoParticleAverageSite&>(system.getVirtualSite(i));
// weights.push_back(site.getWeight(0));
// weights.push_back(site.getWeight(1));
// }
// else if (dynamic_cast<const ThreeParticleAverageSite*>(&system.getVirtualSite(i)) != NULL) {
// // A three particle average.
//
// const ThreeParticleAverageSite& site = dynamic_cast<const ThreeParticleAverageSite&>(system.getVirtualSite(i));
// weights.push_back(site.getWeight(0));
// weights.push_back(site.getWeight(1));
// weights.push_back(site.getWeight(2));
// }
// else if (dynamic_cast<const OutOfPlaneSite*>(&system.getVirtualSite(i)) != NULL) {
// // An out of plane site.
//
// const OutOfPlaneSite& site = dynamic_cast<const OutOfPlaneSite&>(system.getVirtualSite(i));
// weights.push_back(site.getWeight12());
// weights.push_back(site.getWeight13());
// weights.push_back(site.getWeightCross());
// }
// siteWeights.push_back(weights);
// }
// }
// }
// int getNumParticleGroups() {
// return siteTypes.size();
// }
// void getParticlesInGroup(int index, std::vector<int>& particles) {
// particles = siteParticles[index];
// }
// bool areGroupsIdentical(int group1, int group2) {
// if (siteTypes[group1] != siteTypes[group2])
// return false;
// int numParticles = siteWeights[group1].size();
// if (siteWeights[group2].size() != numParticles)
// return false;
// for (int i = 0; i < numParticles; i++)
// if (siteWeights[group1][i] != siteWeights[group2][i])
// return false;
// return true;
// }
//private:
// vector<const type_info*> siteTypes;
// vector<vector<int> > siteParticles;
// vector<vector<double> > siteWeights;
//};
//
//
//void CudaContext::findMoleculeGroups() {
// // The first time this is called, we need to identify all the molecules in the system.
//
// if (moleculeGroups.size() == 0) {
// // Add a ForceInfo that makes sure reordering doesn't break virtual sites.
//
// addForce(new VirtualSiteInfo(system));
//
// // First make a list of every other atom to which each atom is connect by a constraint or force group.
//
// vector<vector<int> > atomBonds(system.getNumParticles());
// for (int i = 0; i < system.getNumConstraints(); i++) {
// int particle1, particle2;
// double distance;
// system.getConstraintParameters(i, particle1, particle2, distance);
// atomBonds[particle1].push_back(particle2);
// atomBonds[particle2].push_back(particle1);
// }
// for (int i = 0; i < (int) forces.size(); i++) {
// for (int j = 0; j < forces[i]->getNumParticleGroups(); j++) {
// vector<int> particles;
// forces[i]->getParticlesInGroup(j, particles);
// for (int k = 0; k < (int) particles.size(); k++)
// for (int m = 0; m < (int) particles.size(); m++)
// if (k != m)
// atomBonds[particles[k]].push_back(particles[m]);
// }
// }
//
// // Now tag atoms by which molecule they belong to.
//
// vector<int> atomMolecule(numAtoms, -1);
// int numMolecules = 0;
// for (int i = 0; i < numAtoms; i++)
// if (atomMolecule[i] == -1)
// tagAtomsInMolecule(i, numMolecules++, atomMolecule, atomBonds);
// vector<vector<int> > atomIndices(numMolecules);
// for (int i = 0; i < numAtoms; i++)
// atomIndices[atomMolecule[i]].push_back(i);
//
// // Construct a description of each molecule.
//
// molecules.resize(numMolecules);
// for (int i = 0; i < numMolecules; i++) {
// molecules[i].atoms = atomIndices[i];
// molecules[i].groups.resize(forces.size());
// }
// for (int i = 0; i < system.getNumConstraints(); i++) {
// int particle1, particle2;
// double distance;
// system.getConstraintParameters(i, particle1, particle2, distance);
// molecules[atomMolecule[particle1]].constraints.push_back(i);
// }
// for (int i = 0; i < (int) forces.size(); i++)
// for (int j = 0; j < forces[i]->getNumParticleGroups(); j++) {
// vector<int> particles;
// forces[i]->getParticlesInGroup(j, particles);
// molecules[atomMolecule[particles[0]]].groups[i].push_back(j);
// }
// }
//
// // Sort them into groups of identical molecules.
//
// vector<Molecule> uniqueMolecules;
// vector<vector<int> > moleculeInstances;
// vector<vector<int> > moleculeOffsets;
// for (int molIndex = 0; molIndex < (int) molecules.size(); molIndex++) {
// Molecule& mol = molecules[molIndex];
//
// // See if it is identical to another molecule.
//
// bool isNew = true;
// for (int j = 0; j < (int) uniqueMolecules.size() && isNew; j++) {
// Molecule& mol2 = uniqueMolecules[j];
// bool identical = (mol.atoms.size() == mol2.atoms.size() && mol.constraints.size() == mol2.constraints.size());
//
// // See if the atoms are identical.
//
// int atomOffset = mol2.atoms[0]-mol.atoms[0];
// for (int i = 0; i < (int) mol.atoms.size() && identical; i++) {
// if (mol.atoms[i] != mol2.atoms[i]-atomOffset || system.getParticleMass(mol.atoms[i]) != system.getParticleMass(mol2.atoms[i]))
// identical = false;
// for (int k = 0; k < (int) forces.size(); k++)
// if (!forces[k]->areParticlesIdentical(mol.atoms[i], mol2.atoms[i]))
// identical = false;
// }
//
// // See if the constraints are identical.
//
// for (int i = 0; i < (int) mol.constraints.size() && identical; i++) {
// int c1particle1, c1particle2, c2particle1, c2particle2;
// double distance1, distance2;
// system.getConstraintParameters(mol.constraints[i], c1particle1, c1particle2, distance1);
// system.getConstraintParameters(mol2.constraints[i], c2particle1, c2particle2, distance2);
// if (c1particle1 != c2particle1-atomOffset || c1particle2 != c2particle2-atomOffset || distance1 != distance2)
// identical = false;
// }
//
// // See if the force groups are identical.
//
// for (int i = 0; i < (int) forces.size() && identical; i++) {
// if (mol.groups[i].size() != mol2.groups[i].size())
// identical = false;
// for (int k = 0; k < (int) mol.groups[i].size() && identical; k++)
// if (!forces[i]->areGroupsIdentical(mol.groups[i][k], mol2.groups[i][k]))
// identical = false;
// }
// if (identical) {
// moleculeInstances[j].push_back(molIndex);
// moleculeOffsets[j].push_back(mol.atoms[0]);
// isNew = false;
// }
// }
// if (isNew) {
// uniqueMolecules.push_back(mol);
// moleculeInstances.push_back(vector<int>());
// moleculeInstances[moleculeInstances.size()-1].push_back(molIndex);
// moleculeOffsets.push_back(vector<int>());
// moleculeOffsets[moleculeOffsets.size()-1].push_back(mol.atoms[0]);
// }
// }
// moleculeGroups.resize(moleculeInstances.size());
// for (int i = 0; i < (int) moleculeInstances.size(); i++)
// {
// moleculeGroups[i].instances = moleculeInstances[i];
// moleculeGroups[i].offsets = moleculeOffsets[i];
// vector<int>& atoms = uniqueMolecules[i].atoms;
// moleculeGroups[i].atoms.resize(atoms.size());
// for (int j = 0; j < (int) atoms.size(); j++)
// moleculeGroups[i].atoms[j] = atoms[j]-atoms[0];
// }
//}
//
//void CudaContext::invalidateMolecules() {
// moleculesInvalid = true;
//}
//
//
void
CudaContext
::
tagAtomsInMolecule
(
int
atom
,
int
molecule
,
vector
<
int
>&
atomMolecule
,
vector
<
vector
<
int
>
>&
atomBonds
)
{
// Recursively tag atoms as belonging to a particular molecule.
atomMolecule
[
atom
]
=
molecule
;
for
(
int
i
=
0
;
i
<
(
int
)
atomBonds
[
atom
].
size
();
i
++
)
if
(
atomMolecule
[
atomBonds
[
atom
][
i
]]
==
-
1
)
tagAtomsInMolecule
(
atomBonds
[
atom
][
i
],
molecule
,
atomMolecule
,
atomBonds
);
}
/**
* This class ensures that atom reordering doesn't break virtual sites.
*/
class
CudaContext
::
VirtualSiteInfo
:
public
CudaForceInfo
{
public:
VirtualSiteInfo
(
const
System
&
system
)
:
CudaForceInfo
(
0
)
{
for
(
int
i
=
0
;
i
<
system
.
getNumParticles
();
i
++
)
{
if
(
system
.
isVirtualSite
(
i
))
{
siteTypes
.
push_back
(
&
typeid
(
system
.
getVirtualSite
(
i
)));
vector
<
int
>
particles
;
particles
.
push_back
(
i
);
for
(
int
j
=
0
;
j
<
system
.
getVirtualSite
(
i
).
getNumParticles
();
j
++
)
particles
.
push_back
(
system
.
getVirtualSite
(
i
).
getParticle
(
j
));
siteParticles
.
push_back
(
particles
);
vector
<
double
>
weights
;
if
(
dynamic_cast
<
const
TwoParticleAverageSite
*>
(
&
system
.
getVirtualSite
(
i
))
!=
NULL
)
{
// A two particle average.
const
TwoParticleAverageSite
&
site
=
dynamic_cast
<
const
TwoParticleAverageSite
&>
(
system
.
getVirtualSite
(
i
));
weights
.
push_back
(
site
.
getWeight
(
0
));
weights
.
push_back
(
site
.
getWeight
(
1
));
}
else
if
(
dynamic_cast
<
const
ThreeParticleAverageSite
*>
(
&
system
.
getVirtualSite
(
i
))
!=
NULL
)
{
// A three particle average.
const
ThreeParticleAverageSite
&
site
=
dynamic_cast
<
const
ThreeParticleAverageSite
&>
(
system
.
getVirtualSite
(
i
));
weights
.
push_back
(
site
.
getWeight
(
0
));
weights
.
push_back
(
site
.
getWeight
(
1
));
weights
.
push_back
(
site
.
getWeight
(
2
));
}
else
if
(
dynamic_cast
<
const
OutOfPlaneSite
*>
(
&
system
.
getVirtualSite
(
i
))
!=
NULL
)
{
// An out of plane site.
const
OutOfPlaneSite
&
site
=
dynamic_cast
<
const
OutOfPlaneSite
&>
(
system
.
getVirtualSite
(
i
));
weights
.
push_back
(
site
.
getWeight12
());
weights
.
push_back
(
site
.
getWeight13
());
weights
.
push_back
(
site
.
getWeightCross
());
}
siteWeights
.
push_back
(
weights
);
}
}
}
int
getNumParticleGroups
()
{
return
siteTypes
.
size
();
}
void
getParticlesInGroup
(
int
index
,
std
::
vector
<
int
>&
particles
)
{
particles
=
siteParticles
[
index
];
}
bool
areGroupsIdentical
(
int
group1
,
int
group2
)
{
if
(
siteTypes
[
group1
]
!=
siteTypes
[
group2
])
return
false
;
int
numParticles
=
siteWeights
[
group1
].
size
();
if
(
siteWeights
[
group2
].
size
()
!=
numParticles
)
return
false
;
for
(
int
i
=
0
;
i
<
numParticles
;
i
++
)
if
(
siteWeights
[
group1
][
i
]
!=
siteWeights
[
group2
][
i
])
return
false
;
return
true
;
}
private:
vector
<
const
type_info
*>
siteTypes
;
vector
<
vector
<
int
>
>
siteParticles
;
vector
<
vector
<
double
>
>
siteWeights
;
};
void
CudaContext
::
findMoleculeGroups
()
{
// The first time this is called, we need to identify all the molecules in the system.
if
(
moleculeGroups
.
size
()
==
0
)
{
// Add a ForceInfo that makes sure reordering doesn't break virtual sites.
addForce
(
new
VirtualSiteInfo
(
system
));
// First make a list of every other atom to which each atom is connect by a constraint or force group.
vector
<
vector
<
int
>
>
atomBonds
(
system
.
getNumParticles
());
for
(
int
i
=
0
;
i
<
system
.
getNumConstraints
();
i
++
)
{
int
particle1
,
particle2
;
double
distance
;
system
.
getConstraintParameters
(
i
,
particle1
,
particle2
,
distance
);
atomBonds
[
particle1
].
push_back
(
particle2
);
atomBonds
[
particle2
].
push_back
(
particle1
);
}
for
(
int
i
=
0
;
i
<
(
int
)
forces
.
size
();
i
++
)
{
for
(
int
j
=
0
;
j
<
forces
[
i
]
->
getNumParticleGroups
();
j
++
)
{
vector
<
int
>
particles
;
forces
[
i
]
->
getParticlesInGroup
(
j
,
particles
);
for
(
int
k
=
0
;
k
<
(
int
)
particles
.
size
();
k
++
)
for
(
int
m
=
0
;
m
<
(
int
)
particles
.
size
();
m
++
)
if
(
k
!=
m
)
atomBonds
[
particles
[
k
]].
push_back
(
particles
[
m
]);
}
}
// Now tag atoms by which molecule they belong to.
vector
<
int
>
atomMolecule
(
numAtoms
,
-
1
);
int
numMolecules
=
0
;
for
(
int
i
=
0
;
i
<
numAtoms
;
i
++
)
if
(
atomMolecule
[
i
]
==
-
1
)
tagAtomsInMolecule
(
i
,
numMolecules
++
,
atomMolecule
,
atomBonds
);
vector
<
vector
<
int
>
>
atomIndices
(
numMolecules
);
for
(
int
i
=
0
;
i
<
numAtoms
;
i
++
)
atomIndices
[
atomMolecule
[
i
]].
push_back
(
i
);
// Construct a description of each molecule.
molecules
.
resize
(
numMolecules
);
for
(
int
i
=
0
;
i
<
numMolecules
;
i
++
)
{
molecules
[
i
].
atoms
=
atomIndices
[
i
];
molecules
[
i
].
groups
.
resize
(
forces
.
size
());
}
for
(
int
i
=
0
;
i
<
system
.
getNumConstraints
();
i
++
)
{
int
particle1
,
particle2
;
double
distance
;
system
.
getConstraintParameters
(
i
,
particle1
,
particle2
,
distance
);
molecules
[
atomMolecule
[
particle1
]].
constraints
.
push_back
(
i
);
}
for
(
int
i
=
0
;
i
<
(
int
)
forces
.
size
();
i
++
)
for
(
int
j
=
0
;
j
<
forces
[
i
]
->
getNumParticleGroups
();
j
++
)
{
vector
<
int
>
particles
;
forces
[
i
]
->
getParticlesInGroup
(
j
,
particles
);
molecules
[
atomMolecule
[
particles
[
0
]]].
groups
[
i
].
push_back
(
j
);
}
}
// Sort them into groups of identical molecules.
vector
<
Molecule
>
uniqueMolecules
;
vector
<
vector
<
int
>
>
moleculeInstances
;
vector
<
vector
<
int
>
>
moleculeOffsets
;
for
(
int
molIndex
=
0
;
molIndex
<
(
int
)
molecules
.
size
();
molIndex
++
)
{
Molecule
&
mol
=
molecules
[
molIndex
];
// See if it is identical to another molecule.
bool
isNew
=
true
;
for
(
int
j
=
0
;
j
<
(
int
)
uniqueMolecules
.
size
()
&&
isNew
;
j
++
)
{
Molecule
&
mol2
=
uniqueMolecules
[
j
];
bool
identical
=
(
mol
.
atoms
.
size
()
==
mol2
.
atoms
.
size
()
&&
mol
.
constraints
.
size
()
==
mol2
.
constraints
.
size
());
// See if the atoms are identical.
int
atomOffset
=
mol2
.
atoms
[
0
]
-
mol
.
atoms
[
0
];
for
(
int
i
=
0
;
i
<
(
int
)
mol
.
atoms
.
size
()
&&
identical
;
i
++
)
{
if
(
mol
.
atoms
[
i
]
!=
mol2
.
atoms
[
i
]
-
atomOffset
||
system
.
getParticleMass
(
mol
.
atoms
[
i
])
!=
system
.
getParticleMass
(
mol2
.
atoms
[
i
]))
identical
=
false
;
for
(
int
k
=
0
;
k
<
(
int
)
forces
.
size
();
k
++
)
if
(
!
forces
[
k
]
->
areParticlesIdentical
(
mol
.
atoms
[
i
],
mol2
.
atoms
[
i
]))
identical
=
false
;
}
// See if the constraints are identical.
for
(
int
i
=
0
;
i
<
(
int
)
mol
.
constraints
.
size
()
&&
identical
;
i
++
)
{
int
c1particle1
,
c1particle2
,
c2particle1
,
c2particle2
;
double
distance1
,
distance2
;
system
.
getConstraintParameters
(
mol
.
constraints
[
i
],
c1particle1
,
c1particle2
,
distance1
);
system
.
getConstraintParameters
(
mol2
.
constraints
[
i
],
c2particle1
,
c2particle2
,
distance2
);
if
(
c1particle1
!=
c2particle1
-
atomOffset
||
c1particle2
!=
c2particle2
-
atomOffset
||
distance1
!=
distance2
)
identical
=
false
;
}
// See if the force groups are identical.
for
(
int
i
=
0
;
i
<
(
int
)
forces
.
size
()
&&
identical
;
i
++
)
{
if
(
mol
.
groups
[
i
].
size
()
!=
mol2
.
groups
[
i
].
size
())
identical
=
false
;
for
(
int
k
=
0
;
k
<
(
int
)
mol
.
groups
[
i
].
size
()
&&
identical
;
k
++
)
if
(
!
forces
[
i
]
->
areGroupsIdentical
(
mol
.
groups
[
i
][
k
],
mol2
.
groups
[
i
][
k
]))
identical
=
false
;
}
if
(
identical
)
{
moleculeInstances
[
j
].
push_back
(
molIndex
);
moleculeOffsets
[
j
].
push_back
(
mol
.
atoms
[
0
]);
isNew
=
false
;
}
}
if
(
isNew
)
{
uniqueMolecules
.
push_back
(
mol
);
moleculeInstances
.
push_back
(
vector
<
int
>
());
moleculeInstances
[
moleculeInstances
.
size
()
-
1
].
push_back
(
molIndex
);
moleculeOffsets
.
push_back
(
vector
<
int
>
());
moleculeOffsets
[
moleculeOffsets
.
size
()
-
1
].
push_back
(
mol
.
atoms
[
0
]);
}
}
moleculeGroups
.
resize
(
moleculeInstances
.
size
());
for
(
int
i
=
0
;
i
<
(
int
)
moleculeInstances
.
size
();
i
++
)
{
moleculeGroups
[
i
].
instances
=
moleculeInstances
[
i
];
moleculeGroups
[
i
].
offsets
=
moleculeOffsets
[
i
];
vector
<
int
>&
atoms
=
uniqueMolecules
[
i
].
atoms
;
moleculeGroups
[
i
].
atoms
.
resize
(
atoms
.
size
());
for
(
int
j
=
0
;
j
<
(
int
)
atoms
.
size
();
j
++
)
moleculeGroups
[
i
].
atoms
[
j
]
=
atoms
[
j
]
-
atoms
[
0
];
}
}
void
CudaContext
::
invalidateMolecules
()
{
moleculesInvalid
=
true
;
}
//void OpenCLContext::validateMolecules() {
// moleculesInvalid = false;
// if (numAtoms == 0 || nonbonded == NULL || !nonbonded->getUseCutoff())
...
...
platforms/cuda2/src/CudaContext.h
View file @
3e16cab9
...
...
@@ -72,11 +72,11 @@ public:
CudaContext
(
const
System
&
system
,
int
deviceIndex
,
bool
useBlockingSync
,
const
std
::
string
&
precision
,
const
std
::
string
&
compiler
,
const
std
::
string
&
tempDir
,
CudaPlatform
::
PlatformData
&
platformData
);
~
CudaContext
();
//
/**
//
* This is called to initialize internal data structures after all Forces in the system
//
* have been initialized.
//
*/
//
void initialize();
/**
* This is called to initialize internal data structures after all Forces in the system
* have been initialized.
*/
void
initialize
();
/**
* Add a CudaForce to this context.
*/
...
...
@@ -123,12 +123,12 @@ public:
CudaArray
&
getVelm
()
{
return
*
velm
;
}
//
/**
//
* Get the array which contains the force on each atom.
//
*/
//
CudaArray
<mm_float4>
& getForce() {
//
return *force;
//
}
/**
* Get the array which contains the force on each atom
(respresented as a long3 in 64 bit fixed point)
.
*/
CudaArray
&
getForce
()
{
return
*
force
;
}
// /**
// * Get the array which contains the buffers in which forces are computed.
// */
...
...
@@ -184,36 +184,41 @@ public:
* omitted, a default set of options will be used
*/
CUmodule
createModule
(
const
std
::
string
source
,
const
std
::
map
<
std
::
string
,
std
::
string
>&
defines
,
const
char
*
optimizationFlags
=
NULL
);
// /**
// * Execute a kernel.
// *
// * @param kernel the kernel to execute
// * @param workUnits the maximum number of work units that should be used
// * @param blockSize the size of each thread block to use
// */
// void executeKernel(cl::Kernel& kernel, int workUnits, int blockSize = -1);
// /**
// * Set all elements of an array to 0.
// */
// void clearBuffer(CudaArray<float>& array);
// /**
// * Set all elements of an array to 0.
// */
// void clearBuffer(CudaArray<mm_float4>& array);
// /**
// * Set all elements of an array to 0.
// *
// * @param memory the Memory to clear
// * @param size the number of float elements in the buffer
// */
// void clearBuffer(cl::Memory& memory, int size);
// /**
// * Register a buffer that should be automatically cleared (all elements set to 0) at the start of each force or energy computation.
// *
// * @param memory the Memory to clear
// * @param size the number of float elements in the buffer
// */
// void addAutoclearBuffer(cl::Memory& memory, int size);
/**
* Get a kernel from a CUDA module.
*
* @param module the module to get the kernel from
* @param name the name of the kernel to get
*/
CUfunction
getKernel
(
CUmodule
&
module
,
const
std
::
string
&
name
);
/**
* Execute a kernel.
*
* @param kernel the kernel to execute
* @param arguments an array of pointers to the kernel arguments
* @param threads the maximum number of threads that should be used
* @param blockSize the size of each thread block to use
* @param sharedSize the amount of dynamic shared memory to allocated for the kernel, in bytes
*/
void
executeKernel
(
CUfunction
kernel
,
void
**
arguments
,
int
workUnits
,
int
blockSize
=
-
1
,
unsigned
int
sharedSize
=
0
);
/**
* Set all elements of an array to 0.
*/
void
clearBuffer
(
CudaArray
&
array
);
/**
* Set all elements of an array to 0.
*
* @param memory the memory to clear
* @param size the number of 4-byte elements in the buffer
*/
void
clearBuffer
(
CUdeviceptr
memory
,
int
size
);
/**
* Register a buffer that should be automatically cleared (all elements set to 0) at the start of each force or energy computation.
*
* @param memory the memory to clear
* @param size the number of float/double elements in the buffer
*/
void
addAutoclearBuffer
(
CUdeviceptr
memory
,
int
size
);
// /**
// * Clear all buffers that have been registered with addAutoclearBuffer().
// */
...
...
@@ -230,108 +235,110 @@ public:
// * Sum the buffesr containing forces.
// */
// void reduceForces();
// /**
// * Get the current simulation time.
// */
// double getTime() {
// return time;
// }
// /**
// * Set the current simulation time.
// */
// void setTime(double t) {
// time = t;
// }
// /**
// * Get the number of integration steps that have been taken.
// */
// int getStepCount() {
// return stepCount;
// }
// /**
// * Set the number of integration steps that have been taken.
// */
// void setStepCount(int steps) {
// stepCount = steps;
// }
// /**
// * Get the number of times forces or energy has been computed.
// */
// int getComputeForceCount() {
// return computeForceCount;
// }
// /**
// * Set the number of times forces or energy has been computed.
// */
// void setComputeForceCount(int count) {
// computeForceCount = count;
// }
// /**
// * Get the number of atoms.
// */
// int getNumAtoms() const {
// return numAtoms;
// }
// /**
// * Get the number of atoms, rounded up to a multiple of TileSize. This is the actual size of
// * most arrays with one element per atom.
// */
// int getPaddedNumAtoms() const {
// return paddedNumAtoms;
// }
// /**
// * Get the number of blocks of TileSize atoms.
// */
// int getNumAtomBlocks() const {
// return numAtomBlocks;
// }
// /**
// * Get the standard number of thread blocks to use when executing kernels.
// */
// int getNumThreadBlocks() const {
// return numThreadBlocks;
// }
// /**
// * Get the number of force buffers.
// */
// int getNumForceBuffers() const {
// return numForceBuffers;
// }
// /**
// * Get the SIMD width of the device being used.
// */
// int getSIMDWidth() const {
// return simdWidth;
// }
// /**
// * Get whether the device being used supports 64 bit atomic operations on global memory.
// */
// bool getSupports64BitGlobalAtomics() {
// return supports64BitGlobalAtomics;
// }
// /**
// * Get whether the device being used supports double precision math.
// */
// bool getSupportsDoublePrecision() {
// return supportsDoublePrecision;
// }
/**
* Get the current simulation time.
*/
double
getTime
()
{
return
time
;
}
/**
* Set the current simulation time.
*/
void
setTime
(
double
t
)
{
time
=
t
;
}
/**
* Get the number of integration steps that have been taken.
*/
int
getStepCount
()
{
return
stepCount
;
}
/**
* Set the number of integration steps that have been taken.
*/
void
setStepCount
(
int
steps
)
{
stepCount
=
steps
;
}
/**
* Get the number of times forces or energy has been computed.
*/
int
getComputeForceCount
()
{
return
computeForceCount
;
}
/**
* Set the number of times forces or energy has been computed.
*/
void
setComputeForceCount
(
int
count
)
{
computeForceCount
=
count
;
}
/**
* Get the number of atoms.
*/
int
getNumAtoms
()
const
{
return
numAtoms
;
}
/**
* Get the number of atoms, rounded up to a multiple of TileSize. This is the actual size of
* most arrays with one element per atom.
*/
int
getPaddedNumAtoms
()
const
{
return
paddedNumAtoms
;
}
/**
* Get the number of blocks of TileSize atoms.
*/
int
getNumAtomBlocks
()
const
{
return
numAtomBlocks
;
}
/**
* Get the standard number of thread blocks to use when executing kernels.
*/
int
getNumThreadBlocks
()
const
{
return
numThreadBlocks
;
}
/**
* Get whether double precision is being used.
*/
bool
getUseDoublePrecision
()
{
return
useDoublePrecision
;
}
/**
* Get whether accumulation is being done in double precision.
*/
bool
getAccumulateInDouble
()
{
return
accumulateInDouble
;
}
/**
* Convert a number to a string in a format suitable for including in a kernel.
* This takes into account whether the context uses single or double precision.
*/
std
::
string
doubleToString
(
double
value
);
/**
* Convert a number to a string in a format suitable for including in a kernel.
*/
std
::
string
intToString
(
int
value
);
/**
* Convert a CUDA result code to the corresponding string description.
*/
std
::
string
getErrorString
(
CUresult
result
);
// /**
// * Get the size of the periodic box.
// */
//
mm_
float4 getPeriodicBoxSize() const {
// float4 getPeriodicBoxSize() const {
// return periodicBoxSize;
// }
// /**
// * Set the size of the periodic box.
// */
// void setPeriodicBoxSize(double xsize, double ysize, double zsize) {
// periodicBoxSize = m
m
_float4((float) xsize, (float) ysize, (float) zsize, 0);
// invPeriodicBoxSize = m
m
_float4((float) (1.0/xsize), (float) (1.0/ysize), (float) (1.0/zsize), 0);
// periodicBoxSize = m
ake
_float4((float) xsize, (float) ysize, (float) zsize, 0);
// invPeriodicBoxSize = m
ake
_float4((float) (1.0/xsize), (float) (1.0/ysize), (float) (1.0/zsize), 0);
// }
// /**
// * Get the inverse of the size of the periodic box.
// */
//
mm_
float4 getInvPeriodicBoxSize() const {
// float4 getInvPeriodicBoxSize() const {
// return invPeriodicBoxSize;
// }
// /**
...
...
@@ -352,66 +359,66 @@ public:
// CudaNonbondedUtilities& getNonbondedUtilities() {
// return *nonbonded;
// }
//
/**
//
* Get the thread used by this context for executing parallel computations.
//
*/
//
WorkThread& getWorkThread() {
//
return *thread;
//
}
//
/**
//
* Get whether atoms were reordered during the most recent force/energy computation.
//
*/
//
bool getAtomsWereReordered() const {
//
return atomsWereReordered;
//
}
//
/**
//
* Set whether atoms were reordered during the most recent force/energy computation.
//
*/
//
void setAtomsWereReordered(bool wereReordered) {
//
atomsWereReordered = wereReordered;
//
}
//
/**
//
* Reorder the internal arrays of atoms to try to keep spatially contiguous atoms close
//
* together in the arrays.
//
*
//
* @param enforcePeriodic if true, the atom positions may be altered to enforce periodic boundary conditions
//
*/
//
void reorderAtoms(bool enforcePeriodic);
//
/**
//
* Add a listener that should be called whenever atoms get reordered. The CudaContext
//
* assumes ownership of the object, and deletes it when the context itself is deleted.
//
*/
//
void addReorderListener(ReorderListener* listener);
//
/**
//
* Get the list of ReorderListeners.
//
*/
//
std::vector<ReorderListener*>& getReorderListeners() {
//
return reorderListeners;
//
}
//
/**
//
* Mark that the current molecule definitions (and hence the atom order) may be invalid.
//
* This should be called whenever force field parameters change. It will cause the definitions
//
* and order to be revalidated the next to reorderAtoms() is called.
//
*/
//
void invalidateMolecules();
//
/**
//
* Get whether the current molecule definitions are valid.
//
*/
//
bool getMoleculesAreInvalid() {
//
return moleculesInvalid;
//
}
/**
* Get the thread used by this context for executing parallel computations.
*/
WorkThread
&
getWorkThread
()
{
return
*
thread
;
}
/**
* Get whether atoms were reordered during the most recent force/energy computation.
*/
bool
getAtomsWereReordered
()
const
{
return
atomsWereReordered
;
}
/**
* Set whether atoms were reordered during the most recent force/energy computation.
*/
void
setAtomsWereReordered
(
bool
wereReordered
)
{
atomsWereReordered
=
wereReordered
;
}
/**
* Reorder the internal arrays of atoms to try to keep spatially contiguous atoms close
* together in the arrays.
*
* @param enforcePeriodic if true, the atom positions may be altered to enforce periodic boundary conditions
*/
void
reorderAtoms
(
bool
enforcePeriodic
);
/**
* Add a listener that should be called whenever atoms get reordered. The CudaContext
* assumes ownership of the object, and deletes it when the context itself is deleted.
*/
void
addReorderListener
(
ReorderListener
*
listener
);
/**
* Get the list of ReorderListeners.
*/
std
::
vector
<
ReorderListener
*>&
getReorderListeners
()
{
return
reorderListeners
;
}
/**
* Mark that the current molecule definitions (and hence the atom order) may be invalid.
* This should be called whenever force field parameters change. It will cause the definitions
* and order to be revalidated the next to reorderAtoms() is called.
*/
void
invalidateMolecules
();
/**
* Get whether the current molecule definitions are valid.
*/
bool
getMoleculesAreInvalid
()
{
return
moleculesInvalid
;
}
private:
struct
Molecule
;
struct
MoleculeGroup
;
class
VirtualSiteInfo
;
//
void findMoleculeGroups();
//
static void tagAtomsInMolecule(int atom, int molecule, std::vector<int>& atomMolecule, std::vector<std::vector<int> >& atomBonds);
//
/**
//
* Ensure that all molecules marked as "identical" really are identical. This should be
//
* called whenever force field parameters change. If necessary, it will rebuild the list
//
* of molecules and resort the atoms.
//
*/
//
void validateMolecules();
void
findMoleculeGroups
();
static
void
tagAtomsInMolecule
(
int
atom
,
int
molecule
,
std
::
vector
<
int
>&
atomMolecule
,
std
::
vector
<
std
::
vector
<
int
>
>&
atomBonds
);
/**
* Ensure that all molecules marked as "identical" really are identical. This should be
* called whenever force field parameters change. If necessary, it will rebuild the list
* of molecules and resort the atoms.
*/
void
validateMolecules
();
static
bool
hasInitializedCuda
;
const
System
&
system
;
double
time
;
...
...
@@ -424,8 +431,6 @@ private:
int
paddedNumAtoms
;
int
numAtomBlocks
;
int
numThreadBlocks
;
// int numForceBuffers;
// int simdWidth;
bool
useBlockingSync
,
useDoublePrecision
,
accumulateInDouble
,
contextIsValid
,
atomsWereReordered
,
moleculesInvalid
;
std
::
string
compiler
,
tempDir
,
gpuArchitecture
;
float4
periodicBoxSize
;
...
...
@@ -446,15 +451,15 @@ private:
std
::
vector
<
Molecule
>
molecules
;
std
::
vector
<
MoleculeGroup
>
moleculeGroups
;
std
::
vector
<
int4
>
posCellOffsets
;
void
*
pinnedBuffer
;
CudaArray
*
posq
;
CudaArray
*
velm
;
// CudaArray<mm_float4>* force;
// CudaArray<mm_float4>* forceBuffers;
// CudaArray<cl_long>* longForceBuffer;
// CudaArray<cl_float>* energyBuffer;
// CudaArray<cl_int>* atomIndex;
// std::vector<cl::Memory*> autoclearBuffers;
// std::vector<int> autoclearBufferSizes;
CudaArray
*
force
;
CudaArray
*
energyBuffer
;
CudaArray
*
atomIndexDevice
;
std
::
vector
<
int
>
atomIndex
;
std
::
vector
<
CUdeviceptr
>
autoclearBuffers
;
std
::
vector
<
int
>
autoclearBufferSizes
;
std
::
vector
<
ReorderListener
*>
reorderListeners
;
// CudaIntegrationUtilities* integration;
// CudaBondedUtilities* bonded;
...
...
platforms/cuda2/src/CudaPlatform.cpp
View file @
3e16cab9
...
...
@@ -154,6 +154,7 @@ CudaPlatform::PlatformData::PlatformData(const System& system, const string& dev
device
<<
contexts
[
i
]
->
getDeviceIndex
();
}
propertyValues
[
CudaPlatform
::
CudaDeviceIndex
()]
=
device
.
str
();
propertyValues
[
CudaPlatform
::
CudaUseBlockingSync
()]
=
blocking
?
"true"
:
"false"
;
propertyValues
[
CudaPlatform
::
CudaPrecision
()]
=
precisionProperty
;
propertyValues
[
CudaPlatform
::
CudaCompiler
()]
=
compilerProperty
;
propertyValues
[
CudaPlatform
::
CudaTempDirectory
()]
=
tempProperty
;
...
...
@@ -166,11 +167,11 @@ CudaPlatform::PlatformData::~PlatformData() {
}
void
CudaPlatform
::
PlatformData
::
initializeContexts
(
const
System
&
system
)
{
//
for (int i = 0; i < (int) contexts.size(); i++)
//
contexts[i]->initialize();
for
(
int
i
=
0
;
i
<
(
int
)
contexts
.
size
();
i
++
)
contexts
[
i
]
->
initialize
();
}
void
CudaPlatform
::
PlatformData
::
syncContexts
()
{
//
for (int i = 0; i < (int) contexts.size(); i++)
//
contexts[i]->getWorkThread().flush();
for
(
int
i
=
0
;
i
<
(
int
)
contexts
.
size
();
i
++
)
contexts
[
i
]
->
getWorkThread
().
flush
();
}
platforms/cuda2/src/CudaSort.cpp
0 → 100644
View file @
3e16cab9
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2010-2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "CudaSort.h"
#include "CudaKernelSources.h"
#include <map>
using
namespace
OpenMM
;
using
namespace
std
;
CudaSort
::
CudaSort
(
CudaContext
&
context
,
SortTrait
*
trait
,
unsigned
int
length
)
:
context
(
context
),
trait
(
trait
),
dataRange
(
NULL
),
bucketOfElement
(
NULL
),
offsetInBucket
(
NULL
),
bucketOffset
(
NULL
),
buckets
(
NULL
)
{
// Create kernels.
map
<
string
,
string
>
replacements
;
replacements
[
"DATA_TYPE"
]
=
trait
->
getDataType
();
replacements
[
"KEY_TYPE"
]
=
trait
->
getKeyType
();
replacements
[
"SORT_KEY"
]
=
trait
->
getSortKey
();
replacements
[
"MIN_KEY"
]
=
trait
->
getMinKey
();
replacements
[
"MAX_KEY"
]
=
trait
->
getMaxKey
();
replacements
[
"MAX_VALUE"
]
=
trait
->
getMaxValue
();
CUmodule
module
=
context
.
createModule
(
context
.
replaceStrings
(
CudaKernelSources
::
sort
,
replacements
));
computeRangeKernel
=
context
.
getKernel
(
module
,
"computeRange"
);
assignElementsKernel
=
context
.
getKernel
(
module
,
"assignElementsToBuckets"
);
computeBucketPositionsKernel
=
context
.
getKernel
(
module
,
"computeBucketPositions"
);
copyToBucketsKernel
=
context
.
getKernel
(
module
,
"copyDataToBuckets"
);
sortBucketsKernel
=
context
.
getKernel
(
module
,
"sortBuckets"
);
// Work out the work group sizes for various kernels.
int
maxBlockSize
;
cuDeviceGetAttribute
(
&
maxBlockSize
,
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X
,
context
.
getDevice
());
for
(
rangeKernelSize
=
1
;
rangeKernelSize
*
2
<=
maxBlockSize
;
rangeKernelSize
*=
2
)
;
positionsKernelSize
=
rangeKernelSize
;
sortKernelSize
=
rangeKernelSize
/
2
;
if
(
rangeKernelSize
>
length
)
rangeKernelSize
=
length
;
int
maxSharedMem
;
cuDeviceGetAttribute
(
&
maxSharedMem
,
CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK
,
context
.
getDevice
());
unsigned
int
maxLocalBuffer
=
(
unsigned
int
)
((
maxSharedMem
/
trait
->
getDataSize
())
/
2
);
if
(
sortKernelSize
>
maxLocalBuffer
)
sortKernelSize
=
maxLocalBuffer
;
unsigned
int
targetBucketSize
=
sortKernelSize
/
2
;
unsigned
int
numBuckets
=
length
/
targetBucketSize
;
if
(
numBuckets
<
1
)
numBuckets
=
1
;
if
(
positionsKernelSize
>
numBuckets
)
positionsKernelSize
=
numBuckets
;
// Create workspace arrays.
dataRange
=
new
CudaArray
(
2
,
trait
->
getKeySize
(),
"sortDataRange"
);
bucketOffset
=
CudaArray
::
create
<
uint1
>
(
numBuckets
,
"bucketOffset"
);
bucketOfElement
=
CudaArray
::
create
<
uint1
>
(
length
,
"bucketOfElement"
);
offsetInBucket
=
CudaArray
::
create
<
uint1
>
(
length
,
"offsetInBucket"
);
buckets
=
new
CudaArray
(
length
,
trait
->
getDataSize
(),
"buckets"
);
}
CudaSort
::~
CudaSort
()
{
delete
trait
;
if
(
dataRange
!=
NULL
)
delete
dataRange
;
if
(
bucketOfElement
!=
NULL
)
delete
bucketOfElement
;
if
(
offsetInBucket
!=
NULL
)
delete
offsetInBucket
;
if
(
bucketOffset
!=
NULL
)
delete
bucketOffset
;
if
(
buckets
!=
NULL
)
delete
buckets
;
}
void
CudaSort
::
sort
(
CudaArray
&
data
)
{
if
(
data
.
getSize
()
!=
bucketOfElement
->
getSize
()
||
data
.
getElementSize
()
!=
trait
->
getDataSize
())
throw
OpenMMException
(
"CudaSort called with different data size"
);
if
(
data
.
getSize
()
==
0
)
return
;
// Compute the range of data values.
unsigned
int
dataSize
=
data
.
getSize
();
void
*
rangeArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
dataSize
,
&
dataRange
->
getDevicePointer
()};
context
.
executeKernel
(
computeRangeKernel
,
rangeArgs
,
rangeKernelSize
,
rangeKernelSize
,
rangeKernelSize
*
trait
->
getKeySize
());
// Assign array elements to buckets.
unsigned
int
numBuckets
=
bucketOffset
->
getSize
();
context
.
clearBuffer
(
*
bucketOffset
);
void
*
elementsArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
dataSize
,
&
numBuckets
,
&
dataRange
->
getDevicePointer
(),
&
bucketOffset
->
getDevicePointer
(),
&
bucketOfElement
->
getDevicePointer
(),
&
offsetInBucket
->
getDevicePointer
()};
context
.
executeKernel
(
assignElementsKernel
,
elementsArgs
,
data
.
getSize
());
// Compute the position of each bucket.
void
*
computeArgs
[]
=
{
&
numBuckets
,
&
bucketOffset
->
getDevicePointer
()};
context
.
executeKernel
(
computeBucketPositionsKernel
,
computeArgs
,
positionsKernelSize
,
positionsKernelSize
,
positionsKernelSize
*
sizeof
(
int
));
// Copy the data into the buckets.
void
*
copyArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
buckets
->
getDevicePointer
(),
&
dataSize
,
&
bucketOffset
->
getDevicePointer
(),
&
bucketOfElement
->
getDevicePointer
(),
&
offsetInBucket
->
getDevicePointer
()};
context
.
executeKernel
(
copyToBucketsKernel
,
copyArgs
,
data
.
getSize
());
// Sort each bucket.
void
*
sortArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
buckets
->
getDevicePointer
(),
&
numBuckets
,
&
bucketOffset
->
getDevicePointer
()};
context
.
executeKernel
(
sortBucketsKernel
,
sortArgs
,
((
data
.
getSize
()
+
sortKernelSize
-
1
)
/
sortKernelSize
)
*
sortKernelSize
,
sortKernelSize
,
sortKernelSize
*
trait
->
getDataSize
());
}
platforms/cuda2/src/CudaSort.h
0 → 100644
View file @
3e16cab9
#ifndef __OPENMM_CUDASORT_H__
#define __OPENMM_CUDASORT_H__
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2010-2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "CudaArray.h"
#include "openmm/internal/windowsExport.h"
#include "CudaContext.h"
namespace
OpenMM
{
/**
* This class sorts arrays of values. It supports any type of values, not just scalars,
* so long as an appropriate sorting key can be defined by which to sort them.
*
* The sorting behavior is specified by a "trait" class that defines the type of data to
* sort and the key for sorting it. Here is an example of a trait class for
* sorting floats:
*
* class SortTrait : public CudaSort::SortTrait {
* int getDataSize() const {return 4;}
* int getKeySize() const {return 4;}
* const char* getDataType() const {return "float";}
* const char* getKeyType() const {return "float";}
* const char* getMinKey() const {return "-MAXFLOAT";}
* const char* getMaxKey() const {return "MAXFLOAT";}
* const char* getMaxValue() const {return "MAXFLOAT";}
* const char* getSortKey() const {return "value";}
* };
*
* The algorithm used is a bucket sort, followed by a bitonic sort within each bucket
* (in local memory when possible, in global memory otherwise). This is similar to
* the algorithm described in
*
* Shifu Chen, Jing Qin, Yongming Xie, Junping Zhao, and Pheng-Ann Heng. "An Efficient
* Sorting Algorithm with CUDA" Journal of the Chinese Institute of Engineers, 32(7),
* pp. 915-921 (2009)
*
* but with many modifications and simplifications. In particular, this algorithm
* involves much less communication between host and device, which is critical to get
* good performance with the array sizes we typically work with (10,000 to 100,000
* elements).
*/
class
OPENMM_EXPORT
CudaSort
{
public:
class
SortTrait
;
/**
* Create a CudaSort object for sorting data of a particular type.
*
* @param context the context in which to perform calculations
* @param trait a SortTrait defining the type of data to sort. It should have been allocated
* on the heap with the "new" operator. This object takes over ownership of it,
* and deletes it when the CudaSort is deleted.
* @param length the length of the arrays this object will be used to sort
*/
CudaSort
(
CudaContext
&
context
,
SortTrait
*
trait
,
unsigned
int
length
);
~
CudaSort
();
/**
* Sort an array.
*/
void
sort
(
CudaArray
&
data
);
private:
CudaContext
&
context
;
SortTrait
*
trait
;
CudaArray
*
dataRange
;
CudaArray
*
bucketOfElement
;
CudaArray
*
offsetInBucket
;
CudaArray
*
bucketOffset
;
CudaArray
*
buckets
;
CUfunction
computeRangeKernel
,
assignElementsKernel
,
computeBucketPositionsKernel
,
copyToBucketsKernel
,
sortBucketsKernel
;
unsigned
int
rangeKernelSize
,
positionsKernelSize
,
sortKernelSize
;
};
/**
* A subclass of SortTrait defines the type of value to sort, and the key for sorting them.
*/
class
CudaSort
::
SortTrait
{
public:
/**
* Get the size of each data value in bytes.
*/
virtual
int
getDataSize
()
const
=
0
;
/**
* Get the size of each key value in bytes.
*/
virtual
int
getKeySize
()
const
=
0
;
/**
* Get the data type of the values to sort.
*/
virtual
const
char
*
getDataType
()
const
=
0
;
/**
* Get the data type of the sorting key.
*/
virtual
const
char
*
getKeyType
()
const
=
0
;
/**
* Get the minimum value a key can take.
*/
virtual
const
char
*
getMinKey
()
const
=
0
;
/**
* Get the maximum value a key can take.
*/
virtual
const
char
*
getMaxKey
()
const
=
0
;
/**
* Get a value whose key is guaranteed to equal getMaxKey().
*/
virtual
const
char
*
getMaxValue
()
const
=
0
;
/**
* Get the CUDA code to select the key from the data value.
*/
virtual
const
char
*
getSortKey
()
const
=
0
;
};
}
// namespace OpenMM
#endif // __OPENMM_CUDASORT_H__
platforms/cuda2/src/kernels/sort.cu
0 → 100644
View file @
3e16cab9
__device__
KEY_TYPE
getValue
(
DATA_TYPE
value
)
{
return
SORT_KEY
;
}
extern
"C"
{
/**
* Calculate the minimum and maximum value in the array to be sorted. This kernel
* is executed as a single work group.
*/
__global__
void
computeRange
(
const
DATA_TYPE
*
__restrict__
data
,
unsigned
int
length
,
KEY_TYPE
*
__restrict__
range
)
{
extern
__shared__
KEY_TYPE
rangeBuffer
[];
KEY_TYPE
minimum
=
MAX_KEY
;
KEY_TYPE
maximum
=
MIN_KEY
;
// Each thread calculates the range of a subset of values.
for
(
unsigned
int
index
=
threadIdx
.
x
;
index
<
length
;
index
+=
blockDim
.
x
)
{
KEY_TYPE
value
=
getValue
(
data
[
index
]);
minimum
=
min
(
minimum
,
value
);
maximum
=
max
(
maximum
,
value
);
}
// Now reduce them.
rangeBuffer
[
threadIdx
.
x
]
=
minimum
;
__syncthreads
();
for
(
unsigned
int
step
=
1
;
step
<
blockDim
.
x
;
step
*=
2
)
{
if
(
threadIdx
.
x
+
step
<
blockDim
.
x
&&
threadIdx
.
x
%
(
2
*
step
)
==
0
)
rangeBuffer
[
threadIdx
.
x
]
=
min
(
rangeBuffer
[
threadIdx
.
x
],
rangeBuffer
[
threadIdx
.
x
+
step
]);
__syncthreads
();
}
minimum
=
rangeBuffer
[
0
];
rangeBuffer
[
threadIdx
.
x
]
=
maximum
;
__syncthreads
();
for
(
unsigned
int
step
=
1
;
step
<
blockDim
.
x
;
step
*=
2
)
{
if
(
threadIdx
.
x
+
step
<
blockDim
.
x
&&
threadIdx
.
x
%
(
2
*
step
)
==
0
)
rangeBuffer
[
threadIdx
.
x
]
=
max
(
rangeBuffer
[
threadIdx
.
x
],
rangeBuffer
[
threadIdx
.
x
+
step
]);
__syncthreads
();
}
maximum
=
rangeBuffer
[
0
];
if
(
threadIdx
.
x
==
0
)
{
range
[
0
]
=
minimum
;
range
[
1
]
=
maximum
;
}
}
/**
* Assign elements to buckets.
*/
__global__
void
assignElementsToBuckets
(
const
DATA_TYPE
*
__restrict__
data
,
unsigned
int
length
,
unsigned
int
numBuckets
,
const
KEY_TYPE
*
__restrict__
range
,
unsigned
int
*
bucketOffset
,
unsigned
int
*
__restrict__
bucketOfElement
,
unsigned
int
*
__restrict__
offsetInBucket
)
{
float
minValue
=
(
float
)
(
range
[
0
]);
float
maxValue
=
(
float
)
(
range
[
1
]);
float
bucketWidth
=
(
maxValue
-
minValue
)
/
numBuckets
;
for
(
unsigned
int
index
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
index
<
length
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
float
key
=
(
float
)
getValue
(
data
[
index
]);
unsigned
int
bucketIndex
=
min
((
unsigned
int
)
((
key
-
minValue
)
/
bucketWidth
),
numBuckets
-
1
);
offsetInBucket
[
index
]
=
atomicAdd
(
&
bucketOffset
[
bucketIndex
],
1
);
bucketOfElement
[
index
]
=
bucketIndex
;
}
}
/**
* Sum the bucket sizes to compute the start position of each bucket. This kernel
* is executed as a single work group.
*/
__global__
void
computeBucketPositions
(
unsigned
int
numBuckets
,
unsigned
int
*
__restrict__
bucketOffset
)
{
extern
__shared__
unsigned
int
posBuffer
[];
unsigned
int
globalOffset
=
0
;
for
(
unsigned
int
startBucket
=
0
;
startBucket
<
numBuckets
;
startBucket
+=
blockDim
.
x
)
{
// Load the bucket sizes into local memory.
unsigned
int
globalIndex
=
startBucket
+
threadIdx
.
x
;
posBuffer
[
threadIdx
.
x
]
=
(
globalIndex
<
numBuckets
?
bucketOffset
[
globalIndex
]
:
0
);
__syncthreads
();
// Perform a parallel prefix sum.
for
(
unsigned
int
step
=
1
;
step
<
blockDim
.
x
;
step
*=
2
)
{
unsigned
int
add
=
(
threadIdx
.
x
>=
step
?
posBuffer
[
threadIdx
.
x
-
step
]
:
0
);
__syncthreads
();
posBuffer
[
threadIdx
.
x
]
+=
add
;
__syncthreads
();
}
// Write the results back to global memory.
if
(
globalIndex
<
numBuckets
)
bucketOffset
[
globalIndex
]
=
posBuffer
[
threadIdx
.
x
]
+
globalOffset
;
globalOffset
+=
posBuffer
[
blockDim
.
x
-
1
];
}
}
/**
* Copy the input data into the buckets for sorting.
*/
__global__
void
copyDataToBuckets
(
const
DATA_TYPE
*
__restrict__
data
,
DATA_TYPE
*
__restrict__
buckets
,
unsigned
int
length
,
const
unsigned
int
*
__restrict__
bucketOffset
,
const
unsigned
int
*
__restrict__
bucketOfElement
,
const
unsigned
int
*
__restrict__
offsetInBucket
)
{
for
(
unsigned
int
index
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
index
<
length
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
DATA_TYPE
element
=
data
[
index
];
unsigned
int
bucketIndex
=
bucketOfElement
[
index
];
unsigned
int
offset
=
(
bucketIndex
==
0
?
0
:
bucketOffset
[
bucketIndex
-
1
]);
buckets
[
offset
+
offsetInBucket
[
index
]]
=
element
;
}
}
/**
* Sort the data in each bucket.
*/
__global__
void
sortBuckets
(
DATA_TYPE
*
__restrict__
data
,
const
DATA_TYPE
*
__restrict__
buckets
,
unsigned
int
numBuckets
,
const
unsigned
int
*
__restrict__
bucketOffset
)
{
extern
__shared__
DATA_TYPE
dataBuffer
[];
for
(
unsigned
int
index
=
blockIdx
.
x
;
index
<
numBuckets
;
index
+=
gridDim
.
x
)
{
unsigned
int
startIndex
=
(
index
==
0
?
0
:
bucketOffset
[
index
-
1
]);
unsigned
int
endIndex
=
bucketOffset
[
index
];
unsigned
int
length
=
endIndex
-
startIndex
;
if
(
length
<=
blockDim
.
x
)
{
// Load the data into local memory.
if
(
threadIdx
.
x
<
length
)
dataBuffer
[
threadIdx
.
x
]
=
buckets
[
startIndex
+
threadIdx
.
x
];
else
dataBuffer
[
threadIdx
.
x
]
=
MAX_VALUE
;
__syncthreads
();
// Perform a bitonic sort in local memory.
for
(
unsigned
int
k
=
2
;
k
<=
blockDim
.
x
;
k
*=
2
)
{
for
(
unsigned
int
j
=
k
/
2
;
j
>
0
;
j
/=
2
)
{
int
ixj
=
threadIdx
.
x
^
j
;
if
(
ixj
>
threadIdx
.
x
)
{
DATA_TYPE
value1
=
dataBuffer
[
threadIdx
.
x
];
DATA_TYPE
value2
=
dataBuffer
[
ixj
];
bool
ascending
=
(
threadIdx
.
x
&
k
)
==
0
;
KEY_TYPE
lowKey
=
(
ascending
?
getValue
(
value1
)
:
getValue
(
value2
));
KEY_TYPE
highKey
=
(
ascending
?
getValue
(
value2
)
:
getValue
(
value1
));
if
(
lowKey
>
highKey
)
{
dataBuffer
[
threadIdx
.
x
]
=
value2
;
dataBuffer
[
ixj
]
=
value1
;
}
}
__syncthreads
();
}
}
// Write the data to the sorted array.
if
(
threadIdx
.
x
<
length
)
data
[
startIndex
+
threadIdx
.
x
]
=
dataBuffer
[
threadIdx
.
x
];
}
else
{
// Copy the bucket data over to the output array.
for
(
unsigned
int
i
=
threadIdx
.
x
;
i
<
length
;
i
+=
blockDim
.
x
)
data
[
startIndex
+
i
]
=
buckets
[
startIndex
+
i
];
__threadfence_block
();
__syncthreads
();
// Perform a bitonic sort in global memory.
for
(
unsigned
int
k
=
2
;
k
<
2
*
length
;
k
*=
2
)
{
for
(
unsigned
int
j
=
k
/
2
;
j
>
0
;
j
/=
2
)
{
for
(
unsigned
int
i
=
threadIdx
.
x
;
i
<
length
;
i
+=
blockDim
.
x
)
{
int
ixj
=
i
^
j
;
if
(
ixj
>
i
&&
ixj
<
length
)
{
DATA_TYPE
value1
=
data
[
startIndex
+
i
];
DATA_TYPE
value2
=
data
[
startIndex
+
ixj
];
bool
ascending
=
((
i
&
k
)
==
0
);
for
(
unsigned
int
mask
=
k
*
2
;
mask
<
2
*
length
;
mask
*=
2
)
ascending
=
((
i
&
mask
)
==
0
?
!
ascending
:
ascending
);
KEY_TYPE
lowKey
=
(
ascending
?
getValue
(
value1
)
:
getValue
(
value2
));
KEY_TYPE
highKey
=
(
ascending
?
getValue
(
value2
)
:
getValue
(
value1
));
if
(
lowKey
>
highKey
)
{
data
[
startIndex
+
i
]
=
value2
;
data
[
startIndex
+
ixj
]
=
value1
;
}
}
}
__threadfence_block
();
__syncthreads
();
}
}
}
}
}
}
\ No newline at end of file
platforms/cuda2/src/kernels/utilities.cu
View file @
3e16cab9
extern
"C"
{
/**
* This is called by the various functions below to clear a buffer.
*/
...
...
@@ -100,3 +102,5 @@ __global__ void reduceForces(const long* __restrict__ longBuffer, float4* __rest
buffer
[
index
]
=
sum
;
}
}
}
\ No newline at end of file
platforms/cuda2/tests/TestCudaSort.cpp
0 → 100644
View file @
3e16cab9
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008-2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
/**
* This tests the CUDA implementation of sorting.
*/
#include "openmm/internal/AssertionUtilities.h"
#include "../src/CudaArray.h"
#include "../src/CudaContext.h"
#include "../src/CudaSort.h"
#include "sfmt/SFMT.h"
#include "openmm/System.h"
#include <iostream>
#include <cmath>
#include <set>
using
namespace
OpenMM
;
using
namespace
std
;
class
SortTrait
:
public
CudaSort
::
SortTrait
{
int
getDataSize
()
const
{
return
4
;}
int
getKeySize
()
const
{
return
4
;}
const
char
*
getDataType
()
const
{
return
"float"
;}
const
char
*
getKeyType
()
const
{
return
"float"
;}
const
char
*
getMinKey
()
const
{
return
"-MAXFLOAT"
;}
const
char
*
getMaxKey
()
const
{
return
"MAXFLOAT"
;}
const
char
*
getMaxValue
()
const
{
return
"MAXFLOAT"
;}
const
char
*
getSortKey
()
const
{
return
"value"
;}
};
void
verifySorting
(
vector
<
float
>
array
)
{
// Sort the array.
System
system
;
system
.
addParticle
(
0.0
);
CudaPlatform
platform
;
CudaPlatform
::
PlatformData
platformData
(
system
,
""
,
"true"
,
"single"
,
platform
.
getPropertyDefaultValue
(
CudaPlatform
::
CudaCompiler
()),
platform
.
getPropertyDefaultValue
(
CudaPlatform
::
CudaTempDirectory
()));
CudaContext
&
context
=
*
platformData
.
contexts
[
0
];
context
.
initialize
();
CudaArray
data
(
array
.
size
(),
4
,
"sortData"
);
data
.
upload
(
array
);
CudaSort
sort
(
context
,
new
SortTrait
(),
array
.
size
());
sort
.
sort
(
data
);
vector
<
float
>
sorted
;
data
.
download
(
sorted
);
// Verify that it is in sorted order.
for
(
int
i
=
1
;
i
<
(
int
)
sorted
.
size
();
i
++
)
ASSERT
(
sorted
[
i
-
1
]
<=
sorted
[
i
]);
// Make sure the sorted array contains the same values as the original one.
multiset
<
float
>
elements1
(
array
.
begin
(),
array
.
end
());
multiset
<
float
>
elements2
(
sorted
.
begin
(),
sorted
.
end
());
ASSERT
(
elements1
==
elements2
);
}
void
testUniformValues
()
{
OpenMM_SFMT
::
SFMT
sfmt
;
init_gen_rand
(
0
,
sfmt
);
vector
<
float
>
array
(
10000
);
for
(
int
i
=
0
;
i
<
(
int
)
array
.
size
();
i
++
)
array
[
i
]
=
(
float
)
genrand_real2
(
sfmt
);
verifySorting
(
array
);
}
void
testLogValues
()
{
OpenMM_SFMT
::
SFMT
sfmt
;
init_gen_rand
(
0
,
sfmt
);
vector
<
float
>
array
(
10000
);
for
(
int
i
=
0
;
i
<
(
int
)
array
.
size
();
i
++
)
array
[
i
]
=
(
float
)
log
(
genrand_real2
(
sfmt
));
verifySorting
(
array
);
}
int
main
()
{
try
{
testUniformValues
();
testLogValues
();
}
catch
(
const
exception
&
e
)
{
cout
<<
"exception: "
<<
e
.
what
()
<<
endl
;
return
1
;
}
cout
<<
"Done"
<<
endl
;
return
0
;
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment