Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
3e16cab9
Commit
3e16cab9
authored
Jun 05, 2012
by
Peter Eastman
Browse files
Continuing to implement new CUDA platform
parent
abb8cb4b
Changes
9
Show whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
1158 additions
and
509 deletions
+1158
-509
platforms/cuda2/src/CudaArray.h
platforms/cuda2/src/CudaArray.h
+1
-1
platforms/cuda2/src/CudaContext.cpp
platforms/cuda2/src/CudaContext.cpp
+367
-309
platforms/cuda2/src/CudaContext.h
platforms/cuda2/src/CudaContext.h
+200
-195
platforms/cuda2/src/CudaPlatform.cpp
platforms/cuda2/src/CudaPlatform.cpp
+5
-4
platforms/cuda2/src/CudaSort.cpp
platforms/cuda2/src/CudaSort.cpp
+132
-0
platforms/cuda2/src/CudaSort.h
platforms/cuda2/src/CudaSort.h
+141
-0
platforms/cuda2/src/kernels/sort.cu
platforms/cuda2/src/kernels/sort.cu
+186
-0
platforms/cuda2/src/kernels/utilities.cu
platforms/cuda2/src/kernels/utilities.cu
+4
-0
platforms/cuda2/tests/TestCudaSort.cpp
platforms/cuda2/tests/TestCudaSort.cpp
+122
-0
No files found.
platforms/cuda2/src/CudaArray.h
View file @
3e16cab9
...
@@ -83,7 +83,7 @@ public:
...
@@ -83,7 +83,7 @@ public:
/**
/**
* Get a pointer to the device memory.
* Get a pointer to the device memory.
*/
*/
CUdeviceptr
getDevicePointer
()
{
CUdeviceptr
&
getDevicePointer
()
{
return
pointer
;
return
pointer
;
}
}
/**
/**
...
...
platforms/cuda2/src/CudaContext.cpp
View file @
3e16cab9
...
@@ -31,7 +31,6 @@
...
@@ -31,7 +31,6 @@
#include "CudaContext.h"
#include "CudaContext.h"
#include "CudaArray.h"
#include "CudaArray.h"
//#include "CudaBondedUtilities.h"
//#include "CudaBondedUtilities.h"
#include "CudaExpressionUtilities.h"
#include "CudaForceInfo.h"
#include "CudaForceInfo.h"
//#include "CudaIntegrationUtilities.h"
//#include "CudaIntegrationUtilities.h"
#include "CudaKernelSources.h"
#include "CudaKernelSources.h"
...
@@ -53,7 +52,7 @@
...
@@ -53,7 +52,7 @@
#define CHECK_RESULT2(result, prefix) \
#define CHECK_RESULT2(result, prefix) \
if (result != CUDA_SUCCESS) { \
if (result != CUDA_SUCCESS) { \
std::stringstream m; \
std::stringstream m; \
m<<prefix<<": "<<result<<" ("<<__FILE__<<":
"<<__LINE__
<<")"
; \
m<<prefix<<": "<<
getErrorString(
result
)
<<" (
"<<result<<")"<<" at
"<<__FILE__<<":"<<__LINE__; \
throw OpenMMException(m.str());\
throw OpenMMException(m.str());\
}
}
...
@@ -66,7 +65,7 @@ bool CudaContext::hasInitializedCuda = false;
...
@@ -66,7 +65,7 @@ bool CudaContext::hasInitializedCuda = false;
CudaContext
::
CudaContext
(
const
System
&
system
,
int
deviceIndex
,
bool
useBlockingSync
,
const
string
&
precision
,
const
string
&
compiler
,
CudaContext
::
CudaContext
(
const
System
&
system
,
int
deviceIndex
,
bool
useBlockingSync
,
const
string
&
precision
,
const
string
&
compiler
,
const
string
&
tempDir
,
CudaPlatform
::
PlatformData
&
platformData
)
:
system
(
system
),
compiler
(
compiler
),
const
string
&
tempDir
,
CudaPlatform
::
PlatformData
&
platformData
)
:
system
(
system
),
compiler
(
compiler
),
time
(
0.0
),
platformData
(
platformData
),
stepCount
(
0
),
computeForceCount
(
0
),
contextIsValid
(
false
),
atomsWereReordered
(
false
),
posq
(
NULL
),
time
(
0.0
),
platformData
(
platformData
),
stepCount
(
0
),
computeForceCount
(
0
),
contextIsValid
(
false
),
atomsWereReordered
(
false
),
pinnedBuffer
(
NULL
),
posq
(
NULL
),
velm
(
NULL
),
/*forceBuffers(NULL), longForceBuffer(NULL), energyBuffer(NULL), atomIndex(NULL), integration(NULL),
velm
(
NULL
),
/*forceBuffers(NULL), longForceBuffer(NULL), energyBuffer(NULL), atomIndex(NULL), integration(NULL),
bonded(NULL), nonbonded(NULL),*/
thread
(
NULL
)
{
bonded(NULL), nonbonded(NULL),*/
thread
(
NULL
)
{
if
(
!
hasInitializedCuda
)
{
if
(
!
hasInitializedCuda
)
{
...
@@ -88,7 +87,7 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
...
@@ -88,7 +87,7 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
else
else
throw
OpenMMException
(
"Illegal value for CudaPrecision: "
+
precision
);
throw
OpenMMException
(
"Illegal value for CudaPrecision: "
+
precision
);
#ifdef WIN32
#ifdef WIN32
this
->
tempDir
=
tempDir
+
"
\"
;
this
->
tempDir
=
tempDir
+
"
\
\
"
;
#else
#else
this
->
tempDir
=
tempDir
+
"/"
;
this
->
tempDir
=
tempDir
+
"/"
;
#endif
#endif
...
@@ -114,6 +113,7 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
...
@@ -114,6 +113,7 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
deviceIndex
=
i
;
deviceIndex
=
i
;
bestSpeed
=
speed
;
bestSpeed
=
speed
;
bestCompute
=
major
;
bestCompute
=
major
;
gpuArchitecture
=
intToString
(
major
)
+
intToString
(
minor
);
}
}
}
}
}
}
...
@@ -121,37 +121,47 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
...
@@ -121,37 +121,47 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
throw
OpenMMException
(
"No compatible CUDA device is available"
);
throw
OpenMMException
(
"No compatible CUDA device is available"
);
CHECK_RESULT
(
cuDeviceGet
(
&
device
,
deviceIndex
));
CHECK_RESULT
(
cuDeviceGet
(
&
device
,
deviceIndex
));
this
->
deviceIndex
=
deviceIndex
;
this
->
deviceIndex
=
deviceIndex
;
int major, minor;
compilationDefines
[
"WORK_GROUP_SIZE"
]
=
intToString
(
ThreadBlockSize
);
CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, device));
gpuArchitecture = CudaExpressionUtilities::intToString(major)+CudaExpressionUtilities::intToString(minor);
compilationDefines["
WORK_GROUP_SIZE
"] = CudaExpressionUtilities::intToString(ThreadBlockSize);
defaultOptimizationOptions
=
"--use_fast_math"
;
defaultOptimizationOptions
=
"--use_fast_math"
;
int numThreadBlocksPerComputeUnit = 6;
unsigned
int
flags
=
CU_CTX_MAP_HOST
;
CHECK_RESULT(cuCtxCreate(&context, 0, device));
if
(
useBlockingSync
)
flags
+=
CU_CTX_SCHED_BLOCKING_SYNC
;
else
flags
+=
CU_CTX_SCHED_SPIN
;
CHECK_RESULT
(
cuCtxCreate
(
&
context
,
flags
,
device
));
contextIsValid
=
true
;
contextIsValid
=
true
;
numAtoms
=
system
.
getNumParticles
();
numAtoms
=
system
.
getNumParticles
();
paddedNumAtoms
=
TileSize
*
((
numAtoms
+
TileSize
-
1
)
/
TileSize
);
paddedNumAtoms
=
TileSize
*
((
numAtoms
+
TileSize
-
1
)
/
TileSize
);
numAtomBlocks
=
(
paddedNumAtoms
+
(
TileSize
-
1
))
/
TileSize
;
numAtomBlocks
=
(
paddedNumAtoms
+
(
TileSize
-
1
))
/
TileSize
;
int
multiprocessors
;
int
multiprocessors
;
CHECK_RESULT
(
cuDeviceGetAttribute
(
&
multiprocessors
,
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT
,
device
));
CHECK_RESULT
(
cuDeviceGetAttribute
(
&
multiprocessors
,
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT
,
device
));
int
numThreadBlocksPerComputeUnit
=
6
;
numThreadBlocks
=
numThreadBlocksPerComputeUnit
*
multiprocessors
;
numThreadBlocks
=
numThreadBlocksPerComputeUnit
*
multiprocessors
;
// bonded = new CudaBondedUtilities(*this);
// bonded = new CudaBondedUtilities(*this);
// nonbonded = new CudaNonbondedUtilities(*this);
// nonbonded = new CudaNonbondedUtilities(*this);
if
(
useDoublePrecision
)
{
CHECK_RESULT
(
cuMemHostAlloc
(
&
pinnedBuffer
,
paddedNumAtoms
*
sizeof
(
double4
),
0
));
posq
=
CudaArray
::
create
<
double4
>
(
paddedNumAtoms
,
"posq"
);
velm
=
CudaArray
::
create
<
double4
>
(
paddedNumAtoms
,
"velm"
);
}
else
{
CHECK_RESULT
(
cuMemHostAlloc
(
&
pinnedBuffer
,
paddedNumAtoms
*
sizeof
(
float4
),
0
));
posq
=
CudaArray
::
create
<
float4
>
(
paddedNumAtoms
,
"posq"
);
posq
=
CudaArray
::
create
<
float4
>
(
paddedNumAtoms
,
"posq"
);
velm
=
CudaArray
::
create
<
float4
>
(
paddedNumAtoms
,
"velm"
);
velm
=
CudaArray
::
create
<
float4
>
(
paddedNumAtoms
,
"velm"
);
}
posCellOffsets
.
resize
(
paddedNumAtoms
,
make_int4
(
0
,
0
,
0
,
0
));
posCellOffsets
.
resize
(
paddedNumAtoms
,
make_int4
(
0
,
0
,
0
,
0
));
// Create utility kernels that are used in multiple places.
// Create utility kernels that are used in multiple places.
CUmodule
utilities
=
createModule
(
CudaKernelSources
::
vectorOps
+
CudaKernelSources
::
utilities
);
CUmodule
utilities
=
createModule
(
CudaKernelSources
::
vectorOps
+
CudaKernelSources
::
utilities
);
cuModuleGetFunction(&
clearBufferKernel
,
utilities, "
clearBuffer
");
clearBufferKernel
=
getKernel
(
utilities
,
"clearBuffer"
);
cuModuleGetFunction(&
clearTwoBuffersKernel
,
utilities, "
clearTwoBuffers
");
clearTwoBuffersKernel
=
getKernel
(
utilities
,
"clearTwoBuffers"
);
cuModuleGetFunction(&
clearThreeBuffersKernel
,
utilities, "
clearThreeBuffers
");
clearThreeBuffersKernel
=
getKernel
(
utilities
,
"clearThreeBuffers"
);
cuModuleGetFunction(&
clearFourBuffersKernel
,
utilities, "
clearFourBuffers
");
clearFourBuffersKernel
=
getKernel
(
utilities
,
"clearFourBuffers"
);
cuModuleGetFunction(&
clearFiveBuffersKernel
,
utilities, "
clearFiveBuffers
");
clearFiveBuffersKernel
=
getKernel
(
utilities
,
"clearFiveBuffers"
);
cuModuleGetFunction(&
clearSixBuffersKernel
,
utilities, "
clearSixBuffers
");
clearSixBuffersKernel
=
getKernel
(
utilities
,
"clearSixBuffers"
);
cuModuleGetFunction(&
reduceFloat4Kernel
,
utilities, "
reduceFloat4Buffer
");
reduceFloat4Kernel
=
getKernel
(
utilities
,
"reduceFloat4Buffer"
);
cuModuleGetFunction(&
reduceForcesKernel
,
utilities, "
reduceForces
");
reduceForcesKernel
=
getKernel
(
utilities
,
"reduceForces"
);
// Set defines based on the requested precision.
// Set defines based on the requested precision.
...
@@ -175,6 +185,8 @@ CudaContext::~CudaContext() {
...
@@ -175,6 +185,8 @@ CudaContext::~CudaContext() {
delete
forces
[
i
];
delete
forces
[
i
];
for
(
int
i
=
0
;
i
<
(
int
)
reorderListeners
.
size
();
i
++
)
for
(
int
i
=
0
;
i
<
(
int
)
reorderListeners
.
size
();
i
++
)
delete
reorderListeners
[
i
];
delete
reorderListeners
[
i
];
if
(
pinnedBuffer
!=
NULL
)
cuMemFreeHost
(
pinnedBuffer
);
if
(
posq
!=
NULL
)
if
(
posq
!=
NULL
)
delete
posq
;
delete
posq
;
if
(
velm
!=
NULL
)
if
(
velm
!=
NULL
)
...
@@ -202,38 +214,29 @@ CudaContext::~CudaContext() {
...
@@ -202,38 +214,29 @@ CudaContext::~CudaContext() {
CHECK_RESULT
(
cuCtxDestroy
(
context
));
CHECK_RESULT
(
cuCtxDestroy
(
context
));
}
}
//void CudaContext::initialize() {
void
CudaContext
::
initialize
()
{
// for (int i = 0; i < numAtoms; i++) {
for
(
int
i
=
0
;
i
<
numAtoms
;
i
++
)
{
// double mass = system.getParticleMass(i);
double
mass
=
system
.
getParticleMass
(
i
);
// (*velm)[i].w = (float) (mass == 0.0 ? 0.0 : 1.0/mass);
if
(
useDoublePrecision
)
// }
((
double4
*
)
pinnedBuffer
)[
i
]
=
make_double4
(
0.0
,
0.0
,
0.0
,
mass
==
0.0
?
0.0
:
1.0
/
mass
);
// velm->upload();
else
((
float4
*
)
pinnedBuffer
)[
i
]
=
make_float4
(
0.0
f
,
0.0
f
,
0.0
f
,
mass
==
0.0
?
0.0
f
:
(
float
)
(
1.0
/
mass
));
}
velm
->
upload
(
pinnedBuffer
);
// bonded->initialize(system);
// bonded->initialize(system);
// numForceBuffers = platformData.contexts.size();
force
=
CudaArray
::
create
<
long3
>
(
paddedNumAtoms
,
"force"
);
// numForceBuffers = std::max(numForceBuffers, bonded->getNumForceBuffers());
addAutoclearBuffer
(
force
->
getDevicePointer
(),
force
->
getSize
()
*
6
);
// for (int i = 0; i < (int) forces.size(); i++)
energyBuffer
=
CudaArray
::
create
<
float
>
(
numThreadBlocks
*
ThreadBlockSize
,
"energyBuffer"
);
// numForceBuffers = std::max(numForceBuffers, forces[i]->getRequiredForceBuffers());
addAutoclearBuffer
(
energyBuffer
->
getDevicePointer
(),
energyBuffer
->
getSize
());
// forceBuffers = new CudaArray<mm_float4>(*this, paddedNumAtoms*numForceBuffers, "
forceBuffers
", false);
atomIndexDevice
=
CudaArray
::
create
<
int
>
(
paddedNumAtoms
,
"atomIndex"
);
// if (supports64BitGlobalAtomics) {
atomIndex
.
resize
(
paddedNumAtoms
);
// longForceBuffer = new CudaArray<cl_long>(*this, 3*paddedNumAtoms, "
longForceBuffer
", false);
for
(
int
i
=
0
;
i
<
paddedNumAtoms
;
++
i
)
// reduceForcesKernel.setArg<cl::Buffer>(0, longForceBuffer->getDeviceBuffer());
atomIndex
[
i
]
=
i
;
// reduceForcesKernel.setArg<cl::Buffer>(1, forceBuffers->getDeviceBuffer());
atomIndexDevice
->
upload
(
atomIndex
);
// reduceForcesKernel.setArg<cl_int>(2, paddedNumAtoms);
findMoleculeGroups
();
// reduceForcesKernel.setArg<cl_int>(3, numForceBuffers);
moleculesInvalid
=
false
;
// addAutoclearBuffer(longForceBuffer->getDeviceBuffer(), longForceBuffer->getSize()*2);
// }
// addAutoclearBuffer(forceBuffers->getDeviceBuffer(), forceBuffers->getSize()*4);
// force = new CudaArray<mm_float4>(*this, &forceBuffers->getDeviceBuffer(), paddedNumAtoms, "
force
", true);
// energyBuffer = new CudaArray<cl_float>(*this, max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers()), "
energyBuffer
", true);
// addAutoclearBuffer(energyBuffer->getDeviceBuffer(), energyBuffer->getSize());
// atomIndex = new CudaArray<cl_int>(*this, paddedNumAtoms, "
atomIndex
", true);
// for (int i = 0; i < paddedNumAtoms; ++i)
// (*atomIndex)[i] = i;
// atomIndex->upload();
// findMoleculeGroups();
// moleculesInvalid = false;
// nonbonded->initialize(system);
// nonbonded->initialize(system);
//
}
}
void
CudaContext
::
addForce
(
CudaForceInfo
*
force
)
{
void
CudaContext
::
addForce
(
CudaForceInfo
*
force
)
{
forces
.
push_back
(
force
);
forces
.
push_back
(
force
);
...
@@ -315,7 +318,7 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
...
@@ -315,7 +318,7 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
CUresult
result
=
cuModuleLoad
(
&
module
,
outputFile
.
c_str
());
CUresult
result
=
cuModuleLoad
(
&
module
,
outputFile
.
c_str
());
if
(
result
!=
CUDA_SUCCESS
)
{
if
(
result
!=
CUDA_SUCCESS
)
{
std
::
stringstream
m
;
std
::
stringstream
m
;
m
<<
"Error loading CUDA module: "
<<
result
;
m
<<
"Error loading CUDA module: "
<<
getErrorString
(
result
)
<<
" ("
<<
result
<<
")"
;
throw
OpenMMException
(
m
.
str
());
throw
OpenMMException
(
m
.
str
());
}
}
remove
(
inputFile
.
c_str
());
remove
(
inputFile
.
c_str
());
...
@@ -329,52 +332,109 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
...
@@ -329,52 +332,109 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
remove
(
logFile
.
c_str
());
remove
(
logFile
.
c_str
());
throw
;
throw
;
}
}
//
// // Get length before using c_str() to avoid length() call invalidating the c_str() value.
// string src_string = src.str();
// ::size_t src_length = src_string.length();
// cl::Program::Sources sources(1, make_pair(src_string.c_str(), src_length));
// cl::Program program(context, sources);
// try {
// program.build(vector<cl::Device>(1, device), options.c_str());
// } catch (cl::Error err) {
// throw OpenMMException("Error compiling kernel: "+program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device));
// }
}
}
//
//void CudaContext::executeKernel(cl::Kernel& kernel, int workUnits, int blockSize) {
CUfunction
CudaContext
::
getKernel
(
CUmodule
&
module
,
const
string
&
name
)
{
// if (blockSize == -1)
CUfunction
function
;
// blockSize = ThreadBlockSize;
CUresult
result
=
cuModuleGetFunction
(
&
function
,
module
,
name
.
c_str
());
// int size = std::min((workUnits+blockSize-1)/blockSize, numThreadBlocks)*blockSize;
if
(
result
!=
CUDA_SUCCESS
)
{
// try {
std
::
stringstream
m
;
// queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(size), cl::NDRange(blockSize));
m
<<
"Error creating kernel "
<<
name
<<
": "
<<
getErrorString
(
result
)
<<
" ("
<<
result
<<
")"
;
// }
throw
OpenMMException
(
m
.
str
());
// catch (cl::Error err) {
}
// stringstream str;
return
function
;
// str<<"Error invoking kernel "<<kernel.getInfo<CL_KERNEL_FUNCTION_NAME>()<<": "<<err.what()<<" ("<<err.err()<<")";
}
// throw OpenMMException(str.str());
// }
string
CudaContext
::
doubleToString
(
double
value
)
{
//}
stringstream
s
;
//
s
.
precision
(
useDoublePrecision
?
16
:
8
);
//void CudaContext::clearBuffer(CudaArray<float>& array) {
s
<<
scientific
<<
value
;
// clearBuffer(array.getDeviceBuffer(), array.getSize());
if
(
!
useDoublePrecision
)
//}
s
<<
"f"
;
//
return
s
.
str
();
//void CudaContext::clearBuffer(CudaArray<mm_float4>& array) {
}
// clearBuffer(array.getDeviceBuffer(), array.getSize()*4);
//}
string
CudaContext
::
intToString
(
int
value
)
{
//
stringstream
s
;
//void CudaContext::clearBuffer(cl::Memory& memory, int size) {
s
<<
value
;
// clearBufferKernel.setArg<cl::Memory>(0, memory);
return
s
.
str
();
// clearBufferKernel.setArg<cl_int>(1, size);
}
// executeKernel(clearBufferKernel, size, 128);
//}
std
::
string
CudaContext
::
getErrorString
(
CUresult
result
)
{
//
switch
(
result
)
{
//void CudaContext::addAutoclearBuffer(cl::Memory& memory, int size) {
case
CUDA_SUCCESS
:
return
"CUDA_SUCCESS"
;
// autoclearBuffers.push_back(&memory);
case
CUDA_ERROR_INVALID_VALUE
:
return
"CUDA_ERROR_INVALID_VALUE"
;
// autoclearBufferSizes.push_back(size);
case
CUDA_ERROR_OUT_OF_MEMORY
:
return
"CUDA_ERROR_OUT_OF_MEMORY"
;
//}
case
CUDA_ERROR_NOT_INITIALIZED
:
return
"CUDA_ERROR_NOT_INITIALIZED"
;
//
case
CUDA_ERROR_DEINITIALIZED
:
return
"CUDA_ERROR_DEINITIALIZED"
;
case
CUDA_ERROR_PROFILER_DISABLED
:
return
"CUDA_ERROR_PROFILER_DISABLED"
;
case
CUDA_ERROR_PROFILER_NOT_INITIALIZED
:
return
"CUDA_ERROR_PROFILER_NOT_INITIALIZED"
;
case
CUDA_ERROR_PROFILER_ALREADY_STARTED
:
return
"CUDA_ERROR_PROFILER_ALREADY_STARTED"
;
case
CUDA_ERROR_PROFILER_ALREADY_STOPPED
:
return
"CUDA_ERROR_PROFILER_ALREADY_STOPPED"
;
case
CUDA_ERROR_NO_DEVICE
:
return
"CUDA_ERROR_NO_DEVICE"
;
case
CUDA_ERROR_INVALID_DEVICE
:
return
"CUDA_ERROR_INVALID_DEVICE"
;
case
CUDA_ERROR_INVALID_IMAGE
:
return
"CUDA_ERROR_INVALID_IMAGE"
;
case
CUDA_ERROR_INVALID_CONTEXT
:
return
"CUDA_ERROR_INVALID_CONTEXT"
;
case
CUDA_ERROR_CONTEXT_ALREADY_CURRENT
:
return
"CUDA_ERROR_CONTEXT_ALREADY_CURRENT"
;
case
CUDA_ERROR_MAP_FAILED
:
return
"CUDA_ERROR_MAP_FAILED"
;
case
CUDA_ERROR_UNMAP_FAILED
:
return
"CUDA_ERROR_UNMAP_FAILED"
;
case
CUDA_ERROR_ARRAY_IS_MAPPED
:
return
"CUDA_ERROR_ARRAY_IS_MAPPED"
;
case
CUDA_ERROR_ALREADY_MAPPED
:
return
"CUDA_ERROR_ALREADY_MAPPED"
;
case
CUDA_ERROR_NO_BINARY_FOR_GPU
:
return
"CUDA_ERROR_NO_BINARY_FOR_GPU"
;
case
CUDA_ERROR_ALREADY_ACQUIRED
:
return
"CUDA_ERROR_ALREADY_ACQUIRED"
;
case
CUDA_ERROR_NOT_MAPPED
:
return
"CUDA_ERROR_NOT_MAPPED"
;
case
CUDA_ERROR_NOT_MAPPED_AS_ARRAY
:
return
"CUDA_ERROR_NOT_MAPPED_AS_ARRAY"
;
case
CUDA_ERROR_NOT_MAPPED_AS_POINTER
:
return
"CUDA_ERROR_NOT_MAPPED_AS_POINTER"
;
case
CUDA_ERROR_ECC_UNCORRECTABLE
:
return
"CUDA_ERROR_ECC_UNCORRECTABLE"
;
case
CUDA_ERROR_UNSUPPORTED_LIMIT
:
return
"CUDA_ERROR_UNSUPPORTED_LIMIT"
;
case
CUDA_ERROR_CONTEXT_ALREADY_IN_USE
:
return
"CUDA_ERROR_CONTEXT_ALREADY_IN_USE"
;
case
CUDA_ERROR_INVALID_SOURCE
:
return
"CUDA_ERROR_INVALID_SOURCE"
;
case
CUDA_ERROR_FILE_NOT_FOUND
:
return
"CUDA_ERROR_FILE_NOT_FOUND"
;
case
CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND
:
return
"CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND"
;
case
CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
:
return
"CUDA_ERROR_SHARED_OBJECT_INIT_FAILED"
;
case
CUDA_ERROR_OPERATING_SYSTEM
:
return
"CUDA_ERROR_OPERATING_SYSTEM"
;
case
CUDA_ERROR_INVALID_HANDLE
:
return
"CUDA_ERROR_INVALID_HANDLE"
;
case
CUDA_ERROR_NOT_FOUND
:
return
"CUDA_ERROR_NOT_FOUND"
;
case
CUDA_ERROR_NOT_READY
:
return
"CUDA_ERROR_NOT_READY"
;
case
CUDA_ERROR_LAUNCH_FAILED
:
return
"CUDA_ERROR_LAUNCH_FAILED"
;
case
CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES
:
return
"CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES"
;
case
CUDA_ERROR_LAUNCH_TIMEOUT
:
return
"CUDA_ERROR_LAUNCH_TIMEOUT"
;
case
CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING
:
return
"CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING"
;
case
CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED
:
return
"CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED"
;
case
CUDA_ERROR_PEER_ACCESS_NOT_ENABLED
:
return
"CUDA_ERROR_PEER_ACCESS_NOT_ENABLED"
;
case
CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE
:
return
"CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE"
;
case
CUDA_ERROR_CONTEXT_IS_DESTROYED
:
return
"CUDA_ERROR_CONTEXT_IS_DESTROYED"
;
case
CUDA_ERROR_UNKNOWN
:
return
"CUDA_ERROR_UNKNOWN"
;
}
return
"Invalid error code"
;
}
void
CudaContext
::
executeKernel
(
CUfunction
kernel
,
void
**
arguments
,
int
threads
,
int
blockSize
,
unsigned
int
sharedSize
)
{
if
(
blockSize
==
-
1
)
blockSize
=
ThreadBlockSize
;
int
gridSize
=
std
::
min
((
threads
+
blockSize
-
1
)
/
blockSize
,
numThreadBlocks
);
CUresult
result
=
cuLaunchKernel
(
kernel
,
gridSize
,
1
,
1
,
blockSize
,
1
,
1
,
sharedSize
,
0
,
arguments
,
NULL
);
if
(
result
!=
CUDA_SUCCESS
)
{
stringstream
str
;
str
<<
"Error invoking kernel: "
<<
getErrorString
(
result
)
<<
" ("
<<
result
<<
")"
;
throw
OpenMMException
(
str
.
str
());
}
}
void
CudaContext
::
clearBuffer
(
CudaArray
&
array
)
{
clearBuffer
(
array
.
getDevicePointer
(),
array
.
getSize
()
*
array
.
getElementSize
()
/
4
);
}
void
CudaContext
::
clearBuffer
(
CUdeviceptr
memory
,
int
size
)
{
void
*
args
[]
=
{
&
memory
,
&
size
};
executeKernel
(
clearBufferKernel
,
args
,
size
,
128
);
}
void
CudaContext
::
addAutoclearBuffer
(
CUdeviceptr
memory
,
int
size
)
{
autoclearBuffers
.
push_back
(
memory
);
autoclearBufferSizes
.
push_back
(
size
);
}
//void CudaContext::clearAutoclearBuffers() {
//void CudaContext::clearAutoclearBuffers() {
// int base = 0;
// int base = 0;
// int total = autoclearBufferSizes.size();
// int total = autoclearBufferSizes.size();
...
@@ -454,219 +514,217 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
...
@@ -454,219 +514,217 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
// executeKernel(reduceFloat4Kernel, bufferSize, 128);
// executeKernel(reduceFloat4Kernel, bufferSize, 128);
//}
//}
//
//
//void CudaContext::tagAtomsInMolecule(int atom, int molecule, vector<int>& atomMolecule, vector<vector<int> >& atomBonds) {
void
CudaContext
::
tagAtomsInMolecule
(
int
atom
,
int
molecule
,
vector
<
int
>&
atomMolecule
,
vector
<
vector
<
int
>
>&
atomBonds
)
{
// // Recursively tag atoms as belonging to a particular molecule.
// Recursively tag atoms as belonging to a particular molecule.
//
// atomMolecule[atom] = molecule;
atomMolecule
[
atom
]
=
molecule
;
// for (int i = 0; i < (int) atomBonds[atom].size(); i++)
for
(
int
i
=
0
;
i
<
(
int
)
atomBonds
[
atom
].
size
();
i
++
)
// if (atomMolecule[atomBonds[atom][i]] == -1)
if
(
atomMolecule
[
atomBonds
[
atom
][
i
]]
==
-
1
)
// tagAtomsInMolecule(atomBonds[atom][i], molecule, atomMolecule, atomBonds);
tagAtomsInMolecule
(
atomBonds
[
atom
][
i
],
molecule
,
atomMolecule
,
atomBonds
);
//}
}
//
///**
/**
// * This class ensures that atom reordering doesn't break virtual sites.
* This class ensures that atom reordering doesn't break virtual sites.
// */
*/
//class CudaContext::VirtualSiteInfo : public CudaForceInfo {
class
CudaContext
::
VirtualSiteInfo
:
public
CudaForceInfo
{
//public:
public:
// VirtualSiteInfo(const System& system) : CudaForceInfo(0) {
VirtualSiteInfo
(
const
System
&
system
)
:
CudaForceInfo
(
0
)
{
// for (int i = 0; i < system.getNumParticles(); i++) {
for
(
int
i
=
0
;
i
<
system
.
getNumParticles
();
i
++
)
{
// if (system.isVirtualSite(i)) {
if
(
system
.
isVirtualSite
(
i
))
{
// siteTypes.push_back(&typeid(system.getVirtualSite(i)));
siteTypes
.
push_back
(
&
typeid
(
system
.
getVirtualSite
(
i
)));
// vector<int> particles;
vector
<
int
>
particles
;
// particles.push_back(i);
particles
.
push_back
(
i
);
// for (int j = 0; j < system.getVirtualSite(i).getNumParticles(); j++)
for
(
int
j
=
0
;
j
<
system
.
getVirtualSite
(
i
).
getNumParticles
();
j
++
)
// particles.push_back(system.getVirtualSite(i).getParticle(j));
particles
.
push_back
(
system
.
getVirtualSite
(
i
).
getParticle
(
j
));
// siteParticles.push_back(particles);
siteParticles
.
push_back
(
particles
);
// vector<double> weights;
vector
<
double
>
weights
;
// if (dynamic_cast<const TwoParticleAverageSite*>(&system.getVirtualSite(i)) != NULL) {
if
(
dynamic_cast
<
const
TwoParticleAverageSite
*>
(
&
system
.
getVirtualSite
(
i
))
!=
NULL
)
{
// // A two particle average.
// A two particle average.
//
// const TwoParticleAverageSite& site = dynamic_cast<const TwoParticleAverageSite&>(system.getVirtualSite(i));
const
TwoParticleAverageSite
&
site
=
dynamic_cast
<
const
TwoParticleAverageSite
&>
(
system
.
getVirtualSite
(
i
));
// weights.push_back(site.getWeight(0));
weights
.
push_back
(
site
.
getWeight
(
0
));
// weights.push_back(site.getWeight(1));
weights
.
push_back
(
site
.
getWeight
(
1
));
// }
}
// else if (dynamic_cast<const ThreeParticleAverageSite*>(&system.getVirtualSite(i)) != NULL) {
else
if
(
dynamic_cast
<
const
ThreeParticleAverageSite
*>
(
&
system
.
getVirtualSite
(
i
))
!=
NULL
)
{
// // A three particle average.
// A three particle average.
//
// const ThreeParticleAverageSite& site = dynamic_cast<const ThreeParticleAverageSite&>(system.getVirtualSite(i));
const
ThreeParticleAverageSite
&
site
=
dynamic_cast
<
const
ThreeParticleAverageSite
&>
(
system
.
getVirtualSite
(
i
));
// weights.push_back(site.getWeight(0));
weights
.
push_back
(
site
.
getWeight
(
0
));
// weights.push_back(site.getWeight(1));
weights
.
push_back
(
site
.
getWeight
(
1
));
// weights.push_back(site.getWeight(2));
weights
.
push_back
(
site
.
getWeight
(
2
));
// }
}
// else if (dynamic_cast<const OutOfPlaneSite*>(&system.getVirtualSite(i)) != NULL) {
else
if
(
dynamic_cast
<
const
OutOfPlaneSite
*>
(
&
system
.
getVirtualSite
(
i
))
!=
NULL
)
{
// // An out of plane site.
// An out of plane site.
//
// const OutOfPlaneSite& site = dynamic_cast<const OutOfPlaneSite&>(system.getVirtualSite(i));
const
OutOfPlaneSite
&
site
=
dynamic_cast
<
const
OutOfPlaneSite
&>
(
system
.
getVirtualSite
(
i
));
// weights.push_back(site.getWeight12());
weights
.
push_back
(
site
.
getWeight12
());
// weights.push_back(site.getWeight13());
weights
.
push_back
(
site
.
getWeight13
());
// weights.push_back(site.getWeightCross());
weights
.
push_back
(
site
.
getWeightCross
());
// }
}
// siteWeights.push_back(weights);
siteWeights
.
push_back
(
weights
);
// }
}
// }
}
// }
}
// int getNumParticleGroups() {
int
getNumParticleGroups
()
{
// return siteTypes.size();
return
siteTypes
.
size
();
// }
}
// void getParticlesInGroup(int index, std::vector<int>& particles) {
void
getParticlesInGroup
(
int
index
,
std
::
vector
<
int
>&
particles
)
{
// particles = siteParticles[index];
particles
=
siteParticles
[
index
];
// }
}
// bool areGroupsIdentical(int group1, int group2) {
bool
areGroupsIdentical
(
int
group1
,
int
group2
)
{
// if (siteTypes[group1] != siteTypes[group2])
if
(
siteTypes
[
group1
]
!=
siteTypes
[
group2
])
// return false;
return
false
;
// int numParticles = siteWeights[group1].size();
int
numParticles
=
siteWeights
[
group1
].
size
();
// if (siteWeights[group2].size() != numParticles)
if
(
siteWeights
[
group2
].
size
()
!=
numParticles
)
// return false;
return
false
;
// for (int i = 0; i < numParticles; i++)
for
(
int
i
=
0
;
i
<
numParticles
;
i
++
)
// if (siteWeights[group1][i] != siteWeights[group2][i])
if
(
siteWeights
[
group1
][
i
]
!=
siteWeights
[
group2
][
i
])
// return false;
return
false
;
// return true;
return
true
;
// }
}
//private:
private:
// vector<const type_info*> siteTypes;
vector
<
const
type_info
*>
siteTypes
;
// vector<vector<int> > siteParticles;
vector
<
vector
<
int
>
>
siteParticles
;
// vector<vector<double> > siteWeights;
vector
<
vector
<
double
>
>
siteWeights
;
//};
};
//
//
void
CudaContext
::
findMoleculeGroups
()
{
//void CudaContext::findMoleculeGroups() {
// The first time this is called, we need to identify all the molecules in the system.
// // The first time this is called, we need to identify all the molecules in the system.
//
if
(
moleculeGroups
.
size
()
==
0
)
{
// if (moleculeGroups.size() == 0) {
// Add a ForceInfo that makes sure reordering doesn't break virtual sites.
// // Add a ForceInfo that makes sure reordering doesn't break virtual sites.
//
addForce
(
new
VirtualSiteInfo
(
system
));
// addForce(new VirtualSiteInfo(system));
//
// First make a list of every other atom to which each atom is connect by a constraint or force group.
// // First make a list of every other atom to which each atom is connect by a constraint or force group.
//
vector
<
vector
<
int
>
>
atomBonds
(
system
.
getNumParticles
());
// vector<vector<int> > atomBonds(system.getNumParticles());
for
(
int
i
=
0
;
i
<
system
.
getNumConstraints
();
i
++
)
{
// for (int i = 0; i < system.getNumConstraints(); i++) {
int
particle1
,
particle2
;
// int particle1, particle2;
double
distance
;
// double distance;
system
.
getConstraintParameters
(
i
,
particle1
,
particle2
,
distance
);
// system.getConstraintParameters(i, particle1, particle2, distance);
atomBonds
[
particle1
].
push_back
(
particle2
);
// atomBonds[particle1].push_back(particle2);
atomBonds
[
particle2
].
push_back
(
particle1
);
// atomBonds[particle2].push_back(particle1);
}
// }
for
(
int
i
=
0
;
i
<
(
int
)
forces
.
size
();
i
++
)
{
// for (int i = 0; i < (int) forces.size(); i++) {
for
(
int
j
=
0
;
j
<
forces
[
i
]
->
getNumParticleGroups
();
j
++
)
{
// for (int j = 0; j < forces[i]->getNumParticleGroups(); j++) {
vector
<
int
>
particles
;
// vector<int> particles;
forces
[
i
]
->
getParticlesInGroup
(
j
,
particles
);
// forces[i]->getParticlesInGroup(j, particles);
for
(
int
k
=
0
;
k
<
(
int
)
particles
.
size
();
k
++
)
// for (int k = 0; k < (int) particles.size(); k++)
for
(
int
m
=
0
;
m
<
(
int
)
particles
.
size
();
m
++
)
// for (int m = 0; m < (int) particles.size(); m++)
if
(
k
!=
m
)
// if (k != m)
atomBonds
[
particles
[
k
]].
push_back
(
particles
[
m
]);
// atomBonds[particles[k]].push_back(particles[m]);
}
// }
}
// }
//
// Now tag atoms by which molecule they belong to.
// // Now tag atoms by which molecule they belong to.
//
vector
<
int
>
atomMolecule
(
numAtoms
,
-
1
);
// vector<int> atomMolecule(numAtoms, -1);
int
numMolecules
=
0
;
// int numMolecules = 0;
for
(
int
i
=
0
;
i
<
numAtoms
;
i
++
)
// for (int i = 0; i < numAtoms; i++)
if
(
atomMolecule
[
i
]
==
-
1
)
// if (atomMolecule[i] == -1)
tagAtomsInMolecule
(
i
,
numMolecules
++
,
atomMolecule
,
atomBonds
);
// tagAtomsInMolecule(i, numMolecules++, atomMolecule, atomBonds);
vector
<
vector
<
int
>
>
atomIndices
(
numMolecules
);
// vector<vector<int> > atomIndices(numMolecules);
for
(
int
i
=
0
;
i
<
numAtoms
;
i
++
)
// for (int i = 0; i < numAtoms; i++)
atomIndices
[
atomMolecule
[
i
]].
push_back
(
i
);
// atomIndices[atomMolecule[i]].push_back(i);
//
// Construct a description of each molecule.
// // Construct a description of each molecule.
//
molecules
.
resize
(
numMolecules
);
// molecules.resize(numMolecules);
for
(
int
i
=
0
;
i
<
numMolecules
;
i
++
)
{
// for (int i = 0; i < numMolecules; i++) {
molecules
[
i
].
atoms
=
atomIndices
[
i
];
// molecules[i].atoms = atomIndices[i];
molecules
[
i
].
groups
.
resize
(
forces
.
size
());
// molecules[i].groups.resize(forces.size());
}
// }
for
(
int
i
=
0
;
i
<
system
.
getNumConstraints
();
i
++
)
{
// for (int i = 0; i < system.getNumConstraints(); i++) {
int
particle1
,
particle2
;
// int particle1, particle2;
double
distance
;
// double distance;
system
.
getConstraintParameters
(
i
,
particle1
,
particle2
,
distance
);
// system.getConstraintParameters(i, particle1, particle2, distance);
molecules
[
atomMolecule
[
particle1
]].
constraints
.
push_back
(
i
);
// molecules[atomMolecule[particle1]].constraints.push_back(i);
}
// }
for
(
int
i
=
0
;
i
<
(
int
)
forces
.
size
();
i
++
)
// for (int i = 0; i < (int) forces.size(); i++)
for
(
int
j
=
0
;
j
<
forces
[
i
]
->
getNumParticleGroups
();
j
++
)
{
// for (int j = 0; j < forces[i]->getNumParticleGroups(); j++) {
vector
<
int
>
particles
;
// vector<int> particles;
forces
[
i
]
->
getParticlesInGroup
(
j
,
particles
);
// forces[i]->getParticlesInGroup(j, particles);
molecules
[
atomMolecule
[
particles
[
0
]]].
groups
[
i
].
push_back
(
j
);
// molecules[atomMolecule[particles[0]]].groups[i].push_back(j);
}
// }
}
// }
//
// Sort them into groups of identical molecules.
// // Sort them into groups of identical molecules.
//
vector
<
Molecule
>
uniqueMolecules
;
// vector<Molecule> uniqueMolecules;
vector
<
vector
<
int
>
>
moleculeInstances
;
// vector<vector<int> > moleculeInstances;
vector
<
vector
<
int
>
>
moleculeOffsets
;
// vector<vector<int> > moleculeOffsets;
for
(
int
molIndex
=
0
;
molIndex
<
(
int
)
molecules
.
size
();
molIndex
++
)
{
// for (int molIndex = 0; molIndex < (int) molecules.size(); molIndex++) {
Molecule
&
mol
=
molecules
[
molIndex
];
// Molecule& mol = molecules[molIndex];
//
// See if it is identical to another molecule.
// // See if it is identical to another molecule.
//
bool
isNew
=
true
;
// bool isNew = true;
for
(
int
j
=
0
;
j
<
(
int
)
uniqueMolecules
.
size
()
&&
isNew
;
j
++
)
{
// for (int j = 0; j < (int) uniqueMolecules.size() && isNew; j++) {
Molecule
&
mol2
=
uniqueMolecules
[
j
];
// Molecule& mol2 = uniqueMolecules[j];
bool
identical
=
(
mol
.
atoms
.
size
()
==
mol2
.
atoms
.
size
()
&&
mol
.
constraints
.
size
()
==
mol2
.
constraints
.
size
());
// bool identical = (mol.atoms.size() == mol2.atoms.size() && mol.constraints.size() == mol2.constraints.size());
//
// See if the atoms are identical.
// // See if the atoms are identical.
//
int
atomOffset
=
mol2
.
atoms
[
0
]
-
mol
.
atoms
[
0
];
// int atomOffset = mol2.atoms[0]-mol.atoms[0];
for
(
int
i
=
0
;
i
<
(
int
)
mol
.
atoms
.
size
()
&&
identical
;
i
++
)
{
// for (int i = 0; i < (int) mol.atoms.size() && identical; i++) {
if
(
mol
.
atoms
[
i
]
!=
mol2
.
atoms
[
i
]
-
atomOffset
||
system
.
getParticleMass
(
mol
.
atoms
[
i
])
!=
system
.
getParticleMass
(
mol2
.
atoms
[
i
]))
// if (mol.atoms[i] != mol2.atoms[i]-atomOffset || system.getParticleMass(mol.atoms[i]) != system.getParticleMass(mol2.atoms[i]))
identical
=
false
;
// identical = false;
for
(
int
k
=
0
;
k
<
(
int
)
forces
.
size
();
k
++
)
// for (int k = 0; k < (int) forces.size(); k++)
if
(
!
forces
[
k
]
->
areParticlesIdentical
(
mol
.
atoms
[
i
],
mol2
.
atoms
[
i
]))
// if (!forces[k]->areParticlesIdentical(mol.atoms[i], mol2.atoms[i]))
identical
=
false
;
// identical = false;
}
// }
//
// See if the constraints are identical.
// // See if the constraints are identical.
//
for
(
int
i
=
0
;
i
<
(
int
)
mol
.
constraints
.
size
()
&&
identical
;
i
++
)
{
// for (int i = 0; i < (int) mol.constraints.size() && identical; i++) {
int
c1particle1
,
c1particle2
,
c2particle1
,
c2particle2
;
// int c1particle1, c1particle2, c2particle1, c2particle2;
double
distance1
,
distance2
;
// double distance1, distance2;
system
.
getConstraintParameters
(
mol
.
constraints
[
i
],
c1particle1
,
c1particle2
,
distance1
);
// system.getConstraintParameters(mol.constraints[i], c1particle1, c1particle2, distance1);
system
.
getConstraintParameters
(
mol2
.
constraints
[
i
],
c2particle1
,
c2particle2
,
distance2
);
// system.getConstraintParameters(mol2.constraints[i], c2particle1, c2particle2, distance2);
if
(
c1particle1
!=
c2particle1
-
atomOffset
||
c1particle2
!=
c2particle2
-
atomOffset
||
distance1
!=
distance2
)
// if (c1particle1 != c2particle1-atomOffset || c1particle2 != c2particle2-atomOffset || distance1 != distance2)
identical
=
false
;
// identical = false;
}
// }
//
// See if the force groups are identical.
// // See if the force groups are identical.
//
for
(
int
i
=
0
;
i
<
(
int
)
forces
.
size
()
&&
identical
;
i
++
)
{
// for (int i = 0; i < (int) forces.size() && identical; i++) {
if
(
mol
.
groups
[
i
].
size
()
!=
mol2
.
groups
[
i
].
size
())
// if (mol.groups[i].size() != mol2.groups[i].size())
identical
=
false
;
// identical = false;
for
(
int
k
=
0
;
k
<
(
int
)
mol
.
groups
[
i
].
size
()
&&
identical
;
k
++
)
// for (int k = 0; k < (int) mol.groups[i].size() && identical; k++)
if
(
!
forces
[
i
]
->
areGroupsIdentical
(
mol
.
groups
[
i
][
k
],
mol2
.
groups
[
i
][
k
]))
// if (!forces[i]->areGroupsIdentical(mol.groups[i][k], mol2.groups[i][k]))
identical
=
false
;
// identical = false;
}
// }
if
(
identical
)
{
// if (identical) {
moleculeInstances
[
j
].
push_back
(
molIndex
);
// moleculeInstances[j].push_back(molIndex);
moleculeOffsets
[
j
].
push_back
(
mol
.
atoms
[
0
]);
// moleculeOffsets[j].push_back(mol.atoms[0]);
isNew
=
false
;
// isNew = false;
}
// }
}
// }
if
(
isNew
)
{
// if (isNew) {
uniqueMolecules
.
push_back
(
mol
);
// uniqueMolecules.push_back(mol);
moleculeInstances
.
push_back
(
vector
<
int
>
());
// moleculeInstances.push_back(vector<int>());
moleculeInstances
[
moleculeInstances
.
size
()
-
1
].
push_back
(
molIndex
);
// moleculeInstances[moleculeInstances.size()-1].push_back(molIndex);
moleculeOffsets
.
push_back
(
vector
<
int
>
());
// moleculeOffsets.push_back(vector<int>());
moleculeOffsets
[
moleculeOffsets
.
size
()
-
1
].
push_back
(
mol
.
atoms
[
0
]);
// moleculeOffsets[moleculeOffsets.size()-1].push_back(mol.atoms[0]);
}
// }
}
// }
moleculeGroups
.
resize
(
moleculeInstances
.
size
());
// moleculeGroups.resize(moleculeInstances.size());
for
(
int
i
=
0
;
i
<
(
int
)
moleculeInstances
.
size
();
i
++
)
// for (int i = 0; i < (int) moleculeInstances.size(); i++)
{
// {
moleculeGroups
[
i
].
instances
=
moleculeInstances
[
i
];
// moleculeGroups[i].instances = moleculeInstances[i];
moleculeGroups
[
i
].
offsets
=
moleculeOffsets
[
i
];
// moleculeGroups[i].offsets = moleculeOffsets[i];
vector
<
int
>&
atoms
=
uniqueMolecules
[
i
].
atoms
;
// vector<int>& atoms = uniqueMolecules[i].atoms;
moleculeGroups
[
i
].
atoms
.
resize
(
atoms
.
size
());
// moleculeGroups[i].atoms.resize(atoms.size());
for
(
int
j
=
0
;
j
<
(
int
)
atoms
.
size
();
j
++
)
// for (int j = 0; j < (int) atoms.size(); j++)
moleculeGroups
[
i
].
atoms
[
j
]
=
atoms
[
j
]
-
atoms
[
0
];
// moleculeGroups[i].atoms[j] = atoms[j]-atoms[0];
}
// }
}
//}
//
void
CudaContext
::
invalidateMolecules
()
{
//void CudaContext::invalidateMolecules() {
moleculesInvalid
=
true
;
// moleculesInvalid = true;
}
//}
//
//
//void OpenCLContext::validateMolecules() {
//void OpenCLContext::validateMolecules() {
// moleculesInvalid = false;
// moleculesInvalid = false;
// if (numAtoms == 0 || nonbonded == NULL || !nonbonded->getUseCutoff())
// if (numAtoms == 0 || nonbonded == NULL || !nonbonded->getUseCutoff())
...
...
platforms/cuda2/src/CudaContext.h
View file @
3e16cab9
...
@@ -72,11 +72,11 @@ public:
...
@@ -72,11 +72,11 @@ public:
CudaContext
(
const
System
&
system
,
int
deviceIndex
,
bool
useBlockingSync
,
const
std
::
string
&
precision
,
CudaContext
(
const
System
&
system
,
int
deviceIndex
,
bool
useBlockingSync
,
const
std
::
string
&
precision
,
const
std
::
string
&
compiler
,
const
std
::
string
&
tempDir
,
CudaPlatform
::
PlatformData
&
platformData
);
const
std
::
string
&
compiler
,
const
std
::
string
&
tempDir
,
CudaPlatform
::
PlatformData
&
platformData
);
~
CudaContext
();
~
CudaContext
();
//
/**
/**
//
* This is called to initialize internal data structures after all Forces in the system
* This is called to initialize internal data structures after all Forces in the system
//
* have been initialized.
* have been initialized.
//
*/
*/
//
void initialize();
void
initialize
();
/**
/**
* Add a CudaForce to this context.
* Add a CudaForce to this context.
*/
*/
...
@@ -123,12 +123,12 @@ public:
...
@@ -123,12 +123,12 @@ public:
CudaArray
&
getVelm
()
{
CudaArray
&
getVelm
()
{
return
*
velm
;
return
*
velm
;
}
}
//
/**
/**
//
* Get the array which contains the force on each atom.
* Get the array which contains the force on each atom
(respresented as a long3 in 64 bit fixed point)
.
//
*/
*/
//
CudaArray
<mm_float4>
& getForce() {
CudaArray
&
getForce
()
{
//
return *force;
return
*
force
;
//
}
}
// /**
// /**
// * Get the array which contains the buffers in which forces are computed.
// * Get the array which contains the buffers in which forces are computed.
// */
// */
...
@@ -184,36 +184,41 @@ public:
...
@@ -184,36 +184,41 @@ public:
* omitted, a default set of options will be used
* omitted, a default set of options will be used
*/
*/
CUmodule
createModule
(
const
std
::
string
source
,
const
std
::
map
<
std
::
string
,
std
::
string
>&
defines
,
const
char
*
optimizationFlags
=
NULL
);
CUmodule
createModule
(
const
std
::
string
source
,
const
std
::
map
<
std
::
string
,
std
::
string
>&
defines
,
const
char
*
optimizationFlags
=
NULL
);
// /**
/**
// * Execute a kernel.
* Get a kernel from a CUDA module.
// *
*
// * @param kernel the kernel to execute
* @param module the module to get the kernel from
// * @param workUnits the maximum number of work units that should be used
* @param name the name of the kernel to get
// * @param blockSize the size of each thread block to use
*/
// */
CUfunction
getKernel
(
CUmodule
&
module
,
const
std
::
string
&
name
);
// void executeKernel(cl::Kernel& kernel, int workUnits, int blockSize = -1);
/**
// /**
* Execute a kernel.
// * Set all elements of an array to 0.
*
// */
* @param kernel the kernel to execute
// void clearBuffer(CudaArray<float>& array);
* @param arguments an array of pointers to the kernel arguments
// /**
* @param threads the maximum number of threads that should be used
// * Set all elements of an array to 0.
* @param blockSize the size of each thread block to use
// */
* @param sharedSize the amount of dynamic shared memory to allocated for the kernel, in bytes
// void clearBuffer(CudaArray<mm_float4>& array);
*/
// /**
void
executeKernel
(
CUfunction
kernel
,
void
**
arguments
,
int
workUnits
,
int
blockSize
=
-
1
,
unsigned
int
sharedSize
=
0
);
// * Set all elements of an array to 0.
/**
// *
* Set all elements of an array to 0.
// * @param memory the Memory to clear
*/
// * @param size the number of float elements in the buffer
void
clearBuffer
(
CudaArray
&
array
);
// */
/**
// void clearBuffer(cl::Memory& memory, int size);
* Set all elements of an array to 0.
// /**
*
// * Register a buffer that should be automatically cleared (all elements set to 0) at the start of each force or energy computation.
* @param memory the memory to clear
// *
* @param size the number of 4-byte elements in the buffer
// * @param memory the Memory to clear
*/
// * @param size the number of float elements in the buffer
void
clearBuffer
(
CUdeviceptr
memory
,
int
size
);
// */
/**
// void addAutoclearBuffer(cl::Memory& memory, int size);
* Register a buffer that should be automatically cleared (all elements set to 0) at the start of each force or energy computation.
*
* @param memory the memory to clear
* @param size the number of float/double elements in the buffer
*/
void
addAutoclearBuffer
(
CUdeviceptr
memory
,
int
size
);
// /**
// /**
// * Clear all buffers that have been registered with addAutoclearBuffer().
// * Clear all buffers that have been registered with addAutoclearBuffer().
// */
// */
...
@@ -230,108 +235,110 @@ public:
...
@@ -230,108 +235,110 @@ public:
// * Sum the buffesr containing forces.
// * Sum the buffesr containing forces.
// */
// */
// void reduceForces();
// void reduceForces();
// /**
/**
// * Get the current simulation time.
* Get the current simulation time.
// */
*/
// double getTime() {
double
getTime
()
{
// return time;
return
time
;
// }
}
// /**
/**
// * Set the current simulation time.
* Set the current simulation time.
// */
*/
// void setTime(double t) {
void
setTime
(
double
t
)
{
// time = t;
time
=
t
;
// }
}
// /**
/**
// * Get the number of integration steps that have been taken.
* Get the number of integration steps that have been taken.
// */
*/
// int getStepCount() {
int
getStepCount
()
{
// return stepCount;
return
stepCount
;
// }
}
// /**
/**
// * Set the number of integration steps that have been taken.
* Set the number of integration steps that have been taken.
// */
*/
// void setStepCount(int steps) {
void
setStepCount
(
int
steps
)
{
// stepCount = steps;
stepCount
=
steps
;
// }
}
// /**
/**
// * Get the number of times forces or energy has been computed.
* Get the number of times forces or energy has been computed.
// */
*/
// int getComputeForceCount() {
int
getComputeForceCount
()
{
// return computeForceCount;
return
computeForceCount
;
// }
}
// /**
/**
// * Set the number of times forces or energy has been computed.
* Set the number of times forces or energy has been computed.
// */
*/
// void setComputeForceCount(int count) {
void
setComputeForceCount
(
int
count
)
{
// computeForceCount = count;
computeForceCount
=
count
;
// }
}
// /**
/**
// * Get the number of atoms.
* Get the number of atoms.
// */
*/
// int getNumAtoms() const {
int
getNumAtoms
()
const
{
// return numAtoms;
return
numAtoms
;
// }
}
// /**
/**
// * Get the number of atoms, rounded up to a multiple of TileSize. This is the actual size of
* Get the number of atoms, rounded up to a multiple of TileSize. This is the actual size of
// * most arrays with one element per atom.
* most arrays with one element per atom.
// */
*/
// int getPaddedNumAtoms() const {
int
getPaddedNumAtoms
()
const
{
// return paddedNumAtoms;
return
paddedNumAtoms
;
// }
}
// /**
/**
// * Get the number of blocks of TileSize atoms.
* Get the number of blocks of TileSize atoms.
// */
*/
// int getNumAtomBlocks() const {
int
getNumAtomBlocks
()
const
{
// return numAtomBlocks;
return
numAtomBlocks
;
// }
}
// /**
/**
// * Get the standard number of thread blocks to use when executing kernels.
* Get the standard number of thread blocks to use when executing kernels.
// */
*/
// int getNumThreadBlocks() const {
int
getNumThreadBlocks
()
const
{
// return numThreadBlocks;
return
numThreadBlocks
;
// }
}
// /**
/**
// * Get the number of force buffers.
* Get whether double precision is being used.
// */
*/
// int getNumForceBuffers() const {
bool
getUseDoublePrecision
()
{
// return numForceBuffers;
return
useDoublePrecision
;
// }
}
// /**
/**
// * Get the SIMD width of the device being used.
* Get whether accumulation is being done in double precision.
// */
*/
// int getSIMDWidth() const {
bool
getAccumulateInDouble
()
{
// return simdWidth;
return
accumulateInDouble
;
// }
}
// /**
/**
// * Get whether the device being used supports 64 bit atomic operations on global memory.
* Convert a number to a string in a format suitable for including in a kernel.
// */
* This takes into account whether the context uses single or double precision.
// bool getSupports64BitGlobalAtomics() {
*/
// return supports64BitGlobalAtomics;
std
::
string
doubleToString
(
double
value
);
// }
/**
// /**
* Convert a number to a string in a format suitable for including in a kernel.
// * Get whether the device being used supports double precision math.
*/
// */
std
::
string
intToString
(
int
value
);
// bool getSupportsDoublePrecision() {
/**
// return supportsDoublePrecision;
* Convert a CUDA result code to the corresponding string description.
// }
*/
std
::
string
getErrorString
(
CUresult
result
);
// /**
// /**
// * Get the size of the periodic box.
// * Get the size of the periodic box.
// */
// */
//
mm_
float4 getPeriodicBoxSize() const {
// float4 getPeriodicBoxSize() const {
// return periodicBoxSize;
// return periodicBoxSize;
// }
// }
// /**
// /**
// * Set the size of the periodic box.
// * Set the size of the periodic box.
// */
// */
// void setPeriodicBoxSize(double xsize, double ysize, double zsize) {
// void setPeriodicBoxSize(double xsize, double ysize, double zsize) {
// periodicBoxSize = m
m
_float4((float) xsize, (float) ysize, (float) zsize, 0);
// periodicBoxSize = m
ake
_float4((float) xsize, (float) ysize, (float) zsize, 0);
// invPeriodicBoxSize = m
m
_float4((float) (1.0/xsize), (float) (1.0/ysize), (float) (1.0/zsize), 0);
// invPeriodicBoxSize = m
ake
_float4((float) (1.0/xsize), (float) (1.0/ysize), (float) (1.0/zsize), 0);
// }
// }
// /**
// /**
// * Get the inverse of the size of the periodic box.
// * Get the inverse of the size of the periodic box.
// */
// */
//
mm_
float4 getInvPeriodicBoxSize() const {
// float4 getInvPeriodicBoxSize() const {
// return invPeriodicBoxSize;
// return invPeriodicBoxSize;
// }
// }
// /**
// /**
...
@@ -352,66 +359,66 @@ public:
...
@@ -352,66 +359,66 @@ public:
// CudaNonbondedUtilities& getNonbondedUtilities() {
// CudaNonbondedUtilities& getNonbondedUtilities() {
// return *nonbonded;
// return *nonbonded;
// }
// }
//
/**
/**
//
* Get the thread used by this context for executing parallel computations.
* Get the thread used by this context for executing parallel computations.
//
*/
*/
//
WorkThread& getWorkThread() {
WorkThread
&
getWorkThread
()
{
//
return *thread;
return
*
thread
;
//
}
}
//
/**
/**
//
* Get whether atoms were reordered during the most recent force/energy computation.
* Get whether atoms were reordered during the most recent force/energy computation.
//
*/
*/
//
bool getAtomsWereReordered() const {
bool
getAtomsWereReordered
()
const
{
//
return atomsWereReordered;
return
atomsWereReordered
;
//
}
}
//
/**
/**
//
* Set whether atoms were reordered during the most recent force/energy computation.
* Set whether atoms were reordered during the most recent force/energy computation.
//
*/
*/
//
void setAtomsWereReordered(bool wereReordered) {
void
setAtomsWereReordered
(
bool
wereReordered
)
{
//
atomsWereReordered = wereReordered;
atomsWereReordered
=
wereReordered
;
//
}
}
//
/**
/**
//
* Reorder the internal arrays of atoms to try to keep spatially contiguous atoms close
* Reorder the internal arrays of atoms to try to keep spatially contiguous atoms close
//
* together in the arrays.
* together in the arrays.
//
*
*
//
* @param enforcePeriodic if true, the atom positions may be altered to enforce periodic boundary conditions
* @param enforcePeriodic if true, the atom positions may be altered to enforce periodic boundary conditions
//
*/
*/
//
void reorderAtoms(bool enforcePeriodic);
void
reorderAtoms
(
bool
enforcePeriodic
);
//
/**
/**
//
* Add a listener that should be called whenever atoms get reordered. The CudaContext
* Add a listener that should be called whenever atoms get reordered. The CudaContext
//
* assumes ownership of the object, and deletes it when the context itself is deleted.
* assumes ownership of the object, and deletes it when the context itself is deleted.
//
*/
*/
//
void addReorderListener(ReorderListener* listener);
void
addReorderListener
(
ReorderListener
*
listener
);
//
/**
/**
//
* Get the list of ReorderListeners.
* Get the list of ReorderListeners.
//
*/
*/
//
std::vector<ReorderListener*>& getReorderListeners() {
std
::
vector
<
ReorderListener
*>&
getReorderListeners
()
{
//
return reorderListeners;
return
reorderListeners
;
//
}
}
//
/**
/**
//
* Mark that the current molecule definitions (and hence the atom order) may be invalid.
* Mark that the current molecule definitions (and hence the atom order) may be invalid.
//
* This should be called whenever force field parameters change. It will cause the definitions
* This should be called whenever force field parameters change. It will cause the definitions
//
* and order to be revalidated the next to reorderAtoms() is called.
* and order to be revalidated the next to reorderAtoms() is called.
//
*/
*/
//
void invalidateMolecules();
void
invalidateMolecules
();
//
/**
/**
//
* Get whether the current molecule definitions are valid.
* Get whether the current molecule definitions are valid.
//
*/
*/
//
bool getMoleculesAreInvalid() {
bool
getMoleculesAreInvalid
()
{
//
return moleculesInvalid;
return
moleculesInvalid
;
//
}
}
private:
private:
struct
Molecule
;
struct
Molecule
;
struct
MoleculeGroup
;
struct
MoleculeGroup
;
class
VirtualSiteInfo
;
class
VirtualSiteInfo
;
//
void findMoleculeGroups();
void
findMoleculeGroups
();
//
static void tagAtomsInMolecule(int atom, int molecule, std::vector<int>& atomMolecule, std::vector<std::vector<int> >& atomBonds);
static
void
tagAtomsInMolecule
(
int
atom
,
int
molecule
,
std
::
vector
<
int
>&
atomMolecule
,
std
::
vector
<
std
::
vector
<
int
>
>&
atomBonds
);
//
/**
/**
//
* Ensure that all molecules marked as "identical" really are identical. This should be
* Ensure that all molecules marked as "identical" really are identical. This should be
//
* called whenever force field parameters change. If necessary, it will rebuild the list
* called whenever force field parameters change. If necessary, it will rebuild the list
//
* of molecules and resort the atoms.
* of molecules and resort the atoms.
//
*/
*/
//
void validateMolecules();
void
validateMolecules
();
static
bool
hasInitializedCuda
;
static
bool
hasInitializedCuda
;
const
System
&
system
;
const
System
&
system
;
double
time
;
double
time
;
...
@@ -424,8 +431,6 @@ private:
...
@@ -424,8 +431,6 @@ private:
int
paddedNumAtoms
;
int
paddedNumAtoms
;
int
numAtomBlocks
;
int
numAtomBlocks
;
int
numThreadBlocks
;
int
numThreadBlocks
;
// int numForceBuffers;
// int simdWidth;
bool
useBlockingSync
,
useDoublePrecision
,
accumulateInDouble
,
contextIsValid
,
atomsWereReordered
,
moleculesInvalid
;
bool
useBlockingSync
,
useDoublePrecision
,
accumulateInDouble
,
contextIsValid
,
atomsWereReordered
,
moleculesInvalid
;
std
::
string
compiler
,
tempDir
,
gpuArchitecture
;
std
::
string
compiler
,
tempDir
,
gpuArchitecture
;
float4
periodicBoxSize
;
float4
periodicBoxSize
;
...
@@ -446,15 +451,15 @@ private:
...
@@ -446,15 +451,15 @@ private:
std
::
vector
<
Molecule
>
molecules
;
std
::
vector
<
Molecule
>
molecules
;
std
::
vector
<
MoleculeGroup
>
moleculeGroups
;
std
::
vector
<
MoleculeGroup
>
moleculeGroups
;
std
::
vector
<
int4
>
posCellOffsets
;
std
::
vector
<
int4
>
posCellOffsets
;
void
*
pinnedBuffer
;
CudaArray
*
posq
;
CudaArray
*
posq
;
CudaArray
*
velm
;
CudaArray
*
velm
;
// CudaArray<mm_float4>* force;
CudaArray
*
force
;
// CudaArray<mm_float4>* forceBuffers;
CudaArray
*
energyBuffer
;
// CudaArray<cl_long>* longForceBuffer;
CudaArray
*
atomIndexDevice
;
// CudaArray<cl_float>* energyBuffer;
std
::
vector
<
int
>
atomIndex
;
// CudaArray<cl_int>* atomIndex;
std
::
vector
<
CUdeviceptr
>
autoclearBuffers
;
// std::vector<cl::Memory*> autoclearBuffers;
std
::
vector
<
int
>
autoclearBufferSizes
;
// std::vector<int> autoclearBufferSizes;
std
::
vector
<
ReorderListener
*>
reorderListeners
;
std
::
vector
<
ReorderListener
*>
reorderListeners
;
// CudaIntegrationUtilities* integration;
// CudaIntegrationUtilities* integration;
// CudaBondedUtilities* bonded;
// CudaBondedUtilities* bonded;
...
...
platforms/cuda2/src/CudaPlatform.cpp
View file @
3e16cab9
...
@@ -154,6 +154,7 @@ CudaPlatform::PlatformData::PlatformData(const System& system, const string& dev
...
@@ -154,6 +154,7 @@ CudaPlatform::PlatformData::PlatformData(const System& system, const string& dev
device
<<
contexts
[
i
]
->
getDeviceIndex
();
device
<<
contexts
[
i
]
->
getDeviceIndex
();
}
}
propertyValues
[
CudaPlatform
::
CudaDeviceIndex
()]
=
device
.
str
();
propertyValues
[
CudaPlatform
::
CudaDeviceIndex
()]
=
device
.
str
();
propertyValues
[
CudaPlatform
::
CudaUseBlockingSync
()]
=
blocking
?
"true"
:
"false"
;
propertyValues
[
CudaPlatform
::
CudaPrecision
()]
=
precisionProperty
;
propertyValues
[
CudaPlatform
::
CudaPrecision
()]
=
precisionProperty
;
propertyValues
[
CudaPlatform
::
CudaCompiler
()]
=
compilerProperty
;
propertyValues
[
CudaPlatform
::
CudaCompiler
()]
=
compilerProperty
;
propertyValues
[
CudaPlatform
::
CudaTempDirectory
()]
=
tempProperty
;
propertyValues
[
CudaPlatform
::
CudaTempDirectory
()]
=
tempProperty
;
...
@@ -166,11 +167,11 @@ CudaPlatform::PlatformData::~PlatformData() {
...
@@ -166,11 +167,11 @@ CudaPlatform::PlatformData::~PlatformData() {
}
}
void
CudaPlatform
::
PlatformData
::
initializeContexts
(
const
System
&
system
)
{
void
CudaPlatform
::
PlatformData
::
initializeContexts
(
const
System
&
system
)
{
//
for (int i = 0; i < (int) contexts.size(); i++)
for
(
int
i
=
0
;
i
<
(
int
)
contexts
.
size
();
i
++
)
//
contexts[i]->initialize();
contexts
[
i
]
->
initialize
();
}
}
void
CudaPlatform
::
PlatformData
::
syncContexts
()
{
void
CudaPlatform
::
PlatformData
::
syncContexts
()
{
//
for (int i = 0; i < (int) contexts.size(); i++)
for
(
int
i
=
0
;
i
<
(
int
)
contexts
.
size
();
i
++
)
//
contexts[i]->getWorkThread().flush();
contexts
[
i
]
->
getWorkThread
().
flush
();
}
}
platforms/cuda2/src/CudaSort.cpp
0 → 100644
View file @
3e16cab9
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2010-2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "CudaSort.h"
#include "CudaKernelSources.h"
#include <map>
using
namespace
OpenMM
;
using
namespace
std
;
CudaSort
::
CudaSort
(
CudaContext
&
context
,
SortTrait
*
trait
,
unsigned
int
length
)
:
context
(
context
),
trait
(
trait
),
dataRange
(
NULL
),
bucketOfElement
(
NULL
),
offsetInBucket
(
NULL
),
bucketOffset
(
NULL
),
buckets
(
NULL
)
{
// Create kernels.
map
<
string
,
string
>
replacements
;
replacements
[
"DATA_TYPE"
]
=
trait
->
getDataType
();
replacements
[
"KEY_TYPE"
]
=
trait
->
getKeyType
();
replacements
[
"SORT_KEY"
]
=
trait
->
getSortKey
();
replacements
[
"MIN_KEY"
]
=
trait
->
getMinKey
();
replacements
[
"MAX_KEY"
]
=
trait
->
getMaxKey
();
replacements
[
"MAX_VALUE"
]
=
trait
->
getMaxValue
();
CUmodule
module
=
context
.
createModule
(
context
.
replaceStrings
(
CudaKernelSources
::
sort
,
replacements
));
computeRangeKernel
=
context
.
getKernel
(
module
,
"computeRange"
);
assignElementsKernel
=
context
.
getKernel
(
module
,
"assignElementsToBuckets"
);
computeBucketPositionsKernel
=
context
.
getKernel
(
module
,
"computeBucketPositions"
);
copyToBucketsKernel
=
context
.
getKernel
(
module
,
"copyDataToBuckets"
);
sortBucketsKernel
=
context
.
getKernel
(
module
,
"sortBuckets"
);
// Work out the work group sizes for various kernels.
int
maxBlockSize
;
cuDeviceGetAttribute
(
&
maxBlockSize
,
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X
,
context
.
getDevice
());
for
(
rangeKernelSize
=
1
;
rangeKernelSize
*
2
<=
maxBlockSize
;
rangeKernelSize
*=
2
)
;
positionsKernelSize
=
rangeKernelSize
;
sortKernelSize
=
rangeKernelSize
/
2
;
if
(
rangeKernelSize
>
length
)
rangeKernelSize
=
length
;
int
maxSharedMem
;
cuDeviceGetAttribute
(
&
maxSharedMem
,
CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK
,
context
.
getDevice
());
unsigned
int
maxLocalBuffer
=
(
unsigned
int
)
((
maxSharedMem
/
trait
->
getDataSize
())
/
2
);
if
(
sortKernelSize
>
maxLocalBuffer
)
sortKernelSize
=
maxLocalBuffer
;
unsigned
int
targetBucketSize
=
sortKernelSize
/
2
;
unsigned
int
numBuckets
=
length
/
targetBucketSize
;
if
(
numBuckets
<
1
)
numBuckets
=
1
;
if
(
positionsKernelSize
>
numBuckets
)
positionsKernelSize
=
numBuckets
;
// Create workspace arrays.
dataRange
=
new
CudaArray
(
2
,
trait
->
getKeySize
(),
"sortDataRange"
);
bucketOffset
=
CudaArray
::
create
<
uint1
>
(
numBuckets
,
"bucketOffset"
);
bucketOfElement
=
CudaArray
::
create
<
uint1
>
(
length
,
"bucketOfElement"
);
offsetInBucket
=
CudaArray
::
create
<
uint1
>
(
length
,
"offsetInBucket"
);
buckets
=
new
CudaArray
(
length
,
trait
->
getDataSize
(),
"buckets"
);
}
CudaSort
::~
CudaSort
()
{
delete
trait
;
if
(
dataRange
!=
NULL
)
delete
dataRange
;
if
(
bucketOfElement
!=
NULL
)
delete
bucketOfElement
;
if
(
offsetInBucket
!=
NULL
)
delete
offsetInBucket
;
if
(
bucketOffset
!=
NULL
)
delete
bucketOffset
;
if
(
buckets
!=
NULL
)
delete
buckets
;
}
void
CudaSort
::
sort
(
CudaArray
&
data
)
{
if
(
data
.
getSize
()
!=
bucketOfElement
->
getSize
()
||
data
.
getElementSize
()
!=
trait
->
getDataSize
())
throw
OpenMMException
(
"CudaSort called with different data size"
);
if
(
data
.
getSize
()
==
0
)
return
;
// Compute the range of data values.
unsigned
int
dataSize
=
data
.
getSize
();
void
*
rangeArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
dataSize
,
&
dataRange
->
getDevicePointer
()};
context
.
executeKernel
(
computeRangeKernel
,
rangeArgs
,
rangeKernelSize
,
rangeKernelSize
,
rangeKernelSize
*
trait
->
getKeySize
());
// Assign array elements to buckets.
unsigned
int
numBuckets
=
bucketOffset
->
getSize
();
context
.
clearBuffer
(
*
bucketOffset
);
void
*
elementsArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
dataSize
,
&
numBuckets
,
&
dataRange
->
getDevicePointer
(),
&
bucketOffset
->
getDevicePointer
(),
&
bucketOfElement
->
getDevicePointer
(),
&
offsetInBucket
->
getDevicePointer
()};
context
.
executeKernel
(
assignElementsKernel
,
elementsArgs
,
data
.
getSize
());
// Compute the position of each bucket.
void
*
computeArgs
[]
=
{
&
numBuckets
,
&
bucketOffset
->
getDevicePointer
()};
context
.
executeKernel
(
computeBucketPositionsKernel
,
computeArgs
,
positionsKernelSize
,
positionsKernelSize
,
positionsKernelSize
*
sizeof
(
int
));
// Copy the data into the buckets.
void
*
copyArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
buckets
->
getDevicePointer
(),
&
dataSize
,
&
bucketOffset
->
getDevicePointer
(),
&
bucketOfElement
->
getDevicePointer
(),
&
offsetInBucket
->
getDevicePointer
()};
context
.
executeKernel
(
copyToBucketsKernel
,
copyArgs
,
data
.
getSize
());
// Sort each bucket.
void
*
sortArgs
[]
=
{
&
data
.
getDevicePointer
(),
&
buckets
->
getDevicePointer
(),
&
numBuckets
,
&
bucketOffset
->
getDevicePointer
()};
context
.
executeKernel
(
sortBucketsKernel
,
sortArgs
,
((
data
.
getSize
()
+
sortKernelSize
-
1
)
/
sortKernelSize
)
*
sortKernelSize
,
sortKernelSize
,
sortKernelSize
*
trait
->
getDataSize
());
}
platforms/cuda2/src/CudaSort.h
0 → 100644
View file @
3e16cab9
#ifndef __OPENMM_CUDASORT_H__
#define __OPENMM_CUDASORT_H__
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2010-2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "CudaArray.h"
#include "openmm/internal/windowsExport.h"
#include "CudaContext.h"
namespace
OpenMM
{
/**
* This class sorts arrays of values. It supports any type of values, not just scalars,
* so long as an appropriate sorting key can be defined by which to sort them.
*
* The sorting behavior is specified by a "trait" class that defines the type of data to
* sort and the key for sorting it. Here is an example of a trait class for
* sorting floats:
*
* class SortTrait : public CudaSort::SortTrait {
* int getDataSize() const {return 4;}
* int getKeySize() const {return 4;}
* const char* getDataType() const {return "float";}
* const char* getKeyType() const {return "float";}
* const char* getMinKey() const {return "-MAXFLOAT";}
* const char* getMaxKey() const {return "MAXFLOAT";}
* const char* getMaxValue() const {return "MAXFLOAT";}
* const char* getSortKey() const {return "value";}
* };
*
* The algorithm used is a bucket sort, followed by a bitonic sort within each bucket
* (in local memory when possible, in global memory otherwise). This is similar to
* the algorithm described in
*
* Shifu Chen, Jing Qin, Yongming Xie, Junping Zhao, and Pheng-Ann Heng. "An Efficient
* Sorting Algorithm with CUDA" Journal of the Chinese Institute of Engineers, 32(7),
* pp. 915-921 (2009)
*
* but with many modifications and simplifications. In particular, this algorithm
* involves much less communication between host and device, which is critical to get
* good performance with the array sizes we typically work with (10,000 to 100,000
* elements).
*/
class
OPENMM_EXPORT
CudaSort
{
public:
class
SortTrait
;
/**
* Create a CudaSort object for sorting data of a particular type.
*
* @param context the context in which to perform calculations
* @param trait a SortTrait defining the type of data to sort. It should have been allocated
* on the heap with the "new" operator. This object takes over ownership of it,
* and deletes it when the CudaSort is deleted.
* @param length the length of the arrays this object will be used to sort
*/
CudaSort
(
CudaContext
&
context
,
SortTrait
*
trait
,
unsigned
int
length
);
~
CudaSort
();
/**
* Sort an array.
*/
void
sort
(
CudaArray
&
data
);
private:
CudaContext
&
context
;
SortTrait
*
trait
;
CudaArray
*
dataRange
;
CudaArray
*
bucketOfElement
;
CudaArray
*
offsetInBucket
;
CudaArray
*
bucketOffset
;
CudaArray
*
buckets
;
CUfunction
computeRangeKernel
,
assignElementsKernel
,
computeBucketPositionsKernel
,
copyToBucketsKernel
,
sortBucketsKernel
;
unsigned
int
rangeKernelSize
,
positionsKernelSize
,
sortKernelSize
;
};
/**
* A subclass of SortTrait defines the type of value to sort, and the key for sorting them.
*/
class
CudaSort
::
SortTrait
{
public:
/**
* Get the size of each data value in bytes.
*/
virtual
int
getDataSize
()
const
=
0
;
/**
* Get the size of each key value in bytes.
*/
virtual
int
getKeySize
()
const
=
0
;
/**
* Get the data type of the values to sort.
*/
virtual
const
char
*
getDataType
()
const
=
0
;
/**
* Get the data type of the sorting key.
*/
virtual
const
char
*
getKeyType
()
const
=
0
;
/**
* Get the minimum value a key can take.
*/
virtual
const
char
*
getMinKey
()
const
=
0
;
/**
* Get the maximum value a key can take.
*/
virtual
const
char
*
getMaxKey
()
const
=
0
;
/**
* Get a value whose key is guaranteed to equal getMaxKey().
*/
virtual
const
char
*
getMaxValue
()
const
=
0
;
/**
* Get the CUDA code to select the key from the data value.
*/
virtual
const
char
*
getSortKey
()
const
=
0
;
};
}
// namespace OpenMM
#endif // __OPENMM_CUDASORT_H__
platforms/cuda2/src/kernels/sort.cu
0 → 100644
View file @
3e16cab9
__device__
KEY_TYPE
getValue
(
DATA_TYPE
value
)
{
return
SORT_KEY
;
}
extern
"C"
{
/**
* Calculate the minimum and maximum value in the array to be sorted. This kernel
* is executed as a single work group.
*/
__global__
void
computeRange
(
const
DATA_TYPE
*
__restrict__
data
,
unsigned
int
length
,
KEY_TYPE
*
__restrict__
range
)
{
extern
__shared__
KEY_TYPE
rangeBuffer
[];
KEY_TYPE
minimum
=
MAX_KEY
;
KEY_TYPE
maximum
=
MIN_KEY
;
// Each thread calculates the range of a subset of values.
for
(
unsigned
int
index
=
threadIdx
.
x
;
index
<
length
;
index
+=
blockDim
.
x
)
{
KEY_TYPE
value
=
getValue
(
data
[
index
]);
minimum
=
min
(
minimum
,
value
);
maximum
=
max
(
maximum
,
value
);
}
// Now reduce them.
rangeBuffer
[
threadIdx
.
x
]
=
minimum
;
__syncthreads
();
for
(
unsigned
int
step
=
1
;
step
<
blockDim
.
x
;
step
*=
2
)
{
if
(
threadIdx
.
x
+
step
<
blockDim
.
x
&&
threadIdx
.
x
%
(
2
*
step
)
==
0
)
rangeBuffer
[
threadIdx
.
x
]
=
min
(
rangeBuffer
[
threadIdx
.
x
],
rangeBuffer
[
threadIdx
.
x
+
step
]);
__syncthreads
();
}
minimum
=
rangeBuffer
[
0
];
rangeBuffer
[
threadIdx
.
x
]
=
maximum
;
__syncthreads
();
for
(
unsigned
int
step
=
1
;
step
<
blockDim
.
x
;
step
*=
2
)
{
if
(
threadIdx
.
x
+
step
<
blockDim
.
x
&&
threadIdx
.
x
%
(
2
*
step
)
==
0
)
rangeBuffer
[
threadIdx
.
x
]
=
max
(
rangeBuffer
[
threadIdx
.
x
],
rangeBuffer
[
threadIdx
.
x
+
step
]);
__syncthreads
();
}
maximum
=
rangeBuffer
[
0
];
if
(
threadIdx
.
x
==
0
)
{
range
[
0
]
=
minimum
;
range
[
1
]
=
maximum
;
}
}
/**
* Assign elements to buckets.
*/
__global__
void
assignElementsToBuckets
(
const
DATA_TYPE
*
__restrict__
data
,
unsigned
int
length
,
unsigned
int
numBuckets
,
const
KEY_TYPE
*
__restrict__
range
,
unsigned
int
*
bucketOffset
,
unsigned
int
*
__restrict__
bucketOfElement
,
unsigned
int
*
__restrict__
offsetInBucket
)
{
float
minValue
=
(
float
)
(
range
[
0
]);
float
maxValue
=
(
float
)
(
range
[
1
]);
float
bucketWidth
=
(
maxValue
-
minValue
)
/
numBuckets
;
for
(
unsigned
int
index
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
index
<
length
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
float
key
=
(
float
)
getValue
(
data
[
index
]);
unsigned
int
bucketIndex
=
min
((
unsigned
int
)
((
key
-
minValue
)
/
bucketWidth
),
numBuckets
-
1
);
offsetInBucket
[
index
]
=
atomicAdd
(
&
bucketOffset
[
bucketIndex
],
1
);
bucketOfElement
[
index
]
=
bucketIndex
;
}
}
/**
* Sum the bucket sizes to compute the start position of each bucket. This kernel
* is executed as a single work group.
*/
__global__
void
computeBucketPositions
(
unsigned
int
numBuckets
,
unsigned
int
*
__restrict__
bucketOffset
)
{
extern
__shared__
unsigned
int
posBuffer
[];
unsigned
int
globalOffset
=
0
;
for
(
unsigned
int
startBucket
=
0
;
startBucket
<
numBuckets
;
startBucket
+=
blockDim
.
x
)
{
// Load the bucket sizes into local memory.
unsigned
int
globalIndex
=
startBucket
+
threadIdx
.
x
;
posBuffer
[
threadIdx
.
x
]
=
(
globalIndex
<
numBuckets
?
bucketOffset
[
globalIndex
]
:
0
);
__syncthreads
();
// Perform a parallel prefix sum.
for
(
unsigned
int
step
=
1
;
step
<
blockDim
.
x
;
step
*=
2
)
{
unsigned
int
add
=
(
threadIdx
.
x
>=
step
?
posBuffer
[
threadIdx
.
x
-
step
]
:
0
);
__syncthreads
();
posBuffer
[
threadIdx
.
x
]
+=
add
;
__syncthreads
();
}
// Write the results back to global memory.
if
(
globalIndex
<
numBuckets
)
bucketOffset
[
globalIndex
]
=
posBuffer
[
threadIdx
.
x
]
+
globalOffset
;
globalOffset
+=
posBuffer
[
blockDim
.
x
-
1
];
}
}
/**
* Copy the input data into the buckets for sorting.
*/
__global__
void
copyDataToBuckets
(
const
DATA_TYPE
*
__restrict__
data
,
DATA_TYPE
*
__restrict__
buckets
,
unsigned
int
length
,
const
unsigned
int
*
__restrict__
bucketOffset
,
const
unsigned
int
*
__restrict__
bucketOfElement
,
const
unsigned
int
*
__restrict__
offsetInBucket
)
{
for
(
unsigned
int
index
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
index
<
length
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
DATA_TYPE
element
=
data
[
index
];
unsigned
int
bucketIndex
=
bucketOfElement
[
index
];
unsigned
int
offset
=
(
bucketIndex
==
0
?
0
:
bucketOffset
[
bucketIndex
-
1
]);
buckets
[
offset
+
offsetInBucket
[
index
]]
=
element
;
}
}
/**
* Sort the data in each bucket.
*/
__global__
void
sortBuckets
(
DATA_TYPE
*
__restrict__
data
,
const
DATA_TYPE
*
__restrict__
buckets
,
unsigned
int
numBuckets
,
const
unsigned
int
*
__restrict__
bucketOffset
)
{
extern
__shared__
DATA_TYPE
dataBuffer
[];
for
(
unsigned
int
index
=
blockIdx
.
x
;
index
<
numBuckets
;
index
+=
gridDim
.
x
)
{
unsigned
int
startIndex
=
(
index
==
0
?
0
:
bucketOffset
[
index
-
1
]);
unsigned
int
endIndex
=
bucketOffset
[
index
];
unsigned
int
length
=
endIndex
-
startIndex
;
if
(
length
<=
blockDim
.
x
)
{
// Load the data into local memory.
if
(
threadIdx
.
x
<
length
)
dataBuffer
[
threadIdx
.
x
]
=
buckets
[
startIndex
+
threadIdx
.
x
];
else
dataBuffer
[
threadIdx
.
x
]
=
MAX_VALUE
;
__syncthreads
();
// Perform a bitonic sort in local memory.
for
(
unsigned
int
k
=
2
;
k
<=
blockDim
.
x
;
k
*=
2
)
{
for
(
unsigned
int
j
=
k
/
2
;
j
>
0
;
j
/=
2
)
{
int
ixj
=
threadIdx
.
x
^
j
;
if
(
ixj
>
threadIdx
.
x
)
{
DATA_TYPE
value1
=
dataBuffer
[
threadIdx
.
x
];
DATA_TYPE
value2
=
dataBuffer
[
ixj
];
bool
ascending
=
(
threadIdx
.
x
&
k
)
==
0
;
KEY_TYPE
lowKey
=
(
ascending
?
getValue
(
value1
)
:
getValue
(
value2
));
KEY_TYPE
highKey
=
(
ascending
?
getValue
(
value2
)
:
getValue
(
value1
));
if
(
lowKey
>
highKey
)
{
dataBuffer
[
threadIdx
.
x
]
=
value2
;
dataBuffer
[
ixj
]
=
value1
;
}
}
__syncthreads
();
}
}
// Write the data to the sorted array.
if
(
threadIdx
.
x
<
length
)
data
[
startIndex
+
threadIdx
.
x
]
=
dataBuffer
[
threadIdx
.
x
];
}
else
{
// Copy the bucket data over to the output array.
for
(
unsigned
int
i
=
threadIdx
.
x
;
i
<
length
;
i
+=
blockDim
.
x
)
data
[
startIndex
+
i
]
=
buckets
[
startIndex
+
i
];
__threadfence_block
();
__syncthreads
();
// Perform a bitonic sort in global memory.
for
(
unsigned
int
k
=
2
;
k
<
2
*
length
;
k
*=
2
)
{
for
(
unsigned
int
j
=
k
/
2
;
j
>
0
;
j
/=
2
)
{
for
(
unsigned
int
i
=
threadIdx
.
x
;
i
<
length
;
i
+=
blockDim
.
x
)
{
int
ixj
=
i
^
j
;
if
(
ixj
>
i
&&
ixj
<
length
)
{
DATA_TYPE
value1
=
data
[
startIndex
+
i
];
DATA_TYPE
value2
=
data
[
startIndex
+
ixj
];
bool
ascending
=
((
i
&
k
)
==
0
);
for
(
unsigned
int
mask
=
k
*
2
;
mask
<
2
*
length
;
mask
*=
2
)
ascending
=
((
i
&
mask
)
==
0
?
!
ascending
:
ascending
);
KEY_TYPE
lowKey
=
(
ascending
?
getValue
(
value1
)
:
getValue
(
value2
));
KEY_TYPE
highKey
=
(
ascending
?
getValue
(
value2
)
:
getValue
(
value1
));
if
(
lowKey
>
highKey
)
{
data
[
startIndex
+
i
]
=
value2
;
data
[
startIndex
+
ixj
]
=
value1
;
}
}
}
__threadfence_block
();
__syncthreads
();
}
}
}
}
}
}
\ No newline at end of file
platforms/cuda2/src/kernels/utilities.cu
View file @
3e16cab9
extern
"C"
{
/**
/**
* This is called by the various functions below to clear a buffer.
* This is called by the various functions below to clear a buffer.
*/
*/
...
@@ -100,3 +102,5 @@ __global__ void reduceForces(const long* __restrict__ longBuffer, float4* __rest
...
@@ -100,3 +102,5 @@ __global__ void reduceForces(const long* __restrict__ longBuffer, float4* __rest
buffer
[
index
]
=
sum
;
buffer
[
index
]
=
sum
;
}
}
}
}
}
\ No newline at end of file
platforms/cuda2/tests/TestCudaSort.cpp
0 → 100644
View file @
3e16cab9
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008-2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
/**
* This tests the CUDA implementation of sorting.
*/
#include "openmm/internal/AssertionUtilities.h"
#include "../src/CudaArray.h"
#include "../src/CudaContext.h"
#include "../src/CudaSort.h"
#include "sfmt/SFMT.h"
#include "openmm/System.h"
#include <iostream>
#include <cmath>
#include <set>
using
namespace
OpenMM
;
using
namespace
std
;
class
SortTrait
:
public
CudaSort
::
SortTrait
{
int
getDataSize
()
const
{
return
4
;}
int
getKeySize
()
const
{
return
4
;}
const
char
*
getDataType
()
const
{
return
"float"
;}
const
char
*
getKeyType
()
const
{
return
"float"
;}
const
char
*
getMinKey
()
const
{
return
"-MAXFLOAT"
;}
const
char
*
getMaxKey
()
const
{
return
"MAXFLOAT"
;}
const
char
*
getMaxValue
()
const
{
return
"MAXFLOAT"
;}
const
char
*
getSortKey
()
const
{
return
"value"
;}
};
void
verifySorting
(
vector
<
float
>
array
)
{
// Sort the array.
System
system
;
system
.
addParticle
(
0.0
);
CudaPlatform
platform
;
CudaPlatform
::
PlatformData
platformData
(
system
,
""
,
"true"
,
"single"
,
platform
.
getPropertyDefaultValue
(
CudaPlatform
::
CudaCompiler
()),
platform
.
getPropertyDefaultValue
(
CudaPlatform
::
CudaTempDirectory
()));
CudaContext
&
context
=
*
platformData
.
contexts
[
0
];
context
.
initialize
();
CudaArray
data
(
array
.
size
(),
4
,
"sortData"
);
data
.
upload
(
array
);
CudaSort
sort
(
context
,
new
SortTrait
(),
array
.
size
());
sort
.
sort
(
data
);
vector
<
float
>
sorted
;
data
.
download
(
sorted
);
// Verify that it is in sorted order.
for
(
int
i
=
1
;
i
<
(
int
)
sorted
.
size
();
i
++
)
ASSERT
(
sorted
[
i
-
1
]
<=
sorted
[
i
]);
// Make sure the sorted array contains the same values as the original one.
multiset
<
float
>
elements1
(
array
.
begin
(),
array
.
end
());
multiset
<
float
>
elements2
(
sorted
.
begin
(),
sorted
.
end
());
ASSERT
(
elements1
==
elements2
);
}
void
testUniformValues
()
{
OpenMM_SFMT
::
SFMT
sfmt
;
init_gen_rand
(
0
,
sfmt
);
vector
<
float
>
array
(
10000
);
for
(
int
i
=
0
;
i
<
(
int
)
array
.
size
();
i
++
)
array
[
i
]
=
(
float
)
genrand_real2
(
sfmt
);
verifySorting
(
array
);
}
void
testLogValues
()
{
OpenMM_SFMT
::
SFMT
sfmt
;
init_gen_rand
(
0
,
sfmt
);
vector
<
float
>
array
(
10000
);
for
(
int
i
=
0
;
i
<
(
int
)
array
.
size
();
i
++
)
array
[
i
]
=
(
float
)
log
(
genrand_real2
(
sfmt
));
verifySorting
(
array
);
}
int
main
()
{
try
{
testUniformValues
();
testLogValues
();
}
catch
(
const
exception
&
e
)
{
cout
<<
"exception: "
<<
e
.
what
()
<<
endl
;
return
1
;
}
cout
<<
"Done"
<<
endl
;
return
0
;
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment