Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
c3125421
Commit
c3125421
authored
Oct 06, 2009
by
Peter Eastman
Browse files
Continuing implementation of nonbonded forces
parent
d95e723d
Changes
12
Hide whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
778 additions
and
213 deletions
+778
-213
platforms/opencl/src/OpenCLArray.h
platforms/opencl/src/OpenCLArray.h
+2
-0
platforms/opencl/src/OpenCLCompact.cpp
platforms/opencl/src/OpenCLCompact.cpp
+70
-0
platforms/opencl/src/OpenCLCompact.h
platforms/opencl/src/OpenCLCompact.h
+46
-0
platforms/opencl/src/OpenCLContext.cpp
platforms/opencl/src/OpenCLContext.cpp
+17
-5
platforms/opencl/src/OpenCLContext.h
platforms/opencl/src/OpenCLContext.h
+14
-4
platforms/opencl/src/OpenCLKernels.cpp
platforms/opencl/src/OpenCLKernels.cpp
+8
-8
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
+78
-14
platforms/opencl/src/OpenCLNonbondedUtilities.h
platforms/opencl/src/OpenCLNonbondedUtilities.h
+48
-1
platforms/opencl/src/kernels/compact.cl
platforms/opencl/src/kernels/compact.cl
+157
-0
platforms/opencl/src/kernels/findInteractingBlocks.cl
platforms/opencl/src/kernels/findInteractingBlocks.cl
+147
-0
platforms/opencl/src/kernels/nonbonded.cl
platforms/opencl/src/kernels/nonbonded.cl
+18
-18
platforms/opencl/tests/TestOpenCLNonbondedForce.cpp
platforms/opencl/tests/TestOpenCLNonbondedForce.cpp
+173
-163
No files found.
platforms/opencl/src/OpenCLArray.h
View file @
c3125421
...
...
@@ -139,6 +139,8 @@ public:
* Copy the values in the Buffer to a vector.
*/
void
download
(
std
::
vector
<
T
>&
data
)
const
{
if
(
data
.
size
()
!=
size
)
data
.
resize
(
size
);
try
{
context
.
getQueue
().
enqueueReadBuffer
(
*
buffer
,
CL_TRUE
,
0
,
size
*
sizeof
(
T
),
&
data
[
0
]);
}
...
...
platforms/opencl/src/OpenCLCompact.cpp
0 → 100644
View file @
c3125421
/* Code for OpenCL stream compaction. Roughly based on:
Billeter M, Olsson O, Assarsson U. Efficient Stream Compaction on Wide SIMD Many-Core Architectures.
High Performance Graphics 2009.
Notes:
- paper recommends 128 threads/block, so this is hard coded.
- I only implement the prefix-sum based compact primitive, and not the POPC one, as that is more
complicated and performs poorly on current hardware
- I only implement the scattered- and staged-write variant of phase III as it they have reasonable
performance across most of the tested workloads in the paper. The selective variant is not
implemented.
- The prefix sum of per-block element counts (phase II) is not done in a particularly efficient
manner. It is, however, done in a very easy to program manner, and integrated into the top of
phase III, reducing the number of kernel invocations required. If one wanted to use existing code,
it'd be easy to take the CUDA SDK scanLargeArray sample, and do a prefix sum over dgBlockCounts in
a phase II kernel. You could also adapt the existing prescan128 to take an initial value, and scan
dgBlockCounts in stages.
Date: 23 Aug 2009
Author: CUDA version by Imran Haque (ihaque@cs.stanford.edu), converted to OpenCL by Peter Eastman
Affiliation: Stanford University
License: Public Domain
*/
#include "OpenCLCompact.h"
using
namespace
OpenMM
;
OpenCLCompact
::
OpenCLCompact
(
OpenCLContext
&
context
)
:
context
(
context
),
dgBlockCounts
(
NULL
)
{
dgBlockCounts
=
new
OpenCLArray
<
cl_uint
>
(
context
,
context
.
getNumThreadBlocks
(),
"dgBlockCounts"
);
cl
::
Program
program
=
context
.
createProgram
(
context
.
loadSourceFromFile
(
"compact.cl"
));
countKernel
=
cl
::
Kernel
(
program
,
"countElts"
);
moveValidKernel
=
cl
::
Kernel
(
program
,
"moveValidElementsStaged"
);
}
OpenCLCompact
::~
OpenCLCompact
()
{
if
(
dgBlockCounts
!=
NULL
)
delete
dgBlockCounts
;
}
void
OpenCLCompact
::
compactStream
(
OpenCLArray
<
cl_uint
>&
dOut
,
OpenCLArray
<
cl_uint
>&
dIn
,
OpenCLArray
<
cl_uint
>&
dValid
,
OpenCLArray
<
cl_uint
>&
numValid
)
{
// Figure out # elements per block
int
len
=
dIn
.
getSize
();
unsigned
int
numBlocks
=
context
.
getNumThreadBlocks
();
if
(
numBlocks
*
128
>
len
)
numBlocks
=
(
len
+
127
)
/
128
;
const
size_t
eltsPerBlock
=
len
/
numBlocks
+
((
len
%
numBlocks
)
?
1
:
0
);
// TODO: implement loop over blocks of 10M
// Phase 1: Calculate number of valid elements per thread block
countKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
dgBlockCounts
->
getDeviceBuffer
());
countKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
dValid
.
getDeviceBuffer
());
countKernel
.
setArg
<
cl_uint
>
(
2
,
len
);
countKernel
.
setArg
(
3
,
128
*
sizeof
(
cl_uint
),
NULL
);
context
.
executeKernel
(
countKernel
,
len
,
128
);
// Phase 2/3: Move valid elements using SIMD compaction
moveValidKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
dIn
.
getDeviceBuffer
());
moveValidKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
dOut
.
getDeviceBuffer
());
moveValidKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
dValid
.
getDeviceBuffer
());
moveValidKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
dgBlockCounts
->
getDeviceBuffer
());
moveValidKernel
.
setArg
<
cl_uint
>
(
4
,
len
);
moveValidKernel
.
setArg
<
cl
::
Buffer
>
(
5
,
numValid
.
getDeviceBuffer
());
moveValidKernel
.
setArg
(
6
,
128
*
sizeof
(
cl_uint
),
NULL
);
moveValidKernel
.
setArg
(
7
,
128
*
sizeof
(
cl_uint
),
NULL
);
moveValidKernel
.
setArg
(
8
,
128
*
sizeof
(
cl_uint
),
NULL
);
context
.
executeKernel
(
moveValidKernel
,
len
,
128
);
}
platforms/opencl/src/OpenCLCompact.h
0 → 100644
View file @
c3125421
#ifndef __OPENMM_OPENCLCOMPACT_H__
#define __OPENMM_OPENCLCOMPACT_H__
/* Code for OPENCL stream compaction. Roughly based on:
Billeter M, Olsson O, Assarsson U. Efficient Stream Compaction on Wide SIMD Many-Core Architectures.
High Performance Graphics 2009.
Notes:
- paper recommends 128 threads/block, so this is hard coded.
- I only implement the prefix-sum based compact primitive, and not the POPC one, as that is more
complicated and performs poorly on current hardware
- I only implement the scattered- and staged-write variant of phase III as it they have reasonable
performance across most of the tested workloads in the paper. The selective variant is not
implemented.
- The prefix sum of per-block element counts (phase II) is not done in a particularly efficient
manner. It is, however, done in a very easy to program manner, and integrated into the top of
phase III, reducing the number of kernel invocations required. If one wanted to use existing code,
it'd be easy to take the CUDA SDK scanLargeArray sample, and do a prefix sum over dgBlockCounts in
a phase II kernel. You could also adapt the existing prescan128 to take an initial value, and scan
dgBlockCounts in stages.
Date: 23 Aug 2009
Author: CUDA version by Imran Haque (ihaque@cs.stanford.edu), converted to OpenCL by Peter Eastman
Affiliation: Stanford University
License: Public Domain
*/
#include "OpenCLArray.h"
namespace
OpenMM
{
class
OpenCLCompact
{
public:
OpenCLCompact
(
OpenCLContext
&
context
);
~
OpenCLCompact
();
void
compactStream
(
OpenCLArray
<
cl_uint
>&
dOut
,
OpenCLArray
<
cl_uint
>&
dIn
,
OpenCLArray
<
cl_uint
>&
dValid
,
OpenCLArray
<
cl_uint
>&
numValid
);
private:
OpenCLContext
&
context
;
OpenCLArray
<
cl_uint
>*
dgBlockCounts
;
cl
::
Kernel
countKernel
;
cl
::
Kernel
moveValidKernel
;
};
}
// namespace OpenMM
#endif // __OPENMM_OPENCLCOMPACT_H__
\ No newline at end of file
platforms/opencl/src/OpenCLContext.cpp
View file @
c3125421
...
...
@@ -58,6 +58,9 @@ OpenCLContext::OpenCLContext(int numParticles, int deviceIndex) : time(0.0), ste
device
=
devices
[
deviceIndex
];
if
(
device
.
getInfo
<
CL_DEVICE_MAX_WORK_ITEM_SIZES
>
()[
0
]
<
minThreadBlockSize
)
throw
OpenMMException
(
"The specified OpenCL device is not compatible with OpenMM"
);
compilationOptions
=
"-cl-fast-relaxed-math"
;
if
(
device
.
getInfo
<
CL_DEVICE_VENDOR
>
()
==
"NVIDIA"
)
compilationOptions
+=
" -DWARPS_ARE_ATOMIC"
;
queue
=
cl
::
CommandQueue
(
context
,
device
);
numAtoms
=
numParticles
;
paddedNumAtoms
=
TileSize
*
((
numParticles
+
TileSize
-
1
)
/
TileSize
);
...
...
@@ -132,21 +135,30 @@ string OpenCLContext::loadSourceFromFile(const string& filename) const {
return
kernel
;
}
cl
::
Program
OpenCLContext
::
createProgram
(
const
std
::
string
source
)
{
cl
::
Program
OpenCLContext
::
createProgram
(
const
string
source
)
{
return
createProgram
(
source
,
map
<
string
,
string
>
());
}
cl
::
Program
OpenCLContext
::
createProgram
(
const
string
source
,
const
map
<
string
,
string
>&
defines
)
{
cl
::
Program
::
Sources
sources
(
1
,
make_pair
(
source
.
c_str
(),
source
.
size
()));
cl
::
Program
program
(
context
,
sources
);
string
options
=
compilationOptions
;
for
(
map
<
string
,
string
>::
const_iterator
iter
=
defines
.
begin
();
iter
!=
defines
.
end
();
++
iter
)
options
+=
" -D"
+
iter
->
first
+
"="
+
iter
->
second
;
try
{
program
.
build
(
vector
<
cl
::
Device
>
(
1
,
device
));
program
.
build
(
vector
<
cl
::
Device
>
(
1
,
device
)
,
options
.
c_str
()
);
}
catch
(
cl
::
Error
err
)
{
throw
OpenMMException
(
"Error compiling kernel: "
+
program
.
getBuildInfo
<
CL_PROGRAM_BUILD_LOG
>
(
device
));
}
return
program
;
}
void
OpenCLContext
::
executeKernel
(
cl
::
Kernel
&
kernel
,
int
workUnits
)
{
int
size
=
std
::
min
((
workUnits
+
ThreadBlockSize
-
1
)
/
ThreadBlockSize
,
numThreadBlocks
)
*
ThreadBlockSize
;
void
OpenCLContext
::
executeKernel
(
cl
::
Kernel
&
kernel
,
int
workUnits
,
int
workUnitSize
)
{
if
(
workUnitSize
==
-
1
)
workUnitSize
=
ThreadBlockSize
;
int
size
=
std
::
min
((
workUnits
+
workUnitSize
-
1
)
/
workUnitSize
,
numThreadBlocks
)
*
workUnitSize
;
try
{
queue
.
enqueueNDRangeKernel
(
kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
size
),
cl
::
NDRange
(
ThreadBlock
Size
));
queue
.
enqueueNDRangeKernel
(
kernel
,
cl
::
NullRange
,
cl
::
NDRange
(
size
),
cl
::
NDRange
(
workUnit
Size
));
}
catch
(
cl
::
Error
err
)
{
stringstream
str
;
...
...
platforms/opencl/src/OpenCLContext.h
View file @
c3125421
...
...
@@ -28,6 +28,8 @@
* -------------------------------------------------------------------------- */
#define __CL_ENABLE_EXCEPTIONS
#include <map>
#include <string>
#include <cl.hpp>
namespace
OpenMM
{
...
...
@@ -139,13 +141,20 @@ public:
* Create an OpenCL Program from source code.
*/
cl
::
Program
createProgram
(
const
std
::
string
source
);
/**
* Create an OpenCL Program from source code.
*
* @param defines a set of preprocessor definitions (name, value) to define when compiling the program
*/
cl
::
Program
createProgram
(
const
std
::
string
source
,
const
std
::
map
<
std
::
string
,
std
::
string
>&
defines
);
/**
* Execute a kernel.
*
* @param kernel the kernel to execute
* @param workUnits the maximum number of work units that should be used
* @param kernel the kernel to execute
* @param workUnits the maximum number of work units that should be used
* @param workUnitSize the size of each work unit to use
*/
void
executeKernel
(
cl
::
Kernel
&
kernel
,
int
workUnits
);
void
executeKernel
(
cl
::
Kernel
&
kernel
,
int
workUnits
,
int
workUnitSize
=
-
1
);
/**
* Set all elements of an array to 0.
*/
...
...
@@ -226,7 +235,7 @@ public:
/**
* Get the OpenCLNonbondedUtilities for this context.
*/
OpenCLNonbondedUtilities
&
getNonbondedUtilties
()
{
OpenCLNonbondedUtilities
&
getNonbondedUtil
i
ties
()
{
return
*
nonbonded
;
}
private:
...
...
@@ -237,6 +246,7 @@ private:
int
numAtomBlocks
;
int
numThreadBlocks
;
int
numForceBuffers
;
std
::
string
compilationOptions
;
cl
::
Context
context
;
cl
::
Device
device
;
cl
::
CommandQueue
queue
;
...
...
platforms/opencl/src/OpenCLKernels.cpp
View file @
c3125421
...
...
@@ -47,17 +47,17 @@ void OpenCLCalcForcesAndEnergyKernel::initialize(const System& system) {
void
OpenCLCalcForcesAndEnergyKernel
::
beginForceComputation
(
ContextImpl
&
context
)
{
cl
.
clearBuffer
(
cl
.
getForceBuffers
());
cl
.
getNonbondedUtilties
().
prepareInteractions
();
cl
.
getNonbondedUtil
i
ties
().
prepareInteractions
();
}
void
OpenCLCalcForcesAndEnergyKernel
::
finishForceComputation
(
ContextImpl
&
context
)
{
cl
.
reduceBuffer
(
cl
.
getForceBuffers
(),
cl
.
getNumForceBuffers
());
cl
.
getNonbondedUtilties
().
prepareInteractions
();
cl
.
getNonbondedUtil
i
ties
().
prepareInteractions
();
}
void
OpenCLCalcForcesAndEnergyKernel
::
beginEnergyComputation
(
ContextImpl
&
context
)
{
cl
.
clearBuffer
(
cl
.
getEnergyBuffer
());
cl
.
getNonbondedUtilties
().
prepareInteractions
();
cl
.
getNonbondedUtil
i
ties
().
prepareInteractions
();
}
double
OpenCLCalcForcesAndEnergyKernel
::
finishEnergyComputation
(
ContextImpl
&
context
)
{
...
...
@@ -566,8 +566,8 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
// }
// data.nonbondedMethod = method;
// gpuSetCoulombParameters(gpu, 138.935485f, particle, c6, c12, q, symbol, exclusionList, method);
cl
.
getNonbondedUtilties
().
addInteraction
(
useCutoff
,
usePeriodic
,
force
.
getCutoffDistance
(),
exclusionList
);
cl
.
getNonbondedUtilties
().
addParameter
(
"sigmaEpsilon"
,
"float2"
,
8
,
sigmaEpsilon
->
getDeviceBuffer
());
cl
.
getNonbondedUtil
i
ties
().
addInteraction
(
useCutoff
,
usePeriodic
,
force
.
getCutoffDistance
(),
exclusionList
);
cl
.
getNonbondedUtil
i
ties
().
addParameter
(
"sigmaEpsilon"
,
"float2"
,
8
,
sigmaEpsilon
->
getDeviceBuffer
());
cutoffSquared
=
force
.
getCutoffDistance
()
*
force
.
getCutoffDistance
();
// Compute the Ewald self energy.
...
...
@@ -582,7 +582,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
// Initialize the exceptions.
int
numExceptions
=
exceptions
.
size
();
int
maxBuffers
=
1
;
int
maxBuffers
=
cl
.
getNonbondedUtilities
().
getNumForceBuffers
()
;
if
(
numExceptions
>
0
)
{
exceptionParams
=
new
OpenCLArray
<
mm_float4
>
(
cl
,
numExceptions
,
"exceptionParams"
);
exceptionIndices
=
new
OpenCLArray
<
mm_int4
>
(
cl
,
numExceptions
,
"exceptionIndices"
);
...
...
@@ -607,13 +607,13 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
}
void
OpenCLCalcNonbondedForceKernel
::
executeForces
(
ContextImpl
&
context
)
{
cl
.
getNonbondedUtilties
().
computeInteractions
();
cl
.
getNonbondedUtil
i
ties
().
computeInteractions
();
if
(
exceptionIndices
!=
NULL
)
{
int
numExceptions
=
exceptionIndices
->
getSize
();
exceptionsKernel
.
setArg
<
cl_int
>
(
0
,
cl
.
getPaddedNumAtoms
());
exceptionsKernel
.
setArg
<
cl_int
>
(
1
,
numExceptions
);
exceptionsKernel
.
setArg
<
cl_float
>
(
2
,
cutoffSquared
);
exceptionsKernel
.
setArg
<
mm_float4
>
(
3
,
cl
.
getNonbondedUtilties
().
getPeriodicBoxSize
());
exceptionsKernel
.
setArg
<
mm_float4
>
(
3
,
cl
.
getNonbondedUtil
i
ties
().
getPeriodicBoxSize
());
exceptionsKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
cl
.
getForceBuffers
().
getDeviceBuffer
());
exceptionsKernel
.
setArg
<
cl
::
Buffer
>
(
5
,
cl
.
getEnergyBuffer
().
getDeviceBuffer
());
exceptionsKernel
.
setArg
<
cl
::
Buffer
>
(
6
,
cl
.
getPosq
().
getDeviceBuffer
());
...
...
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
View file @
c3125421
...
...
@@ -26,13 +26,25 @@
#include "OpenCLNonbondedUtilities.h"
#include "OpenCLArray.h"
#include "OpenCLCompact.h"
#include <map>
using
namespace
OpenMM
;
using
namespace
std
;
OpenCLNonbondedUtilities
::
OpenCLNonbondedUtilities
(
OpenCLContext
&
context
)
:
context
(
context
),
cutoff
(
-
1.0
),
useCutoff
(
false
),
numForceBuffers
(
0
),
tiles
(
NULL
),
exclusionIndex
(
NULL
),
exclusions
(
NULL
)
{
numForceBuffers
(
0
),
tiles
(
NULL
),
exclusionIndex
(
NULL
),
exclusions
(
NULL
),
interactingTiles
(
NULL
),
interactionFlags
(
NULL
),
interactionCount
(
NULL
),
blockCenter
(
NULL
),
blockBoundingBox
(
NULL
),
compact
(
NULL
)
{
// Decide how many force buffers to use.
forceBufferPerAtomBlock
=
false
;
numForceBuffers
=
context
.
getNumThreadBlocks
()
*
OpenCLContext
::
ThreadBlockSize
/
OpenCLContext
::
TileSize
;
if
(
numForceBuffers
>=
context
.
getNumAtomBlocks
())
{
// For small systems, it is more efficient to have one force buffer per block of 32 atoms instead of one per warp.
forceBufferPerAtomBlock
=
true
;
numForceBuffers
=
context
.
getNumAtomBlocks
();
}
}
OpenCLNonbondedUtilities
::~
OpenCLNonbondedUtilities
()
{
...
...
@@ -42,6 +54,18 @@ OpenCLNonbondedUtilities::~OpenCLNonbondedUtilities() {
delete
exclusionIndex
;
if
(
exclusions
!=
NULL
)
delete
exclusions
;
if
(
interactingTiles
!=
NULL
)
delete
interactingTiles
;
if
(
interactionFlags
!=
NULL
)
delete
interactionFlags
;
if
(
interactionCount
!=
NULL
)
delete
interactionCount
;
if
(
blockCenter
!=
NULL
)
delete
blockCenter
;
if
(
blockBoundingBox
!=
NULL
)
delete
blockBoundingBox
;
if
(
compact
!=
NULL
)
delete
compact
;
}
void
OpenCLNonbondedUtilities
::
addInteraction
(
bool
usesCutoff
,
bool
usesPeriodic
,
double
cutoffDistance
,
const
std
::
vector
<
std
::
vector
<
int
>
>&
exclusionList
)
{
...
...
@@ -90,21 +114,17 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
for
(
unsigned
int
x
=
y
;
x
<
numAtomBlocks
;
x
++
)
tileVec
[
count
++
]
=
(
x
<<
17
)
|
(
y
<<
2
);
// Decide how many force buffers to use.
bool
forceBufferPerAtomBlock
=
false
;
numForceBuffers
=
context
.
getNumThreadBlocks
()
*
OpenCLContext
::
ThreadBlockSize
/
OpenCLContext
::
TileSize
;
if
(
numForceBuffers
>=
numAtomBlocks
)
{
// For small systems, it is more efficient to have one force buffer per block of 32 atoms instead of one per warp.
forceBufferPerAtomBlock
=
true
;
numForceBuffers
=
numAtomBlocks
;
}
// Create kernels.
cl
::
Program
forceProgram
=
context
.
createProgram
(
context
.
loadSourceFromFile
(
"nonbonded.cl"
));
map
<
string
,
string
>
defines
;
if
(
forceBufferPerAtomBlock
)
defines
[
"USE_OUTPUT_BUFFER_PER_BLOCK"
]
=
"true"
;
cl
::
Program
forceProgram
=
context
.
createProgram
(
context
.
loadSourceFromFile
(
"nonbonded.cl"
),
defines
);
forceKernel
=
cl
::
Kernel
(
forceProgram
,
"computeNonbonded"
);
cl
::
Program
interactingBlocksProgram
=
context
.
createProgram
(
context
.
loadSourceFromFile
(
"findInteractingBlocks.cl"
),
defines
);
findBlockBoundsKernel
=
cl
::
Kernel
(
interactingBlocksProgram
,
"findBlockBounds"
);
findInteractingBlocksKernel
=
cl
::
Kernel
(
interactingBlocksProgram
,
"findBlocksWithInteractions"
);
findInteractionsWithinBlocksKernel
=
cl
::
Kernel
(
interactingBlocksProgram
,
"findInteractionsWithinBlocks"
);
// Mark which tiles have exclusions.
...
...
@@ -182,16 +202,60 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
tiles
->
upload
(
tileVec
);
exclusions
->
upload
(
exclusionVec
);
exclusionIndex
->
upload
(
exclusionIndexVec
);
// Record the periodic box size.
Vec3
boxVectors
[
3
];
system
.
getPeriodicBoxVectors
(
boxVectors
[
0
],
boxVectors
[
1
],
boxVectors
[
2
]);
periodicBoxSize
=
(
mm_float4
)
{(
float
)
boxVectors
[
0
][
0
],
(
float
)
boxVectors
[
1
][
1
],
(
float
)
boxVectors
[
2
][
2
],
0.0
f
};
// Create data structures for the neighbor list.
if
(
useCutoff
)
{
interactingTiles
=
new
OpenCLArray
<
cl_uint
>
(
context
,
numTiles
,
"interactingTiles"
);
interactionFlags
=
new
OpenCLArray
<
cl_uint
>
(
context
,
numTiles
,
"interactionFlags"
);
interactionCount
=
new
OpenCLArray
<
cl_uint
>
(
context
,
1
,
"interactionCount"
);
blockCenter
=
new
OpenCLArray
<
mm_float4
>
(
context
,
numAtomBlocks
,
"blockCenter"
);
blockBoundingBox
=
new
OpenCLArray
<
mm_float4
>
(
context
,
numAtomBlocks
,
"blockBoundingBox"
);
compact
=
new
OpenCLCompact
(
context
);
}
}
void
OpenCLNonbondedUtilities
::
prepareInteractions
()
{
hasComputedInteractions
=
false
;
if
(
!
useCutoff
)
return
;
// TODO compute the neighbor list
// Compute the neighbor list.
findBlockBoundsKernel
.
setArg
<
cl_int
>
(
0
,
context
.
getNumAtoms
());
findBlockBoundsKernel
.
setArg
<
mm_float4
>
(
1
,
periodicBoxSize
);
findBlockBoundsKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
context
.
getPosq
().
getDeviceBuffer
());
findBlockBoundsKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
blockCenter
->
getDeviceBuffer
());
findBlockBoundsKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
blockBoundingBox
->
getDeviceBuffer
());
context
.
executeKernel
(
findBlockBoundsKernel
,
context
.
getNumAtoms
());
findInteractingBlocksKernel
.
setArg
<
cl_int
>
(
0
,
tiles
->
getSize
());
findInteractingBlocksKernel
.
setArg
<
cl_float
>
(
1
,
cutoff
*
cutoff
);
findInteractingBlocksKernel
.
setArg
<
mm_float4
>
(
2
,
periodicBoxSize
);
findInteractingBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
tiles
->
getDeviceBuffer
());
findInteractingBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
blockCenter
->
getDeviceBuffer
());
findInteractingBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
5
,
blockBoundingBox
->
getDeviceBuffer
());
findInteractingBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
6
,
interactionFlags
->
getDeviceBuffer
());
context
.
executeKernel
(
findInteractingBlocksKernel
,
context
.
getNumAtoms
());
compact
->
compactStream
(
*
interactingTiles
,
*
tiles
,
*
interactionFlags
,
*
interactionCount
);
findInteractionsWithinBlocksKernel
.
setArg
<
cl_float
>
(
0
,
cutoff
*
cutoff
);
findInteractionsWithinBlocksKernel
.
setArg
<
mm_float4
>
(
1
,
periodicBoxSize
);
findInteractionsWithinBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
context
.
getPosq
().
getDeviceBuffer
());
findInteractionsWithinBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
tiles
->
getDeviceBuffer
());
findInteractionsWithinBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
blockCenter
->
getDeviceBuffer
());
findInteractionsWithinBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
5
,
blockBoundingBox
->
getDeviceBuffer
());
findInteractionsWithinBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
6
,
interactionFlags
->
getDeviceBuffer
());
findInteractionsWithinBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
7
,
interactionCount
->
getDeviceBuffer
());
findInteractionsWithinBlocksKernel
.
setArg
(
8
,
OpenCLContext
::
ThreadBlockSize
*
sizeof
(
cl_uint
),
NULL
);
context
.
executeKernel
(
findInteractionsWithinBlocksKernel
,
context
.
getNumAtoms
());
}
void
OpenCLNonbondedUtilities
::
computeInteractions
()
{
...
...
platforms/opencl/src/OpenCLNonbondedUtilities.h
View file @
c3125421
...
...
@@ -34,6 +34,8 @@
namespace
OpenMM
{
class
OpenCLCompact
;
/**
* This class implements features that are used by several different force. It provides
* a generic interface for calculating nonbonded interactions.
...
...
@@ -86,17 +88,62 @@ public:
* prepareInteractions(). Additional calls return immediately without doing anything.
*/
void
computeInteractions
();
/**
* Get the array containing the center of each atom block.
*/
OpenCLArray
<
mm_float4
>&
getBlockCenters
()
{
return
*
blockCenter
;
}
/**
* Get the array containing the dimensions of each atom block.
*/
OpenCLArray
<
mm_float4
>&
getBlockBoundingBoxes
()
{
return
*
blockBoundingBox
;
}
/**
* Get the array containing the full set of tiles.
*/
OpenCLArray
<
cl_uint
>&
getTiles
()
{
return
*
tiles
;
}
/**
* Get the array whose first element contains the number of tiles with interactions.
*/
OpenCLArray
<
cl_uint
>&
getInteractionCount
()
{
return
*
interactionCount
;
}
/**
* Get the array containing tiles with interactions.
*/
OpenCLArray
<
cl_uint
>&
getInteractingTiles
()
{
return
*
interactingTiles
;
}
/**
* Get the array containing flags for tiles with interactions.
*/
OpenCLArray
<
cl_uint
>&
getInteractionFlags
()
{
return
*
interactionFlags
;
}
private:
class
ParameterInfo
;
OpenCLContext
&
context
;
cl
::
Kernel
forceKernel
;
cl
::
Kernel
findBlockBoundsKernel
;
cl
::
Kernel
findInteractingBlocksKernel
;
cl
::
Kernel
findInteractionsWithinBlocksKernel
;
OpenCLArray
<
cl_uint
>*
tiles
;
OpenCLArray
<
cl_uint
>*
exclusionIndex
;
OpenCLArray
<
cl_uint
>*
exclusions
;
OpenCLArray
<
cl_uint
>*
interactingTiles
;
OpenCLArray
<
cl_uint
>*
interactionFlags
;
OpenCLArray
<
cl_uint
>*
interactionCount
;
OpenCLArray
<
mm_float4
>*
blockCenter
;
OpenCLArray
<
mm_float4
>*
blockBoundingBox
;
std
::
vector
<
std
::
vector
<
int
>
>
atomExclusions
;
std
::
vector
<
ParameterInfo
>
parameters
;
OpenCLCompact
*
compact
;
double
cutoff
;
bool
useCutoff
,
usePeriodic
,
hasComputedInteractions
;
bool
useCutoff
,
usePeriodic
,
forceBufferPerAtomBlock
,
hasComputedInteractions
;
int
numForceBuffers
;
mm_float4
periodicBoxSize
;
};
...
...
platforms/opencl/src/kernels/compact.cl
0 → 100644
View file @
c3125421
/*
Code
for
CUDA
stream
compaction.
Roughly
based
on:
Billeter
M,
Olsson
O,
Assarsson
U.
Efficient
Stream
Compaction
on
Wide
SIMD
Many-Core
Architectures.
High
Performance
Graphics
2009.
Notes:
-
paper
recommends
128
threads/block,
so
this
is
hard
coded.
-
I
only
implement
the
prefix-sum
based
compact
primitive,
and
not
the
POPC
one,
as
that
is
more
complicated
and
performs
poorly
on
current
hardware
-
I
only
implement
the
scattered-
and
staged-write
variant
of
phase
III
as
it
they
have
reasonable
performance
across
most
of
the
tested
workloads
in
the
paper.
The
selective
variant
is
not
implemented.
-
The
prefix
sum
of
per-block
element
counts
(
phase
II
)
is
not
done
in
a
particularly
efficient
manner.
It
is,
however,
done
in
a
very
easy
to
program
manner,
and
integrated
into
the
top
of
phase
III,
reducing
the
number
of
kernel
invocations
required.
If
one
wanted
to
use
existing
code,
it
'd
be
easy
to
take
the
CUDA
SDK
scanLargeArray
sample,
and
do
a
prefix
sum
over
dgBlockCounts
in
a
phase
II
kernel.
You
could
also
adapt
the
existing
prescan128
to
take
an
initial
value,
and
scan
dgBlockCounts
in
stages.
Date:
23
Aug
2009
Author:
CUDA
version
by
Imran
Haque
(
ihaque@cs.stanford.edu
)
,
converted
to
OpenCL
by
Peter
Eastman
Affiliation:
Stanford
University
License:
Public
Domain
*/
//
Phase
1:
Count
valid
elements
per
thread
block
//
Hard-code
128
thd/blk
unsigned
int
sumReduce128
(
__local
unsigned
int*
arr
)
{
//
Parallel
reduce
element
counts
//
Assumes
128
thd/block
int
thread
=
get_local_id
(
0
)
;
if
(
thread
<
64
)
arr[thread]
+=
arr[thread+64]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
#
ifdef
WARPS_ARE_ATOMIC
if
(
thread
<
32
)
{
arr[thread]
+=
arr[thread+32]
;
if
(
thread
<
16
)
arr[thread]
+=
arr[thread+16]
;
if
(
thread
<
8
)
arr[thread]
+=
arr[thread+8]
;
if
(
thread
<
4
)
arr[thread]
+=
arr[thread+4]
;
if
(
thread
<
2
)
arr[thread]
+=
arr[thread+2]
;
if
(
thread
<
1
)
arr[thread]
+=
arr[thread+1]
;
}
#
else
if
(
thread
<
32
)
arr[thread]
+=
arr[thread+32]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
thread
<
16
)
arr[thread]
+=
arr[thread+16]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
thread
<
8
)
arr[thread]
+=
arr[thread+8]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
thread
<
4
)
arr[thread]
+=
arr[thread+4]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
thread
<
2
)
arr[thread]
+=
arr[thread+2]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
thread
<
1
)
arr[thread]
+=
arr[thread+1]
;
#
endif
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
return
arr[0]
;
}
__kernel
void
countElts
(
__global
unsigned
int*
dgBlockCounts,
__global
unsigned
int*
dgValid,
const
unsigned
int
len,
__local
unsigned
int*
dsCount
)
{
dsCount[get_local_id
(
0
)
]
=
0
;
unsigned
int
ub
;
const
unsigned
int
eltsPerBlock
=
len/get_num_groups
(
0
)
+
((
len
%
get_num_groups
(
0
))
?
1
:
0
)
;
ub
=
(
len
<
(
get_group_id
(
0
)
+1
)
*eltsPerBlock
)
?
len
:
((
get_group_id
(
0
)
+
1
)
*eltsPerBlock
)
;
for
(
int
base
=
get_group_id
(
0
)
*
eltsPerBlock
; base < (get_group_id(0)+1)*eltsPerBlock; base += get_local_size(0)) {
if
((
base
+
get_local_id
(
0
))
<
ub
&&
dgValid[base+get_local_id
(
0
)
]
)
dsCount[get_local_id
(
0
)
]++
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
unsigned
int
blockCount
=
sumReduce128
(
dsCount
)
;
if
(
get_local_id
(
0
)
==
0
)
dgBlockCounts[get_group_id
(
0
)
]
=
blockCount
;
return
;
}
//
Phase
2/3:
Move
valid
elements
using
SIMD
compaction
(
phase
2
is
done
implicitly
at
top
of
__global__
method
)
//
Exclusive
prefix
scan
over
128
elements
//
Assumes
128
threads
//
Taken
from
cuda
SDK
"scan"
sample
for
naive
scan,
with
small
modifications
int
exclusivePrescan128
(
__local
const
unsigned
int*
in,
__local
unsigned
int*
outAndTemp
)
{
const
int
n=128
;
//TODO:
this
temp
storage
could
be
reduced
since
we
write
to
shared
memory
in
out
anyway,
and
n
is
hardcoded
//__shared__
int
temp[2*n]
;
__local
unsigned
int*
temp
=
outAndTemp
;
int
pout
=
1
,
pin
=
0
;
//
load
input
into
temp
//
This
is
exclusive
scan,
so
shift
right
by
one
and
set
first
elt
to
0
int
thread
=
get_local_id
(
0
)
;
temp[pout*n
+
get_local_id
(
0
)
]
=
(
get_local_id
(
0
)
>
0
)
?
in[get_local_id
(
0
)
-1]
:
0
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
for
(
int
offset
=
1
; offset < n; offset *= 2)
{
pout
=
1
-
pout
; // swap double buffer indices
pin
=
1
-
pout
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
temp[pout*n+get_local_id
(
0
)
]
=
temp[pin*n+get_local_id
(
0
)
]
;
if
(
get_local_id
(
0
)
>=
offset
)
temp[pout*n+get_local_id
(
0
)
]
+=
temp[pin*n+get_local_id
(
0
)
-
offset]
;
}
//out[get_local_id
(
0
)
]
=
temp[pout*n+get_local_id
(
0
)
]
; // write output
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
return
outAndTemp[127]+in[127]
; // Return sum of all elements
}
int
compactSIMDPrefixSum
(
__local
const
unsigned
int*
dsData,
__local
const
unsigned
int*
dsValid,
__local
unsigned
int*
dsCompact,
__local
unsigned
int*
dsLocalIndex
)
{
int
numValid
=
exclusivePrescan128
(
dsValid,dsLocalIndex
)
;
int
thread
=
get_local_id
(
0
)
;
if
(
dsValid[get_local_id
(
0
)
]
)
dsCompact[dsLocalIndex[get_local_id
(
0
)
]]
=
dsData[get_local_id
(
0
)
]
;
return
numValid
;
}
__kernel
void
moveValidElementsStaged
(
__global
unsigned
int*
dgData,
__global
unsigned
int*
dgCompact,
__global
unsigned
int*
dgValid,
__global
unsigned
int*
dgBlockCounts,
unsigned
int
len,
__global
unsigned
int*
dNumValidElements,
__local
unsigned
int*
inBlock,
__local
unsigned
int*
validBlock,
__local
unsigned
int*
compactBlock
)
{
__local
unsigned
int
dsLocalIndex[256]
;
int
blockOutOffset=0
;
//
Sum
up
the
blockCounts
before
us
to
find
our
offset
//
This
is
totally
inefficient
-
lots
of
repeated
work
b/w
blocks,
and
uneven
balancing.
//
Paper
implements
this
as
a
prefix
sum
kernel
in
phase
II
//
May
still
be
faster
than
an
extra
kernel
invocation?
int
thread
=
get_local_id
(
0
)
;
for
(
int
base
=
0
; base < get_group_id(0); base += get_local_size(0)) {
//
Load
up
the
count
of
valid
elements
for
each
block
before
us
in
batches
of
128
if
((
base
+
get_local_id
(
0
))
<
get_group_id
(
0
))
{
validBlock[get_local_id
(
0
)
]
=
dgBlockCounts[base+get_local_id
(
0
)
]
;
}
else
{
validBlock[get_local_id
(
0
)
]
=
0
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
//
Parallel
reduce
these
counts
//
Accumulate
in
the
final
offset
variable
blockOutOffset
+=
sumReduce128
(
validBlock
)
;
}
unsigned
int
ub
;
const
unsigned
int
eltsPerBlock
=
len/get_num_groups
(
0
)
+
((
len
%
get_num_groups
(
0
))
?
1
:
0
)
;
ub
=
(
len
<
(
get_group_id
(
0
)
+1
)
*eltsPerBlock
)
?
len
:
((
get_group_id
(
0
)
+
1
)
*eltsPerBlock
)
;
for
(
int
base
=
get_group_id
(
0
)
*
eltsPerBlock
; base < (get_group_id(0)+1)*eltsPerBlock; base += get_local_size(0)) {
if
((
base
+
get_local_id
(
0
))
<
ub
)
{
validBlock[get_local_id
(
0
)
]
=
dgValid[base+get_local_id
(
0
)
]
;
inBlock[get_local_id
(
0
)
]
=
dgData[base+get_local_id
(
0
)
]
;
}
else
{
validBlock[get_local_id
(
0
)
]
=
0
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
int
numValidBlock
=
compactSIMDPrefixSum
(
inBlock,validBlock,compactBlock,dsLocalIndex
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
get_local_id
(
0
)
<
numValidBlock
)
{
dgCompact[blockOutOffset
+
get_local_id
(
0
)
]
=
compactBlock[get_local_id
(
0
)
]
;
}
blockOutOffset
+=
numValidBlock
;
}
if
(
get_group_id
(
0
)
==
(
get_num_groups
(
0
)
-1
)
&&
get_local_id
(
0
)
==
0
)
{
*dNumValidElements
=
blockOutOffset
;
}
}
platforms/opencl/src/kernels/findInteractingBlocks.cl
0 → 100644
View file @
c3125421
const
int
BlockSize
=
32
;
/**
*
Find
a
bounding
box
for
the
atoms
in
each
block.
*/
__kernel
void
findBlockBounds
(
int
numAtoms,
float4
periodicBoxSize,
__global
float4*
posq,
__global
float4*
blockCenter,
__global
float4*
blockBoundingBox
)
{
int
index
=
get_global_id
(
0
)
;
int
base
=
index*BlockSize
;
while
(
base
<
numAtoms
)
{
float4
pos
=
posq[base]
;
#
ifdef
USE_PERIODIC
pos.x
-=
floor
(
pos.x/periodicBoxSize.x
)
*periodicBoxSize.x
;
pos.y
-=
floor
(
pos.y/periodicBoxSize.y
)
*periodicBoxSize.y
;
pos.z
-=
floor
(
pos.z/periodicBoxSize.z
)
*periodicBoxSize.z
;
float4
firstPoint
=
pos
;
#
endif
float4
minPos
=
pos
;
float4
maxPos
=
pos
;
int
last
=
min
(
base+BlockSize,
numAtoms
)
;
for
(
int
i
=
base+1
; i < last; i++) {
pos
=
posq[i]
;
#
ifdef
USE_PERIODIC
pos.x
-=
floor
((
pos.x-firstPoint.x
)
/periodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
pos.y
-=
floor
((
pos.y-firstPoint.y
)
/periodicBoxSize.y+0.5f
)
*periodicBoxSize.y
;
pos.z
-=
floor
((
pos.z-firstPoint.z
)
/periodicBoxSize.z+0.5f
)
*periodicBoxSize.z
;
#
endif
minPos
=
min
(
minPos,
pos
)
;
maxPos
=
max
(
maxPos,
pos
)
;
}
blockBoundingBox[index]
=
0.5f*
(
maxPos-minPos
)
;
blockCenter[index]
=
0.5f*
(
maxPos+minPos
)
;
index
+=
get_global_size
(
0
)
;
base
=
index*BlockSize
;
}
}
/**
*
Compare
the
bounding
boxes
for
each
pair
of
blocks.
If
they
are
sufficiently
far
apart,
*
mark
them
as
non-interacting.
*/
__kernel
void
findBlocksWithInteractions
(
int
numTiles,
float
cutoffSquared,
float4
periodicBoxSize,
__global
unsigned
int*
tiles,
__global
float4*
blockCenter,
__global
float4*
blockBoundingBox,
__global
unsigned
int*
interactionFlag
)
{
int
index
=
get_global_id
(
0
)
;
while
(
index
<
numTiles
)
{
//
Extract
cell
coordinates
from
appropriate
work
unit
unsigned
int
x
=
tiles[index]
;
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
;
x
=
(
x
>>
17
)
;
//
Find
the
distance
between
the
bounding
boxes
of
the
two
cells.
float4
delta
=
blockCenter[x]-blockCenter[y]
;
#
ifdef
USE_PERIODIC
delta.x
-=
floor
(
delta.x/periodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
delta.y
-=
floor
(
delta.y/periodicBoxSize.y+0.5f
)
*periodicBoxSize.y
;
delta.z
-=
floor
(
delta.z/periodicBoxSize.z+0.5f
)
*periodicBoxSize.z
;
#
endif
float4
boxSizea
=
blockBoundingBox[x]
;
float4
boxSizeb
=
blockBoundingBox[y]
;
delta.x
=
max
(
0.0f,
fabs
(
delta.x
)
-boxSizea.x-boxSizeb.x
)
;
delta.y
=
max
(
0.0f,
fabs
(
delta.y
)
-boxSizea.y-boxSizeb.y
)
;
delta.z
=
max
(
0.0f,
fabs
(
delta.z
)
-boxSizea.z-boxSizeb.z
)
;
interactionFlag[index]
=
(
delta.x*delta.x+delta.y*delta.y+delta.z*delta.z
>
cutoffSquared
?
0
:
1
)
;
index
+=
get_global_size
(
0
)
;
}
}
/**
*
Compare
each
atom
in
one
block
to
the
bounding
box
of
another
block,
and
set
*
flags
for
which
ones
are
interacting.
*/
__kernel
void
findInteractionsWithinBlocks
(
float
cutoffSquared,
float4
periodicBoxSize,
__global
float4*
posq,
__global
unsigned
int*
tiles,
__global
float4*
blockCenter,
__global
float4*
blockBoundingBox,
__global
unsigned
int*
interactionFlags,
__global
unsigned
int*
interactionCount,
__local
unsigned
int*
flags
)
{
unsigned
int
totalWarps
=
get_global_size
(
0
)
/BlockSize
;
unsigned
int
warp
=
get_global_id
(
0
)
/BlockSize
;
unsigned
int
numTiles
=
interactionCount[0]
;
unsigned
int
pos
=
warp*numTiles/totalWarps
;
unsigned
int
end
=
(
warp+1
)
*numTiles/totalWarps
;
unsigned
int
index
=
get_local_id
(
0
)
&
(
BlockSize
-
1
)
;
unsigned
int
lasty
=
0xFFFFFFFF
;
float4
apos
;
while
(
pos
<
end
)
{
//
Extract
cell
coordinates
from
appropriate
work
unit
unsigned
int
x
=
tiles[pos]
;
unsigned
int
y
=
((
x
>>
2
)
&
0x7fff
)
;
bool
bExclusionFlag
=
(
x
&
0x1
)
;
x
=
(
x
>>
17
)
;
if
(
x
==
y
||
bExclusionFlag
)
{
//
Assume
this
block
will
be
dense.
if
(
index
==
0
)
interactionFlags[pos]
=
0xFFFFFFFF
;
}
else
{
//
Load
the
bounding
box
for
x
and
the
atom
positions
for
y.
float4
center
=
blockCenter[x]
;
float4
boxSize
=
blockBoundingBox[x]
;
if
(
y
!=
lasty
)
apos
=
posq[
(
y*BlockSize
)
+index]
;
//
Find
the
distance
of
the
atom
from
the
bounding
box.
float4
delta
=
apos-center
;
#
ifdef
USE_PERIODIC
delta.x
-=
floor
(
delta.x/periodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
delta.y
-=
floor
(
delta.y/periodicBoxSize.y+0.5f
)
*periodicBoxSize.y
;
delta.z
-=
floor
(
delta.z/periodicBoxSize.z+0.5f
)
*periodicBoxSize.z
;
#
endif
delta
=
max
((
float4
)
0.0f,
fabs
(
delta
)
-boxSize
)
;
int
thread
=
get_local_id
(
0
)
;
flags[thread]
=
(
delta.x*delta.x+delta.y*delta.y+delta.z*delta.z
>
cutoffSquared
?
0
:
1
<<
index
)
;
//
Sum
the
flags.
if
(
index
%
2
==
0
)
flags[thread]
+=
flags[thread+1]
;
if
(
index
%
4
==
0
)
flags[thread]
+=
flags[thread+2]
;
if
(
index
%
8
==
0
)
flags[thread]
+=
flags[thread+4]
;
if
(
index
%
16
==
0
)
flags[thread]
+=
flags[thread+8]
;
if
(
index
==
0
)
{
unsigned
int
allFlags
=
flags[thread]
+
flags[thread+16]
;
//
Count
how
many
flags
are
set,
and
based
on
that
decide
whether
to
compute
all
interactions
//
or
only
a
fraction
of
them.
unsigned
int
bits
=
(
allFlags&0x55555555
)
+
((
allFlags>>1
)
&0x55555555
)
;
bits
=
(
bits&0x33333333
)
+
((
bits>>2
)
&0x33333333
)
;
bits
=
(
bits&0x0F0F0F0F
)
+
((
bits>>4
)
&0x0F0F0F0F
)
;
bits
=
(
bits&0x00FF00FF
)
+
((
bits>>8
)
&0x00FF00FF
)
;
bits
=
(
bits&0x0000FFFF
)
+
((
bits>>16
)
&0x0000FFFF
)
;
interactionFlags[pos]
=
(
bits
>
12
?
0xFFFFFFFF
:
allFlags
)
;
}
lasty
=
y
;
}
pos++
;
}
}
platforms/opencl/src/kernels/nonbonded.cl
View file @
c3125421
...
...
@@ -84,16 +84,16 @@ __kernel void computeNonbonded(int numTiles, int paddedNumAtoms, float cutoffSqu
// Write results
float4 of;
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = x + tgx + warp*paddedNumAtoms;
of = forceBuffers[offset];
of.xyz += af.xyz;
forceBuffers[offset] = of;
#else
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
of.xyz = af.xyz;
of.w = 0.0f;
unsigned int offset = x + tgx + (x/TileSize) * paddedNumAtoms;
forceBuffers[offset] = of;
#else
unsigned int offset = x + tgx + warp*paddedNumAtoms;
of = forceBuffers[offset];
of.xyz += af.xyz;
forceBuffers[offset] = of;
#endif
}
else {
...
...
@@ -169,14 +169,14 @@ __kernel void computeNonbonded(int numTiles, int paddedNumAtoms, float cutoffSqu
}
}
}
else
// bExclusion
else
#endif
{
// Read fixed atom data into registers and GRF
unsigned int xi = x/TileSize;
unsigned int yi = y/TileSize;
unsigned int tile = xi+yi*paddedNumAtoms/TileSize-yi*(yi+1)/2;
unsigned int excl = exclusions[exclusionIndices[tile]+tgx];
unsigned int excl =
(hasExclusions ?
exclusions[exclusionIndices[tile]+tgx]
: 0xFFFFFFFF)
;
excl = (excl >> tgx) |
(
excl
<<
(
TileSize
-
tgx
))
;
for
(
unsigned
int
j
=
0
; j < TileSize; j++) {
bool
isExcluded
=
!
(
excl
&
0x1
)
;
...
...
@@ -222,16 +222,7 @@ __kernel void computeNonbonded(int numTiles, int paddedNumAtoms, float cutoffSqu
//
Write
results
float4
of
;
#
ifdef
USE_OUTPUT_BUFFER_PER_WARP
unsigned
int
offset
=
x
+
tgx
+
warp*paddedNumAtoms
;
of
=
forceBuffers[offset]
;
of.xyz
+=
af.xyz
;
forceBuffers[offset]
=
of
;
offset
=
y
+
tgx
+
warp*paddedNumAtoms
;
of
=
forceBuffers[offset]
;
of.xyz
+=
local_foce[get_local_id
(
0
)
].xyz
;
forceBuffers[offset]
=
of
;
#
else
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
of.xyz
=
af.xyz
;
of.w
=
0.0f
;
unsigned
int
offset
=
x
+
tgx
+
(
y/TileSize
)
*
paddedNumAtoms
;
...
...
@@ -239,6 +230,15 @@ __kernel void computeNonbonded(int numTiles, int paddedNumAtoms, float cutoffSqu
of
=
local_force[get_local_id
(
0
)
]
;
offset
=
y
+
tgx
+
(
x/TileSize
)
*
paddedNumAtoms
;
forceBuffers[offset]
=
of
;
#
else
unsigned
int
offset
=
x
+
tgx
+
warp*paddedNumAtoms
;
of
=
forceBuffers[offset]
;
of.xyz
+=
af.xyz
;
forceBuffers[offset]
=
of
;
offset
=
y
+
tgx
+
warp*paddedNumAtoms
;
of
=
forceBuffers[offset]
;
of.xyz
+=
local_force[get_local_id
(
0
)
].xyz
;
forceBuffers[offset]
=
of
;
#
endif
lasty
=
y
;
}
...
...
platforms/opencl/tests/TestOpenCLNonbondedForce.cpp
View file @
c3125421
...
...
@@ -43,7 +43,8 @@
#include "openmm/LangevinIntegrator.h"
#include "openmm/VerletIntegrator.h"
#include "openmm/internal/ContextImpl.h"
//#include "kernels/gputypes.h"
#include "OpenCLArray.h"
#include "OpenCLNonbondedUtilities.h"
#include "../src/SimTKUtilities/SimTKOpenMMRealType.h"
#include "../src/sfmt/SFMT.h"
#include <iostream>
...
...
@@ -438,167 +439,176 @@ void testLargeSystem() {
ASSERT_EQUAL_TOL
(
clState
.
getPotentialEnergy
(),
referenceState
.
getPotentialEnergy
(),
tol
);
}
//void testBlockInteractions(bool periodic) {
// const int blockSize = 32;
// const int numBlocks = 100;
// const int numParticles = blockSize*numBlocks;
// const double cutoff = 1.0;
// const double boxSize = (periodic ? 5.1 : 1.1);
// OpenCLPlatform cl;
// System system;
// VerletIntegrator integrator(0.01);
// NonbondedForce* nonbonded = new NonbondedForce();
// vector<Vec3> positions(numParticles);
// init_gen_rand(0);
// for (int i = 0; i < numParticles; i++) {
// system.addParticle(1.0);
// nonbonded->addParticle(1.0, 0.2, 0.2);
// positions[i] = Vec3(boxSize*(3*genrand_real2()-1), boxSize*(3*genrand_real2()-1), boxSize*(3*genrand_real2()-1));
// }
// nonbonded->setNonbondedMethod(periodic ? NonbondedForce::CutoffPeriodic : NonbondedForce::CutoffNonPeriodic);
// nonbonded->setCutoffDistance(cutoff);
// system.setPeriodicBoxVectors(Vec3(boxSize, 0, 0), Vec3(0, boxSize, 0), Vec3(0, 0, boxSize));
// system.addForce(nonbonded);
// Context context(system, integrator, cl);
// context.setPositions(positions);
// State state = context.getState(State::Positions | State::Velocities | State::Forces);
// ContextImpl* contextImpl = *reinterpret_cast<ContextImpl**>(&context);
// OpenCLPlatform::PlatformData& data = *static_cast<OpenCLPlatform::PlatformData*>(contextImpl->getPlatformData());
//
// // Verify that the bounds of each block were calculated correctly.
//
// data.gpu->psPosq4->Download();
// data.gpu->psGridBoundingBox->Download();
// data.gpu->psGridCenter->Download();
// for (int i = 0; i < numBlocks; i++) {
// float4 gridSize = (*data.gpu->psGridBoundingBox)[i];
// float4 center = (*data.gpu->psGridCenter)[i];
// if (periodic) {
// ASSERT(gridSize.x < 0.5*boxSize);
// ASSERT(gridSize.y < 0.5*boxSize);
// ASSERT(gridSize.z < 0.5*boxSize);
// }
// float minx = 0.0, maxx = 0.0, miny = 0.0, maxy = 0.0, minz = 0.0, maxz = 0.0, radius = 0.0;
// for (int j = 0; j < blockSize; j++) {
// float4 pos = (*data.gpu->psPosq4)[i*blockSize+j];
// float dx = pos.x-center.x;
// float dy = pos.y-center.y;
// float dz = pos.z-center.z;
// if (periodic) {
// dx -= (float)(floor(0.5+dx/boxSize)*boxSize);
// dy -= (float)(floor(0.5+dy/boxSize)*boxSize);
// dz -= (float)(floor(0.5+dz/boxSize)*boxSize);
// }
// ASSERT(abs(dx) < gridSize.x+TOL);
// ASSERT(abs(dy) < gridSize.y+TOL);
// ASSERT(abs(dz) < gridSize.z+TOL);
// minx = min(minx, dx);
// maxx = max(maxx, dx);
// miny = min(miny, dy);
// maxy = max(maxy, dy);
// minz = min(minz, dz);
// maxz = max(maxz, dz);
// }
// ASSERT_EQUAL_TOL(-minx, gridSize.x, TOL);
// ASSERT_EQUAL_TOL(maxx, gridSize.x, TOL);
// ASSERT_EQUAL_TOL(-miny, gridSize.y, TOL);
// ASSERT_EQUAL_TOL(maxy, gridSize.y, TOL);
// ASSERT_EQUAL_TOL(-minz, gridSize.z, TOL);
// ASSERT_EQUAL_TOL(maxz, gridSize.z, TOL);
// }
//
// // Verify that interactions were identified correctly.
//
// data.gpu->psInteractionCount->Download();
// int numWithInteractions = (*data.gpu->psInteractionCount)[0];
// vector<bool> hasInteractions(data.gpu->sim.workUnits, false);
// data.gpu->psInteractingWorkUnit->Download();
// data.gpu->psInteractionFlag->Download();
// const unsigned int atoms = data.gpu->sim.paddedNumberOfAtoms;
// const unsigned int grid = data.gpu->grid;
// const unsigned int dim = (atoms+(grid-1))/grid;
// for (int i = 0; i < numWithInteractions; i++) {
// unsigned int workUnit = (*data.gpu->psInteractingWorkUnit)[i];
// unsigned int x = (workUnit >> 17);
// unsigned int y = ((workUnit >> 2) & 0x7fff);
// int tile = (x > y ? x+y*dim-y*(y+1)/2 : y+x*dim-x*(x+1)/2);
// hasInteractions[tile] = true;
//
// // Make sure this tile really should have been flagged based on bounding volumes.
//
// float4 gridSize1 = (*data.gpu->psGridBoundingBox)[x];
// float4 gridSize2 = (*data.gpu->psGridBoundingBox)[y];
// float4 center1 = (*data.gpu->psGridCenter)[x];
// float4 center2 = (*data.gpu->psGridCenter)[y];
// float dx = center1.x-center2.x;
// float dy = center1.y-center2.y;
// float dz = center1.z-center2.z;
// if (periodic) {
// dx -= (float)(floor(0.5+dx/boxSize)*boxSize);
// dy -= (float)(floor(0.5+dy/boxSize)*boxSize);
// dz -= (float)(floor(0.5+dz/boxSize)*boxSize);
// }
// dx = max(0.0f, abs(dx)-gridSize1.x-gridSize2.x);
// dy = max(0.0f, abs(dy)-gridSize1.y-gridSize2.y);
// dz = max(0.0f, abs(dz)-gridSize1.z-gridSize2.z);
// ASSERT(sqrt(dx*dx+dy*dy+dz*dz) < cutoff+TOL);
//
// // Check the interaction flags.
//
// unsigned int flags = (*data.gpu->psInteractionFlag)[i];
// for (int atom2 = 0; atom2 < 32; atom2++) {
// if ((flags & 1) == 0) {
// float4 pos2 = (*data.gpu->psPosq4)[y*blockSize+atom2];
// for (int atom1 = 0; atom1 < blockSize; ++atom1) {
// float4 pos1 = (*data.gpu->psPosq4)[x*blockSize+atom1];
// float dx = pos2.x-pos1.x;
// float dy = pos2.y-pos1.y;
// float dz = pos2.z-pos1.z;
// if (periodic) {
// dx -= (float)(floor(0.5+dx/boxSize)*boxSize);
// dy -= (float)(floor(0.5+dy/boxSize)*boxSize);
// dz -= (float)(floor(0.5+dz/boxSize)*boxSize);
// }
// if (dx*dx+dy*dy+dz*dz < cutoff*cutoff) {
// dx = pos2.x-center1.x;
// dy = pos2.y-center1.y;
// dz = pos2.z-center1.z;
// dx = max(0.0f, abs(dx)-gridSize1.x);
// dy = max(0.0f, abs(dy)-gridSize1.y);
// dz = max(0.0f, abs(dz)-gridSize1.z);
// }
// ASSERT(dx*dx+dy*dy+dz*dz > cutoff*cutoff);
// }
// }
// flags >>= 1;
// }
// }
//
// // Check the tiles that did not have interactions to make sure all atoms are beyond the cutoff.
//
// data.gpu->psWorkUnit->Download();
// for (int i = 0; i < (int)hasInteractions.size(); i++)
// if (!hasInteractions[i]) {
// unsigned int workUnit = (*data.gpu->psWorkUnit)[i];
// unsigned int x = (workUnit >> 17);
// unsigned int y = ((workUnit >> 2) & 0x7fff);
// for (int atom1 = 0; atom1 < blockSize; ++atom1) {
// float4 pos1 = (*data.gpu->psPosq4)[x*blockSize+atom1];
// for (int atom2 = 0; atom2 < blockSize; ++atom2) {
// float4 pos2 = (*data.gpu->psPosq4)[y*blockSize+atom2];
// float dx = pos1.x-pos2.x;
// float dy = pos1.y-pos2.y;
// float dz = pos1.z-pos2.z;
// if (periodic) {
// dx -= (float)(floor(0.5+dx/boxSize)*boxSize);
// dy -= (float)(floor(0.5+dy/boxSize)*boxSize);
// dz -= (float)(floor(0.5+dz/boxSize)*boxSize);
// }
// ASSERT(dx*dx+dy*dy+dz*dz > cutoff*cutoff);
// }
// }
// }
//}
void
testBlockInteractions
(
bool
periodic
)
{
const
int
blockSize
=
32
;
const
int
numBlocks
=
100
;
const
int
numParticles
=
blockSize
*
numBlocks
;
const
double
cutoff
=
1.0
;
const
double
boxSize
=
(
periodic
?
5.1
:
1.1
);
OpenCLPlatform
cl
;
System
system
;
VerletIntegrator
integrator
(
0.01
);
NonbondedForce
*
nonbonded
=
new
NonbondedForce
();
vector
<
Vec3
>
positions
(
numParticles
);
init_gen_rand
(
0
);
for
(
int
i
=
0
;
i
<
numParticles
;
i
++
)
{
system
.
addParticle
(
1.0
);
nonbonded
->
addParticle
(
1.0
,
0.2
,
0.2
);
positions
[
i
]
=
Vec3
(
boxSize
*
(
3
*
genrand_real2
()
-
1
),
boxSize
*
(
3
*
genrand_real2
()
-
1
),
boxSize
*
(
3
*
genrand_real2
()
-
1
));
}
nonbonded
->
setNonbondedMethod
(
periodic
?
NonbondedForce
::
CutoffPeriodic
:
NonbondedForce
::
CutoffNonPeriodic
);
nonbonded
->
setCutoffDistance
(
cutoff
);
system
.
setPeriodicBoxVectors
(
Vec3
(
boxSize
,
0
,
0
),
Vec3
(
0
,
boxSize
,
0
),
Vec3
(
0
,
0
,
boxSize
));
system
.
addForce
(
nonbonded
);
Context
context
(
system
,
integrator
,
cl
);
context
.
setPositions
(
positions
);
State
state
=
context
.
getState
(
State
::
Positions
|
State
::
Velocities
|
State
::
Forces
);
ContextImpl
*
contextImpl
=
*
reinterpret_cast
<
ContextImpl
**>
(
&
context
);
OpenCLPlatform
::
PlatformData
&
data
=
*
static_cast
<
OpenCLPlatform
::
PlatformData
*>
(
contextImpl
->
getPlatformData
());
OpenCLContext
&
clcontext
=
*
data
.
context
;
// Verify that the bounds of each block were calculated correctly.
clcontext
.
getPosq
().
download
();
vector
<
mm_float4
>
blockCenters
(
numBlocks
);
vector
<
mm_float4
>
blockBoundingBoxes
(
numBlocks
);
OpenCLNonbondedUtilities
&
nb
=
clcontext
.
getNonbondedUtilities
();
nb
.
getBlockCenters
().
download
(
blockCenters
);
nb
.
getBlockBoundingBoxes
().
download
(
blockBoundingBoxes
);
for
(
int
i
=
0
;
i
<
numBlocks
;
i
++
)
{
mm_float4
gridSize
=
blockBoundingBoxes
[
i
];
mm_float4
center
=
blockCenters
[
i
];
if
(
periodic
)
{
ASSERT
(
gridSize
.
x
<
0.5
*
boxSize
);
ASSERT
(
gridSize
.
y
<
0.5
*
boxSize
);
ASSERT
(
gridSize
.
z
<
0.5
*
boxSize
);
}
float
minx
=
0.0
,
maxx
=
0.0
,
miny
=
0.0
,
maxy
=
0.0
,
minz
=
0.0
,
maxz
=
0.0
,
radius
=
0.0
;
for
(
int
j
=
0
;
j
<
blockSize
;
j
++
)
{
mm_float4
pos
=
clcontext
.
getPosq
()[
i
*
blockSize
+
j
];
float
dx
=
pos
.
x
-
center
.
x
;
float
dy
=
pos
.
y
-
center
.
y
;
float
dz
=
pos
.
z
-
center
.
z
;
if
(
periodic
)
{
dx
-=
(
float
)(
floor
(
0.5
+
dx
/
boxSize
)
*
boxSize
);
dy
-=
(
float
)(
floor
(
0.5
+
dy
/
boxSize
)
*
boxSize
);
dz
-=
(
float
)(
floor
(
0.5
+
dz
/
boxSize
)
*
boxSize
);
}
ASSERT
(
abs
(
dx
)
<
gridSize
.
x
+
TOL
);
ASSERT
(
abs
(
dy
)
<
gridSize
.
y
+
TOL
);
ASSERT
(
abs
(
dz
)
<
gridSize
.
z
+
TOL
);
minx
=
min
(
minx
,
dx
);
maxx
=
max
(
maxx
,
dx
);
miny
=
min
(
miny
,
dy
);
maxy
=
max
(
maxy
,
dy
);
minz
=
min
(
minz
,
dz
);
maxz
=
max
(
maxz
,
dz
);
}
ASSERT_EQUAL_TOL
(
-
minx
,
gridSize
.
x
,
TOL
);
ASSERT_EQUAL_TOL
(
maxx
,
gridSize
.
x
,
TOL
);
ASSERT_EQUAL_TOL
(
-
miny
,
gridSize
.
y
,
TOL
);
ASSERT_EQUAL_TOL
(
maxy
,
gridSize
.
y
,
TOL
);
ASSERT_EQUAL_TOL
(
-
minz
,
gridSize
.
z
,
TOL
);
ASSERT_EQUAL_TOL
(
maxz
,
gridSize
.
z
,
TOL
);
}
// Verify that interactions were identified correctly.
vector
<
cl_uint
>
interactionCount
;
vector
<
cl_uint
>
interactingTiles
;
vector
<
cl_uint
>
interactionFlags
;
nb
.
getInteractionCount
().
download
(
interactionCount
);
int
numWithInteractions
=
interactionCount
[
0
];
vector
<
bool
>
hasInteractions
(
nb
.
getTiles
().
getSize
(),
false
);
nb
.
getInteractingTiles
().
download
(
interactingTiles
);
nb
.
getInteractionFlags
().
download
(
interactionFlags
);
const
unsigned
int
atoms
=
clcontext
.
getPaddedNumAtoms
();
const
unsigned
int
grid
=
OpenCLContext
::
TileSize
;
const
unsigned
int
dim
=
clcontext
.
getNumAtomBlocks
();
printf
(
"%d of %d
\n
"
,
numWithInteractions
,
hasInteractions
.
size
());
for
(
int
i
=
0
;
i
<
numWithInteractions
;
i
++
)
{
unsigned
int
tile
=
interactingTiles
[
i
];
unsigned
int
x
=
(
tile
>>
17
);
unsigned
int
y
=
((
tile
>>
2
)
&
0x7fff
);
int
index
=
(
x
>
y
?
x
+
y
*
dim
-
y
*
(
y
+
1
)
/
2
:
y
+
x
*
dim
-
x
*
(
x
+
1
)
/
2
);
hasInteractions
[
index
]
=
true
;
// Make sure this tile really should have been flagged based on bounding volumes.
mm_float4
gridSize1
=
blockBoundingBoxes
[
x
];
mm_float4
gridSize2
=
blockBoundingBoxes
[
y
];
mm_float4
center1
=
blockCenters
[
x
];
mm_float4
center2
=
blockCenters
[
y
];
float
dx
=
center1
.
x
-
center2
.
x
;
float
dy
=
center1
.
y
-
center2
.
y
;
float
dz
=
center1
.
z
-
center2
.
z
;
if
(
periodic
)
{
dx
-=
(
float
)(
floor
(
0.5
+
dx
/
boxSize
)
*
boxSize
);
dy
-=
(
float
)(
floor
(
0.5
+
dy
/
boxSize
)
*
boxSize
);
dz
-=
(
float
)(
floor
(
0.5
+
dz
/
boxSize
)
*
boxSize
);
}
dx
=
max
(
0.0
f
,
abs
(
dx
)
-
gridSize1
.
x
-
gridSize2
.
x
);
dy
=
max
(
0.0
f
,
abs
(
dy
)
-
gridSize1
.
y
-
gridSize2
.
y
);
dz
=
max
(
0.0
f
,
abs
(
dz
)
-
gridSize1
.
z
-
gridSize2
.
z
);
ASSERT
(
sqrt
(
dx
*
dx
+
dy
*
dy
+
dz
*
dz
)
<
cutoff
+
TOL
);
// Check the interaction flags.
unsigned
int
flags
=
interactionFlags
[
i
];
for
(
int
atom2
=
0
;
atom2
<
32
;
atom2
++
)
{
if
((
flags
&
1
)
==
0
)
{
mm_float4
pos2
=
clcontext
.
getPosq
()[
y
*
blockSize
+
atom2
];
for
(
int
atom1
=
0
;
atom1
<
blockSize
;
++
atom1
)
{
mm_float4
pos1
=
clcontext
.
getPosq
()[
x
*
blockSize
+
atom1
];
float
dx
=
pos2
.
x
-
pos1
.
x
;
float
dy
=
pos2
.
y
-
pos1
.
y
;
float
dz
=
pos2
.
z
-
pos1
.
z
;
if
(
periodic
)
{
dx
-=
(
float
)(
floor
(
0.5
+
dx
/
boxSize
)
*
boxSize
);
dy
-=
(
float
)(
floor
(
0.5
+
dy
/
boxSize
)
*
boxSize
);
dz
-=
(
float
)(
floor
(
0.5
+
dz
/
boxSize
)
*
boxSize
);
}
if
(
dx
*
dx
+
dy
*
dy
+
dz
*
dz
<
cutoff
*
cutoff
)
{
dx
=
pos2
.
x
-
center1
.
x
;
dy
=
pos2
.
y
-
center1
.
y
;
dz
=
pos2
.
z
-
center1
.
z
;
dx
=
max
(
0.0
f
,
abs
(
dx
)
-
gridSize1
.
x
);
dy
=
max
(
0.0
f
,
abs
(
dy
)
-
gridSize1
.
y
);
dz
=
max
(
0.0
f
,
abs
(
dz
)
-
gridSize1
.
z
);
}
ASSERT
(
dx
*
dx
+
dy
*
dy
+
dz
*
dz
>
cutoff
*
cutoff
);
}
}
flags
>>=
1
;
}
}
// Check the tiles that did not have interactions to make sure all atoms are beyond the cutoff.
vector
<
cl_uint
>
tiles
;
nb
.
getTiles
().
download
(
tiles
);
for
(
int
i
=
0
;
i
<
(
int
)
hasInteractions
.
size
();
i
++
)
if
(
!
hasInteractions
[
i
])
{
unsigned
int
tile
=
tiles
[
i
];
unsigned
int
x
=
(
tile
>>
17
);
unsigned
int
y
=
((
tile
>>
2
)
&
0x7fff
);
for
(
int
atom1
=
0
;
atom1
<
blockSize
;
++
atom1
)
{
mm_float4
pos1
=
clcontext
.
getPosq
()[
x
*
blockSize
+
atom1
];
for
(
int
atom2
=
0
;
atom2
<
blockSize
;
++
atom2
)
{
mm_float4
pos2
=
clcontext
.
getPosq
()[
y
*
blockSize
+
atom2
];
float
dx
=
pos1
.
x
-
pos2
.
x
;
float
dy
=
pos1
.
y
-
pos2
.
y
;
float
dz
=
pos1
.
z
-
pos2
.
z
;
if
(
periodic
)
{
dx
-=
(
float
)(
floor
(
0.5
+
dx
/
boxSize
)
*
boxSize
);
dy
-=
(
float
)(
floor
(
0.5
+
dy
/
boxSize
)
*
boxSize
);
dz
-=
(
float
)(
floor
(
0.5
+
dz
/
boxSize
)
*
boxSize
);
}
ASSERT
(
dx
*
dx
+
dy
*
dy
+
dz
*
dz
>
cutoff
*
cutoff
);
}
}
}
}
int
main
()
{
try
{
...
...
@@ -609,7 +619,7 @@ int main() {
// testCutoff14();
// testPeriodic();
// testLargeSystem();
//
testBlockInteractions(false);
testBlockInteractions
(
false
);
// testBlockInteractions(true);
}
catch
(
const
exception
&
e
)
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment