Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
93c467b2
Commit
93c467b2
authored
Mar 22, 2013
by
Peter Eastman
Browse files
Merged 5.1Optimizations branch back to trunk
parent
f6d4557d
Changes
86
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
3182 additions
and
2450 deletions
+3182
-2450
platforms/opencl/src/OpenCLSort.cpp
platforms/opencl/src/OpenCLSort.cpp
+161
-0
platforms/opencl/src/OpenCLSort.h
platforms/opencl/src/OpenCLSort.h
+64
-136
platforms/opencl/src/kernels/andersenThermostat.cl
platforms/opencl/src/kernels/andersenThermostat.cl
+3
-3
platforms/opencl/src/kernels/brownian.cl
platforms/opencl/src/kernels/brownian.cl
+3
-3
platforms/opencl/src/kernels/ccma.cl
platforms/opencl/src/kernels/ccma.cl
+6
-1
platforms/opencl/src/kernels/coulombLennardJones.cl
platforms/opencl/src/kernels/coulombLennardJones.cl
+3
-3
platforms/opencl/src/kernels/customGBEnergyN2.cl
platforms/opencl/src/kernels/customGBEnergyN2.cl
+376
-0
platforms/opencl/src/kernels/customGBEnergyN2_cpu.cl
platforms/opencl/src/kernels/customGBEnergyN2_cpu.cl
+300
-157
platforms/opencl/src/kernels/customGBEnergyN2_default.cl
platforms/opencl/src/kernels/customGBEnergyN2_default.cl
+0
-258
platforms/opencl/src/kernels/customGBEnergyN2_nvidia.cl
platforms/opencl/src/kernels/customGBEnergyN2_nvidia.cl
+0
-282
platforms/opencl/src/kernels/customGBValueN2.cl
platforms/opencl/src/kernels/customGBValueN2.cl
+324
-0
platforms/opencl/src/kernels/customGBValueN2_cpu.cl
platforms/opencl/src/kernels/customGBValueN2_cpu.cl
+240
-141
platforms/opencl/src/kernels/customGBValueN2_default.cl
platforms/opencl/src/kernels/customGBValueN2_default.cl
+0
-237
platforms/opencl/src/kernels/customGBValueN2_nvidia.cl
platforms/opencl/src/kernels/customGBValueN2_nvidia.cl
+0
-304
platforms/opencl/src/kernels/fft.cl
platforms/opencl/src/kernels/fft.cl
+2
-2
platforms/opencl/src/kernels/findInteractingBlocks.cl
platforms/opencl/src/kernels/findInteractingBlocks.cl
+224
-205
platforms/opencl/src/kernels/findInteractingBlocks_cpu.cl
platforms/opencl/src/kernels/findInteractingBlocks_cpu.cl
+171
-106
platforms/opencl/src/kernels/gbsaObc.cl
platforms/opencl/src/kernels/gbsaObc.cl
+748
-0
platforms/opencl/src/kernels/gbsaObc_cpu.cl
platforms/opencl/src/kernels/gbsaObc_cpu.cl
+557
-126
platforms/opencl/src/kernels/gbsaObc_default.cl
platforms/opencl/src/kernels/gbsaObc_default.cl
+0
-486
No files found.
platforms/opencl/src/OpenCLSort.cpp
0 → 100644
View file @
93c467b2
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2010-2013 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "OpenCLSort.h"
#include "OpenCLKernelSources.h"
#include <map>
using
namespace
OpenMM
;
using
namespace
std
;
OpenCLSort
::
OpenCLSort
(
OpenCLContext
&
context
,
SortTrait
*
trait
,
unsigned
int
length
)
:
context
(
context
),
trait
(
trait
),
dataRange
(
NULL
),
bucketOfElement
(
NULL
),
offsetInBucket
(
NULL
),
bucketOffset
(
NULL
),
buckets
(
NULL
),
dataLength
(
length
)
{
// Create kernels.
std
::
map
<
std
::
string
,
std
::
string
>
replacements
;
replacements
[
"DATA_TYPE"
]
=
trait
->
getDataType
();
replacements
[
"KEY_TYPE"
]
=
trait
->
getKeyType
();
replacements
[
"SORT_KEY"
]
=
trait
->
getSortKey
();
replacements
[
"MIN_KEY"
]
=
trait
->
getMinKey
();
replacements
[
"MAX_KEY"
]
=
trait
->
getMaxKey
();
replacements
[
"MAX_VALUE"
]
=
trait
->
getMaxValue
();
replacements
[
"VALUE_IS_INT2"
]
=
(
trait
->
getDataType
()
==
std
::
string
(
"int2"
)
?
"1"
:
"0"
);
cl
::
Program
program
=
context
.
createProgram
(
context
.
replaceStrings
(
OpenCLKernelSources
::
sort
,
replacements
));
shortListKernel
=
cl
::
Kernel
(
program
,
"sortShortList"
);
computeRangeKernel
=
cl
::
Kernel
(
program
,
"computeRange"
);
assignElementsKernel
=
cl
::
Kernel
(
program
,
"assignElementsToBuckets"
);
computeBucketPositionsKernel
=
cl
::
Kernel
(
program
,
"computeBucketPositions"
);
copyToBucketsKernel
=
cl
::
Kernel
(
program
,
"copyDataToBuckets"
);
sortBucketsKernel
=
cl
::
Kernel
(
program
,
"sortBuckets"
);
// Work out the work group sizes for various kernels.
unsigned
int
maxGroupSize
=
std
::
min
(
256
,
(
int
)
context
.
getDevice
().
getInfo
<
CL_DEVICE_MAX_WORK_GROUP_SIZE
>
());
int
maxSharedMem
=
context
.
getDevice
().
getInfo
<
CL_DEVICE_LOCAL_MEM_SIZE
>
();
unsigned
int
maxLocalBuffer
=
(
unsigned
int
)
((
maxSharedMem
/
trait
->
getDataSize
())
/
2
);
isShortList
=
(
length
<=
maxLocalBuffer
);
for
(
rangeKernelSize
=
1
;
rangeKernelSize
*
2
<=
maxGroupSize
;
rangeKernelSize
*=
2
)
;
positionsKernelSize
=
rangeKernelSize
;
sortKernelSize
=
(
isShortList
?
rangeKernelSize
:
rangeKernelSize
/
2
);
if
(
rangeKernelSize
>
length
)
rangeKernelSize
=
length
;
if
(
sortKernelSize
>
maxLocalBuffer
)
sortKernelSize
=
maxLocalBuffer
;
unsigned
int
targetBucketSize
=
sortKernelSize
/
2
;
unsigned
int
numBuckets
=
length
/
targetBucketSize
;
if
(
numBuckets
<
1
)
numBuckets
=
1
;
if
(
positionsKernelSize
>
numBuckets
)
positionsKernelSize
=
numBuckets
;
// Create workspace arrays.
if
(
!
isShortList
)
{
dataRange
=
new
OpenCLArray
(
context
,
2
,
trait
->
getKeySize
(),
"sortDataRange"
);
bucketOffset
=
OpenCLArray
::
create
<
cl_uint
>
(
context
,
numBuckets
,
"bucketOffset"
);
bucketOfElement
=
OpenCLArray
::
create
<
cl_uint
>
(
context
,
length
,
"bucketOfElement"
);
offsetInBucket
=
OpenCLArray
::
create
<
cl_uint
>
(
context
,
length
,
"offsetInBucket"
);
buckets
=
new
OpenCLArray
(
context
,
length
,
trait
->
getDataSize
(),
"buckets"
);
}
}
OpenCLSort
::~
OpenCLSort
()
{
delete
trait
;
if
(
dataRange
!=
NULL
)
delete
dataRange
;
if
(
bucketOfElement
!=
NULL
)
delete
bucketOfElement
;
if
(
offsetInBucket
!=
NULL
)
delete
offsetInBucket
;
if
(
bucketOffset
!=
NULL
)
delete
bucketOffset
;
if
(
buckets
!=
NULL
)
delete
buckets
;
}
void
OpenCLSort
::
sort
(
OpenCLArray
&
data
)
{
if
(
data
.
getSize
()
!=
dataLength
||
data
.
getElementSize
()
!=
trait
->
getDataSize
())
throw
OpenMMException
(
"OpenCLSort called with different data size"
);
if
(
data
.
getSize
()
==
0
)
return
;
if
(
isShortList
)
{
// We can use a simpler sort kernel that does the entire operation at once in local memory.
shortListKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
data
.
getDeviceBuffer
());
shortListKernel
.
setArg
<
cl_uint
>
(
1
,
dataLength
);
shortListKernel
.
setArg
(
2
,
dataLength
*
trait
->
getDataSize
(),
NULL
);
context
.
executeKernel
(
shortListKernel
,
sortKernelSize
,
sortKernelSize
);
}
else
{
// Compute the range of data values.
computeRangeKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
data
.
getDeviceBuffer
());
computeRangeKernel
.
setArg
<
cl_uint
>
(
1
,
data
.
getSize
());
computeRangeKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
dataRange
->
getDeviceBuffer
());
computeRangeKernel
.
setArg
(
3
,
rangeKernelSize
*
trait
->
getKeySize
(),
NULL
);
context
.
executeKernel
(
computeRangeKernel
,
rangeKernelSize
,
rangeKernelSize
);
// Assign array elements to buckets.
unsigned
int
numBuckets
=
bucketOffset
->
getSize
();
context
.
clearBuffer
(
*
bucketOffset
);
assignElementsKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
data
.
getDeviceBuffer
());
assignElementsKernel
.
setArg
<
cl_int
>
(
1
,
data
.
getSize
());
assignElementsKernel
.
setArg
<
cl_int
>
(
2
,
numBuckets
);
assignElementsKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
dataRange
->
getDeviceBuffer
());
assignElementsKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
bucketOffset
->
getDeviceBuffer
());
assignElementsKernel
.
setArg
<
cl
::
Buffer
>
(
5
,
bucketOfElement
->
getDeviceBuffer
());
assignElementsKernel
.
setArg
<
cl
::
Buffer
>
(
6
,
offsetInBucket
->
getDeviceBuffer
());
context
.
executeKernel
(
assignElementsKernel
,
data
.
getSize
());
// Compute the position of each bucket.
computeBucketPositionsKernel
.
setArg
<
cl_int
>
(
0
,
numBuckets
);
computeBucketPositionsKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
bucketOffset
->
getDeviceBuffer
());
computeBucketPositionsKernel
.
setArg
(
2
,
positionsKernelSize
*
sizeof
(
cl_int
),
NULL
);
context
.
executeKernel
(
computeBucketPositionsKernel
,
positionsKernelSize
,
positionsKernelSize
);
// Copy the data into the buckets.
copyToBucketsKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
data
.
getDeviceBuffer
());
copyToBucketsKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
buckets
->
getDeviceBuffer
());
copyToBucketsKernel
.
setArg
<
cl_int
>
(
2
,
data
.
getSize
());
copyToBucketsKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
bucketOffset
->
getDeviceBuffer
());
copyToBucketsKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
bucketOfElement
->
getDeviceBuffer
());
copyToBucketsKernel
.
setArg
<
cl
::
Buffer
>
(
5
,
offsetInBucket
->
getDeviceBuffer
());
context
.
executeKernel
(
copyToBucketsKernel
,
data
.
getSize
());
// Sort each bucket.
sortBucketsKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
data
.
getDeviceBuffer
());
sortBucketsKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
buckets
->
getDeviceBuffer
());
sortBucketsKernel
.
setArg
<
cl_int
>
(
2
,
numBuckets
);
sortBucketsKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
bucketOffset
->
getDeviceBuffer
());
sortBucketsKernel
.
setArg
(
4
,
sortKernelSize
*
trait
->
getDataSize
(),
NULL
);
context
.
executeKernel
(
sortBucketsKernel
,
((
data
.
getSize
()
+
sortKernelSize
-
1
)
/
sortKernelSize
)
*
sortKernelSize
,
sortKernelSize
);
}
}
platforms/opencl/src/OpenCLSort.h
View file @
93c467b2
...
@@ -9,7 +9,7 @@
...
@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* *
* Portions copyright (c) 2010 Stanford University and the Authors.
*
* Portions copyright (c) 2010
-2013
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Authors: Peter Eastman *
* Contributors: *
* Contributors: *
* *
* *
...
@@ -28,9 +28,7 @@
...
@@ -28,9 +28,7 @@
* -------------------------------------------------------------------------- */
* -------------------------------------------------------------------------- */
#include "OpenCLArray.h"
#include "OpenCLArray.h"
#include "OpenCLKernelSources.h"
#include "windowsExportOpenCL.h"
#include "windowsExportOpenCL.h"
#include <map>
namespace
OpenMM
{
namespace
OpenMM
{
...
@@ -38,26 +36,19 @@ namespace OpenMM {
...
@@ -38,26 +36,19 @@ namespace OpenMM {
* This class sorts arrays of values. It supports any type of values, not just scalars,
* This class sorts arrays of values. It supports any type of values, not just scalars,
* so long as an appropriate sorting key can be defined by which to sort them.
* so long as an appropriate sorting key can be defined by which to sort them.
*
*
* The
class is templatiz
ed by a "trait" class that defines the type of data to
* The
sorting behavior is specifi
ed by a "trait" class that defines the type of data to
* sort and the key for sorting it. Here is an example of a trait class for
* sort and the key for sorting it. Here is an example of a trait class for
* sorting floats:
* sorting floats:
*
*
* struct FloatTrait {
* class FloatTrait : public OpenCLSort::SortTrait {
* // The name of the data and key types being sorted.
* int getDataSize() const {return 4;}
* // Both the host type and OpenCL type is required.
* int getKeySize() const {return 4;}
* // For primitive types they will be the same.
* const char* getDataType() const {return "float";}
* typedef cl_float DataType;
* const char* getKeyType() const {return "float";}
* typedef cl_float KeyType;
* const char* getMinKey() const {return "-MAXFLOAT";}
* static const char* clDataType() {return "float";}
* const char* getMaxKey() const {return "MAXFLOAT";}
* static const char* clKeyType() {return "float";}
* const char* getMaxValue() const {return "MAXFLOAT";}
* // The minimum value a key can take.
* const char* getSortKey() const {return "value";}
* static const char* clMinKey() {return "-MAXFLOAT";}
* // The maximum value a key can take.
* static const char* clMaxKey() {return "MAXFLOAT";}
* // A value whose key is guaranteed to equal clMaxKey().
* static const char* clMaxValue() {return "MAXFLOAT";}
* // The OpenCL code to select the key from the data value.
* static const char* clSortKey() {return "value";}
* };
* };
*
*
* The algorithm used is a bucket sort, followed by a bitonic sort within each bucket
* The algorithm used is a bucket sort, followed by a bitonic sort within each bucket
...
@@ -74,139 +65,76 @@ namespace OpenMM {
...
@@ -74,139 +65,76 @@ namespace OpenMM {
* elements).
* elements).
*/
*/
template
<
class
TRAIT
>
class
OPENMM_EXPORT_OPENCL
OpenCLSort
{
class
OpenCLSort
{
public:
public:
class
SortTrait
;
/**
/**
* Create an OpenCLSort object for sorting data of a particular type.
* Create an OpenCLSort object for sorting data of a particular type.
*
*
* @param context the context in which to perform calculations
* @param context the context in which to perform calculations
* @param trait a SortTrait defining the type of data to sort. It should have been allocated
* on the heap with the "new" operator. This object takes over ownership of it,
* and deletes it when the OpenCLSort is deleted.
* @param length the length of the arrays this object will be used to sort
* @param length the length of the arrays this object will be used to sort
*/
*/
OpenCLSort
(
OpenCLContext
&
context
,
unsigned
int
length
)
:
context
(
context
),
OpenCLSort
(
OpenCLContext
&
context
,
SortTrait
*
trait
,
unsigned
int
length
);
dataRange
(
NULL
),
bucketOfElement
(
NULL
),
offsetInBucket
(
NULL
),
bucketOffset
(
NULL
),
buckets
(
NULL
)
{
~
OpenCLSort
();
// Create kernels.
std
::
map
<
std
::
string
,
std
::
string
>
replacements
;
replacements
[
"DATA_TYPE"
]
=
TRAIT
::
clDataType
();
replacements
[
"KEY_TYPE"
]
=
TRAIT
::
clKeyType
();
replacements
[
"SORT_KEY"
]
=
TRAIT
::
clSortKey
();
replacements
[
"MIN_KEY"
]
=
TRAIT
::
clMinKey
();
replacements
[
"MAX_KEY"
]
=
TRAIT
::
clMaxKey
();
replacements
[
"MAX_VALUE"
]
=
TRAIT
::
clMaxValue
();
replacements
[
"VALUE_IS_INT2"
]
=
(
TRAIT
::
clDataType
()
==
std
::
string
(
"int2"
)
?
"1"
:
"0"
);
cl
::
Program
program
=
context
.
createProgram
(
context
.
replaceStrings
(
OpenCLKernelSources
::
sort
,
replacements
));
computeRangeKernel
=
cl
::
Kernel
(
program
,
"computeRange"
);
assignElementsKernel
=
cl
::
Kernel
(
program
,
"assignElementsToBuckets"
);
computeBucketPositionsKernel
=
cl
::
Kernel
(
program
,
"computeBucketPositions"
);
copyToBucketsKernel
=
cl
::
Kernel
(
program
,
"copyDataToBuckets"
);
sortBucketsKernel
=
cl
::
Kernel
(
program
,
"sortBuckets"
);
// Work out the work group sizes for various kernels.
unsigned
int
maxGroupSize
=
std
::
min
(
256
,
(
int
)
context
.
getDevice
().
getInfo
<
CL_DEVICE_MAX_WORK_GROUP_SIZE
>
());
for
(
rangeKernelSize
=
1
;
rangeKernelSize
*
2
<=
maxGroupSize
;
rangeKernelSize
*=
2
)
;
positionsKernelSize
=
rangeKernelSize
;
sortKernelSize
=
rangeKernelSize
/
2
;
if
(
rangeKernelSize
>
length
)
rangeKernelSize
=
length
;
unsigned
int
maxLocalBuffer
=
(
unsigned
int
)
((
context
.
getDevice
().
getInfo
<
CL_DEVICE_LOCAL_MEM_SIZE
>
()
/
sizeof
(
typename
TRAIT
::
DataType
))
/
2
);
if
(
sortKernelSize
>
maxLocalBuffer
)
sortKernelSize
=
maxLocalBuffer
;
unsigned
int
targetBucketSize
=
sortKernelSize
/
2
;
unsigned
int
numBuckets
=
length
/
targetBucketSize
;
if
(
numBuckets
<
1
)
numBuckets
=
1
;
if
(
positionsKernelSize
>
numBuckets
)
positionsKernelSize
=
numBuckets
;
// Create workspace arrays.
dataRange
=
OpenCLArray
::
create
<
typename
TRAIT
::
KeyType
>
(
context
,
2
,
"sortDataRange"
);
bucketOffset
=
OpenCLArray
::
create
<
cl_uint
>
(
context
,
numBuckets
,
"bucketOffset"
);
bucketOfElement
=
OpenCLArray
::
create
<
cl_uint
>
(
context
,
length
,
"bucketOfElement"
);
offsetInBucket
=
OpenCLArray
::
create
<
cl_uint
>
(
context
,
length
,
"offsetInBucket"
);
buckets
=
OpenCLArray
::
create
<
typename
TRAIT
::
DataType
>
(
context
,
length
,
"buckets"
);
}
~
OpenCLSort
()
{
if
(
dataRange
!=
NULL
)
delete
dataRange
;
if
(
bucketOfElement
!=
NULL
)
delete
bucketOfElement
;
if
(
offsetInBucket
!=
NULL
)
delete
offsetInBucket
;
if
(
bucketOffset
!=
NULL
)
delete
bucketOffset
;
if
(
buckets
!=
NULL
)
delete
buckets
;
}
/**
/**
* Sort an array.
* Sort an array.
*/
*/
void
sort
(
OpenCLArray
&
data
)
{
void
sort
(
OpenCLArray
&
data
);
if
(
data
.
getSize
()
!=
bucketOfElement
->
getSize
())
throw
OpenMMException
(
"OpenCLSort called with different data size"
);
if
(
data
.
getSize
()
==
0
)
return
;
// Compute the range of data values.
computeRangeKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
data
.
getDeviceBuffer
());
computeRangeKernel
.
setArg
<
cl_uint
>
(
1
,
data
.
getSize
());
computeRangeKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
dataRange
->
getDeviceBuffer
());
computeRangeKernel
.
setArg
(
3
,
rangeKernelSize
*
sizeof
(
typename
TRAIT
::
KeyType
),
NULL
);
context
.
executeKernel
(
computeRangeKernel
,
rangeKernelSize
,
rangeKernelSize
);
// Assign array elements to buckets.
unsigned
int
numBuckets
=
bucketOffset
->
getSize
();
context
.
clearBuffer
(
*
bucketOffset
);
assignElementsKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
data
.
getDeviceBuffer
());
assignElementsKernel
.
setArg
<
cl_int
>
(
1
,
data
.
getSize
());
assignElementsKernel
.
setArg
<
cl_int
>
(
2
,
numBuckets
);
assignElementsKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
dataRange
->
getDeviceBuffer
());
assignElementsKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
bucketOffset
->
getDeviceBuffer
());
assignElementsKernel
.
setArg
<
cl
::
Buffer
>
(
5
,
bucketOfElement
->
getDeviceBuffer
());
assignElementsKernel
.
setArg
<
cl
::
Buffer
>
(
6
,
offsetInBucket
->
getDeviceBuffer
());
context
.
executeKernel
(
assignElementsKernel
,
data
.
getSize
());
// Compute the position of each bucket.
computeBucketPositionsKernel
.
setArg
<
cl_int
>
(
0
,
numBuckets
);
computeBucketPositionsKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
bucketOffset
->
getDeviceBuffer
());
computeBucketPositionsKernel
.
setArg
(
2
,
positionsKernelSize
*
sizeof
(
cl_int
),
NULL
);
context
.
executeKernel
(
computeBucketPositionsKernel
,
positionsKernelSize
,
positionsKernelSize
);
// Copy the data into the buckets.
copyToBucketsKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
data
.
getDeviceBuffer
());
copyToBucketsKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
buckets
->
getDeviceBuffer
());
copyToBucketsKernel
.
setArg
<
cl_int
>
(
2
,
data
.
getSize
());
copyToBucketsKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
bucketOffset
->
getDeviceBuffer
());
copyToBucketsKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
bucketOfElement
->
getDeviceBuffer
());
copyToBucketsKernel
.
setArg
<
cl
::
Buffer
>
(
5
,
offsetInBucket
->
getDeviceBuffer
());
context
.
executeKernel
(
copyToBucketsKernel
,
data
.
getSize
());
// Sort each bucket.
sortBucketsKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
data
.
getDeviceBuffer
());
sortBucketsKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
buckets
->
getDeviceBuffer
());
sortBucketsKernel
.
setArg
<
cl_int
>
(
2
,
numBuckets
);
sortBucketsKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
bucketOffset
->
getDeviceBuffer
());
sortBucketsKernel
.
setArg
(
4
,
sortKernelSize
*
sizeof
(
typename
TRAIT
::
DataType
),
NULL
);
context
.
executeKernel
(
sortBucketsKernel
,
((
data
.
getSize
()
+
sortKernelSize
-
1
)
/
sortKernelSize
)
*
sortKernelSize
,
sortKernelSize
);
}
private:
private:
OpenCLContext
&
context
;
OpenCLContext
&
context
;
SortTrait
*
trait
;
OpenCLArray
*
dataRange
;
OpenCLArray
*
dataRange
;
OpenCLArray
*
bucketOfElement
;
OpenCLArray
*
bucketOfElement
;
OpenCLArray
*
offsetInBucket
;
OpenCLArray
*
offsetInBucket
;
OpenCLArray
*
bucketOffset
;
OpenCLArray
*
bucketOffset
;
OpenCLArray
*
buckets
;
OpenCLArray
*
buckets
;
cl
::
Kernel
computeRangeKernel
,
assignElementsKernel
,
computeBucketPositionsKernel
,
copyToBucketsKernel
,
sortBucketsKernel
;
cl
::
Kernel
shortListKernel
,
computeRangeKernel
,
assignElementsKernel
,
computeBucketPositionsKernel
,
copyToBucketsKernel
,
sortBucketsKernel
;
unsigned
int
rangeKernelSize
,
positionsKernelSize
,
sortKernelSize
;
unsigned
int
dataLength
,
rangeKernelSize
,
positionsKernelSize
,
sortKernelSize
;
bool
isShortList
;
};
/**
* A subclass of SortTrait defines the type of value to sort, and the key for sorting them.
*/
class
OpenCLSort
::
SortTrait
{
public:
virtual
~
SortTrait
()
{
}
/**
* Get the size of each data value in bytes.
*/
virtual
int
getDataSize
()
const
=
0
;
/**
* Get the size of each key value in bytes.
*/
virtual
int
getKeySize
()
const
=
0
;
/**
* Get the data type of the values to sort.
*/
virtual
const
char
*
getDataType
()
const
=
0
;
/**
* Get the data type of the sorting key.
*/
virtual
const
char
*
getKeyType
()
const
=
0
;
/**
* Get the minimum value a key can take.
*/
virtual
const
char
*
getMinKey
()
const
=
0
;
/**
* Get the maximum value a key can take.
*/
virtual
const
char
*
getMaxKey
()
const
=
0
;
/**
* Get a value whose key is guaranteed to equal getMaxKey().
*/
virtual
const
char
*
getMaxValue
()
const
=
0
;
/**
* Get the CUDA code to select the key from the data value.
*/
virtual
const
char
*
getSortKey
()
const
=
0
;
};
};
}
// namespace OpenMM
}
// namespace OpenMM
...
...
platforms/opencl/src/kernels/andersenThermostat.cl
View file @
93c467b2
...
@@ -4,14 +4,14 @@
...
@@ -4,14 +4,14 @@
__kernel
void
applyAndersenThermostat
(
float
collisionFrequency,
float
kT,
__global
mixed4*
velm,
__global
const
mixed2*
restrict
stepSize,
__global
const
float4*
restrict
random,
__kernel
void
applyAndersenThermostat
(
float
collisionFrequency,
float
kT,
__global
mixed4*
velm,
__global
const
mixed2*
restrict
stepSize,
__global
const
float4*
restrict
random,
unsigned
int
randomIndex,
__global
const
int*
restrict
atomGroups
)
{
unsigned
int
randomIndex,
__global
const
int*
restrict
atomGroups
)
{
float
collisionProbability
=
1.0f-
exp
(
-collisionFrequency*stepSize[0].y
)
;
float
collisionProbability
=
1.0f-
EXP
(
-collisionFrequency*stepSize[0].y
)
;
float
randomRange
=
erf
(
collisionProbability/
sqrt
(
2.0f
))
;
float
randomRange
=
erf
(
collisionProbability/
SQRT
(
2.0f
))
;
for
(
int
index
=
get_global_id
(
0
)
; index < NUM_ATOMS; index += get_global_size(0)) {
for
(
int
index
=
get_global_id
(
0
)
; index < NUM_ATOMS; index += get_global_size(0)) {
mixed4
velocity
=
velm[index]
;
mixed4
velocity
=
velm[index]
;
float4
selectRand
=
random[randomIndex+atomGroups[index]]
;
float4
selectRand
=
random[randomIndex+atomGroups[index]]
;
float4
velRand
=
random[randomIndex+index]
;
float4
velRand
=
random[randomIndex+index]
;
real
scale
=
(
selectRand.w
>
-randomRange
&&
selectRand.w
<
randomRange
?
0
:
1
)
;
real
scale
=
(
selectRand.w
>
-randomRange
&&
selectRand.w
<
randomRange
?
0
:
1
)
;
real
add
=
(
1-scale
)
*
sqrt
(
kT*velocity.w
)
;
real
add
=
(
1-scale
)
*
SQRT
(
kT*velocity.w
)
;
velocity.x
=
scale*velocity.x
+
add*velRand.x
;
velocity.x
=
scale*velocity.x
+
add*velRand.x
;
velocity.y
=
scale*velocity.y
+
add*velRand.y
;
velocity.y
=
scale*velocity.y
+
add*velRand.y
;
velocity.z
=
scale*velocity.z
+
add*velRand.z
;
velocity.z
=
scale*velocity.z
+
add*velRand.z
;
...
...
platforms/opencl/src/kernels/brownian.cl
View file @
93c467b2
...
@@ -8,9 +8,9 @@ __kernel void integrateBrownianPart1(mixed tauDeltaT, mixed noiseAmplitude, __gl
...
@@ -8,9 +8,9 @@ __kernel void integrateBrownianPart1(mixed tauDeltaT, mixed noiseAmplitude, __gl
for
(
int
index
=
get_global_id
(
0
)
; index < NUM_ATOMS; index += get_global_size(0)) {
for
(
int
index
=
get_global_id
(
0
)
; index < NUM_ATOMS; index += get_global_size(0)) {
mixed
invMass
=
velm[index].w
;
mixed
invMass
=
velm[index].w
;
if
(
invMass
!=
0
)
{
if
(
invMass
!=
0
)
{
posDelta[index]
=
(
mixed4
)
(
tauDeltaT*invMass*force[index].x
+
noiseAmplitude*
sqrt
(
invMass
)
*random[randomIndex].x,
posDelta[index]
=
(
mixed4
)
(
tauDeltaT*invMass*force[index].x
+
noiseAmplitude*
SQRT
(
invMass
)
*random[randomIndex].x,
tauDeltaT*invMass*force[index].y
+
noiseAmplitude*
sqrt
(
invMass
)
*random[randomIndex].y,
tauDeltaT*invMass*force[index].y
+
noiseAmplitude*
SQRT
(
invMass
)
*random[randomIndex].y,
tauDeltaT*invMass*force[index].z
+
noiseAmplitude*
sqrt
(
invMass
)
*random[randomIndex].z,
0
)
;
tauDeltaT*invMass*force[index].z
+
noiseAmplitude*
SQRT
(
invMass
)
*random[randomIndex].z,
0
)
;
}
}
randomIndex
+=
get_global_size
(
0
)
;
randomIndex
+=
get_global_size
(
0
)
;
}
}
...
...
platforms/opencl/src/kernels/ccma.cl
View file @
93c467b2
...
@@ -10,7 +10,8 @@ mixed4 loadPos(__global const real4* restrict posq, __global const real4* restri
...
@@ -10,7 +10,8 @@ mixed4 loadPos(__global const real4* restrict posq, __global const real4* restri
/**
/**
*
Compute
the
direction
each
constraint
is
pointing
in.
This
is
called
once
at
the
beginning
of
constraint
evaluation.
*
Compute
the
direction
each
constraint
is
pointing
in.
This
is
called
once
at
the
beginning
of
constraint
evaluation.
*/
*/
__kernel
void
computeConstraintDirections
(
__global
const
int2*
restrict
constraintAtoms,
__global
mixed4*
restrict
constraintDistance,
__global
const
real4*
restrict
atomPositions,
__global
const
real4*
restrict
posCorrection
)
{
__kernel
void
computeConstraintDirections
(
__global
const
int2*
restrict
constraintAtoms,
__global
mixed4*
restrict
constraintDistance,
__global
const
real4*
restrict
atomPositions,
__global
const
real4*
restrict
posCorrection,
__global
int*
restrict
converged
)
{
for
(
int
index
=
get_global_id
(
0
)
; index < NUM_CONSTRAINTS; index += get_global_size(0)) {
for
(
int
index
=
get_global_id
(
0
)
; index < NUM_CONSTRAINTS; index += get_global_size(0)) {
//
Compute
the
direction
for
this
constraint.
//
Compute
the
direction
for
this
constraint.
...
@@ -23,6 +24,10 @@ __kernel void computeConstraintDirections(__global const int2* restrict constrai
...
@@ -23,6 +24,10 @@ __kernel void computeConstraintDirections(__global const int2* restrict constrai
dir.z
=
oldPos1.z-oldPos2.z
;
dir.z
=
oldPos1.z-oldPos2.z
;
constraintDistance[index]
=
dir
;
constraintDistance[index]
=
dir
;
}
}
if
(
get_global_id
(
0
)
==
0
)
{
converged[0]
=
1
;
converged[1]
=
0
;
}
}
}
/**
/**
...
...
platforms/opencl/src/kernels/coulombLennardJones.cl
View file @
93c467b2
#
if
USE_EWALD
#
if
USE_EWALD
bool
needCorrection
=
isExcluded
&&
atom1
!=
atom2
&&
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
;
bool
needCorrection
=
hasExclusions
&&
isExcluded
&&
atom1
!=
atom2
&&
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
;
if
(
!isExcluded
|
| needCorrection) {
if
(
!isExcluded
|
| needCorrection) {
real tempForce = 0;
if (r2 < CUTOFF_SQUARED |
|
needCorrection
)
{
if (r2 < CUTOFF_SQUARED |
|
needCorrection
)
{
const
real
alphaR
=
EWALD_ALPHA*r
;
const
real
alphaR
=
EWALD_ALPHA*r
;
const
real
expAlphaRSqr
=
EXP
(
-alphaR*alphaR
)
;
const
real
expAlphaRSqr
=
EXP
(
-alphaR*alphaR
)
;
...
@@ -16,6 +15,7 @@ if (!isExcluded || needCorrection) {
...
@@ -16,6 +15,7 @@ if (!isExcluded || needCorrection) {
t
*=
t
;
t
*=
t
;
t
*=
t
;
t
*=
t
;
const
real
erfcAlphaR
=
RECIP
(
t*t
)
;
const
real
erfcAlphaR
=
RECIP
(
t*t
)
;
real
tempForce
=
0
;
if
(
needCorrection
)
{
if
(
needCorrection
)
{
//
Subtract
off
the
part
of
this
interaction
that
was
included
in
the
reciprocal
space
contribution.
//
Subtract
off
the
part
of
this
interaction
that
was
included
in
the
reciprocal
space
contribution.
...
@@ -36,8 +36,8 @@ if (!isExcluded || needCorrection) {
...
@@ -36,8 +36,8 @@ if (!isExcluded || needCorrection) {
tempEnergy
+=
prefactor*erfcAlphaR
;
tempEnergy
+=
prefactor*erfcAlphaR
;
#
endif
#
endif
}
}
dEdR
+=
tempForce*invR*invR
;
}
}
dEdR
+=
tempForce*invR*invR
;
}
}
#
else
#
else
{
{
...
...
platforms/opencl/src/kernels/customGBEnergyN2.cl
0 → 100644
View file @
93c467b2
#
ifdef
SUPPORTS_64_BIT_ATOMICS
#
pragma
OPENCL
EXTENSION
cl_khr_int64_base_atomics
:
enable
#
define
STORE_DERIVATIVE_1
(
INDEX
)
atom_add
(
&derivBuffers[offset+
(
INDEX-1
)
*PADDED_NUM_ATOMS],
(
long
)
(
deriv##INDEX##_1*0x100000000
))
;
#
define
STORE_DERIVATIVE_2
(
INDEX
)
atom_add
(
&derivBuffers[offset+
(
INDEX-1
)
*PADDED_NUM_ATOMS],
(
long
)
(
local_deriv##INDEX[get_local_id
(
0
)
]*0x100000000
))
;
#
else
#
define
STORE_DERIVATIVE_1
(
INDEX
)
derivBuffers##INDEX[offset]
+=
deriv##INDEX##_1
;
#
define
STORE_DERIVATIVE_2
(
INDEX
)
derivBuffers##INDEX[offset]
+=
local_deriv##INDEX[get_local_id
(
0
)
]
;
#
endif
/**
*
Compute
a
force
based
on
pair
interactions.
*/
__kernel
void
computeN2Energy
(
#
ifdef
SUPPORTS_64_BIT_ATOMICS
__global
long*
restrict
forceBuffers,
#
else
__global
real4*
restrict
forceBuffers,
#
endif
__global
real*
restrict
energyBuffer,
__local
real4*
restrict
local_force,
__global
const
real4*
restrict
posq,
__local
real4*
restrict
local_posq,
__global
const
unsigned
int*
restrict
exclusions,
__global
const
ushort2*
exclusionTiles,
#
ifdef
USE_CUTOFF
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
const
real4*
restrict
blockCenter,
__global
const
int*
restrict
interactingAtoms
#
else
unsigned
int
numTiles
#
endif
PARAMETER_ARGUMENTS
)
{
const
unsigned
int
totalWarps
=
get_global_size
(
0
)
/TILE_SIZE
;
const
unsigned
int
warp
=
get_global_id
(
0
)
/TILE_SIZE
;
const
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
TILE_SIZE-1
)
;
const
unsigned
int
tbx
=
get_local_id
(
0
)
-
tgx
;
real
energy
=
0
;
//
First
loop:
process
tiles
that
contain
exclusions.
const
unsigned
int
firstExclusionTile
=
FIRST_EXCLUSION_TILE+warp*
(
LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE
)
/totalWarps
;
const
unsigned
int
lastExclusionTile
=
FIRST_EXCLUSION_TILE+
(
warp+1
)
*
(
LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE
)
/totalWarps
;
for
(
int
pos
=
firstExclusionTile
; pos < lastExclusionTile; pos++) {
const
ushort2
tileIndices
=
exclusionTiles[pos]
;
const
unsigned
int
x
=
tileIndices.x
;
const
unsigned
int
y
=
tileIndices.y
;
real4
force
=
0
;
DECLARE_ATOM1_DERIVATIVES
unsigned
int
atom1
=
x*TILE_SIZE
+
tgx
;
real4
posq1
=
posq[atom1]
;
LOAD_ATOM1_PARAMETERS
#
ifdef
USE_EXCLUSIONS
unsigned
int
excl
=
exclusions[pos*TILE_SIZE+tgx]
;
#
endif
if
(
x
==
y
)
{
//
This
tile
is
on
the
diagonal.
const
unsigned
int
localAtomIndex
=
get_local_id
(
0
)
;
local_posq[localAtomIndex]
=
posq1
;
LOAD_LOCAL_PARAMETERS_FROM_1
SYNC_WARPS
;
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
int
atom2
=
tbx+j
;
real4
posq2
=
local_posq[atom2]
;
real4
delta
=
(
real4
)
(
posq2.xyz
-
posq1.xyz,
0
)
;
#
ifdef
USE_PERIODIC
delta.xyz
-=
floor
(
delta.xyz*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
#
endif
real
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
#
ifdef
USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
#
endif
real
invR
=
RSQRT
(
r2
)
;
real
r
=
RECIP
(
invR
)
;
LOAD_ATOM2_PARAMETERS
atom2
=
y*TILE_SIZE+j
;
real
dEdR
=
0
;
real
tempEnergy
=
0
;
#
ifdef
USE_EXCLUSIONS
bool
isExcluded
=
!
(
excl
&
0x1
)
;
#
endif
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
&&
atom1
!=
atom2
)
{
COMPUTE_INTERACTION
dEdR
/=
-r
;
}
energy
+=
0.5f*tempEnergy
;
delta.xyz
*=
dEdR
;
force.xyz
-=
delta.xyz
;
#
ifdef
USE_CUTOFF
}
#
endif
#
ifdef
USE_EXCLUSIONS
excl
>>=
1
;
#
endif
SYNC_WARPS
;
}
}
else
{
//
This
is
an
off-diagonal
tile.
const
unsigned
int
localAtomIndex
=
get_local_id
(
0
)
;
unsigned
int
j
=
y*TILE_SIZE
+
tgx
;
local_posq[localAtomIndex]
=
posq[j]
;
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
local_force[localAtomIndex]
=
0
;
CLEAR_LOCAL_DERIVATIVES
SYNC_WARPS
;
#
ifdef
USE_EXCLUSIONS
excl
=
(
excl
>>
tgx
)
| (excl << (TILE_SIZE - tgx));
#endif
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+tj;
real4 posq2 = local_posq[atom2];
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+tj;
real dEdR = 0;
real tempEnergy = 0;
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
COMPUTE_INTERACTION
dEdR /= -r;
}
energy += tempEnergy;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
atom2 = tbx+tj;
local_force[atom2].xyz += delta.xyz;
RECORD_DERIVATIVE_2
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl >>= 1;
#endif
tj = (tj + 1) & (TILE_SIZE - 1);
SYNC_WARPS;
}
}
// Write results.
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset = x*TILE_SIZE + tgx;
atom_add(&forceBuffers[offset], (long) (force.x*0x100000000));
atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
STORE_DERIVATIVES_1
if (x != y) {
offset = y*TILE_SIZE + tgx;
atom_add(&forceBuffers[offset], (long) (local_force[get_local_id(0)].x*0x100000000));
atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (local_force[get_local_id(0)].y*0x100000000));
atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (local_force[get_local_id(0)].z*0x100000000));
STORE_DERIVATIVES_2
}
#else
unsigned int offset1 = x*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
unsigned int offset = offset1;
forceBuffers[offset1].xyz += force.xyz;
STORE_DERIVATIVES_1
if (x != y) {
offset = offset2;
forceBuffers[offset2] += (real4) (local_force[get_local_id(0)].x, local_force[get_local_id(0)].y, local_force[get_local_id(0)].z, 0.0f);
STORE_DERIVATIVES_2
}
#endif
}
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
// of them (no cutoff).
#ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0];
int pos = warp*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
int end = (warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
#else
int pos = warp*numTiles/totalWarps;
int end = (warp+1)*numTiles/totalWarps;
#endif
int skipBase = 0;
int currentSkipIndex = tbx;
__local int atomIndices[FORCE_WORK_GROUP_SIZE];
__local int skipTiles[FORCE_WORK_GROUP_SIZE];
skipTiles[get_local_id(0)] = -1;
while (pos < end) {
const bool isExcluded = false;
real4 force = 0;
DECLARE_ATOM1_DERIVATIVES
bool includeTile = true;
// Extract the coordinates of this tile.
unsigned int x, y;
bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
x = tileIndices.x;
singlePeriodicCopy = tileIndices.y;
}
else
#endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y |
|
x
>=
NUM_BLOCKS
)
{
//
Occasionally
happens
due
to
roundoff
error.
y
+=
(
x
<
y
?
-1
:
1
)
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
}
//
Skip
over
tiles
that
have
exclusions,
since
they
were
already
processed.
SYNC_WARPS
;
while
(
skipTiles[tbx+TILE_SIZE-1]
<
pos
)
{
SYNC_WARPS
;
if
(
skipBase+tgx
<
NUM_TILES_WITH_EXCLUSIONS
)
{
ushort2
tile
=
exclusionTiles[skipBase+tgx]
;
skipTiles[get_local_id
(
0
)
]
=
tile.x
+
tile.y*NUM_BLOCKS
-
tile.y*
(
tile.y+1
)
/2
;
}
else
skipTiles[get_local_id
(
0
)
]
=
end
;
skipBase
+=
TILE_SIZE
;
currentSkipIndex
=
tbx
;
SYNC_WARPS
;
}
while
(
skipTiles[currentSkipIndex]
<
pos
)
currentSkipIndex++
;
includeTile
=
(
skipTiles[currentSkipIndex]
!=
pos
)
;
}
if
(
includeTile
)
{
unsigned
int
atom1
=
x*TILE_SIZE
+
tgx
;
//
Load
atom
data
for
this
tile.
real4
posq1
=
posq[atom1]
;
LOAD_ATOM1_PARAMETERS
const
unsigned
int
localAtomIndex
=
get_local_id
(
0
)
;
#
ifdef
USE_CUTOFF
unsigned
int
j
=
(
numTiles
<=
maxTiles
?
interactingAtoms[pos*TILE_SIZE+tgx]
:
y*TILE_SIZE
+
tgx
)
;
#
else
unsigned
int
j
=
y*TILE_SIZE
+
tgx
;
#
endif
atomIndices[get_local_id
(
0
)
]
=
j
;
if
(
j
<
PADDED_NUM_ATOMS
)
{
local_posq[localAtomIndex]
=
posq[j]
;
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
local_force[localAtomIndex]
=
0
;
CLEAR_LOCAL_DERIVATIVES
}
SYNC_WARPS
;
#
ifdef
USE_PERIODIC
if
(
singlePeriodicCopy
)
{
//
The
box
is
small
enough
that
we
can
just
translate
all
the
atoms
into
a
single
periodic
//
box,
then
skip
having
to
apply
periodic
boundary
conditions
later.
real4
blockCenterX
=
blockCenter[x]
;
posq1.xyz
-=
floor
((
posq1.xyz-blockCenterX.xyz
)
*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
local_posq[get_local_id
(
0
)
].x
-=
floor
((
local_posq[get_local_id
(
0
)
].x-blockCenterX.x
)
*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
local_posq[get_local_id
(
0
)
].y
-=
floor
((
local_posq[get_local_id
(
0
)
].y-blockCenterX.y
)
*invPeriodicBoxSize.y+0.5f
)
*periodicBoxSize.y
;
local_posq[get_local_id
(
0
)
].z
-=
floor
((
local_posq[get_local_id
(
0
)
].z-blockCenterX.z
)
*invPeriodicBoxSize.z+0.5f
)
*periodicBoxSize.z
;
SYNC_WARPS
;
unsigned
int
tj
=
tgx
;
for
(
j
=
0
; j < TILE_SIZE; j++) {
int
atom2
=
tbx+tj
;
real4
posq2
=
local_posq[atom2]
;
real4
delta
=
(
real4
)
(
posq2.xyz
-
posq1.xyz,
0
)
;
real
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
if
(
r2
<
CUTOFF_SQUARED
)
{
real
invR
=
RSQRT
(
r2
)
;
real
r
=
RECIP
(
invR
)
;
LOAD_ATOM2_PARAMETERS
atom2
=
atomIndices[tbx+tj]
;
real
dEdR
=
0
;
real
tempEnergy
=
0
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
COMPUTE_INTERACTION
dEdR
/=
-r
;
}
energy
+=
tempEnergy
;
delta.xyz
*=
dEdR
;
force.xyz
-=
delta.xyz
;
atom2
=
tbx+tj
;
local_force[atom2].xyz
+=
delta.xyz
;
RECORD_DERIVATIVE_2
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
)
;
SYNC_WARPS
;
}
}
else
#
endif
{
//
We
need
to
apply
periodic
boundary
conditions
separately
for
each
interaction.
unsigned
int
tj
=
tgx
;
for
(
j
=
0
; j < TILE_SIZE; j++) {
int
atom2
=
tbx+tj
;
real4
posq2
=
local_posq[atom2]
;
real4
delta
=
(
real4
)
(
posq2.xyz
-
posq1.xyz,
0
)
;
#
ifdef
USE_PERIODIC
delta.xyz
-=
floor
(
delta.xyz*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
#
endif
real
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
#
ifdef
USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
#
endif
real
invR
=
RSQRT
(
r2
)
;
real
r
=
RECIP
(
invR
)
;
LOAD_ATOM2_PARAMETERS
atom2
=
atomIndices[tbx+tj]
;
real
dEdR
=
0
;
real
tempEnergy
=
0
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
COMPUTE_INTERACTION
dEdR
/=
-r
;
}
energy
+=
tempEnergy
;
delta.xyz
*=
dEdR
;
force.xyz
-=
delta.xyz
;
atom2
=
tbx+tj
;
local_force[atom2].xyz
+=
delta.xyz
;
RECORD_DERIVATIVE_2
#
ifdef
USE_CUTOFF
}
#
endif
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
)
;
SYNC_WARPS
;
}
}
//
Write
results.
#
ifdef
USE_CUTOFF
unsigned
int
atom2
=
atomIndices[get_local_id
(
0
)
]
;
#
else
unsigned
int
atom2
=
y*TILE_SIZE
+
tgx
;
#
endif
#
ifdef
SUPPORTS_64_BIT_ATOMICS
atom_add
(
&forceBuffers[atom1],
(
long
)
(
force.x*0x100000000
))
;
atom_add
(
&forceBuffers[atom1+PADDED_NUM_ATOMS],
(
long
)
(
force.y*0x100000000
))
;
atom_add
(
&forceBuffers[atom1+2*PADDED_NUM_ATOMS],
(
long
)
(
force.z*0x100000000
))
;
unsigned
int
offset
=
atom1
;
STORE_DERIVATIVES_1
if
(
atom2
<
PADDED_NUM_ATOMS
)
{
atom_add
(
&forceBuffers[atom2],
(
long
)
(
local_force[get_local_id
(
0
)
].x*0x100000000
))
;
atom_add
(
&forceBuffers[atom2+PADDED_NUM_ATOMS],
(
long
)
(
local_force[get_local_id
(
0
)
].y*0x100000000
))
;
atom_add
(
&forceBuffers[atom2+2*PADDED_NUM_ATOMS],
(
long
)
(
local_force[get_local_id
(
0
)
].z*0x100000000
))
;
offset
=
atom2
;
STORE_DERIVATIVES_2
}
#
else
unsigned
int
offset1
=
atom1
+
warp*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
atom2
+
warp*PADDED_NUM_ATOMS
;
forceBuffers[offset1].xyz
+=
force.xyz
;
unsigned
int
offset
=
offset1
;
STORE_DERIVATIVES_1
if
(
atom2
<
PADDED_NUM_ATOMS
)
{
forceBuffers[offset2]
+=
(
real4
)
(
local_force[get_local_id
(
0
)
].x,
local_force[get_local_id
(
0
)
].y,
local_force[get_local_id
(
0
)
].z,
0.0f
)
;
offset
=
offset2
;
STORE_DERIVATIVES_2
}
#
endif
}
pos++
;
}
energyBuffer[get_global_id
(
0
)
]
+=
energy
;
}
platforms/opencl/src/kernels/customGBEnergyN2_cpu.cl
View file @
93c467b2
#
define
TILE_SIZE
32
#
ifdef
SUPPORTS_64_BIT_ATOMICS
#
define
STORE_DERIVATIVE_1
(
INDEX
)
derivBuffers##INDEX[offset1]
+=
deriv##INDEX##_1
;
#
pragma
OPENCL
EXTENSION
cl_khr_int64_base_atomics
:
enable
#
define
STORE_DERIVATIVE_2
(
INDEX
)
derivBuffers##INDEX[offset2]
+=
local_deriv##INDEX[tgx]
;
#
define
STORE_DERIVATIVE_1
(
INDEX
)
atom_add
(
&derivBuffers[offset+
(
INDEX-1
)
*PADDED_NUM_ATOMS],
(
long
)
(
deriv##INDEX##_1*0x100000000
))
;
#
define
STORE_DERIVATIVE_2
(
INDEX
)
atom_add
(
&derivBuffers[offset+
(
INDEX-1
)
*PADDED_NUM_ATOMS],
(
long
)
(
local_deriv##INDEX[tgx]*0x100000000
))
;
#
else
#
define
STORE_DERIVATIVE_1
(
INDEX
)
derivBuffers##INDEX[offset]
+=
deriv##INDEX##_1
;
#
define
STORE_DERIVATIVE_2
(
INDEX
)
derivBuffers##INDEX[offset]
+=
local_deriv##INDEX[tgx]
;
#
endif
/**
/**
*
Compute
a
force
based
on
pair
interactions.
*
Compute
a
force
based
on
pair
interactions.
*/
*/
__kernel
void
computeN2Energy
(
__kernel
void
computeN2Energy
(
__global
real4*
restrict
forceBuffers,
__global
real*
restrict
energyBuffer,
__local
real4*
restrict
local_force,
#
ifdef
SUPPORTS_64_BIT_ATOMICS
__global
const
real4*
restrict
posq,
__local
real4*
restrict
local_posq,
__global
const
unsigned
int*
restrict
exclusions,
__global
const
unsigned
int*
restrict
exclusionIndices,
__global
long*
restrict
forceBuffers,
__global
const
unsigned
int*
restrict
exclusionRowIndices,
__local
real4*
restrict
tempBuffer,
#
ifdef
USE_CUTOFF
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
const
unsigned
int*
restrict
interactionFlags
#
else
#
else
unsigned
int
numTiles
__global
real4*
restrict
forceBuffers,
#
endif
#
endif
PARAMETER_ARGUMENTS
)
{
__global
real*
restrict
energyBuffer,
__local
real4*
restrict
local_force,
__global
const
real4*
restrict
posq,
__local
real4*
restrict
local_posq,
__global
const
unsigned
int*
restrict
exclusions,
__global
const
ushort2*
exclusionTiles,
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
unsigned
int
numTiles
=
interactionCount[0]
;
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
const
real4*
restrict
blockCenter,
__global
const
int*
restrict
interactingAtoms
unsigned
int
pos
=
get_group_id
(
0
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS*
(
NUM_BLOCKS+1
)
/2
:
numTiles
)
/get_num_groups
(
0
)
;
unsigned
int
end
=
(
get_group_id
(
0
)
+1
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS*
(
NUM_BLOCKS+1
)
/2
:
numTiles
)
/get_num_groups
(
0
)
;
#
else
#
else
unsigned
int
pos
=
get_group_id
(
0
)
*numTiles/get_num_groups
(
0
)
;
unsigned
int
numTiles
unsigned
int
end
=
(
get_group_id
(
0
)
+1
)
*numTiles/get_num_groups
(
0
)
;
#
endif
#
endif
PARAMETER_ARGUMENTS
)
{
real
energy
=
0
;
real
energy
=
0
;
unsigned
int
lasty
=
0xFFFFFFFF
;
while
(
pos
<
end
)
{
//
First
loop:
process
tiles
that
contain
exclusions.
//
Extract
the
coordinates
of
this
tile
unsigned
int
x,
y
;
const
unsigned
int
firstExclusionTile
=
FIRST_EXCLUSION_TILE+get_group_id
(
0
)
*
(
LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE
)
/get_num_groups
(
0
)
;
#
ifdef
USE_CUTOFF
const
unsigned
int
lastExclusionTile
=
FIRST_EXCLUSION_TILE+
(
get_group_id
(
0
)
+1
)
*
(
LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE
)
/get_num_groups
(
0
)
;
if
(
numTiles
<=
maxTiles
)
{
for
(
int
pos
=
firstExclusionTile
; pos < lastExclusionTile; pos++) {
ushort2
tileIndices
=
tiles[pos]
;
const
ushort2
tileIndices
=
exclusionTiles[pos]
;
x
=
tileIndices.x
;
const
unsigned
int
x
=
tileIndices.x
;
y
=
tileIndices.y
;
const
unsigned
int
y
=
tileIndices.y
;
}
else
#
endif
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS+0.5f-SQRT
((
NUM_BLOCKS+0.5f
)
*
(
NUM_BLOCKS+0.5f
)
-2*pos
))
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
if
(
x
<
y
|
| x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
// Lo
cate the exclusion
data for this tile.
//
Lo
ad
the
data
for
this
tile.
#ifdef USE_EXCLUSIONS
for
(
int
localAtomIndex
=
0
; localAtomIndex < TILE_SIZE; localAtomIndex++) {
unsigned int exclusionStart = exclusionRowIndices[x];
unsigned
int
j
=
y*TILE_SIZE
+
localAtomIndex
;
unsigned int exclusionEnd = exclusionRowIndices[x+1];
local_posq[localAtomIndex]
=
posq[j]
;
int exclusionIndex = -1;
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
for (int i = exclusionStart; i < exclusionEnd; i++)
if (exclusionIndices[i] == y) {
exclusionIndex = i*TILE_SIZE;
break;
}
bool hasExclusions = (exclusionIndex > -1);
#else
bool hasExclusions = false;
#endif
// Load the data for this tile if we don't already have it cached.
if (lasty != y) {
for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
unsigned int j = y*TILE_SIZE + localAtomIndex;
local_posq[localAtomIndex] = posq[j];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
}
}
if
(
x
==
y
)
{
if
(
x
==
y
)
{
//
This
tile
is
on
the
diagonal.
//
This
tile
is
on
the
diagonal.
for
(
unsigned
int
tgx
=
0
; tgx < TILE_SIZE; tgx++) {
for
(
unsigned
int
tgx
=
0
; tgx < TILE_SIZE; tgx++) {
#
ifdef
USE_EXCLUSIONS
#
ifdef
USE_EXCLUSIONS
unsigned int excl = exclusions[
exclusionIndex
+tgx];
unsigned
int
excl
=
exclusions[
pos*TILE_SIZE
+tgx]
;
#
endif
#
endif
unsigned
int
atom1
=
x*TILE_SIZE+tgx
;
unsigned
int
atom1
=
x*TILE_SIZE+tgx
;
real4
force
=
0
;
real4
force
=
0
;
...
@@ -84,9 +56,6 @@ __kernel void computeN2Energy(__global real4* restrict forceBuffers, __global re
...
@@ -84,9 +56,6 @@ __kernel void computeN2Energy(__global real4* restrict forceBuffers, __global re
real4
posq1
=
posq[atom1]
;
real4
posq1
=
posq[atom1]
;
LOAD_ATOM1_PARAMETERS
LOAD_ATOM1_PARAMETERS
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
real4
posq2
=
local_posq[j]
;
real4
posq2
=
local_posq[j]
;
real4
delta
=
(
real4
)
(
posq2.xyz
-
posq1.xyz,
0
)
;
real4
delta
=
(
real4
)
(
posq2.xyz
-
posq1.xyz,
0
)
;
#
ifdef
USE_PERIODIC
#
ifdef
USE_PERIODIC
...
@@ -96,20 +65,23 @@ __kernel void computeN2Energy(__global real4* restrict forceBuffers, __global re
...
@@ -96,20 +65,23 @@ __kernel void computeN2Energy(__global real4* restrict forceBuffers, __global re
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
if
(
r2
<
CUTOFF_SQUARED
)
{
#
endif
#
endif
real invR = RSQRT(r2);
real
invR
=
RSQRT
(
r2
)
;
real r = RECIP(invR);
real
r
=
RECIP
(
invR
)
;
unsigned int atom2 = j;
unsigned
int
atom2
=
j
;
LOAD_ATOM2_PARAMETERS
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
atom2
=
y*TILE_SIZE+j
;
real dEdR = 0;
real
dEdR
=
0
;
real tempEnergy = 0;
real
tempEnergy
=
0
;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
#
ifdef
USE_EXCLUSIONS
COMPUTE_INTERACTION
bool
isExcluded
=
!
(
excl
&
0x1
)
;
dEdR /= -r;
#
endif
}
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
&&
atom1
!=
atom2
)
{
energy += 0.5f*tempEnergy;
COMPUTE_INTERACTION
delta.xyz *= dEdR;
dEdR
/=
-r
;
force.xyz -= delta.xyz;
}
energy
+=
0.5f*tempEnergy
;
delta.xyz
*=
dEdR
;
force.xyz
-=
delta.xyz
;
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
}
}
#
endif
#
endif
...
@@ -118,11 +90,19 @@ __kernel void computeN2Energy(__global real4* restrict forceBuffers, __global re
...
@@ -118,11 +90,19 @@ __kernel void computeN2Energy(__global real4* restrict forceBuffers, __global re
#
endif
#
endif
}
}
// Write results
//
Write
results
.
unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#
ifdef
SUPPORTS_64_BIT_ATOMICS
forceBuffers[offset1].xyz += force.xyz;
unsigned
int
offset
=
atom1
;
atom_add
(
&forceBuffers[offset],
(
long
)
(
force.x*0x100000000
))
;
atom_add
(
&forceBuffers[offset+PADDED_NUM_ATOMS],
(
long
)
(
force.y*0x100000000
))
;
atom_add
(
&forceBuffers[offset+2*PADDED_NUM_ATOMS],
(
long
)
(
force.z*0x100000000
))
;
STORE_DERIVATIVES_1
STORE_DERIVATIVES_1
#
else
unsigned
int
offset
=
atom1
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
forceBuffers[offset].xyz
+=
force.xyz
;
STORE_DERIVATIVES_1
#
endif
}
}
}
}
else
{
else
{
...
@@ -132,60 +112,212 @@ __kernel void computeN2Energy(__global real4* restrict forceBuffers, __global re
...
@@ -132,60 +112,212 @@ __kernel void computeN2Energy(__global real4* restrict forceBuffers, __global re
local_force[localAtomIndex]
=
0
;
local_force[localAtomIndex]
=
0
;
CLEAR_LOCAL_DERIVATIVES
CLEAR_LOCAL_DERIVATIVES
}
}
#if defined(USE_CUTOFF) && defined(USE_EXCLUSIONS)
for
(
unsigned
int
tgx
=
0
; tgx < TILE_SIZE; tgx++) {
unsigned int flags1 = (numTiles <= maxTiles ? interactionFlags[2*pos] : 0xFFFFFFFF);
#
ifdef
USE_EXCLUSIONS
unsigned int flags2 = (numTiles <= maxTiles ? interactionFlags[2*pos+1] : 0xFFFFFFFF);
unsigned
int
excl
=
exclusions[pos*TILE_SIZE+tgx]
;
if (!hasExclusions && (flags1 != 0xFFFFFFFF |
|
flags2
!=
0xFFFFFFFF
)
)
{
#
endif
//
Compute
only
a
subset
of
the
interactions
in
this
tile.
unsigned
int
atom1
=
x*TILE_SIZE+tgx
;
real4
force
=
0
;
for
(
unsigned
int
tgx
=
0
; tgx < TILE_SIZE; tgx++) {
DECLARE_ATOM1_DERIVATIVES
if
((
flags2&
(
1<<tgx
))
!=
0
)
{
real4
posq1
=
posq[atom1]
;
unsigned
int
atom1
=
x*TILE_SIZE+tgx
;
LOAD_ATOM1_PARAMETERS
real
value
=
0
;
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
DECLARE_ATOM1_DERIVATIVES
real4
posq2
=
local_posq[j]
;
real4
posq1
=
posq[atom1]
;
real4
delta
=
(
real4
)
(
posq2.xyz
-
posq1.xyz,
0
)
;
LOAD_ATOM1_PARAMETERS
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
if
((
flags&
(
1<<j
))
!=
0
)
{
real4
posq2
=
local_posq[j]
;
real4
delta
=
(
real4
)
(
posq2.xyz
-
posq1.xyz,
0
)
;
#
ifdef
USE_PERIODIC
#
ifdef
USE_PERIODIC
delta.xyz
-=
floor
(
delta.xyz*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
delta.xyz
-=
floor
(
delta.xyz*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
#
endif
#
endif
real
r2
=
dot
(
delta.xyz,
delta.xyz
)
;
real
r2
=
dot
(
delta.xyz,
delta.xyz
)
;
if
(
r2
<
CUTOFF_SQUARED
)
{
#
ifdef
USE_CUTOFF
real
invR
=
RSQRT
(
r2
)
;
if
(
r2
<
CUTOFF_SQUARED
)
{
real
r
=
RECIP
(
invR
)
;
#
endif
unsigned
int
atom2
=
j
;
real
invR
=
RSQRT
(
r2
)
;
LOAD_ATOM2_PARAMETERS
real
r
=
RECIP
(
invR
)
;
atom2
=
y*TILE_SIZE+j
;
unsigned
int
atom2
=
j
;
real
dEdR
=
0
;
LOAD_ATOM2_PARAMETERS
real
tempEnergy
=
0
;
atom2
=
y*TILE_SIZE+j
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
real
dEdR
=
0
;
COMPUTE_INTERACTION
real
tempEnergy
=
0
;
dEdR
/=
-r
;
#
ifdef
USE_EXCLUSIONS
}
bool
isExcluded
=
(
atom1
>=
NUM_ATOMS
|
| atom2 >= NUM_ATOMS || !(excl & 0x1));
energy
+=
tempEnergy
;
if (!isExcluded) {
delta.xyz
*=
dEdR
;
#else
force.xyz
-=
delta.xyz
;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
atom2
=
j
;
#endif
local_force[atom2].xyz
+=
delta.xyz
;
COMPUTE_INTERACTION
RECORD_DERIVATIVE_2
dEdR /= -r;
}
}
}
}
energy += tempEnergy;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
atom2 = j;
local_force[atom2].xyz += delta.xyz;
RECORD_DERIVATIVE_2
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl >>= 1;
#endif
}
// Write results for atom1.
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset = atom1;
atom_add(&forceBuffers[offset], (long) (force.x*0x100000000));
atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
STORE_DERIVATIVES_1
#else
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz += force.xyz;
STORE_DERIVATIVES_1
#endif
}
// Write results.
for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset = y*TILE_SIZE+tgx;
atom_add(&forceBuffers[offset], (long) (local_force[tgx].x*0x100000000));
atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (local_force[tgx].y*0x100000000));
atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (local_force[tgx].z*0x100000000));
STORE_DERIVATIVES_2
#else
unsigned int offset = y*TILE_SIZE+tgx + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz += local_force[tgx].xyz;
STORE_DERIVATIVES_2
#endif
}
}
}
//
Write
results
for
atom1.
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
// of them (no cutoff).
unsigned
int
offset
=
atom1
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
#ifdef USE_CUTOFF
global_value[offset]
+=
value
;
const unsigned int numTiles = interactionCount[0];
int pos = get_group_id(0)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0);
int end = (get_group_id(0)+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0);
#else
int pos = get_group_id(0)*numTiles/get_num_groups(0);
int end = (get_group_id(0)+1)*numTiles/get_num_groups(0);
#endif
int nextToSkip = -1;
int currentSkipIndex = 0;
__local int atomIndices[TILE_SIZE];
while (pos < end) {
const bool isExcluded = false;
bool includeTile = true;
// Extract the coordinates of this tile.
unsigned int x, y;
bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
x = tileIndices.x;
singlePeriodicCopy = tileIndices.y;
}
else
#endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y |
|
x
>=
NUM_BLOCKS
)
{
//
Occasionally
happens
due
to
roundoff
error.
y
+=
(
x
<
y
?
-1
:
1
)
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
}
//
Skip
over
tiles
that
have
exclusions,
since
they
were
already
processed.
while
(
nextToSkip
<
pos
)
{
if
(
currentSkipIndex
<
NUM_TILES_WITH_EXCLUSIONS
)
{
ushort2
tile
=
exclusionTiles[currentSkipIndex++]
;
nextToSkip
=
tile.x
+
tile.y*NUM_BLOCKS
-
tile.y*
(
tile.y+1
)
/2
;
}
else
nextToSkip
=
end
;
}
includeTile
=
(
nextToSkip
!=
pos
)
;
}
if
(
includeTile
)
{
//
Load
the
data
for
this
tile.
for
(
int
localAtomIndex
=
0
; localAtomIndex < TILE_SIZE; localAtomIndex++) {
#
ifdef
USE_CUTOFF
unsigned
int
j
=
(
numTiles
<=
maxTiles
?
interactingAtoms[pos*TILE_SIZE+localAtomIndex]
:
y*TILE_SIZE+localAtomIndex
)
;
#
else
unsigned
int
j
=
y*TILE_SIZE+localAtomIndex
;
#
endif
atomIndices[localAtomIndex]
=
j
;
if
(
j
<
PADDED_NUM_ATOMS
)
{
local_posq[localAtomIndex]
=
posq[j]
;
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
local_force[localAtomIndex]
=
0
;
CLEAR_LOCAL_DERIVATIVES
}
}
#
ifdef
USE_PERIODIC
if
(
singlePeriodicCopy
)
{
//
The
box
is
small
enough
that
we
can
just
translate
all
the
atoms
into
a
single
periodic
//
box,
then
skip
having
to
apply
periodic
boundary
conditions
later.
real4
blockCenterX
=
blockCenter[x]
;
for
(
unsigned
int
tgx
=
0
; tgx < TILE_SIZE; tgx++)
local_posq[tgx].xyz
-=
floor
((
local_posq[tgx].xyz-blockCenterX.xyz
)
*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
for
(
unsigned
int
tgx
=
0
; tgx < TILE_SIZE; tgx++) {
unsigned
int
atom1
=
x*TILE_SIZE+tgx
;
real4
force
=
0
;
DECLARE_ATOM1_DERIVATIVES
real4
posq1
=
posq[atom1]
;
LOAD_ATOM1_PARAMETERS
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
real4
posq2
=
local_posq[j]
;
real4
delta
=
(
real4
)
(
posq2.xyz
-
posq1.xyz,
0
)
;
real
r2
=
dot
(
delta.xyz,
delta.xyz
)
;
if
(
atom1
<
NUM_ATOMS
&&
atomIndices[j]
<
NUM_ATOMS
&&
r2
<
CUTOFF_SQUARED
)
{
real
invR
=
RSQRT
(
r2
)
;
real
r
=
RECIP
(
invR
)
;
unsigned
int
atom2
=
j
;
LOAD_ATOM2_PARAMETERS
atom2
=
atomIndices[j]
;
real
dEdR
=
0
;
real
tempEnergy
=
0
;
COMPUTE_INTERACTION
dEdR
/=
-r
;
energy
+=
tempEnergy
;
delta.xyz
*=
dEdR
;
force.xyz
-=
delta.xyz
;
atom2
=
j
;
local_force[atom2].xyz
+=
delta.xyz
;
RECORD_DERIVATIVE_2
}
}
}
//
Write
results
for
atom1.
#
ifdef
SUPPORTS_64_BIT_ATOMICS
unsigned
int
offset
=
atom1
;
atom_add
(
&forceBuffers[offset],
(
long
)
(
force.x*0x100000000
))
;
atom_add
(
&forceBuffers[offset+PADDED_NUM_ATOMS],
(
long
)
(
force.y*0x100000000
))
;
atom_add
(
&forceBuffers[offset+2*PADDED_NUM_ATOMS],
(
long
)
(
force.z*0x100000000
))
;
STORE_DERIVATIVES_1
#
else
unsigned
int
offset
=
atom1
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
forceBuffers[offset].xyz
+=
force.xyz
;
STORE_DERIVATIVES_1
#
endif
}
}
}
}
else
else
#
endif
#
endif
{
{
//
Compute
the
full
set
of
interactions
in
this
tile
.
//
We
need
to
apply
periodic
boundary
conditions
separately
for
each
interaction
.
for
(
unsigned
int
tgx
=
0
; tgx < TILE_SIZE; tgx++) {
for
(
unsigned
int
tgx
=
0
; tgx < TILE_SIZE; tgx++) {
unsigned
int
atom1
=
x*TILE_SIZE+tgx
;
unsigned
int
atom1
=
x*TILE_SIZE+tgx
;
...
@@ -193,13 +325,7 @@ __kernel void computeN2Energy(__global real4* restrict forceBuffers, __global re
...
@@ -193,13 +325,7 @@ __kernel void computeN2Energy(__global real4* restrict forceBuffers, __global re
DECLARE_ATOM1_DERIVATIVES
DECLARE_ATOM1_DERIVATIVES
real4
posq1
=
posq[atom1]
;
real4
posq1
=
posq[atom1]
;
LOAD_ATOM1_PARAMETERS
LOAD_ATOM1_PARAMETERS
#
ifdef
USE_EXCLUSIONS
unsigned
int
excl
=
(
hasExclusions
?
exclusions[exclusionIndex+tgx]
:
0xFFFFFFFF
)
;
#
endif
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
#
ifdef
USE_EXCLUSIONS
bool
isExcluded
=
!
(
excl
&
0x1
)
;
#
endif
real4
posq2
=
local_posq[j]
;
real4
posq2
=
local_posq[j]
;
real4
delta
=
(
real4
)
(
posq2.xyz
-
posq1.xyz,
0
)
;
real4
delta
=
(
real4
)
(
posq2.xyz
-
posq1.xyz,
0
)
;
#
ifdef
USE_PERIODIC
#
ifdef
USE_PERIODIC
...
@@ -207,50 +333,67 @@ __kernel void computeN2Energy(__global real4* restrict forceBuffers, __global re
...
@@ -207,50 +333,67 @@ __kernel void computeN2Energy(__global real4* restrict forceBuffers, __global re
#
endif
#
endif
real
r2
=
dot
(
delta.xyz,
delta.xyz
)
;
real
r2
=
dot
(
delta.xyz,
delta.xyz
)
;
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
if
(
atom1
<
NUM_ATOMS
&&
atomIndices[j]
<
NUM_ATOMS
&&
r2
<
CUTOFF_SQUARED
)
{
#
else
if
(
atom1
<
NUM_ATOMS
&&
atomIndices[j]
<
NUM_ATOMS
)
{
#
endif
#
endif
real
invR
=
RSQRT
(
r2
)
;
real
invR
=
RSQRT
(
r2
)
;
real
r
=
RECIP
(
invR
)
;
real
r
=
RECIP
(
invR
)
;
unsigned
int
atom2
=
j
;
unsigned
int
atom2
=
j
;
LOAD_ATOM2_PARAMETERS
LOAD_ATOM2_PARAMETERS
atom2
=
y*TILE_SIZE+j
;
atom2
=
atomIndices[j]
;
real
dEdR
=
0
;
real
dEdR
=
0
;
real
tempEnergy
=
0
;
real
tempEnergy
=
0
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
COMPUTE_INTERACTION
COMPUTE_INTERACTION
dEdR
/=
-r
;
dEdR
/=
-r
;
energy
+=
tempEnergy
;
delta.xyz
*=
dEdR
;
force.xyz
-=
delta.xyz
;
atom2
=
j
;
local_force[atom2].xyz
+=
delta.xyz
;
RECORD_DERIVATIVE_2
}
}
energy
+=
tempEnergy
;
delta.xyz
*=
dEdR
;
force.xyz
-=
delta.xyz
;
atom2
=
j
;
local_force[atom2].xyz
+=
delta.xyz
;
RECORD_DERIVATIVE_2
#
ifdef
USE_CUTOFF
}
#
endif
#
ifdef
USE_EXCLUSIONS
excl
>>=
1
;
#
endif
}
}
//
Write
results
for
atom1.
//
Write
results
for
atom1.
unsigned
int
offset1
=
atom1
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
#
ifdef
SUPPORTS_64_BIT_ATOMICS
forceBuffers[offset1].xyz
+=
force.xyz
;
unsigned
int
offset
=
atom1
;
atom_add
(
&forceBuffers[offset],
(
long
)
(
force.x*0x100000000
))
;
atom_add
(
&forceBuffers[offset+PADDED_NUM_ATOMS],
(
long
)
(
force.y*0x100000000
))
;
atom_add
(
&forceBuffers[offset+2*PADDED_NUM_ATOMS],
(
long
)
(
force.z*0x100000000
))
;
STORE_DERIVATIVES_1
#
else
unsigned
int
offset
=
atom1
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
forceBuffers[offset].xyz
+=
force.xyz
;
STORE_DERIVATIVES_1
STORE_DERIVATIVES_1
#
endif
}
}
}
}
//
Write
results
//
Write
results
.
for
(
int
tgx
=
0
; tgx < TILE_SIZE; tgx++) {
for
(
int
tgx
=
0
; tgx < TILE_SIZE; tgx++) {
unsigned
int
offset2
=
y*TILE_SIZE+tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
#
ifdef
USE_CUTOFF
forceBuffers[offset2].xyz
+=
local_force[tgx].xyz
;
unsigned
int
atom2
=
atomIndices[tgx]
;
STORE_DERIVATIVES_2
#
else
unsigned
int
atom2
=
y*TILE_SIZE
+
tgx
;
#
endif
if
(
atom2
<
PADDED_NUM_ATOMS
)
{
#
ifdef
SUPPORTS_64_BIT_ATOMICS
atom_add
(
&forceBuffers[atom2],
(
long
)
(
local_force[tgx].x*0x100000000
))
;
atom_add
(
&forceBuffers[atom2+PADDED_NUM_ATOMS],
(
long
)
(
local_force[tgx].y*0x100000000
))
;
atom_add
(
&forceBuffers[atom2+2*PADDED_NUM_ATOMS],
(
long
)
(
local_force[tgx].z*0x100000000
))
;
unsigned
int
offset
=
atom2
;
STORE_DERIVATIVES_2
#
else
unsigned
int
offset
=
atom2
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
forceBuffers[offset].xyz
+=
local_force[tgx].xyz
;
STORE_DERIVATIVES_2
#
endif
}
}
}
}
}
lasty
=
y
;
pos++
;
pos++
;
}
}
energyBuffer[get_global_id
(
0
)
]
+=
energy
;
energyBuffer[get_global_id
(
0
)
]
+=
energy
;
...
...
platforms/opencl/src/kernels/customGBEnergyN2_default.cl
deleted
100644 → 0
View file @
f6d4557d
#
define
TILE_SIZE
32
#
ifdef
SUPPORTS_64_BIT_ATOMICS
#
pragma
OPENCL
EXTENSION
cl_khr_int64_base_atomics
:
enable
#
define
STORE_DERIVATIVE_1
(
INDEX
)
atom_add
(
&derivBuffers[offset1+
(
INDEX-1
)
*PADDED_NUM_ATOMS],
(
long
)
(
deriv##INDEX##_1*0x100000000
))
;
#
define
STORE_DERIVATIVE_2
(
INDEX
)
atom_add
(
&derivBuffers[offset2+
(
INDEX-1
)
*PADDED_NUM_ATOMS],
(
long
)
(
local_deriv##INDEX[get_local_id
(
0
)
]*0x100000000
))
;
#
else
#
define
STORE_DERIVATIVE_1
(
INDEX
)
derivBuffers##INDEX[offset1]
+=
deriv##INDEX##_1+tempDerivBuffer##INDEX[get_local_id
(
0
)
+TILE_SIZE]
;
#
define
STORE_DERIVATIVE_2
(
INDEX
)
derivBuffers##INDEX[offset2]
+=
local_deriv##INDEX[get_local_id
(
0
)
]+local_deriv##INDEX[get_local_id
(
0
)
+TILE_SIZE]
;
#
endif
/**
*
Compute
a
force
based
on
pair
interactions.
*/
__kernel
__attribute__
((
reqd_work_group_size
(
WORK_GROUP_SIZE,
1
,
1
)))
void
computeN2Energy
(
#
ifdef
SUPPORTS_64_BIT_ATOMICS
__global
long*
restrict
forceBuffers,
#
else
__global
real4*
restrict
forceBuffers,
#
endif
__global
real*
restrict
energyBuffer,
__local
real4*
restrict
local_force,
__global
const
real4*
restrict
posq,
__local
real4*
restrict
local_posq,
__global
const
unsigned
int*
restrict
exclusions,
__global
const
unsigned
int*
restrict
exclusionIndices,
__global
const
unsigned
int*
restrict
exclusionRowIndices,
__local
real4*
restrict
tempForceBuffer,
#
ifdef
USE_CUTOFF
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
unsigned
int
maxTiles
#
else
unsigned
int
numTiles
#
endif
PARAMETER_ARGUMENTS
)
{
#
ifdef
USE_CUTOFF
unsigned
int
numTiles
=
interactionCount[0]
;
unsigned
int
pos
=
get_group_id
(
0
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS*
(
NUM_BLOCKS+1
)
/2
:
numTiles
)
/get_num_groups
(
0
)
;
unsigned
int
end
=
(
get_group_id
(
0
)
+1
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS*
(
NUM_BLOCKS+1
)
/2
:
numTiles
)
/get_num_groups
(
0
)
;
#
else
unsigned
int
pos
=
get_group_id
(
0
)
*numTiles/get_num_groups
(
0
)
;
unsigned
int
end
=
(
get_group_id
(
0
)
+1
)
*numTiles/get_num_groups
(
0
)
;
#
endif
real
energy
=
0
;
unsigned
int
lasty
=
0xFFFFFFFF
;
__local
unsigned
int
exclusionRange[2]
;
__local
int
exclusionIndex[1]
;
DECLARE_TEMP_BUFFERS
while
(
pos
<
end
)
{
//
Extract
the
coordinates
of
this
tile
unsigned
int
x,
y
;
#
ifdef
USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles[pos]
;
x
=
tileIndices.x
;
y
=
tileIndices.y
;
}
else
#
endif
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS+0.5f-SQRT
((
NUM_BLOCKS+0.5f
)
*
(
NUM_BLOCKS+0.5f
)
-2*pos
))
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
if
(
x
<
y
|
| x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
unsigned int baseLocalAtom = (get_local_id(0) < TILE_SIZE ? 0 : TILE_SIZE/2);
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
unsigned int forceBufferOffset = (tgx < TILE_SIZE/2 ? 0 : TILE_SIZE);
unsigned int atom1 = x*TILE_SIZE + tgx;
real4 force = 0;
DECLARE_ATOM1_DERIVATIVES
real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
// Locate the exclusion data for this tile.
#ifdef USE_EXCLUSIONS
if (get_local_id(0) < 2)
exclusionRange[get_local_id(0)] = exclusionRowIndices[x+get_local_id(0)];
if (tgx == 0)
exclusionIndex[0] = -1;
barrier(CLK_LOCAL_MEM_FENCE);
for (int i = exclusionRange[0]+tgx; i < exclusionRange[1]; i += TILE_SIZE)
if (exclusionIndices[i] == y)
exclusionIndex[0] = i*TILE_SIZE;
barrier(CLK_LOCAL_MEM_FENCE);
bool hasExclusions = (exclusionIndex[0] > -1);
#endif
if (x == y) {
// This tile is on the diagonal.
const unsigned int localAtomIndex = get_local_id(0);
local_posq[localAtomIndex] = posq1;
LOAD_LOCAL_PARAMETERS_FROM_1
barrier(CLK_LOCAL_MEM_FENCE);
#ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[exclusionIndex[0]+tgx] >> baseLocalAtom;
#endif
for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
int atom2 = baseLocalAtom+j;
real4 posq2 = local_posq[atom2];
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+baseLocalAtom+j;
real dEdR = 0;
real tempEnergy = 0;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
COMPUTE_INTERACTION
dEdR /= -r;
}
energy += 0.5f*tempEnergy;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl >>= 1;
#endif
}
// Sum the forces and write results.
if (get_local_id(0) >= TILE_SIZE) {
tempForceBuffer[get_local_id(0)] = force;
SET_TEMP_BUFFERS
}
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) {
#ifdef SUPPORTS_64_BIT_ATOMICS
const unsigned int offset1 = x*TILE_SIZE + tgx;
atom_add(&forceBuffers[offset1], (long) ((force.x + tempForceBuffer[get_local_id(0)+TILE_SIZE].x)*0x100000000));
atom_add(&forceBuffers[offset1+PADDED_NUM_ATOMS], (long) ((force.y + tempForceBuffer[get_local_id(0)+TILE_SIZE].y)*0x100000000));
atom_add(&forceBuffers[offset1+2*PADDED_NUM_ATOMS], (long) ((force.z + tempForceBuffer[get_local_id(0)+TILE_SIZE].z)*0x100000000));
#else
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
const unsigned int offset1 = x*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else
const unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif
forceBuffers[offset1].xyz += force.xyz + tempForceBuffer[get_local_id(0)+TILE_SIZE].xyz;
#endif
STORE_DERIVATIVES_1
}
}
else {
// This is an off-diagonal tile.
const unsigned int localAtomIndex = get_local_id(0);
if (lasty != y && get_local_id(0) < TILE_SIZE) {
unsigned int j = y*TILE_SIZE + tgx;
local_posq[localAtomIndex] = posq[j];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
local_force[localAtomIndex] = 0;
CLEAR_LOCAL_DERIVATIVES
barrier(CLK_LOCAL_MEM_FENCE);
// Compute the full set of interactions in this tile.
#ifdef USE_EXCLUSIONS
unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[0]+tgx] : 0xFFFFFFFF);
excl = (excl >> baseLocalAtom) & 0xFFFF;
excl += excl << 16;
excl = (excl >> tgx) |
(
excl
<<
(
TILE_SIZE
-
tgx
))
;
#
endif
unsigned
int
tj
=
tgx%
(
TILE_SIZE/2
)
;
for
(
unsigned
int
j
=
0
; j < TILE_SIZE/2; j++) {
#
ifdef
USE_EXCLUSIONS
bool
isExcluded
=
!
(
excl
&
0x1
)
;
#
endif
int
atom2
=
baseLocalAtom+tj
;
real4
posq2
=
local_posq[atom2]
;
real4
delta
=
(
real4
)
(
posq2.xyz
-
posq1.xyz,
0
)
;
#
ifdef
USE_PERIODIC
delta.x
-=
floor
(
delta.x*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
delta.y
-=
floor
(
delta.y*invPeriodicBoxSize.y+0.5f
)
*periodicBoxSize.y
;
delta.z
-=
floor
(
delta.z*invPeriodicBoxSize.z+0.5f
)
*periodicBoxSize.z
;
#
endif
real
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
#
ifdef
USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
#
endif
real
invR
=
RSQRT
(
r2
)
;
real
r
=
RECIP
(
invR
)
;
LOAD_ATOM2_PARAMETERS
atom2
=
y*TILE_SIZE+baseLocalAtom+tj
;
real
dEdR
=
0
;
real
tempEnergy
=
0
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
COMPUTE_INTERACTION
dEdR
/=
-r
;
}
energy
+=
tempEnergy
;
delta.xyz
*=
dEdR
;
force.xyz
-=
delta.xyz
;
atom2
=
baseLocalAtom+tj+forceBufferOffset
;
local_force[baseLocalAtom+tj+forceBufferOffset].xyz
+=
delta.xyz
;
RECORD_DERIVATIVE_2
#
ifdef
USE_CUTOFF
}
#
endif
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
#
ifdef
USE_EXCLUSIONS
excl
>>=
1
;
#
endif
tj
=
(
tj+1
)
%
(
TILE_SIZE/2
)
;
}
//
Sum
the
forces
and
write
results.
if
(
get_local_id
(
0
)
>=
TILE_SIZE
)
{
tempForceBuffer[get_local_id
(
0
)
]
=
force
;
SET_TEMP_BUFFERS
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
{
#
ifdef
SUPPORTS_64_BIT_ATOMICS
const
unsigned
int
offset1
=
x*TILE_SIZE
+
tgx
;
const
unsigned
int
offset2
=
y*TILE_SIZE
+
tgx
;
atom_add
(
&forceBuffers[offset1],
(
long
)
((
force.x+tempForceBuffer[get_local_id
(
0
)
+TILE_SIZE].x
)
*0x100000000
))
;
atom_add
(
&forceBuffers[offset1+PADDED_NUM_ATOMS],
(
long
)
((
force.y+tempForceBuffer[get_local_id
(
0
)
+TILE_SIZE].y
)
*0x100000000
))
;
atom_add
(
&forceBuffers[offset1+2*PADDED_NUM_ATOMS],
(
long
)
((
force.z+tempForceBuffer[get_local_id
(
0
)
+TILE_SIZE].z
)
*0x100000000
))
;
atom_add
(
&forceBuffers[offset2],
(
long
)
((
local_force[get_local_id
(
0
)
].x+local_force[get_local_id
(
0
)
+TILE_SIZE].x
)
*0x100000000
))
;
atom_add
(
&forceBuffers[offset2+PADDED_NUM_ATOMS],
(
long
)
((
local_force[get_local_id
(
0
)
].y+local_force[get_local_id
(
0
)
+TILE_SIZE].y
)
*0x100000000
))
;
atom_add
(
&forceBuffers[offset2+2*PADDED_NUM_ATOMS],
(
long
)
((
local_force[get_local_id
(
0
)
].z+local_force[get_local_id
(
0
)
+TILE_SIZE].z
)
*0x100000000
))
;
#
else
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
const
unsigned
int
offset1
=
x*TILE_SIZE
+
tgx
+
y*PADDED_NUM_ATOMS
;
const
unsigned
int
offset2
=
y*TILE_SIZE
+
tgx
+
x*PADDED_NUM_ATOMS
;
#
else
const
unsigned
int
offset1
=
x*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
const
unsigned
int
offset2
=
y*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
#
endif
forceBuffers[offset1].xyz
+=
force.xyz+tempForceBuffer[get_local_id
(
0
)
+TILE_SIZE].xyz
;
forceBuffers[offset2].xyz
+=
local_force[get_local_id
(
0
)
].xyz+local_force[get_local_id
(
0
)
+TILE_SIZE].xyz
;
#
endif
STORE_DERIVATIVES_1
STORE_DERIVATIVES_2
}
}
lasty
=
y
;
pos++
;
}
energyBuffer[get_global_id
(
0
)
]
+=
energy
;
}
platforms/opencl/src/kernels/customGBEnergyN2_nvidia.cl
deleted
100644 → 0
View file @
f6d4557d
#
pragma
OPENCL
EXTENSION
cl_khr_global_int32_base_atomics
:
enable
#
ifdef
SUPPORTS_64_BIT_ATOMICS
#
pragma
OPENCL
EXTENSION
cl_khr_int64_base_atomics
:
enable
#
define
STORE_DERIVATIVE_1
(
INDEX
)
atom_add
(
&derivBuffers[offset+
(
INDEX-1
)
*PADDED_NUM_ATOMS],
(
long
)
(
deriv##INDEX##_1*0x100000000
))
;
#
define
STORE_DERIVATIVE_2
(
INDEX
)
atom_add
(
&derivBuffers[offset+
(
INDEX-1
)
*PADDED_NUM_ATOMS],
(
long
)
(
local_deriv##INDEX[get_local_id
(
0
)
]*0x100000000
))
;
#
else
#
define
STORE_DERIVATIVE_1
(
INDEX
)
derivBuffers##INDEX[offset]
+=
deriv##INDEX##_1
;
#
define
STORE_DERIVATIVE_2
(
INDEX
)
derivBuffers##INDEX[offset]
+=
local_deriv##INDEX[get_local_id
(
0
)
]
;
#
endif
#
define
TILE_SIZE
32
/**
*
Compute
a
force
based
on
pair
interactions.
*/
__kernel
void
computeN2Energy
(
#
ifdef
SUPPORTS_64_BIT_ATOMICS
__global
long*
restrict
forceBuffers,
#
else
__global
real4*
restrict
forceBuffers,
#
endif
__global
real*
restrict
energyBuffer,
__local
real4*
restrict
local_force,
__global
const
real4*
restrict
posq,
__local
real4*
restrict
local_posq,
__global
const
unsigned
int*
restrict
exclusions,
__global
const
unsigned
int*
restrict
exclusionIndices,
__global
const
unsigned
int*
restrict
exclusionRowIndices,
__local
real4*
restrict
tempBuffer,
#
ifdef
USE_CUTOFF
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
const
unsigned
int*
restrict
interactionFlags
#
else
unsigned
int
numTiles
#
endif
PARAMETER_ARGUMENTS
)
{
unsigned
int
totalWarps
=
get_global_size
(
0
)
/TILE_SIZE
;
unsigned
int
warp
=
get_global_id
(
0
)
/TILE_SIZE
;
#
ifdef
USE_CUTOFF
unsigned
int
numTiles
=
interactionCount[0]
;
unsigned
int
pos
=
warp*
(
numTiles
>
maxTiles
?
NUM_BLOCKS*
(
NUM_BLOCKS+1
)
/2
:
numTiles
)
/totalWarps
;
unsigned
int
end
=
(
warp+1
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS*
(
NUM_BLOCKS+1
)
/2
:
numTiles
)
/totalWarps
;
#
else
unsigned
int
pos
=
warp*numTiles/totalWarps
;
unsigned
int
end
=
(
warp+1
)
*numTiles/totalWarps
;
#
endif
real
energy
=
0
;
unsigned
int
lasty
=
0xFFFFFFFF
;
__local
unsigned
int
exclusionRange[2*WARPS_PER_GROUP]
;
__local
int
exclusionIndex[WARPS_PER_GROUP]
;
__local
int2*
reservedBlocks
=
(
__local
int2*
)
exclusionRange
;
do
{
//
Extract
the
coordinates
of
this
tile
const
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
TILE_SIZE-1
)
;
const
unsigned
int
tbx
=
get_local_id
(
0
)
-
tgx
;
const
unsigned
int
localGroupIndex
=
get_local_id
(
0
)
/TILE_SIZE
;
unsigned
int
x,
y
;
real4
force
=
0
;
DECLARE_ATOM1_DERIVATIVES
if
(
pos
<
end
)
{
#
ifdef
USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles[pos]
;
x
=
tileIndices.x
;
y
=
tileIndices.y
;
}
else
#
endif
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS+0.5f-SQRT
((
NUM_BLOCKS+0.5f
)
*
(
NUM_BLOCKS+0.5f
)
-2*pos
))
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
if
(
x
<
y
|
| x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
unsigned int atom1 = x*TILE_SIZE + tgx;
real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
// Locate the exclusion data for this tile.
#ifdef USE_EXCLUSIONS
if (tgx < 2)
exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
if (tgx == 0)
exclusionIndex[localGroupIndex] = -1;
for (unsigned int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
if (exclusionIndices[i] == y)
exclusionIndex[localGroupIndex] = i*TILE_SIZE;
bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
#else
bool hasExclusions = false;
#endif
if (pos >= end)
; // This warp is done.
else if (x == y) {
// This tile is on the diagonal.
const unsigned int localAtomIndex = get_local_id(0);
local_posq[localAtomIndex] = posq1;
LOAD_LOCAL_PARAMETERS_FROM_1
#ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
#endif
for (unsigned int j = 0; j < TILE_SIZE; j++) {
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
int atom2 = tbx+j;
real4 posq2 = local_posq[atom2];
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
real dEdR = 0;
real tempEnergy = 0;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
COMPUTE_INTERACTION
dEdR /= -r;
}
energy += 0.5f*tempEnergy;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl >>= 1;
#endif
}
}
else {
// This is an off-diagonal tile.
const unsigned int localAtomIndex = get_local_id(0);
if (lasty != y) {
unsigned int j = y*TILE_SIZE + tgx;
local_posq[localAtomIndex] = posq[j];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
local_force[localAtomIndex] = 0;
CLEAR_LOCAL_DERIVATIVES
#ifdef USE_CUTOFF
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
if (!hasExclusions && flags == 0) {
// No interactions in this tile.
}
else
#endif
{
// Compute the full set of interactions in this tile.
#ifdef USE_EXCLUSIONS
unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[localGroupIndex]+tgx] : 0xFFFFFFFF);
excl = (excl >> tgx) |
(
excl
<<
(
TILE_SIZE
-
tgx
))
;
#
endif
unsigned
int
tj
=
tgx
;
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
#
ifdef
USE_EXCLUSIONS
bool
isExcluded
=
!
(
excl
&
0x1
)
;
#
endif
int
atom2
=
tbx+tj
;
real4
posq2
=
local_posq[atom2]
;
real4
delta
=
(
real4
)
(
posq2.xyz
-
posq1.xyz,
0
)
;
#
ifdef
USE_PERIODIC
delta.x
-=
floor
(
delta.x*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
delta.y
-=
floor
(
delta.y*invPeriodicBoxSize.y+0.5f
)
*periodicBoxSize.y
;
delta.z
-=
floor
(
delta.z*invPeriodicBoxSize.z+0.5f
)
*periodicBoxSize.z
;
#
endif
real
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
#
ifdef
USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
#
endif
real
invR
=
RSQRT
(
r2
)
;
real
r
=
RECIP
(
invR
)
;
LOAD_ATOM2_PARAMETERS
atom2
=
y*TILE_SIZE+tj
;
real
dEdR
=
0
;
real
tempEnergy
=
0
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
COMPUTE_INTERACTION
dEdR
/=
-r
;
}
energy
+=
tempEnergy
;
delta.xyz
*=
dEdR
;
force.xyz
-=
delta.xyz
;
atom2
=
tbx+tj
;
local_force[atom2].xyz
+=
delta.xyz
;
RECORD_DERIVATIVE_2
#
ifdef
USE_CUTOFF
}
#
endif
#
ifdef
USE_EXCLUSIONS
excl
>>=
1
;
#
endif
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
)
;
}
}
}
}
lasty
=
y
;
//
Write
results.
We
need
to
coordinate
between
warps
to
make
sure
no
two
of
them
//
ever
try
to
write
to
the
same
piece
of
memory
at
the
same
time.
#
ifdef
SUPPORTS_64_BIT_ATOMICS
if
(
pos
<
end
)
{
const
unsigned
int
offset
=
x*TILE_SIZE
+
tgx
;
atom_add
(
&forceBuffers[offset],
(
long
)
(
force.x*0x100000000
))
;
atom_add
(
&forceBuffers[offset+PADDED_NUM_ATOMS],
(
long
)
(
force.y*0x100000000
))
;
atom_add
(
&forceBuffers[offset+2*PADDED_NUM_ATOMS],
(
long
)
(
force.z*0x100000000
))
;
STORE_DERIVATIVES_1
}
if
(
pos
<
end
&&
x
!=
y
)
{
const
unsigned
int
offset
=
y*TILE_SIZE
+
tgx
;
atom_add
(
&forceBuffers[offset],
(
long
)
(
local_force[get_local_id
(
0
)
].x*0x100000000
))
;
atom_add
(
&forceBuffers[offset+PADDED_NUM_ATOMS],
(
long
)
(
local_force[get_local_id
(
0
)
].y*0x100000000
))
;
atom_add
(
&forceBuffers[offset+2*PADDED_NUM_ATOMS],
(
long
)
(
local_force[get_local_id
(
0
)
].z*0x100000000
))
;
STORE_DERIVATIVES_2
}
#
else
int
writeX
=
(
pos
<
end
?
x
:
-1
)
;
int
writeY
=
(
pos
<
end
&&
x
!=
y
?
y
:
-1
)
;
if
(
tgx
==
0
)
reservedBlocks[localGroupIndex]
=
(
int2
)(
writeX,
writeY
)
;
bool
done
=
false
;
int
doneIndex
=
0
;
int
checkIndex
=
0
;
while
(
true
)
{
//
See
if
any
warp
still
needs
to
write
its
data.
bool
allDone
=
true
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
while
(
doneIndex
<
WARPS_PER_GROUP
&&
allDone
)
{
if
(
reservedBlocks[doneIndex].x
!=
-1
)
allDone
=
false
;
else
doneIndex++
;
}
if
(
allDone
)
break
;
if
(
!done
)
{
//
See
whether
this
warp
can
write
its
data.
This
requires
that
no
previous
warp
//
is
trying
to
write
to
the
same
block
of
the
buffer.
bool
canWrite
=
(
writeX
!=
-1
)
;
while
(
checkIndex
<
localGroupIndex
&&
canWrite
)
{
if
((
reservedBlocks[checkIndex].x
==
x
|
| reservedBlocks[checkIndex].y == x) ||
(writeY != -1 && (reservedBlocks[checkIndex].x == y |
|
reservedBlocks[checkIndex].y
==
y
)))
canWrite
=
false
;
else
checkIndex++
;
}
if
(
canWrite
)
{
//
Write
the
data
to
global
memory,
then
mark
this
warp
as
done.
if
(
writeX
>
-1
)
{
const
unsigned
int
offset
=
x*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
forceBuffers[offset].xyz
+=
force.xyz
;
STORE_DERIVATIVES_1
}
if
(
writeY
>
-1
)
{
const
unsigned
int
offset
=
y*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
forceBuffers[offset].xyz
+=
local_force[get_local_id
(
0
)
].xyz
;
STORE_DERIVATIVES_2
}
done
=
true
;
if
(
tgx
==
0
)
reservedBlocks[localGroupIndex]
=
(
int2
)(
-1
,
-1
)
;
}
}
}
#
endif
pos++
;
}
while
(
pos
<
end
)
;
energyBuffer[get_global_id
(
0
)
]
+=
energy
;
}
platforms/opencl/src/kernels/customGBValueN2.cl
0 → 100644
View file @
93c467b2
#
ifdef
SUPPORTS_64_BIT_ATOMICS
#
pragma
OPENCL
EXTENSION
cl_khr_int64_base_atomics
:
enable
#
endif
/**
*
Compute
a
value
based
on
pair
interactions.
*/
__kernel
void
computeN2Value
(
__global
const
real4*
restrict
posq,
__local
real4*
restrict
local_posq,
__global
const
unsigned
int*
restrict
exclusions,
__global
const
ushort2*
exclusionTiles,
#
ifdef
SUPPORTS_64_BIT_ATOMICS
__global
long*
restrict
global_value,
#
else
__global
real*
restrict
global_value,
#
endif
__local
real*
restrict
local_value,
#
ifdef
USE_CUTOFF
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
const
real4*
restrict
blockCenter,
__global
const
int*
restrict
interactingAtoms
#
else
unsigned
int
numTiles
#
endif
PARAMETER_ARGUMENTS
)
{
const
unsigned
int
totalWarps
=
get_global_size
(
0
)
/TILE_SIZE
;
const
unsigned
int
warp
=
get_global_id
(
0
)
/TILE_SIZE
;
const
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
TILE_SIZE-1
)
;
const
unsigned
int
tbx
=
get_local_id
(
0
)
-
tgx
;
//
First
loop:
process
tiles
that
contain
exclusions.
const
unsigned
int
firstExclusionTile
=
FIRST_EXCLUSION_TILE+warp*
(
LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE
)
/totalWarps
;
const
unsigned
int
lastExclusionTile
=
FIRST_EXCLUSION_TILE+
(
warp+1
)
*
(
LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE
)
/totalWarps
;
for
(
int
pos
=
firstExclusionTile
; pos < lastExclusionTile; pos++) {
const
ushort2
tileIndices
=
exclusionTiles[pos]
;
const
unsigned
int
x
=
tileIndices.x
;
const
unsigned
int
y
=
tileIndices.y
;
real
value
=
0
;
unsigned
int
atom1
=
x*TILE_SIZE
+
tgx
;
real4
posq1
=
posq[atom1]
;
LOAD_ATOM1_PARAMETERS
#
ifdef
USE_EXCLUSIONS
unsigned
int
excl
=
exclusions[pos*TILE_SIZE+tgx]
;
#
endif
if
(
x
==
y
)
{
//
This
tile
is
on
the
diagonal.
const
unsigned
int
localAtomIndex
=
get_local_id
(
0
)
;
local_posq[localAtomIndex]
=
posq1
;
LOAD_LOCAL_PARAMETERS_FROM_1
SYNC_WARPS
;
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
int
atom2
=
tbx+j
;
real4
posq2
=
local_posq[atom2]
;
real4
delta
=
(
real4
)
(
posq2.xyz
-
posq1.xyz,
0
)
;
#
ifdef
USE_PERIODIC
delta.xyz
-=
floor
(
delta.xyz*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
#
endif
real
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
#
ifdef
USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
#
endif
real
invR
=
RSQRT
(
r2
)
;
real
r
=
RECIP
(
invR
)
;
LOAD_ATOM2_PARAMETERS
atom2
=
y*TILE_SIZE+j
;
real
tempValue1
=
0
;
real
tempValue2
=
0
;
#
ifdef
USE_EXCLUSIONS
bool
isExcluded
=
(
atom1
>=
NUM_ATOMS
|
| atom2 >= NUM_ATOMS || !(excl & 0x1));
if (!isExcluded && atom1 != atom2) {
#else
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
#endif
COMPUTE_VALUE
}
value += tempValue1;
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl >>= 1;
#endif
SYNC_WARPS;
}
}
else {
// This is an off-diagonal tile.
const unsigned int localAtomIndex = get_local_id(0);
unsigned int j = y*TILE_SIZE + tgx;
local_posq[localAtomIndex] = posq[j];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
local_value[localAtomIndex] = 0;
SYNC_WARPS;
#ifdef USE_EXCLUSIONS
excl = (excl >> tgx) |
(
excl
<<
(
TILE_SIZE
-
tgx
))
;
#
endif
unsigned
int
tj
=
tgx
;
for
(
j
=
0
; j < TILE_SIZE; j++) {
int
atom2
=
tbx+tj
;
real4
posq2
=
local_posq[atom2]
;
real4
delta
=
(
real4
)
(
posq2.xyz
-
posq1.xyz,
0
)
;
#
ifdef
USE_PERIODIC
delta.xyz
-=
floor
(
delta.xyz*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
#
endif
real
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
#
ifdef
USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
#
endif
real
invR
=
RSQRT
(
r2
)
;
real
r
=
RECIP
(
invR
)
;
LOAD_ATOM2_PARAMETERS
atom2
=
y*TILE_SIZE+tj
;
real
tempValue1
=
0
;
real
tempValue2
=
0
;
#
ifdef
USE_EXCLUSIONS
bool
isExcluded
=
(
atom1
>=
NUM_ATOMS
|
| atom2 >= NUM_ATOMS || !(excl & 0x1));
if (!isExcluded) {
#else
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
#endif
COMPUTE_VALUE
}
value += tempValue1;
local_value[tbx+tj] += tempValue2;
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl >>= 1;
#endif
tj = (tj + 1) & (TILE_SIZE - 1);
SYNC_WARPS;
}
}
// Write results.
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset = x*TILE_SIZE + tgx;
atom_add(&global_value[offset], (long) (value*0x100000000));
if (x != y) {
offset = y*TILE_SIZE + tgx;
atom_add(&global_value[offset], (long) (local_value[get_local_id(0)]*0x100000000));
}
#else
unsigned int offset1 = x*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
global_value[offset1] += value;
if (x != y)
global_value[offset2] += local_value[get_local_id(0)];
#endif
}
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
// of them (no cutoff).
#ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0];
int pos = warp*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
int end = (warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
#else
int pos = warp*numTiles/totalWarps;
int end = (warp+1)*numTiles/totalWarps;
#endif
int skipBase = 0;
int currentSkipIndex = tbx;
__local int atomIndices[FORCE_WORK_GROUP_SIZE];
__local int skipTiles[FORCE_WORK_GROUP_SIZE];
skipTiles[get_local_id(0)] = -1;
while (pos < end) {
real value = 0;
bool includeTile = true;
// Extract the coordinates of this tile.
unsigned int x, y;
bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
x = tileIndices.x;
singlePeriodicCopy = tileIndices.y;
}
else
#endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y |
|
x
>=
NUM_BLOCKS
)
{
//
Occasionally
happens
due
to
roundoff
error.
y
+=
(
x
<
y
?
-1
:
1
)
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
}
//
Skip
over
tiles
that
have
exclusions,
since
they
were
already
processed.
SYNC_WARPS
;
while
(
skipTiles[tbx+TILE_SIZE-1]
<
pos
)
{
SYNC_WARPS
;
if
(
skipBase+tgx
<
NUM_TILES_WITH_EXCLUSIONS
)
{
ushort2
tile
=
exclusionTiles[skipBase+tgx]
;
skipTiles[get_local_id
(
0
)
]
=
tile.x
+
tile.y*NUM_BLOCKS
-
tile.y*
(
tile.y+1
)
/2
;
}
else
skipTiles[get_local_id
(
0
)
]
=
end
;
skipBase
+=
TILE_SIZE
;
currentSkipIndex
=
tbx
;
SYNC_WARPS
;
}
while
(
skipTiles[currentSkipIndex]
<
pos
)
currentSkipIndex++
;
includeTile
=
(
skipTiles[currentSkipIndex]
!=
pos
)
;
}
if
(
includeTile
)
{
unsigned
int
atom1
=
x*TILE_SIZE
+
tgx
;
//
Load
atom
data
for
this
tile.
real4
posq1
=
posq[atom1]
;
LOAD_ATOM1_PARAMETERS
const
unsigned
int
localAtomIndex
=
get_local_id
(
0
)
;
#
ifdef
USE_CUTOFF
unsigned
int
j
=
(
numTiles
<=
maxTiles
?
interactingAtoms[pos*TILE_SIZE+tgx]
:
y*TILE_SIZE
+
tgx
)
;
#
else
unsigned
int
j
=
y*TILE_SIZE
+
tgx
;
#
endif
atomIndices[get_local_id
(
0
)
]
=
j
;
if
(
j
<
PADDED_NUM_ATOMS
)
{
local_posq[localAtomIndex]
=
posq[j]
;
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
local_value[localAtomIndex]
=
0
;
}
SYNC_WARPS
;
#
ifdef
USE_PERIODIC
if
(
singlePeriodicCopy
)
{
//
The
box
is
small
enough
that
we
can
just
translate
all
the
atoms
into
a
single
periodic
//
box,
then
skip
having
to
apply
periodic
boundary
conditions
later.
real4
blockCenterX
=
blockCenter[x]
;
posq1.xyz
-=
floor
((
posq1.xyz-blockCenterX.xyz
)
*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
local_posq[get_local_id
(
0
)
].x
-=
floor
((
local_posq[get_local_id
(
0
)
].x-blockCenterX.x
)
*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
local_posq[get_local_id
(
0
)
].y
-=
floor
((
local_posq[get_local_id
(
0
)
].y-blockCenterX.y
)
*invPeriodicBoxSize.y+0.5f
)
*periodicBoxSize.y
;
local_posq[get_local_id
(
0
)
].z
-=
floor
((
local_posq[get_local_id
(
0
)
].z-blockCenterX.z
)
*invPeriodicBoxSize.z+0.5f
)
*periodicBoxSize.z
;
SYNC_WARPS
;
unsigned
int
tj
=
tgx
;
for
(
j
=
0
; j < TILE_SIZE; j++) {
int
atom2
=
tbx+tj
;
real4
posq2
=
local_posq[atom2]
;
real4
delta
=
(
real4
)
(
posq2.xyz
-
posq1.xyz,
0
)
;
real
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
if
(
r2
<
CUTOFF_SQUARED
)
{
real
invR
=
RSQRT
(
r2
)
;
real
r
=
RECIP
(
invR
)
;
LOAD_ATOM2_PARAMETERS
atom2
=
atomIndices[tbx+tj]
;
real
tempValue1
=
0
;
real
tempValue2
=
0
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
COMPUTE_VALUE
}
value
+=
tempValue1
;
local_value[tbx+tj]
+=
tempValue2
;
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
)
;
SYNC_WARPS
;
}
}
else
#
endif
{
//
We
need
to
apply
periodic
boundary
conditions
separately
for
each
interaction.
unsigned
int
tj
=
tgx
;
for
(
j
=
0
; j < TILE_SIZE; j++) {
int
atom2
=
tbx+tj
;
real4
posq2
=
local_posq[atom2]
;
real4
delta
=
(
real4
)
(
posq2.xyz
-
posq1.xyz,
0
)
;
#
ifdef
USE_PERIODIC
delta.xyz
-=
floor
(
delta.xyz*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
#
endif
real
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
#
ifdef
USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
#
endif
real
invR
=
RSQRT
(
r2
)
;
real
r
=
RECIP
(
invR
)
;
LOAD_ATOM2_PARAMETERS
atom2
=
atomIndices[tbx+tj]
;
real
tempValue1
=
0
;
real
tempValue2
=
0
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
COMPUTE_VALUE
}
value
+=
tempValue1
;
local_value[tbx+tj]
+=
tempValue2
;
#
ifdef
USE_CUTOFF
}
#
endif
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
)
;
SYNC_WARPS
;
}
}
//
Write
results.
#
ifdef
USE_CUTOFF
unsigned
int
atom2
=
atomIndices[get_local_id
(
0
)
]
;
#
else
unsigned
int
atom2
=
y*TILE_SIZE
+
tgx
;
#
endif
#
ifdef
SUPPORTS_64_BIT_ATOMICS
atom_add
(
&global_value[atom1],
(
long
)
(
value*0x100000000
))
;
if
(
atom2
<
PADDED_NUM_ATOMS
)
atom_add
(
&global_value[atom2],
(
long
)
(
local_value[get_local_id
(
0
)
]*0x100000000
))
;
#
else
unsigned
int
offset1
=
atom1
+
warp*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
atom2
+
warp*PADDED_NUM_ATOMS
;
global_value[offset1]
+=
value
;
if
(
atom2
<
PADDED_NUM_ATOMS
)
global_value[offset2]
+=
local_value[get_local_id
(
0
)
]
;
#
endif
}
pos++
;
}
}
platforms/opencl/src/kernels/customGBValueN2_cpu.cl
View file @
93c467b2
#
define
TILE_SIZE
32
#
ifdef
SUPPORTS_64_BIT_ATOMICS
#
pragma
OPENCL
EXTENSION
cl_khr_int64_base_atomics
:
enable
#
endif
/**
/**
*
Compute
a
value
based
on
pair
interactions.
*
Compute
a
value
based
on
pair
interactions.
*/
*/
__kernel
void
computeN2Value
(
__global
const
real4*
restrict
posq,
__local
real4*
restrict
local_posq,
__global
const
unsigned
int*
restrict
exclusions,
__kernel
void
computeN2Value
(
__global
const
real4*
restrict
posq,
__local
real4*
restrict
local_posq,
__global
const
unsigned
int*
restrict
exclusions,
__global
const
unsigned
int*
restrict
exclusionIndices,
__global
const
unsigned
int*
restrict
exclusionRowIndices,
__global
real*
restrict
global_value,
__local
real*
restrict
local_value,
__global
const
ushort2*
exclusionTiles,
__local
real*
restrict
tempBuffer,
#
ifdef
SUPPORTS_64_BIT_ATOMICS
__global
long*
restrict
global_value,
#
else
__global
real*
restrict
global_value,
#
endif
__local
real*
restrict
local_value,
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
const
unsigned
int*
restrict
interacti
onFlag
s
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
const
real4*
restrict
blockCenter,
__global
const
int*
restrict
interacti
ngAtom
s
#
else
#
else
unsigned
int
numTiles
unsigned
int
numTiles
#
endif
#
endif
PARAMETER_ARGUMENTS
)
{
PARAMETER_ARGUMENTS
)
{
//
First
loop:
process
tiles
that
contain
exclusions.
const
unsigned
int
firstExclusionTile
=
FIRST_EXCLUSION_TILE+get_group_id
(
0
)
*
(
LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE
)
/get_num_groups
(
0
)
;
const
unsigned
int
lastExclusionTile
=
FIRST_EXCLUSION_TILE+
(
get_group_id
(
0
)
+1
)
*
(
LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE
)
/get_num_groups
(
0
)
;
for
(
int
pos
=
firstExclusionTile
; pos < lastExclusionTile; pos++) {
const
ushort2
tileIndices
=
exclusionTiles[pos]
;
const
unsigned
int
x
=
tileIndices.x
;
const
unsigned
int
y
=
tileIndices.y
;
//
Load
the
data
for
this
tile.
for
(
int
localAtomIndex
=
0
; localAtomIndex < TILE_SIZE; localAtomIndex++) {
unsigned
int
j
=
y*TILE_SIZE
+
localAtomIndex
;
local_posq[localAtomIndex]
=
posq[j]
;
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
if
(
x
==
y
)
{
//
This
tile
is
on
the
diagonal.
for
(
unsigned
int
tgx
=
0
; tgx < TILE_SIZE; tgx++) {
#
ifdef
USE_EXCLUSIONS
unsigned
int
excl
=
exclusions[pos*TILE_SIZE+tgx]
;
#
endif
unsigned
int
atom1
=
x*TILE_SIZE+tgx
;
real
value
=
0
;
real4
posq1
=
posq[atom1]
;
LOAD_ATOM1_PARAMETERS
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
real4
posq2
=
local_posq[j]
;
real4
delta
=
(
real4
)
(
posq2.xyz
-
posq1.xyz,
0
)
;
#
ifdef
USE_PERIODIC
delta.xyz
-=
floor
(
delta.xyz*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
#
endif
real
r2
=
dot
(
delta.xyz,
delta.xyz
)
;
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
unsigned
int
numTiles
=
interactionCount[0]
;
if
(
r2
<
CUTOFF_SQUARED
)
{
unsigned
int
pos
=
get_group_id
(
0
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS*
(
NUM_BLOCKS+1
)
/2
:
numTiles
)
/get_num_groups
(
0
)
;
#
endif
unsigned
int
end
=
(
get_group_id
(
0
)
+1
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS*
(
NUM_BLOCKS+1
)
/2
:
numTiles
)
/get_num_groups
(
0
)
;
real
invR
=
RSQRT
(
r2
)
;
real
r
=
RECIP
(
invR
)
;
unsigned
int
atom2
=
j
;
LOAD_ATOM2_PARAMETERS
atom2
=
y*TILE_SIZE+j
;
real
tempValue1
=
0
;
real
tempValue2
=
0
;
#
ifdef
USE_EXCLUSIONS
bool
isExcluded
=
(
atom1
>=
NUM_ATOMS
|
| atom2 >= NUM_ATOMS || !(excl & 0x1));
if (!isExcluded && atom1 != atom2) {
#else
#else
unsigned
int
pos
=
get_group_id
(
0
)
*numTiles/get_num_groups
(
0
)
;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
unsigned
int
end
=
(
get_group_id
(
0
)
+1
)
*numTiles/get_num_groups
(
0
)
;
#endif
#endif
unsigned
int
lasty
=
0xFFFFFFFF
;
COMPUTE_VALUE
}
while
(
pos
<
end
)
{
value += tempValue1;
//
Extract
the
coordinates
of
this
tile
unsigned
int
x,
y
;
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
}
ushort2
tileIndices
=
tiles[pos]
;
x
=
tileIndices.x
;
y
=
tileIndices.y
;
}
else
#endif
#endif
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS+0.5f-SQRT
((
NUM_BLOCKS+0.5f
)
*
(
NUM_BLOCKS+0.5f
)
-2*pos
))
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
if
(
x
<
y
|
| x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
// Locate the exclusion data for this tile.
#ifdef USE_EXCLUSIONS
#ifdef USE_EXCLUSIONS
unsigned int exclusionStart = exclusionRowIndices[x];
excl >>= 1;
unsigned int exclusionEnd = exclusionRowIndices[x+1];
int exclusionIndex = -1;
for (int i = exclusionStart; i < exclusionEnd; i++)
if (exclusionIndices[i] == y) {
exclusionIndex = i*TILE_SIZE;
break;
}
bool hasExclusions = (exclusionIndex > -1);
#else
bool hasExclusions = false;
#endif
#endif
}
// Load the data for this tile if we don't already have it cached
.
// Write results
.
if (lasty != y) {
#ifdef SUPPORTS_64_BIT_ATOMICS
for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
atom_add(&global_value[atom1], (long) (value*0x100000000));
unsigned int j = y*TILE_SIZE + localAtomIndex;
#else
local_posq[localAtomIndex] = posq[j];
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
global_value[offset] += value;
#endif
}
}
}
}
if (x == y)
{
else
{
// This
tile
is
o
n
the
diagonal.
// This is
a
n
off-
diagonal
tile
.
for (int tgx = 0; tgx < TILE_SIZE; tgx++)
local_value[tgx] = 0;
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
#ifdef USE_EXCLUSIONS
#ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[
exclusionIndex
+tgx];
unsigned int excl = exclusions[
pos*TILE_SIZE
+tgx];
#endif
#endif
unsigned int atom1 = x*TILE_SIZE+tgx;
unsigned int atom1 = x*TILE_SIZE+tgx;
real value = 0;
real value = 0;
real4 posq1 = posq[atom1];
real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
LOAD_ATOM1_PARAMETERS
for (unsigned int j = 0; j < TILE_SIZE; j++) {
for (unsigned int j = 0; j < TILE_SIZE; j++) {
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
real4 posq2 = local_posq[j];
real4 posq2 = local_posq[j];
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
#ifdef USE_PERIODIC
...
@@ -92,21 +114,23 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
...
@@ -92,21 +114,23 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
if (r2 < CUTOFF_SQUARED) {
#endif
#endif
real invR = RSQRT(r2);
real invR = RSQRT(r2);
real r = RECIP(invR);
real r = RECIP(invR);
unsigned int atom2 = j;
unsigned int atom2 = j;
LOAD_ATOM2_PARAMETERS
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
atom2 = y*TILE_SIZE+j;
real tempValue1 = 0;
real tempValue1 = 0;
real tempValue2 = 0;
real tempValue2 = 0;
#ifdef USE_EXCLUSIONS
#ifdef USE_EXCLUSIONS
if (!isExcluded && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 0x1));
if (!isExcluded) {
#else
#else
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
#endif
#endif
COMPUTE_VALUE
COMPUTE_VALUE
}
}
value += tempValue1;
value += tempValue1;
local_value[j] += tempValue2;
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
}
}
#endif
#endif
...
@@ -115,78 +139,148 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
...
@@ -115,78 +139,148 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
#endif
#endif
}
}
// Write results
// Write results
for atom1.
unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&global_value[atom1], (long) (value*0x100000000));
#else
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
global_value[offset] += value;
global_value[offset] += value;
#endif
}
// Write results.
for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset = y*TILE_SIZE+tgx;
atom_add(&global_value[offset], (long) (local_value[tgx]*0x100000000));
#else
unsigned int offset = y*TILE_SIZE+tgx + get_group_id(0)*PADDED_NUM_ATOMS;
global_value[offset] += local_value[tgx];
#endif
}
}
}
}
else {
}
// This is an off-diagonal tile.
for (int tgx = 0; tgx < TILE_SIZE; tgx++)
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
local_value[tgx] = 0;
// of them (no cutoff).
#if defined(USE_CUTOFF) && defined(USE_EXCLUSIONS)
unsigned int flags1 = (numTiles <= maxTiles ? interactionFlags[2*pos] : 0xFFFFFFFF);
unsigned int flags2 = (numTiles <= maxTiles ? interactionFlags[2*pos+1] : 0xFFFFFFFF);
if (!hasExclusions && (flags1 != 0xFFFFFFFF |
|
flags2
!=
0xFFFFFFFF
)
)
{
//
Compute
only
a
subset
of
the
interactions
in
this
tile.
for
(
unsigned
int
tgx
=
0
; tgx < TILE_SIZE; tgx++) {
#ifdef USE_CUTOFF
if
((
flags2&
(
1<<tgx
))
!=
0
)
{
const unsigned int numTiles = interactionCount[0];
unsigned
int
atom1
=
x*TILE_SIZE+tgx
;
int pos = get_group_id(0)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0);
real
value
=
0
;
int end = (get_group_id(0)+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0);
real4
posq1
=
posq[atom1]
;
#else
LOAD_ATOM1_PARAMETERS
int pos = get_group_id(0)*numTiles/get_num_groups(0);
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
int end = (get_group_id(0)+1)*numTiles/get_num_groups(0);
if
((
flags&
(
1<<j
))
!=
0
)
{
#endif
real4
posq2
=
local_posq[j]
;
int nextToSkip = -1;
real4
delta
=
(
real4
)
(
posq2.xyz
-
posq1.xyz,
0
)
;
int currentSkipIndex = 0;
__local int atomIndices[TILE_SIZE];
while (pos < end) {
bool includeTile = true;
// Extract the coordinates of this tile.
unsigned int x, y;
bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
x = tileIndices.x;
singlePeriodicCopy = tileIndices.y;
}
else
#endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y |
|
x
>=
NUM_BLOCKS
)
{
//
Occasionally
happens
due
to
roundoff
error.
y
+=
(
x
<
y
?
-1
:
1
)
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
}
//
Skip
over
tiles
that
have
exclusions,
since
they
were
already
processed.
while
(
nextToSkip
<
pos
)
{
if
(
currentSkipIndex
<
NUM_TILES_WITH_EXCLUSIONS
)
{
ushort2
tile
=
exclusionTiles[currentSkipIndex++]
;
nextToSkip
=
tile.x
+
tile.y*NUM_BLOCKS
-
tile.y*
(
tile.y+1
)
/2
;
}
else
nextToSkip
=
end
;
}
includeTile
=
(
nextToSkip
!=
pos
)
;
}
if
(
includeTile
)
{
//
Load
the
data
for
this
tile.
for
(
int
localAtomIndex
=
0
; localAtomIndex < TILE_SIZE; localAtomIndex++) {
#
ifdef
USE_CUTOFF
unsigned
int
j
=
(
numTiles
<=
maxTiles
?
interactingAtoms[pos*TILE_SIZE+localAtomIndex]
:
y*TILE_SIZE+localAtomIndex
)
;
#
else
unsigned
int
j
=
y*TILE_SIZE+localAtomIndex
;
#
endif
atomIndices[localAtomIndex]
=
j
;
if
(
j
<
PADDED_NUM_ATOMS
)
{
local_posq[localAtomIndex]
=
posq[j]
;
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
local_value[localAtomIndex]
=
0
;
}
}
#
ifdef
USE_PERIODIC
#
ifdef
USE_PERIODIC
delta.xyz
-=
floor
(
delta.xyz*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
if
(
singlePeriodicCopy
)
{
#
endif
//
The
box
is
small
enough
that
we
can
just
translate
all
the
atoms
into
a
single
periodic
real
r2
=
dot
(
delta.xyz,
delta.xyz
)
;
//
box,
then
skip
having
to
apply
periodic
boundary
conditions
later.
real
tempValue1
=
0
;
real
tempValue2
=
0
;
real4
blockCenterX
=
blockCenter[x]
;
if
(
r2
<
CUTOFF_SQUARED
)
{
for
(
unsigned
int
tgx
=
0
; tgx < TILE_SIZE; tgx++)
real
invR
=
RSQRT
(
r2
)
;
local_posq[tgx].xyz
-=
floor
((
local_posq[tgx].xyz-blockCenterX.xyz
)
*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
real
r
=
RECIP
(
invR
)
;
for
(
unsigned
int
tgx
=
0
; tgx < TILE_SIZE; tgx++) {
unsigned
int
atom2
=
j
;
unsigned
int
atom1
=
x*TILE_SIZE+tgx
;
LOAD_ATOM2_PARAMETERS
real
value
=
0
;
atom2
=
y*TILE_SIZE+j
;
real4
posq1
=
posq[atom1]
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
LOAD_ATOM1_PARAMETERS
COMPUTE_VALUE
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
}
real4
posq2
=
local_posq[j]
;
value
+=
tempValue1
;
real4
delta
=
(
real4
)
(
posq2.xyz
-
posq1.xyz,
0
)
;
local_value[j]
+=
tempValue2
;
real
r2
=
dot
(
delta.xyz,
delta.xyz
)
;
}
if
(
atom1
<
NUM_ATOMS
&&
atomIndices[j]
<
NUM_ATOMS
&&
r2
<
CUTOFF_SQUARED
)
{
}
real
invR
=
RSQRT
(
r2
)
;
real
r
=
RECIP
(
invR
)
;
unsigned
int
atom2
=
j
;
LOAD_ATOM2_PARAMETERS
atom2
=
atomIndices[j]
;
real
tempValue1
=
0
;
real
tempValue2
=
0
;
COMPUTE_VALUE
value
+=
tempValue1
;
local_value[j]
+=
tempValue2
;
}
}
}
//
Write
results
for
atom1.
//
Write
results
for
atom1.
unsigned
int
offset
=
atom1
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
#
ifdef
SUPPORTS_64_BIT_ATOMICS
global_value[offset]
+=
value
;
atom_add
(
&global_value[atom1],
(
long
)
(
value*0x100000000
))
;
}
#
else
unsigned
int
offset
=
atom1
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
global_value[offset]
+=
value
;
#
endif
}
}
}
}
else
else
#
endif
#
endif
{
{
//
Compute
the
full
set
of
interactions
in
this
tile
.
//
We
need
to
apply
periodic
boundary
conditions
separately
for
each
interaction
.
for
(
unsigned
int
tgx
=
0
; tgx < TILE_SIZE; tgx++) {
for
(
unsigned
int
tgx
=
0
; tgx < TILE_SIZE; tgx++) {
unsigned
int
atom1
=
x*TILE_SIZE+tgx
;
unsigned
int
atom1
=
x*TILE_SIZE+tgx
;
real
value
=
0
;
real
value
=
0
;
real4
posq1
=
posq[atom1]
;
real4
posq1
=
posq[atom1]
;
LOAD_ATOM1_PARAMETERS
LOAD_ATOM1_PARAMETERS
#
ifdef
USE_EXCLUSIONS
unsigned
int
excl
=
(
hasExclusions
?
exclusions[exclusionIndex+tgx]
:
0xFFFFFFFF
)
;
#
endif
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
#
ifdef
USE_EXCLUSIONS
bool
isExcluded
=
!
(
excl
&
0x1
)
;
#
endif
real4
posq2
=
local_posq[j]
;
real4
posq2
=
local_posq[j]
;
real4
delta
=
(
real4
)
(
posq2.xyz
-
posq1.xyz,
0
)
;
real4
delta
=
(
real4
)
(
posq2.xyz
-
posq1.xyz,
0
)
;
#
ifdef
USE_PERIODIC
#
ifdef
USE_PERIODIC
...
@@ -194,47 +288,52 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
...
@@ -194,47 +288,52 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
#
endif
#
endif
real
r2
=
dot
(
delta.xyz,
delta.xyz
)
;
real
r2
=
dot
(
delta.xyz,
delta.xyz
)
;
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
if
(
atom1
<
NUM_ATOMS
&&
atomIndices[j]
<
NUM_ATOMS
&&
r2
<
CUTOFF_SQUARED
)
{
#
endif
real
invR
=
RSQRT
(
r2
)
;
real
r
=
RECIP
(
invR
)
;
unsigned
int
atom2
=
j
;
LOAD_ATOM2_PARAMETERS
atom2
=
y*TILE_SIZE+j
;
real
tempValue1
=
0
;
real
tempValue2
=
0
;
#
ifdef
USE_EXCLUSIONS
if
(
!isExcluded
&&
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
#
else
#
else
if
(
atom1
<
NUM_ATOMS
&&
atom
2
<
NUM_ATOMS
)
{
if
(
atom1
<
NUM_ATOMS
&&
atom
Indices[j]
<
NUM_ATOMS
)
{
#
endif
#
endif
real
invR
=
RSQRT
(
r2
)
;
real
r
=
RECIP
(
invR
)
;
unsigned
int
atom2
=
j
;
LOAD_ATOM2_PARAMETERS
atom2
=
atomIndices[j]
;
real
tempValue1
=
0
;
real
tempValue2
=
0
;
COMPUTE_VALUE
COMPUTE_VALUE
value
+=
tempValue1
;
local_value[j]
+=
tempValue2
;
}
}
value
+=
tempValue1
;
local_value[j]
+=
tempValue2
;
#
ifdef
USE_CUTOFF
}
#
endif
#
ifdef
USE_EXCLUSIONS
excl
>>=
1
;
#
endif
}
}
//
Write
results
for
atom1.
//
Write
results
for
atom1.
#
ifdef
SUPPORTS_64_BIT_ATOMICS
atom_add
(
&global_value[atom1],
(
long
)
(
value*0x100000000
))
;
#
else
unsigned
int
offset
=
atom1
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset
=
atom1
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
global_value[offset]
+=
value
;
global_value[offset]
+=
value
;
#
endif
}
}
}
}
//
Write
results
//
Write
results
.
for
(
int
tgx
=
0
; tgx < TILE_SIZE; tgx++) {
for
(
int
tgx
=
0
; tgx < TILE_SIZE; tgx++) {
unsigned
int
offset
=
y*TILE_SIZE+tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
#
ifdef
USE_CUTOFF
global_value[offset]
+=
local_value[tgx]
;
unsigned
int
atom2
=
atomIndices[tgx]
;
#
else
unsigned
int
atom2
=
y*TILE_SIZE
+
tgx
;
#
endif
if
(
atom2
<
PADDED_NUM_ATOMS
)
{
#
ifdef
SUPPORTS_64_BIT_ATOMICS
atom_add
(
&global_value[atom2],
(
long
)
(
local_value[tgx]*0x100000000
))
;
#
else
unsigned
int
offset
=
atom2
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
global_value[offset]
+=
local_value[tgx]
;
#
endif
}
}
}
}
}
lasty
=
y
;
pos++
;
pos++
;
}
}
}
}
platforms/opencl/src/kernels/customGBValueN2_default.cl
deleted
100644 → 0
View file @
f6d4557d
#
ifdef
SUPPORTS_64_BIT_ATOMICS
#
pragma
OPENCL
EXTENSION
cl_khr_global_int32_base_atomics
:
enable
#
pragma
OPENCL
EXTENSION
cl_khr_int64_base_atomics
:
enable
#
endif
#
define
TILE_SIZE
32
/**
*
Compute
a
value
based
on
pair
interactions.
*/
__kernel
__attribute__
((
reqd_work_group_size
(
WORK_GROUP_SIZE,
1
,
1
)))
void
computeN2Value
(
__global
const
real4*
restrict
posq,
__local
real4*
restrict
local_posq,
__global
const
unsigned
int*
restrict
exclusions,
__global
const
unsigned
int*
restrict
exclusionIndices,
__global
const
unsigned
int*
restrict
exclusionRowIndices,
#
ifdef
SUPPORTS_64_BIT_ATOMICS
__global
long*
restrict
global_value,
#
else
__global
real*
restrict
global_value,
#
endif
__local
real*
restrict
local_value,
__local
real*
restrict
tempBuffer,
#
ifdef
USE_CUTOFF
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
unsigned
int
maxTiles
#
else
unsigned
int
numTiles
#
endif
PARAMETER_ARGUMENTS
)
{
#
ifdef
USE_CUTOFF
unsigned
int
numTiles
=
interactionCount[0]
;
unsigned
int
pos
=
get_group_id
(
0
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS*
(
NUM_BLOCKS+1
)
/2
:
numTiles
)
/get_num_groups
(
0
)
;
unsigned
int
end
=
(
get_group_id
(
0
)
+1
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS*
(
NUM_BLOCKS+1
)
/2
:
numTiles
)
/get_num_groups
(
0
)
;
#
else
unsigned
int
pos
=
get_group_id
(
0
)
*numTiles/get_num_groups
(
0
)
;
unsigned
int
end
=
(
get_group_id
(
0
)
+1
)
*numTiles/get_num_groups
(
0
)
;
#
endif
real
energy
=
0
;
unsigned
int
lasty
=
0xFFFFFFFF
;
__local
unsigned
int
exclusionRange[2]
;
__local
int
exclusionIndex[1]
;
while
(
pos
<
end
)
{
//
Extract
the
coordinates
of
this
tile
unsigned
int
x,
y
;
#
ifdef
USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles[pos]
;
x
=
tileIndices.x
;
y
=
tileIndices.y
;
}
else
#
endif
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS+0.5f-SQRT
((
NUM_BLOCKS+0.5f
)
*
(
NUM_BLOCKS+0.5f
)
-2*pos
))
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
if
(
x
<
y
|
| x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
unsigned int baseLocalAtom = (get_local_id(0) < TILE_SIZE ? 0 : TILE_SIZE/2);
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
unsigned int valueBufferOffset = (tgx < TILE_SIZE/2 ? 0 : TILE_SIZE);
unsigned int atom1 = x*TILE_SIZE + tgx;
real value = 0;
real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
// Locate the exclusion data for this tile.
#ifdef USE_EXCLUSIONS
if (get_local_id(0) < 2)
exclusionRange[get_local_id(0)] = exclusionRowIndices[x+get_local_id(0)];
if (tgx == 0)
exclusionIndex[0] = -1;
barrier(CLK_LOCAL_MEM_FENCE);
for (int i = exclusionRange[0]+tgx; i < exclusionRange[1]; i += TILE_SIZE)
if (exclusionIndices[i] == y)
exclusionIndex[0] = i*TILE_SIZE;
barrier(CLK_LOCAL_MEM_FENCE);
bool hasExclusions = (exclusionIndex[0] > -1);
#endif
if (x == y) {
// This tile is on the diagonal.
const unsigned int localAtomIndex = get_local_id(0);
local_posq[localAtomIndex] = posq1;
LOAD_LOCAL_PARAMETERS_FROM_1
barrier(CLK_LOCAL_MEM_FENCE);
#ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[exclusionIndex[0]+tgx] >> baseLocalAtom;
#endif
for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
int atom2 = baseLocalAtom+j;
real4 posq2 = local_posq[atom2];
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+baseLocalAtom+j;
real tempValue1 = 0;
real tempValue2 = 0;
#ifdef USE_EXCLUSIONS
if (!isExcluded && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
#else
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
#endif
COMPUTE_VALUE
}
value += tempValue1;
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl >>= 1;
#endif
}
// Sum the values and write results.
if (get_local_id(0) >= TILE_SIZE)
tempBuffer[get_local_id(0)] = value;
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) {
#ifdef SUPPORTS_64_BIT_ATOMICS
const unsigned int offset = x*TILE_SIZE + tgx;
atom_add(&global_value[offset], (long) ((value + tempBuffer[get_local_id(0)+TILE_SIZE])*0x100000000));
#else
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
const unsigned int offset = x*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else
const unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif
global_value[offset] += value + tempBuffer[get_local_id(0)+TILE_SIZE];
#endif
}
}
else {
// This is an off-diagonal tile.
if (lasty != y && get_local_id(0) < TILE_SIZE) {
unsigned int j = y*TILE_SIZE + tgx;
local_posq[get_local_id(0)] = posq[j];
const unsigned int localAtomIndex = get_local_id(0);
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
local_value[get_local_id(0)] = 0;
barrier(CLK_LOCAL_MEM_FENCE);
// Compute the full set of interactions in this tile.
#ifdef USE_EXCLUSIONS
unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[0]+tgx] : 0xFFFFFFFF);
excl = (excl >> baseLocalAtom) & 0xFFFF;
excl += excl << 16;
excl = (excl >> tgx) |
(
excl
<<
(
TILE_SIZE
-
tgx
))
;
#
endif
unsigned
int
tj
=
tgx%
(
TILE_SIZE/2
)
;
for
(
unsigned
int
j
=
0
; j < TILE_SIZE/2; j++) {
#
ifdef
USE_EXCLUSIONS
bool
isExcluded
=
!
(
excl
&
0x1
)
;
#
endif
int
atom2
=
baseLocalAtom+tj
;
real4
posq2
=
local_posq[atom2]
;
real4
delta
=
(
real4
)
(
posq2.xyz
-
posq1.xyz,
0
)
;
#
ifdef
USE_PERIODIC
delta.x
-=
floor
(
delta.x*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
delta.y
-=
floor
(
delta.y*invPeriodicBoxSize.y+0.5f
)
*periodicBoxSize.y
;
delta.z
-=
floor
(
delta.z*invPeriodicBoxSize.z+0.5f
)
*periodicBoxSize.z
;
#
endif
real
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
#
ifdef
USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
#
endif
real
invR
=
RSQRT
(
r2
)
;
real
r
=
RECIP
(
invR
)
;
LOAD_ATOM2_PARAMETERS
atom2
=
y*TILE_SIZE+baseLocalAtom+tj
;
real
tempValue1
=
0
;
real
tempValue2
=
0
;
#
ifdef
USE_EXCLUSIONS
if
(
!isExcluded
&&
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
#
else
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
#
endif
COMPUTE_VALUE
}
value
+=
tempValue1
;
local_value[baseLocalAtom+tj+valueBufferOffset]
+=
tempValue2
;
#
ifdef
USE_CUTOFF
}
#
endif
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
#
ifdef
USE_EXCLUSIONS
excl
>>=
1
;
#
endif
tj
=
(
tj+1
)
%
(
TILE_SIZE/2
)
;
}
//
Sum
the
values
and
write
results.
if
(
get_local_id
(
0
)
>=
TILE_SIZE
)
tempBuffer[get_local_id
(
0
)
]
=
value
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
{
#
ifdef
SUPPORTS_64_BIT_ATOMICS
const
unsigned
int
offset1
=
x*TILE_SIZE
+
tgx
;
const
unsigned
int
offset2
=
y*TILE_SIZE
+
tgx
;
atom_add
(
&global_value[offset1],
(
long
)
((
value
+
tempBuffer[get_local_id
(
0
)
+TILE_SIZE]
)
*0x100000000
))
;
atom_add
(
&global_value[offset2],
(
long
)
((
local_value[get_local_id
(
0
)
]
+
local_value[get_local_id
(
0
)
+TILE_SIZE]
)
*0x100000000
))
;
#
else
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
const
unsigned
int
offset1
=
x*TILE_SIZE
+
tgx
+
y*PADDED_NUM_ATOMS
;
const
unsigned
int
offset2
=
y*TILE_SIZE
+
tgx
+
x*PADDED_NUM_ATOMS
;
#
else
const
unsigned
int
offset1
=
x*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
const
unsigned
int
offset2
=
y*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
#
endif
global_value[offset1]
+=
value
+
tempBuffer[get_local_id
(
0
)
+TILE_SIZE]
;
global_value[offset2]
+=
local_value[get_local_id
(
0
)
]
+
local_value[get_local_id
(
0
)
+TILE_SIZE]
;
#
endif
}
}
lasty
=
y
;
pos++
;
}
}
platforms/opencl/src/kernels/customGBValueN2_nvidia.cl
deleted
100644 → 0
View file @
f6d4557d
#
pragma
OPENCL
EXTENSION
cl_khr_global_int32_base_atomics
:
enable
#
ifdef
SUPPORTS_64_BIT_ATOMICS
#
pragma
OPENCL
EXTENSION
cl_khr_int64_base_atomics
:
enable
#
endif
#
define
TILE_SIZE
32
/**
*
Compute
a
value
based
on
pair
interactions.
*/
__kernel
void
computeN2Value
(
__global
const
real4*
restrict
posq,
__local
real4*
restrict
local_posq,
__global
const
unsigned
int*
restrict
exclusions,
__global
const
unsigned
int*
restrict
exclusionIndices,
__global
const
unsigned
int*
restrict
exclusionRowIndices,
#
ifdef
SUPPORTS_64_BIT_ATOMICS
__global
long*
restrict
global_value,
#
else
__global
real*
restrict
global_value,
#
endif
__local
real*
restrict
local_value,
__local
real*
restrict
tempBuffer,
#
ifdef
USE_CUTOFF
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
const
unsigned
int*
restrict
interactionFlags
#
else
unsigned
int
numTiles
#
endif
PARAMETER_ARGUMENTS
)
{
unsigned
int
totalWarps
=
get_global_size
(
0
)
/TILE_SIZE
;
unsigned
int
warp
=
get_global_id
(
0
)
/TILE_SIZE
;
#
ifdef
USE_CUTOFF
unsigned
int
numTiles
=
interactionCount[0]
;
unsigned
int
pos
=
warp*
(
numTiles
>
maxTiles
?
NUM_BLOCKS*
(
NUM_BLOCKS+1
)
/2
:
numTiles
)
/totalWarps
;
unsigned
int
end
=
(
warp+1
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS*
(
NUM_BLOCKS+1
)
/2
:
numTiles
)
/totalWarps
;
#
else
unsigned
int
pos
=
warp*numTiles/totalWarps
;
unsigned
int
end
=
(
warp+1
)
*numTiles/totalWarps
;
#
endif
real
energy
=
0
;
unsigned
int
lasty
=
0xFFFFFFFF
;
__local
unsigned
int
exclusionRange[2*WARPS_PER_GROUP]
;
__local
int
exclusionIndex[WARPS_PER_GROUP]
;
__local
int2*
reservedBlocks
=
(
__local
int2*
)
exclusionRange
;
do
{
//
Extract
the
coordinates
of
this
tile
const
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
TILE_SIZE-1
)
;
const
unsigned
int
tbx
=
get_local_id
(
0
)
-
tgx
;
const
unsigned
int
localGroupIndex
=
get_local_id
(
0
)
/TILE_SIZE
;
unsigned
int
x,
y
;
real
value
=
0
;
if
(
pos
<
end
)
{
#
ifdef
USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles[pos]
;
x
=
tileIndices.x
;
y
=
tileIndices.y
;
}
else
#
endif
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS+0.5f-SQRT
((
NUM_BLOCKS+0.5f
)
*
(
NUM_BLOCKS+0.5f
)
-2*pos
))
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
if
(
x
<
y
|
| x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
unsigned int atom1 = x*TILE_SIZE + tgx;
real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
// Locate the exclusion data for this tile.
#ifdef USE_EXCLUSIONS
if (tgx < 2)
exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
if (tgx == 0)
exclusionIndex[localGroupIndex] = -1;
for (unsigned int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
if (exclusionIndices[i] == y)
exclusionIndex[localGroupIndex] = i*TILE_SIZE;
bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
#else
bool hasExclusions = false;
#endif
if (pos >= end)
; // This warp is done.
else if (x == y) {
// This tile is on the diagonal.
const unsigned int localAtomIndex = get_local_id(0);
local_posq[localAtomIndex] = posq1;
LOAD_LOCAL_PARAMETERS_FROM_1
#ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
#endif
for (unsigned int j = 0; j < TILE_SIZE; j++) {
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
int atom2 = tbx+j;
real4 posq2 = local_posq[atom2];
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
real tempValue1 = 0;
real tempValue2 = 0;
#ifdef USE_EXCLUSIONS
if (!isExcluded && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
#else
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
#endif
COMPUTE_VALUE
}
value += tempValue1;
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl >>= 1;
#endif
}
}
else {
// This is an off-diagonal tile.
if (lasty != y) {
unsigned int j = y*TILE_SIZE + tgx;
local_posq[get_local_id(0)] = posq[j];
const unsigned int localAtomIndex = get_local_id(0);
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
local_value[get_local_id(0)] = 0;
#ifdef USE_CUTOFF
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
if (!hasExclusions && flags != 0xFFFFFFFF) {
if (flags == 0) {
// No interactions in this tile.
}
else {
// Compute only a subset of the interactions in this tile.
for (unsigned int j = 0; j < TILE_SIZE; j++) {
if ((flags&(1<<j)) != 0) {
int atom2 = tbx+j;
real4 posq2 = local_posq[atom2];
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real tempValue1 = 0;
real tempValue2 = 0;
if (r2 < CUTOFF_SQUARED) {
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
COMPUTE_VALUE
}
value += tempValue1;
}
tempBuffer[get_local_id(0)] = tempValue2;
// Sum the forces on atom2.
if (tgx % 4 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+1]+tempBuffer[get_local_id(0)+2]+tempBuffer[get_local_id(0)+3];
if (tgx == 0)
local_value[tbx+j] += tempBuffer[get_local_id(0)]+tempBuffer[get_local_id(0)+4]+tempBuffer[get_local_id(0)+8]+tempBuffer[get_local_id(0)+12]+tempBuffer[get_local_id(0)+16]+tempBuffer[get_local_id(0)+20]+tempBuffer[get_local_id(0)+24]+tempBuffer[get_local_id(0)+28];
}
}
}
}
else
#endif
{
// Compute the full set of interactions in this tile.
#ifdef USE_EXCLUSIONS
unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[localGroupIndex]+tgx] : 0xFFFFFFFF);
excl = (excl >> tgx) |
(
excl
<<
(
TILE_SIZE
-
tgx
))
;
#
endif
unsigned
int
tj
=
tgx
;
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
#
ifdef
USE_EXCLUSIONS
bool
isExcluded
=
!
(
excl
&
0x1
)
;
#
endif
int
atom2
=
tbx+tj
;
real4
posq2
=
local_posq[atom2]
;
real4
delta
=
(
real4
)
(
posq2.xyz
-
posq1.xyz,
0
)
;
#
ifdef
USE_PERIODIC
delta.x
-=
floor
(
delta.x*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
delta.y
-=
floor
(
delta.y*invPeriodicBoxSize.y+0.5f
)
*periodicBoxSize.y
;
delta.z
-=
floor
(
delta.z*invPeriodicBoxSize.z+0.5f
)
*periodicBoxSize.z
;
#
endif
real
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
#
ifdef
USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
#
endif
real
invR
=
RSQRT
(
r2
)
;
real
r
=
RECIP
(
invR
)
;
LOAD_ATOM2_PARAMETERS
atom2
=
y*TILE_SIZE+tj
;
real
tempValue1
=
0
;
real
tempValue2
=
0
;
#
ifdef
USE_EXCLUSIONS
if
(
!isExcluded
&&
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
#
else
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
#
endif
COMPUTE_VALUE
}
value
+=
tempValue1
;
local_value[tbx+tj]
+=
tempValue2
;
#
ifdef
USE_CUTOFF
}
#
endif
#
ifdef
USE_EXCLUSIONS
excl
>>=
1
;
#
endif
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
)
;
}
}
}
}
//
Write
results.
We
need
to
coordinate
between
warps
to
make
sure
no
two
of
them
//
ever
try
to
write
to
the
same
piece
of
memory
at
the
same
time.
#
ifdef
SUPPORTS_64_BIT_ATOMICS
if
(
pos
<
end
)
{
const
unsigned
int
offset
=
x*TILE_SIZE
+
tgx
;
atom_add
(
&global_value[offset],
(
long
)
(
value*0x100000000
))
;
}
if
(
pos
<
end
&&
x
!=
y
)
{
const
unsigned
int
offset
=
y*TILE_SIZE
+
tgx
;
atom_add
(
&global_value[offset],
(
long
)
(
local_value[get_local_id
(
0
)
]*0x100000000
))
;
}
#
else
int
writeX
=
(
pos
<
end
?
x
:
-1
)
;
int
writeY
=
(
pos
<
end
&&
x
!=
y
?
y
:
-1
)
;
if
(
tgx
==
0
)
reservedBlocks[localGroupIndex]
=
(
int2
)(
writeX,
writeY
)
;
bool
done
=
false
;
int
doneIndex
=
0
;
int
checkIndex
=
0
;
while
(
true
)
{
//
See
if
any
warp
still
needs
to
write
its
data.
bool
allDone
=
true
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
while
(
doneIndex
<
WARPS_PER_GROUP
&&
allDone
)
{
if
(
reservedBlocks[doneIndex].x
!=
-1
)
allDone
=
false
;
else
doneIndex++
;
}
if
(
allDone
)
break
;
if
(
!done
)
{
//
See
whether
this
warp
can
write
its
data.
This
requires
that
no
previous
warp
//
is
trying
to
write
to
the
same
block
of
the
buffer.
bool
canWrite
=
(
writeX
!=
-1
)
;
while
(
checkIndex
<
localGroupIndex
&&
canWrite
)
{
if
((
reservedBlocks[checkIndex].x
==
x
|
| reservedBlocks[checkIndex].y == x) ||
(writeY != -1 && (reservedBlocks[checkIndex].x == y |
|
reservedBlocks[checkIndex].y
==
y
)))
canWrite
=
false
;
else
checkIndex++
;
}
if
(
canWrite
)
{
//
Write
the
data
to
global
memory,
then
mark
this
warp
as
done.
if
(
writeX
>
-1
)
{
const
unsigned
int
offset
=
x*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
global_value[offset]
+=
value
;
}
if
(
writeY
>
-1
)
{
const
unsigned
int
offset
=
y*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
global_value[offset]
+=
local_value[get_local_id
(
0
)
]
;
}
done
=
true
;
if
(
tgx
==
0
)
reservedBlocks[localGroupIndex]
=
(
int2
)(
-1
,
-1
)
;
}
}
}
#
endif
lasty
=
y
;
pos++
;
}
while
(
pos
<
end
)
;
}
platforms/opencl/src/kernels/fft.cl
View file @
93c467b2
...
@@ -11,14 +11,14 @@ __kernel void execFFT(__global const real2* restrict in, __global real2* restric
...
@@ -11,14 +11,14 @@ __kernel void execFFT(__global const real2* restrict in, __global real2* restric
for
(
int
i
=
get_local_id
(
0
)
; i < ZSIZE; i += get_local_size(0))
for
(
int
i
=
get_local_id
(
0
)
; i < ZSIZE; i += get_local_size(0))
w[i]
=
(
real2
)
(
cos
(
-sign*i*2*M_PI/ZSIZE
)
,
sin
(
-sign*i*2*M_PI/ZSIZE
))
;
w[i]
=
(
real2
)
(
cos
(
-sign*i*2*M_PI/ZSIZE
)
,
sin
(
-sign*i*2*M_PI/ZSIZE
))
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
for
(
int
index
=
get_group_id
(
0
)
; index < XSIZE*YSIZE; index += get_num_groups(0)) {
for
(
int
index
=
get_group_id
(
0
)
*BLOCKS_PER_GROUP+get_local_id
(
0
)
/ZSIZE
; index < XSIZE*YSIZE; index += get_num_groups(0)
*BLOCKS_PER_GROUP
) {
int
x
=
index/YSIZE
;
int
x
=
index/YSIZE
;
int
y
=
index-x*YSIZE
;
int
y
=
index-x*YSIZE
;
#
if
LOOP_REQUIRED
#
if
LOOP_REQUIRED
for
(
int
z
=
get_local_id
(
0
)
; z < ZSIZE; z += get_local_size(0))
for
(
int
z
=
get_local_id
(
0
)
; z < ZSIZE; z += get_local_size(0))
data0[z]
=
in[x*
(
YSIZE*ZSIZE
)
+y*ZSIZE+z]
;
data0[z]
=
in[x*
(
YSIZE*ZSIZE
)
+y*ZSIZE+z]
;
#
else
#
else
data0[get_local_id
(
0
)
]
=
in[x*
(
YSIZE*ZSIZE
)
+y*ZSIZE+get_local_id
(
0
)
]
;
data0[get_local_id
(
0
)
]
=
in[x*
(
YSIZE*ZSIZE
)
+y*ZSIZE+get_local_id
(
0
)
%ZSIZE
]
;
#
endif
#
endif
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
COMPUTE_FFT
COMPUTE_FFT
...
...
platforms/opencl/src/kernels/findInteractingBlocks.cl
View file @
93c467b2
#
pragma
OPENCL
EXTENSION
cl_khr_global_int32_base_atomics
:
enable
#
pragma
OPENCL
EXTENSION
cl_khr_global_int32_base_atomics
:
enable
#
pragma
OPENCL
EXTENSION
cl_khr_byte_addressable_store
:
enable
#
pragma
OPENCL
EXTENSION
cl_khr_byte_addressable_store
:
enable
#
define
TILE_SIZE
32
#
define
GROUP_SIZE
64
#
define
BUFFER_GROUPS
4
#
define
BUFFER_SIZE
BUFFER_GROUPS*GROUP_SIZE
#
define
BUFFER_SIZE
BUFFER_GROUPS*GROUP_SIZE
#
define
WARP_SIZE
32
#
define
INVALID
0xFFFF
/**
/**
*
Find
a
bounding
box
for
the
atoms
in
each
block.
*
Find
a
bounding
box
for
the
atoms
in
each
block.
*/
*/
__kernel
void
findBlockBounds
(
int
numAtoms,
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
__global
const
real4*
restrict
posq,
__global
real4*
restrict
blockCenter,
__global
real4*
restrict
blockBoundingBox,
__global
unsigned
int*
restrict
interactionCount
)
{
__kernel
void
findBlockBounds
(
int
numAtoms,
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
__global
const
real4*
restrict
posq,
__global
real4*
restrict
blockCenter,
__global
real4*
restrict
blockBoundingBox,
__global
int*
restrict
rebuildNeighborList,
__global
real2*
restrict
sortedBlocks
)
{
int
index
=
get_global_id
(
0
)
;
int
index
=
get_global_id
(
0
)
;
int
base
=
index*TILE_SIZE
;
int
base
=
index*TILE_SIZE
;
while
(
base
<
numAtoms
)
{
while
(
base
<
numAtoms
)
{
real4
pos
=
posq[base]
;
real4
pos
=
posq[base]
;
#
ifdef
USE_PERIODIC
#
ifdef
USE_PERIODIC
pos.x
-=
floor
(
pos.x*invPeriodicBoxSize.x
)
*periodicBoxSize.x
;
pos.xyz
-=
floor
(
pos.xyz*invPeriodicBoxSize.xyz
)
*periodicBoxSize.xyz
;
pos.y
-=
floor
(
pos.y*invPeriodicBoxSize.y
)
*periodicBoxSize.y
;
pos.z
-=
floor
(
pos.z*invPeriodicBoxSize.z
)
*periodicBoxSize.z
;
real4
firstPoint
=
pos
;
real4
firstPoint
=
pos
;
#
endif
#
endif
real4
minPos
=
pos
;
real4
minPos
=
pos
;
...
@@ -25,146 +24,229 @@ __kernel void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeri
...
@@ -25,146 +24,229 @@ __kernel void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeri
for
(
int
i
=
base+1
; i < last; i++) {
for
(
int
i
=
base+1
; i < last; i++) {
pos
=
posq[i]
;
pos
=
posq[i]
;
#
ifdef
USE_PERIODIC
#
ifdef
USE_PERIODIC
pos.x
-=
floor
((
pos.x-firstPoint.x
)
*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
pos.xyz
-=
floor
((
pos.xyz-firstPoint.xyz
)
*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
pos.y
-=
floor
((
pos.y-firstPoint.y
)
*invPeriodicBoxSize.y+0.5f
)
*periodicBoxSize.y
;
pos.z
-=
floor
((
pos.z-firstPoint.z
)
*invPeriodicBoxSize.z+0.5f
)
*periodicBoxSize.z
;
#
endif
#
endif
minPos
=
min
(
minPos,
pos
)
;
minPos
=
min
(
minPos,
pos
)
;
maxPos
=
max
(
maxPos,
pos
)
;
maxPos
=
max
(
maxPos,
pos
)
;
}
}
blockBoundingBox[index]
=
0.5f*
(
maxPos-minPos
)
;
real4
blockSize
=
0.5f*
(
maxPos-minPos
)
;
blockBoundingBox[index]
=
blockSize
;
blockCenter[index]
=
0.5f*
(
maxPos+minPos
)
;
blockCenter[index]
=
0.5f*
(
maxPos+minPos
)
;
sortedBlocks[index]
=
(
real2
)
(
blockSize.x+blockSize.y+blockSize.z,
index
)
;
index
+=
get_global_size
(
0
)
;
index
+=
get_global_size
(
0
)
;
base
=
index*TILE_SIZE
;
base
=
index*TILE_SIZE
;
}
}
if
(
get_global_id
(
0
)
==
0
)
if
(
get_global_id
(
0
)
==
0
)
interactionCoun
t[0]
=
0
;
rebuildNeighborLis
t[0]
=
0
;
}
}
/**
/**
*
This
is
called
by
findBlocksWithInteractions
()
.
It
compacts
the
list
of
blocks
and
writes
them
*
Sort
the
data
about
bounding
boxes
so
it
can
be
accessed
more
efficiently
in
the
next
kernel.
*
to
global
memory.
*/
*/
void
storeInteractionData
(
__local
ushort2*
buffer,
__local
int*
valid,
__local
short*
sum,
__local
ushort2*
temp,
__local
int*
baseIndex,
__kernel
void
sortBoxData
(
__global
const
real2*
restrict
sortedBlock,
__global
const
real4*
restrict
blockCenter,
__global
unsigned
int*
interactionCount,
__global
ushort2*
interactingTiles,
real
cutoffSquared,
real4
periodicBoxSize,
__global
const
real4*
restrict
blockBoundingBox,
__global
real4*
restrict
sortedBlockCenter,
real4
invPeriodicBoxSize,
__global
const
real4*
posq,
__global
const
real4*
blockCenter,
__global
const
real4*
blockBoundingBox,
unsigned
int
maxTiles
)
{
__global
real4*
restrict
sortedBlockBoundingBox,
__global
const
real4*
restrict
posq,
__global
const
real4*
restrict
oldPositions,
//
The
buffer
is
full,
so
we
need
to
compact
it
and
write
out
results.
Start
by
doing
a
parallel
prefix
sum.
__global
unsigned
int*
restrict
interactionCount,
__global
int*
restrict
rebuildNeighborList
)
{
for
(
int
i
=
get_global_id
(
0
)
; i < NUM_BLOCKS; i += get_global_size(0)) {
int
index
=
(
int
)
sortedBlock[i].y
;
sortedBlockCenter[i]
=
blockCenter[index]
;
sortedBlockBoundingBox[i]
=
blockBoundingBox[index]
;
}
//
Also
check
whether
any
atom
has
moved
enough
so
that
we
really
need
to
rebuild
the
neighbor
list.
for
(
int
i
=
get_local_id
(
0
)
; i < BUFFER_SIZE; i += GROUP_SIZE)
bool
rebuild
=
false
;
temp[i].x
=
(
valid[i]
?
1
:
0
)
;
for
(
int
i
=
get_global_id
(
0
)
; i < NUM_ATOMS; i += get_global_size(0)) {
real4
delta
=
oldPositions[i]-posq[i]
;
if
(
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
>
0.25f*PADDING*PADDING
)
rebuild
=
true
;
}
if
(
rebuild
)
{
rebuildNeighborList[0]
=
1
;
interactionCount[0]
=
0
;
}
}
/**
*
Perform
a
parallel
prefix
sum
over
an
array.
The
input
values
are
all
assumed
to
be
0
or
1.
*/
void
prefixSum
(
__local
short*
sum,
__local
ushort2*
temp
)
{
for
(
int
i
=
get_local_id
(
0
)
; i < BUFFER_SIZE; i += get_local_size(0))
temp[i].x
=
sum[i]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
int
whichBuffer
=
0
;
int
whichBuffer
=
0
;
for
(
int
offset
=
1
; offset < BUFFER_SIZE; offset *= 2) {
for
(
int
offset
=
1
; offset < BUFFER_SIZE; offset *= 2) {
if
(
whichBuffer
==
0
)
if
(
whichBuffer
==
0
)
for
(
int
i
=
get_local_id
(
0
)
; i < BUFFER_SIZE; i +=
GROUP_SIZE
)
for
(
int
i
=
get_local_id
(
0
)
; i < BUFFER_SIZE; i +=
get_local_size(0)
)
temp[i].y
=
(
i
<
offset
?
temp[i].x
:
temp[i].x+temp[i-offset].x
)
;
temp[i].y
=
(
i
<
offset
?
temp[i].x
:
temp[i].x+temp[i-offset].x
)
;
else
else
for
(
int
i
=
get_local_id
(
0
)
; i < BUFFER_SIZE; i +=
GROUP_SIZE
)
for
(
int
i
=
get_local_id
(
0
)
; i < BUFFER_SIZE; i +=
get_local_size(0)
)
temp[i].x
=
(
i
<
offset
?
temp[i].y
:
temp[i].y+temp[i-offset].y
)
;
temp[i].x
=
(
i
<
offset
?
temp[i].y
:
temp[i].y+temp[i-offset].y
)
;
whichBuffer
=
1-whichBuffer
;
whichBuffer
=
1-whichBuffer
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
}
if
(
whichBuffer
==
0
)
if
(
whichBuffer
==
0
)
for
(
int
i
=
get_local_id
(
0
)
; i < BUFFER_SIZE; i +=
GROUP_SIZE
)
for
(
int
i
=
get_local_id
(
0
)
; i < BUFFER_SIZE; i +=
get_local_size(0)
)
sum[i]
=
temp[i].x
;
sum[i]
=
temp[i].x
;
else
else
for
(
int
i
=
get_local_id
(
0
)
; i < BUFFER_SIZE; i +=
GROUP_SIZE
)
for
(
int
i
=
get_local_id
(
0
)
; i < BUFFER_SIZE; i +=
get_local_size(0)
)
sum[i]
=
temp[i].y
;
sum[i]
=
temp[i].y
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
int
numValid
=
sum[BUFFER_SIZE-1]
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
//
Compact
the
buffer.
for
(
int
i
=
get_local_id
(
0
)
; i < BUFFER_SIZE; i += GROUP_SIZE)
/**
if
(
valid[i]
)
{
*
This
is
called
by
findBlocksWithInteractions
()
.
It
compacts
the
list
of
blocks,
identifies
interactions
temp[sum[i]-1]
=
buffer[i]
;
*
in
them,
and
writes
the
result
to
global
memory.
sum[i]
=
valid[i]
;
*/
valid[i]
=
false
;
void
storeInteractionData
(
unsigned
short
x,
__local
unsigned
short*
buffer,
__local
short*
sum,
__local
ushort2*
temp,
__local
int*
atoms,
__local
int*
numAtoms,
buffer[i]
=
(
ushort2
)
1
;
__local
int*
baseIndex,
__global
unsigned
int*
interactionCount,
__global
ushort2*
interactingTiles,
__global
unsigned
int*
interactingAtoms,
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
__global
const
real4*
posq,
__local
real4*
posBuffer,
real4
blockCenterX,
real4
blockSizeX,
unsigned
int
maxTiles,
bool
finish
)
{
const
bool
singlePeriodicCopy
=
(
0.5f*periodicBoxSize.x-blockSizeX.x
>=
PADDED_CUTOFF
&&
0.5f*periodicBoxSize.y-blockSizeX.y
>=
PADDED_CUTOFF
&&
0.5f*periodicBoxSize.z-blockSizeX.z
>=
PADDED_CUTOFF
)
;
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
{
real4
pos
=
posq[x*TILE_SIZE+get_local_id
(
0
)
]
;
#
ifdef
USE_PERIODIC
if
(
singlePeriodicCopy
)
{
//
The
box
is
small
enough
that
we
can
just
translate
all
the
atoms
into
a
single
periodic
//
box,
then
skip
having
to
apply
periodic
boundary
conditions
later.
pos.xyz
-=
floor
((
pos.xyz-blockCenterX.xyz
)
*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
}
}
#
endif
posBuffer[get_local_id
(
0
)
]
=
pos
;
}
//
The
buffer
is
full,
so
we
need
to
compact
it
and
write
out
results.
Start
by
doing
a
parallel
prefix
sum.
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
for
(
int
i
=
get_local_id
(
0
)
; i < BUFFER_SIZE; i += get_local_size(0))
sum[i]
=
(
buffer[i]
==
INVALID
?
0
:
1
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
prefixSum
(
sum,
temp
)
;
int
numValid
=
sum[BUFFER_SIZE-1]
;
#
ifndef
WARPS_ARE_ATOMIC
//
Compact
the
buffer.
//
Filter
the
list
of
tiles
by
comparing
the
distance
from
each
atom
to
the
other
bounding
box.
//
We
only
do
this
if
we
aren
't
already
optimizing
the
computation
using
flags.
int
index
=
get_local_id
(
0
)
&
(
TILE_SIZE-1
)
;
for
(
int
i
=
get_local_id
(
0
)
; i < BUFFER_SIZE; i += get_local_size(0))
int
group
=
get_local_id
(
0
)
/TILE_SIZE
;
if
(
buffer[i]
!=
INVALID
)
real4
center,
boxSize,
pos
;
temp[sum[i]-1].x
=
buffer[i]
;
for
(
int
tile
=
0
; tile < numValid; tile++) {
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
int
x
=
temp[tile].x
;
for
(
int
i
=
get_local_id
(
0
)
; i < BUFFER_SIZE; i += get_local_size(0))
int
y
=
temp[tile].y
;
buffer[i]
=
temp[i].x
;
if
(
x
==
y
)
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
continue
;
//
Lo
ad
an
atom
position
and
the
bounding
box
the
other
block
.
//
Lo
op
over
the
tiles
and
find
specific
interactions
in
them
.
#
ifdef
MAC_AMD_WORKAROUND
const
int
indexInWarp
=
get_local_id
(
0
)
%WARP_SIZE
;
int
box
=
(
group
==
0
?
x
:
y
)
;
for
(
int
base
=
0
; base < numValid; base += BUFFER_SIZE/WARP_SIZE) {
int
atom
=
(
group
==
0
?
y
:
x
)
*TILE_SIZE+index
;
for
(
int
i
=
get_local_id
(
0
)
/WARP_SIZE
; i < BUFFER_SIZE/WARP_SIZE && base+i < numValid; i += GROUP_SIZE/WARP_SIZE) {
__global
real*
bc
=
(
__global
real*
)
blockCenter
;
//
Check
each
atom
in
block
Y
for
interactions.
__global
real*
bb
=
(
__global
real*
)
blockBoundingBox
;
__global
real*
ps
=
(
__global
real*
)
posq
;
real4
pos
=
posq[buffer[base+i]*TILE_SIZE+indexInWarp]
;
center
=
(
real4
)
(
bc[4*box],
bc[4*box+1],
bc[4*box+2],
0
)
;
#
ifdef
USE_PERIODIC
boxSize
=
(
real4
)
(
bb[4*box],
bb[4*box+1],
bb[4*box+2],
0
)
;
if
(
singlePeriodicCopy
)
pos
=
(
real4
)
(
ps[4*atom],
ps[4*atom+1],
ps[4*atom+2],
0
)
;
pos.xyz
-=
floor
((
pos.xyz-blockCenterX.xyz
)
*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
#
else
center
=
blockCenter[
(
group
==
0
?
x
:
y
)
]
;
boxSize
=
blockBoundingBox[
(
group
==
0
?
x
:
y
)
]
;
pos
=
posq[
(
group
==
0
?
y
:
x
)
*TILE_SIZE+index]
;
#
endif
#
endif
bool
interacts
=
false
;
//
Find
the
distance
of
the
atom
from
the
bounding
box.
real4
delta
=
pos-center
;
#
ifdef
USE_PERIODIC
#
ifdef
USE_PERIODIC
delta.x
-=
floor
(
delta.x*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
if
(
!singlePeriodicCopy
)
{
delta.y
-=
floor
(
delta.y*invPeriodicBoxSize.y+0.5f
)
*periodicBoxSize.y
;
for
(
int
j
=
0
; j < TILE_SIZE; j++) {
delta.z
-=
floor
(
delta.z*invPeriodicBoxSize.z+0.5f
)
*periodicBoxSize.z
;
real4
delta
=
pos-posBuffer[j]
;
delta.xyz
-=
floor
(
delta.xyz*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
interacts
|= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED);
}
}
else {
#endif
#endif
delta
=
max
((
real4
)
0
,
fabs
(
delta
)
-boxSize
)
;
for (int j = 0; j < TILE_SIZE; j++) {
__local
ushort*
flag
=
(
__local
ushort*
)
&buffer[tile]
;
real4 delta = pos-posBuffer[j];
if
(
delta.x*delta.x+delta.y*delta.y+delta.z*delta.z
<
cutoffSquared
)
interacts |
=
(
delta.x*delta.x+delta.y*delta.y+delta.z*delta.z
<
PADDED_CUTOFF_SQUARED
)
;
flag[group]
=
false
;
}
#
ifdef
USE_PERIODIC
}
#
endif
sum[i*WARP_SIZE+indexInWarp]
=
(
interacts
?
1
:
0
)
;
}
for
(
int
i
=
numValid-base+get_local_id
(
0
)
/WARP_SIZE
; i < BUFFER_SIZE/WARP_SIZE; i += GROUP_SIZE/WARP_SIZE)
sum[i*WARP_SIZE+indexInWarp]
=
0
;
//
Compact
the
list
of
atoms.
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
flag[0]
|
| flag[1]) {
prefixSum
(
sum,
temp
)
;
// This tile contains no interactions.
for
(
int
i
=
get_local_id
(
0
)
; i < BUFFER_SIZE; i += get_local_size(0))
if
(
sum[i]
!=
(
i
==
0
?
0
:
sum[i-1]
))
atoms[*numAtoms+sum[i]-1]
=
buffer[base+i/WARP_SIZE]*TILE_SIZE+indexInWarp
;
//
Store
them
to
global
memory.
numValid--;
int
atomsToStore
=
*numAtoms+sum[BUFFER_SIZE-1]
;
bool
storePartialTile
=
(
finish
&&
base
>=
numValid-BUFFER_SIZE/WARP_SIZE
)
;
int
tilesToStore
=
(
storePartialTile
?
(
atomsToStore+TILE_SIZE-1
)
/TILE_SIZE
:
atomsToStore/TILE_SIZE
)
;
if
(
tilesToStore
>
0
)
{
if
(
get_local_id
(
0
)
==
0
)
*baseIndex
=
atom_add
(
interactionCount,
tilesToStore
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
get_local_id
(
0
)
==
0
)
*numAtoms
=
atomsToStore-tilesToStore*TILE_SIZE
;
if
(
*baseIndex+tilesToStore
<=
maxTiles
)
{
if
(
get_local_id
(
0
)
<
tilesToStore
)
interactingTiles[*baseIndex+get_local_id
(
0
)
]
=
(
ushort2
)
(
x,
singlePeriodicCopy
)
;
for
(
int
i
=
get_local_id
(
0
)
; i < tilesToStore*TILE_SIZE; i += get_local_size(0))
interactingAtoms[*baseIndex*TILE_SIZE+i]
=
(
i
<
atomsToStore
?
atoms[i]
:
NUM_ATOMS
)
;
}
}
else
{
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
get_local_id
(
0
)
==
0
)
if
(
get_local_id
(
0
)
==
0
)
temp[tile] = temp[numValid];
*numAtoms
+=
sum[BUFFER_SIZE-1]
;
tile--;
}
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
get_local_id
(
0
)
<
*numAtoms
&&
!storePartialTile
)
atoms[get_local_id
(
0
)
]
=
atoms[tilesToStore*TILE_SIZE+get_local_id
(
0
)
]
;
}
}
#endif
// Store it to global memory.
if
(
numValid
==
0
&&
*numAtoms
>
0
&&
finish
)
{
//
We
didn
't
have
any
more
tiles
to
process,
but
there
were
some
atoms
left
over
from
a
//
previous
call
to
this
function.
Save
them
now.
if (get_local_id(0) == 0)
if
(
get_local_id
(
0
)
==
0
)
*baseIndex = atom_add(interactionCount, numValid);
*baseIndex
=
atom_add
(
interactionCount,
1
)
;
barrier(CLK_LOCAL_MEM_FENCE);
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if (*baseIndex+numValid <= maxTiles)
if
(
*baseIndex
<
maxTiles
)
{
for (int i = get_local_id(0); i < numValid; i += GROUP_SIZE)
if
(
get_local_id
(
0
)
==
0
)
interactingTiles[*baseIndex+i] = temp[i];
interactingTiles[*baseIndex]
=
(
ushort2
)
(
x,
singlePeriodicCopy
)
;
barrier(CLK_LOCAL_MEM_FENCE);
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
interactingAtoms[*baseIndex*TILE_SIZE+get_local_id
(
0
)
]
=
(
get_local_id
(
0
)
<
*numAtoms
?
atoms[get_local_id
(
0
)
]
:
NUM_ATOMS
)
;
}
}
//
Reset
the
buffer
for
processing
more
tiles.
for
(
int
i
=
get_local_id
(
0
)
; i < BUFFER_SIZE; i += get_local_size(0))
buffer[i]
=
INVALID
;
}
}
/**
/**
*
Compare
the
bounding
boxes
for
each
pair
of
blocks.
If
they
are
sufficiently
far
apart,
*
Compare
the
bounding
boxes
for
each
pair
of
blocks.
If
they
are
sufficiently
far
apart,
*
mark
them
as
non-interacting.
*
mark
them
as
non-interacting.
*/
*/
__kernel void findBlocksWithInteractions(
real cutoffSquared,
real4 periodicBoxSize, real4 invPeriodicBoxSize, __global const real4* restrict blockCenter,
__kernel
void
findBlocksWithInteractions
(
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
__global
const
real4*
restrict
blockCenter,
__global
const
real4*
restrict
blockBoundingBox,
__global
unsigned
int*
restrict
interactionCount,
__global
ushort2*
restrict
interactingTiles,
__global
const
real4*
restrict
blockBoundingBox,
__global
unsigned
int*
restrict
interactionCount,
__global
ushort2*
restrict
interactingTiles,
__global unsigned int* restrict interactionFlags, __global const real4* restrict posq, unsigned int maxTiles, unsigned int startTileIndex,
__global
unsigned
int*
restrict
interactingAtoms,
__global
const
real4*
restrict
posq,
unsigned
int
maxTiles,
unsigned
int
startBlockIndex,
unsigned int endTileIndex) {
unsigned
int
numBlocks,
__global
real2*
restrict
sortedBlocks,
__global
const
real4*
restrict
sortedBlockCenter,
__global
const
real4*
restrict
sortedBlockBoundingBox,
__local ushort2 buffer[BUFFER_SIZE];
__global
const
unsigned
int*
restrict
exclusionIndices,
__global
const
unsigned
int*
restrict
exclusionRowIndices,
__global
real4*
restrict
oldPositions,
__local int valid[BUFFER_SIZE];
__global
const
int*
restrict
rebuildNeighborList
)
{
__local
unsigned
short
buffer[BUFFER_SIZE]
;
__local
short
sum[BUFFER_SIZE]
;
__local
short
sum[BUFFER_SIZE]
;
__local
ushort2
temp[BUFFER_SIZE]
;
__local
ushort2
temp[BUFFER_SIZE]
;
__local
int
atoms[BUFFER_SIZE+TILE_SIZE]
;
__local
real4
posBuffer[TILE_SIZE]
;
__local
int
exclusionsForX[MAX_EXCLUSIONS]
;
__local
int
bufferFull
;
__local
int
bufferFull
;
__local
int
globalIndex
;
__local
int
globalIndex
;
__local
int
numAtoms
;
#
ifdef
AMD_ATOMIC_WORK_AROUND
#
ifdef
AMD_ATOMIC_WORK_AROUND
//
Do
a
byte
write
to
force
all
memory
accesses
to
interactionCount
to
use
the
complete
path.
//
Do
a
byte
write
to
force
all
memory
accesses
to
interactionCount
to
use
the
complete
path.
//
This
avoids
the
atomic
access
from
causing
all
word
accesses
to
other
buffers
from
using
the
slow
complete
path.
//
This
avoids
the
atomic
access
from
causing
all
word
accesses
to
other
buffers
from
using
the
slow
complete
path.
...
@@ -173,142 +255,79 @@ __kernel void findBlocksWithInteractions(real cutoffSquared, real4 periodicBoxSi
...
@@ -173,142 +255,79 @@ __kernel void findBlocksWithInteractions(real cutoffSquared, real4 periodicBoxSi
if
(
get_global_id
(
0
)
==
get_local_id
(
0
)
+1
)
if
(
get_global_id
(
0
)
==
get_local_id
(
0
)
+1
)
((
__global
char*
)
interactionCount
)
[sizeof
(
unsigned
int
)
+1]
=
0
;
((
__global
char*
)
interactionCount
)
[sizeof
(
unsigned
int
)
+1]
=
0
;
#
endif
#
endif
if
(
rebuildNeighborList[0]
==
0
)
return
; // The neighbor list doesn't need to be rebuilt.
int
valuesInBuffer
=
0
;
int
valuesInBuffer
=
0
;
if
(
get_local_id
(
0
)
==
0
)
if
(
get_local_id
(
0
)
==
0
)
bufferFull
=
false
;
bufferFull
=
false
;
for
(
int
i
=
0
; i < BUFFER_GROUPS; ++i)
for
(
int
i
=
0
; i < BUFFER_GROUPS; ++i)
valid
[i*GROUP_SIZE+get_local_id(0)] =
false
;
buffer
[i*GROUP_SIZE+get_local_id
(
0
)
]
=
INVALID
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
for (int baseIndex = startTileIndex+get_group_id(0)*get_local_size(0); baseIndex < endTileIndex; baseIndex += get_global_size(0)) {
// Identify the pair of blocks to compare.
//
Loop
over
blocks
sorted
by
size.
int index = baseIndex+get_local_id(0);
for
(
int
i
=
startBlockIndex+get_group_id
(
0
)
; i < startBlockIndex+numBlocks; i += get_num_groups(0)) {
if (index < endTileIndex) {
if
(
get_local_id
(
0
)
==
get_local_size
(
0
)
-1
)
unsigned int y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*index));
numAtoms
=
0
;
unsigned int x = (index-y*NUM_BLOCKS+y*(y+1)/2);
real2
sortedKey
=
sortedBlocks[i]
;
if (x < y |
|
x
>=
NUM_BLOCKS
)
{
//
Occasionally
happens
due
to
roundoff
error.
unsigned
short
x
=
(
unsigned
short
)
sortedKey.y
;
y
+=
(
x
<
y
?
-1
:
1
)
;
real4
blockCenterX
=
blockCenter[x]
;
x
=
(
index-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
real4
blockSizeX
=
blockBoundingBox[x]
;
}
//
Find
the
distance
between
the
bounding
boxes
of
the
two
cells.
#
ifdef
MAC_AMD_WORKAROUND
//
Load
exclusion
data
for
block
x.
__global
real*
bc
=
(
__global
real*
)
blockCenter
;
__global
real*
bb
=
(
__global
real*
)
blockBoundingBox
;
const
int
exclusionStart
=
exclusionRowIndices[x]
;
real4
bcx
=
(
real4
)
(
bc[4*x],
bc[4*x+1],
bc[4*x+2],
0
)
;
const
int
exclusionEnd
=
exclusionRowIndices[x+1]
;
real4
bcy
=
(
real4
)
(
bc[4*y],
bc[4*y+1],
bc[4*y+2],
0
)
;
const
int
numExclusions
=
exclusionEnd-exclusionStart
;
real4
delta
=
bcx-bcy
;
for
(
int
j
=
get_local_id
(
0
)
; j < numExclusions; j += get_local_size(0))
real4
boxSizea
=
(
real4
)
(
bb[4*x],
bb[4*x+1],
bb[4*x+2],
0
)
;
exclusionsForX[j]
=
exclusionIndices[exclusionStart+j]
;
real4
boxSizeb
=
(
real4
)
(
bb[4*y],
bb[4*y+1],
bb[4*y+2],
0
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
#
else
real4
delta
=
blockCenter[x]-blockCenter[y]
;
//
Compare
it
to
other
blocks
after
this
one
in
sorted
order.
real4
boxSizea
=
blockBoundingBox[x]
;
real4
boxSizeb
=
blockBoundingBox[y]
;
for
(
int
base
=
i+1
; base < NUM_BLOCKS; base += get_local_size(0)) {
#
endif
int
j
=
base+get_local_id
(
0
)
;
real2
sortedKey2
=
(
j
<
NUM_BLOCKS
?
sortedBlocks[j]
:
(
real2
)
0
)
;
real4
blockCenterY
=
(
j
<
NUM_BLOCKS
?
sortedBlockCenter[j]
:
(
real4
)
0
)
;
real4
blockSizeY
=
(
j
<
NUM_BLOCKS
?
sortedBlockBoundingBox[j]
:
(
real4
)
0
)
;
unsigned
short
y
=
(
unsigned
short
)
sortedKey2.y
;
real4
delta
=
blockCenterX-blockCenterY
;
#
ifdef
USE_PERIODIC
#
ifdef
USE_PERIODIC
delta.x
-=
floor
(
delta.x*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
delta.x
-=
floor
(
delta.x*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
delta.y
-=
floor
(
delta.y*invPeriodicBoxSize.y+0.5f
)
*periodicBoxSize.y
;
delta.y
-=
floor
(
delta.y*invPeriodicBoxSize.y+0.5f
)
*periodicBoxSize.y
;
delta.z
-=
floor
(
delta.z*invPeriodicBoxSize.z+0.5f
)
*periodicBoxSize.z
;
delta.z
-=
floor
(
delta.z*invPeriodicBoxSize.z+0.5f
)
*periodicBoxSize.z
;
#
endif
#
endif
delta.x
=
max
((
real
)
0
,
fabs
(
delta.x
)
-boxSizea.x-boxSizeb.x
)
;
delta.x
=
max
((
real
)
0
,
fabs
(
delta.x
)
-blockSizeX.x-blockSizeY.x
)
;
delta.y
=
max
((
real
)
0
,
fabs
(
delta.y
)
-boxSizea.y-boxSizeb.y
)
;
delta.y
=
max
((
real
)
0
,
fabs
(
delta.y
)
-blockSizeX.y-blockSizeY.y
)
;
delta.z
=
max
((
real
)
0
,
fabs
(
delta.z
)
-boxSizea.z-boxSizeb.z
)
;
delta.z
=
max
((
real
)
0
,
fabs
(
delta.z
)
-blockSizeX.z-blockSizeY.z
)
;
if
(
delta.x*delta.x+delta.y*delta.y+delta.z*delta.z
<
cutoffSquared
)
{
bool
hasExclusions
=
false
;
for
(
int
k
=
0
; k < numExclusions; k++)
hasExclusions
|
=
(
exclusionsForX[k]
==
y
)
;
if
(
j
<
NUM_BLOCKS
&&
delta.x*delta.x+delta.y*delta.y+delta.z*delta.z
<
PADDED_CUTOFF_SQUARED
&&
!hasExclusions
)
{
//
Add
this
tile
to
the
buffer.
//
Add
this
tile
to
the
buffer.
int
bufferIndex
=
valuesInBuffer*GROUP_SIZE+get_local_id
(
0
)
;
int
bufferIndex
=
valuesInBuffer*GROUP_SIZE+get_local_id
(
0
)
;
valid[bufferIndex]
=
true
;
buffer[bufferIndex]
=
y
;
buffer[bufferIndex]
=
(
ushort2
)
(
x,
y
)
;
valuesInBuffer++
;
valuesInBuffer++
;
if
(
!bufferFull
&&
valuesInBuffer
==
BUFFER_GROUPS
)
if
(
!bufferFull
&&
valuesInBuffer
==
BUFFER_GROUPS
)
bufferFull
=
true
;
bufferFull
=
true
;
}
}
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
bufferFull
)
{
storeInteractionData
(
buffer,
valid,
sum,
temp,
&globalIndex,
interactionCount,
interactingTiles,
cutoffSquared,
periodicBoxSize,
invPeriodicBoxSize,
posq,
blockCenter,
blockBoundingBox,
maxTiles
)
;
valuesInBuffer
=
0
;
if
(
get_local_id
(
0
)
==
0
)
bufferFull
=
false
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
if
(
bufferFull
)
{
}
storeInteractionData
(
x,
buffer,
sum,
temp,
atoms,
&numAtoms,
&globalIndex,
interactionCount,
interactingTiles,
interactingAtoms,
periodicBoxSize,
invPeriodicBoxSize,
posq,
posBuffer,
blockCenterX,
blockSizeX,
maxTiles,
false
)
;
storeInteractionData
(
buffer,
valid,
sum,
temp,
&globalIndex,
interactionCount,
interactingTiles,
cutoffSquared,
periodicBoxSize,
invPeriodicBoxSize,
posq,
blockCenter,
blockBoundingBox,
maxTiles
)
;
valuesInBuffer
=
0
;
}
if
(
get_local_id
(
0
)
==
0
)
bufferFull
=
false
;
/**
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
*
Compare
each
atom
in
one
block
to
the
bounding
box
of
another
block,
and
set
*
flags
for
which
ones
are
interacting.
*/
__kernel
void
findInteractionsWithinBlocks
(
real
cutoffSquared,
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
__global
const
real4*
restrict
posq,
__global
const
ushort2*
restrict
tiles,
__global
const
real4*
restrict
blockCenter,
__global
const
real4*
restrict
blockBoundingBox,
__global
unsigned
int*
restrict
interactionFlags,
__global
const
unsigned
int*
restrict
interactionCount,
__local
volatile
unsigned
int*
restrict
flags,
unsigned
int
maxTiles
)
{
unsigned
int
totalWarps
=
get_global_size
(
0
)
/TILE_SIZE
;
unsigned
int
warp
=
get_global_id
(
0
)
/TILE_SIZE
;
unsigned
int
numTiles
=
interactionCount[0]
;
unsigned
int
pos
=
warp*numTiles/totalWarps
;
unsigned
int
end
=
(
warp+1
)
*numTiles/totalWarps
;
unsigned
int
index
=
get_local_id
(
0
)
&
(
TILE_SIZE
-
1
)
;
if
(
numTiles
>
maxTiles
)
return
;
unsigned
int
lasty
=
0xFFFFFFFF
;
real4
apos
;
while
(
pos
<
end
)
{
//
Extract
the
coordinates
of
this
tile
ushort2
tileIndices
=
tiles[pos]
;
unsigned
int
x
=
tileIndices.x
;
unsigned
int
y
=
tileIndices.y
;
if
(
x
==
y
)
{
if
(
index
==
0
)
interactionFlags[pos]
=
0xFFFFFFFF
;
}
else
{
//
Load
the
bounding
box
for
x
and
the
atom
positions
for
y.
real4
center
=
blockCenter[x]
;
real4
boxSize
=
blockBoundingBox[x]
;
if
(
y
!=
lasty
)
apos
=
posq[y*TILE_SIZE+index]
;
//
Find
the
distance
of
the
atom
from
the
bounding
box.
real4
delta
=
apos-center
;
#
ifdef
USE_PERIODIC
delta.x
-=
floor
(
delta.x*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
delta.y
-=
floor
(
delta.y*invPeriodicBoxSize.y+0.5f
)
*periodicBoxSize.y
;
delta.z
-=
floor
(
delta.z*invPeriodicBoxSize.z+0.5f
)
*periodicBoxSize.z
;
#
endif
delta
=
max
((
real4
)
0
,
fabs
(
delta
)
-boxSize
)
;
int
thread
=
get_local_id
(
0
)
;
flags[thread]
=
(
delta.x*delta.x+delta.y*delta.y+delta.z*delta.z
>
cutoffSquared
?
0
:
1
<<
index
)
;
//
Sum
the
flags.
#
ifdef
WARPS_ARE_ATOMIC
if
(
index
%
4
==
0
)
flags[thread]
+=
flags[thread+1]+flags[thread+2]+flags[thread+3]
;
#
else
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
index
%
4
==
0
)
flags[thread]
+=
flags[thread+1]+flags[thread+2]+flags[thread+3]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
#
endif
if
(
index
==
0
)
{
unsigned
int
allFlags
=
flags[thread]+flags[thread+4]+flags[thread+8]+flags[thread+12]+flags[thread+16]+flags[thread+20]+flags[thread+24]+flags[thread+28]
;
//
Count
how
many
flags
are
set,
and
based
on
that
decide
whether
to
compute
all
interactions
//
or
only
a
fraction
of
them.
unsigned
int
bits
=
(
allFlags&0x55555555
)
+
((
allFlags>>1
)
&0x55555555
)
;
bits
=
(
bits&0x33333333
)
+
((
bits>>2
)
&0x33333333
)
;
bits
=
(
bits&0x0F0F0F0F
)
+
((
bits>>4
)
&0x0F0F0F0F
)
;
bits
=
(
bits&0x00FF00FF
)
+
((
bits>>8
)
&0x00FF00FF
)
;
bits
=
(
bits&0x0000FFFF
)
+
((
bits>>16
)
&0x0000FFFF
)
;
interactionFlags[pos]
=
(
bits
>
12
?
0xFFFFFFFF
:
allFlags
)
;
}
}
lasty
=
y
;
}
}
pos++
;
storeInteractionData
(
x,
buffer,
sum,
temp,
atoms,
&numAtoms,
&globalIndex,
interactionCount,
interactingTiles,
interactingAtoms,
periodicBoxSize,
invPeriodicBoxSize,
posq,
posBuffer,
blockCenterX,
blockSizeX,
maxTiles,
true
)
;
}
}
//
Record
the
positions
the
neighbor
list
is
based
on.
for
(
int
i
=
get_global_id
(
0
)
; i < NUM_ATOMS; i += get_global_size(0))
oldPositions[i]
=
posq[i]
;
}
}
platforms/opencl/src/kernels/findInteractingBlocks_cpu.cl
View file @
93c467b2
#
pragma
OPENCL
EXTENSION
cl_khr_global_int32_base_atomics
:
enable
#
pragma
OPENCL
EXTENSION
cl_khr_global_int32_base_atomics
:
enable
#
pragma
OPENCL
EXTENSION
cl_khr_byte_addressable_store
:
enable
#
pragma
OPENCL
EXTENSION
cl_khr_byte_addressable_store
:
enable
#
define
TILE_SIZE
32
#
define
GROUP_SIZE
64
#
define
BUFFER_GROUPS
4
#
define
BUFFER_SIZE
BUFFER_GROUPS*GROUP_SIZE
#
define
BUFFER_SIZE
BUFFER_GROUPS*GROUP_SIZE
/**
/**
*
Find
a
bounding
box
for
the
atoms
in
each
block.
*
Find
a
bounding
box
for
the
atoms
in
each
block.
*/
*/
__kernel
void
findBlockBounds
(
int
numAtoms,
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
__global
const
real4*
restrict
posq,
__global
real4*
restrict
blockCenter,
__global
real4*
restrict
blockBoundingBox,
__global
unsigned
int*
restrict
interactionCount
)
{
__kernel
void
findBlockBounds
(
int
numAtoms,
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
__global
const
real4*
restrict
posq,
__global
real4*
restrict
blockCenter,
__global
real4*
restrict
blockBoundingBox,
__global
int*
restrict
rebuildNeighborList,
__global
real2*
restrict
sortedBlocks
)
{
int
index
=
get_global_id
(
0
)
;
int
index
=
get_global_id
(
0
)
;
int
base
=
index*TILE_SIZE
;
int
base
=
index*TILE_SIZE
;
while
(
base
<
numAtoms
)
{
while
(
base
<
numAtoms
)
{
real4
pos
=
posq[base]
;
real4
pos
=
posq[base]
;
#
ifdef
USE_PERIODIC
#
ifdef
USE_PERIODIC
pos.x
-=
floor
(
pos.x*invPeriodicBoxSize.x
)
*periodicBoxSize.x
;
pos.xyz
-=
floor
(
pos.xyz*invPeriodicBoxSize.xyz
)
*periodicBoxSize.xyz
;
pos.y
-=
floor
(
pos.y*invPeriodicBoxSize.y
)
*periodicBoxSize.y
;
pos.z
-=
floor
(
pos.z*invPeriodicBoxSize.z
)
*periodicBoxSize.z
;
real4
firstPoint
=
pos
;
real4
firstPoint
=
pos
;
#
endif
#
endif
real4
minPos
=
pos
;
real4
minPos
=
pos
;
...
@@ -25,143 +22,211 @@ __kernel void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeri
...
@@ -25,143 +22,211 @@ __kernel void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeri
for
(
int
i
=
base+1
; i < last; i++) {
for
(
int
i
=
base+1
; i < last; i++) {
pos
=
posq[i]
;
pos
=
posq[i]
;
#
ifdef
USE_PERIODIC
#
ifdef
USE_PERIODIC
pos.x
-=
floor
((
pos.x-firstPoint.x
)
*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
pos.xyz
-=
floor
((
pos.xyz-firstPoint.xyz
)
*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
pos.y
-=
floor
((
pos.y-firstPoint.y
)
*invPeriodicBoxSize.y+0.5f
)
*periodicBoxSize.y
;
pos.z
-=
floor
((
pos.z-firstPoint.z
)
*invPeriodicBoxSize.z+0.5f
)
*periodicBoxSize.z
;
#
endif
#
endif
minPos
=
min
(
minPos,
pos
)
;
minPos
=
min
(
minPos,
pos
)
;
maxPos
=
max
(
maxPos,
pos
)
;
maxPos
=
max
(
maxPos,
pos
)
;
}
}
blockBoundingBox[index]
=
0.5f*
(
maxPos-minPos
)
;
real4
blockSize
=
0.5f*
(
maxPos-minPos
)
;
blockBoundingBox[index]
=
blockSize
;
blockCenter[index]
=
0.5f*
(
maxPos+minPos
)
;
blockCenter[index]
=
0.5f*
(
maxPos+minPos
)
;
sortedBlocks[index]
=
(
real2
)
(
blockSize.x+blockSize.y+blockSize.z,
index
)
;
index
+=
get_global_size
(
0
)
;
index
+=
get_global_size
(
0
)
;
base
=
index*TILE_SIZE
;
base
=
index*TILE_SIZE
;
}
}
if
(
get_global_id
(
0
)
==
0
)
if
(
get_global_id
(
0
)
==
0
)
rebuildNeighborList[0]
=
0
;
}
/**
*
Sort
the
data
about
bounding
boxes
so
it
can
be
accessed
more
efficiently
in
the
next
kernel.
*/
__kernel
void
sortBoxData
(
__global
const
real2*
restrict
sortedBlock,
__global
const
real4*
restrict
blockCenter,
__global
const
real4*
restrict
blockBoundingBox,
__global
real4*
restrict
sortedBlockCenter,
__global
real4*
restrict
sortedBlockBoundingBox,
__global
const
real4*
restrict
posq,
__global
const
real4*
restrict
oldPositions,
__global
unsigned
int*
restrict
interactionCount,
__global
int*
restrict
rebuildNeighborList
)
{
for
(
int
i
=
get_global_id
(
0
)
; i < NUM_BLOCKS; i += get_global_size(0)) {
int
index
=
(
int
)
sortedBlock[i].y
;
sortedBlockCenter[i]
=
blockCenter[index]
;
sortedBlockBoundingBox[i]
=
blockBoundingBox[index]
;
}
//
Also
check
whether
any
atom
has
moved
enough
so
that
we
really
need
to
rebuild
the
neighbor
list.
bool
rebuild
=
false
;
for
(
int
i
=
get_global_id
(
0
)
; i < NUM_ATOMS; i += get_global_size(0)) {
real4
delta
=
oldPositions[i]-posq[i]
;
if
(
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
>
0.25f*PADDING*PADDING
)
rebuild
=
true
;
}
if
(
rebuild
)
{
rebuildNeighborList[0]
=
1
;
interactionCount[0]
=
0
;
interactionCount[0]
=
0
;
}
}
}
/**
/**
*
This
is
called
by
findBlocksWithInteractions
()
.
It
compacts
the
list
of
blocks
and
writes
them
*
This
is
called
by
findBlocksWithInteractions
()
.
It
compacts
the
list
of
blocks
and
writes
them
*
to
global
memory.
*
to
global
memory.
*/
*/
void
storeInteractionData
(
ushort2*
buffer,
int
numValid,
__global
unsigned
int*
interactionCount,
__global
ushort2*
interactingTiles,
void
storeInteractionData
(
unsigned
short
x,
unsigned
short*
buffer,
int*
atoms,
int*
numAtoms,
int
numValid,
__global
unsigned
int*
interactionCount,
__global
unsigned
int*
interactionFlags,
real
cutoffSquared,
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
__global
ushort2*
interactingTiles,
__global
unsigned
int*
interactingAtoms,
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
__global
real4*
posq,
__global
real4*
blockCenter,
__global
real4*
blockBoundingBox,
unsigned
int
maxTiles
)
{
__global
real4*
posq,
real4
blockCenterX,
real4
blockSizeX,
unsigned
int
maxTiles,
bool
finish
)
{
//
Filter
the
list
of
tiles
by
comparing
the
distance
from
each
atom
to
the
other
bounding
box.
real4
posBuffer[TILE_SIZE]
;
const
bool
singlePeriodicCopy
=
(
0.5f*periodicBoxSize.x-blockSizeX.x
>=
PADDED_CUTOFF
&&
unsigned
int
flagsBuffer[2*BUFFER_SIZE]
;
0.5f*periodicBoxSize.y-blockSizeX.y
>=
PADDED_CUTOFF
&&
real4
atomPositions[TILE_SIZE]
;
0.5f*periodicBoxSize.z-blockSizeX.z
>=
PADDED_CUTOFF
)
;
int
lasty
=
-1
;
for
(
int
i
=
0
; i < TILE_SIZE; i++) {
real4
centery,
boxSizey
;
real4
pos
=
posq[x*TILE_SIZE+i]
;
for
(
int
tile
=
0
; tile < numValid; ) {
#
ifdef
USE_PERIODIC
int
x
=
buffer[tile].x
;
if
(
singlePeriodicCopy
)
{
int
y
=
buffer[tile].y
;
//
The
box
is
small
enough
that
we
can
just
translate
all
the
atoms
into
a
single
periodic
if
(
x
==
y
)
{
//
box,
then
skip
having
to
apply
periodic
boundary
conditions
later.
tile++
;
continue
;
pos.xyz
-=
floor
((
pos.xyz-blockCenterX.xyz
)
*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
}
//
Load
the
atom
positions
and
bounding
boxes.
real4
centerx
=
blockCenter[x]
;
real4
boxSizex
=
blockBoundingBox[x]
;
if
(
y
!=
lasty
)
{
for
(
int
atom
=
0
; atom < TILE_SIZE; atom++)
atomPositions[atom]
=
posq[y*TILE_SIZE+atom]
;
centery
=
blockCenter[y]
;
boxSizey
=
blockBoundingBox[y]
;
lasty
=
y
;
}
}
#
endif
posBuffer[i]
=
pos
;
}
//
Find
the
distance
of
each
atom
from
the
bounding
box
.
//
Loop
over
the
tiles
and
find
specific
interactions
in
them
.
unsigned
int
flags1
=
0
,
flags2
=
0
;
for
(
int
tile
=
0
; tile < numValid; tile++) {
for
(
int
atom
=
0
; atom < TILE_SIZE; atom++) {
for
(
int
indexInTile
=
0
; indexInTile < TILE_SIZE; indexInTile++) {
real4
delta
=
atomPositions[atom]-centerx
;
//
Check
each
atom
in
block
Y
for
interactions.
int
atom
=
buffer[tile]*TILE_SIZE+indexInTile
;
real4
pos
=
posq[atom]
;
#
ifdef
USE_PERIODIC
#
ifdef
USE_PERIODIC
delta.xyz
-=
floor
(
delta.xyz*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
if
(
singlePeriodicCopy
)
pos.xyz
-=
floor
((
pos.xyz-blockCenterX.xyz
)
*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
#
endif
#
endif
delta
=
max
((
real4
)
0
,
fabs
(
delta
)
-boxSizex
)
;
bool
interacts
=
false
;
if
(
dot
(
delta.xyz,
delta.xyz
)
<
cutoffSquared
)
flags1
+=
1
<<
atom
;
delta
=
posq[x*TILE_SIZE+atom]-centery
;
#
ifdef
USE_PERIODIC
#
ifdef
USE_PERIODIC
delta.xyz
-=
floor
(
delta.xyz*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
if
(
!singlePeriodicCopy
)
{
for
(
int
j
=
0
; j < TILE_SIZE && !interacts; j++) {
real4
delta
=
pos-posBuffer[j]
;
delta.xyz
-=
floor
(
delta.xyz*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
interacts
=
(
delta.x*delta.x+delta.y*delta.y+delta.z*delta.z
<
PADDED_CUTOFF_SQUARED
)
;
}
}
else
{
#
endif
#
endif
delta
=
max
((
real4
)
0
,
fabs
(
delta
)
-boxSizey
)
;
for
(
int
j
=
0
; j < TILE_SIZE && !interacts; j++) {
if
(
dot
(
delta.xyz,
delta.xyz
)
<
cutoffSquared
)
real4
delta
=
pos-posBuffer[j]
;
flags2
+=
1
<<
atom
;
interacts
=
(
delta.x*delta.x+delta.y*delta.y+delta.z*delta.z
<
PADDED_CUTOFF_SQUARED
)
;
}
}
if
(
flags1
==
0
|
| flags2 == 0) {
#
ifdef
USE_PERIODIC
// This tile contains no interactions.
}
#
endif
numValid--;
if
(
interacts
)
buffer[tile] = buffer[numValid];
atoms[
(
*numAtoms
)
++]
=
atom
;
}
if
(
*numAtoms
==
BUFFER_SIZE
)
{
else {
//
The
atoms
buffer
is
full,
so
store
it
to
global
memory.
flagsBuffer[2*tile] = flags1;
flagsBuffer[2*tile+1] = flags2;
int
tilesToStore
=
BUFFER_SIZE/TILE_SIZE
;
tile++;
int
baseIndex
=
atom_add
(
interactionCount,
tilesToStore
)
;
if
(
baseIndex+tilesToStore
<=
maxTiles
)
{
for
(
int
i
=
0
; i < tilesToStore; i++) {
interactingTiles[baseIndex+i]
=
(
ushort2
)
(
x,
singlePeriodicCopy
)
;
for
(
int
j
=
0
; j < TILE_SIZE; j++)
interactingAtoms[
(
baseIndex+i
)
*TILE_SIZE+j]
=
atoms[i*TILE_SIZE+j]
;
}
}
*numAtoms
=
0
;
}
}
}
}
}
// Store it to global memory.
if
(
*numAtoms
>
0
&&
finish
)
{
//
There
are
some
leftover
atoms,
so
save
them
now.
int baseIndex = atom_add(interactionCount, numValid);
if (baseIndex+numValid <= maxTiles)
int
tilesToStore
=
(
*numAtoms+TILE_SIZE-1
)
/TILE_SIZE
;
for (int i = 0; i < numValid; i++) {
int
baseIndex
=
atom_add
(
interactionCount,
tilesToStore
)
;
interactingTiles[baseIndex+i] = buffer[i];
if
(
baseIndex+tilesToStore
<=
maxTiles
)
{
interactionFlags[2*(baseIndex+i)] = flagsBuffer[2*i];
for
(
int
i
=
0
; i < tilesToStore; i++) {
interactionFlags[2*(baseIndex+i)+1] = flagsBuffer[2*i+1];
interactingTiles[baseIndex+i]
=
(
ushort2
)
(
x,
singlePeriodicCopy
)
;
for
(
int
j
=
0
; j < TILE_SIZE; j++) {
int
index
=
i*TILE_SIZE+j
;
interactingAtoms[
(
baseIndex+i
)
*TILE_SIZE+j]
=
(
index
<
*numAtoms
?
atoms[index]
:
NUM_ATOMS
)
;
}
}
}
}
}
}
}
/**
/**
*
Compare
the
bounding
boxes
for
each
pair
of
blocks.
If
they
are
sufficiently
far
apart,
*
Compare
the
bounding
boxes
for
each
pair
of
blocks.
If
they
are
sufficiently
far
apart,
*
mark
them
as
non-interacting.
*
mark
them
as
non-interacting.
*/
*/
__kernel void findBlocksWithInteractions(
real cutoffSquared,
real4 periodicBoxSize, real4 invPeriodicBoxSize, __global const real4* restrict blockCenter,
__kernel
void
findBlocksWithInteractions
(
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
__global
const
real4*
restrict
blockCenter,
__global
const
real4*
restrict
blockBoundingBox,
__global
unsigned
int*
restrict
interactionCount,
__global
ushort2*
restrict
interactingTiles,
__global
const
real4*
restrict
blockBoundingBox,
__global
unsigned
int*
restrict
interactionCount,
__global
ushort2*
restrict
interactingTiles,
__global unsigned int* restrict interactionFlags, __global const real4* restrict posq, unsigned int maxTiles, unsigned int startTileIndex,
__global
unsigned
int*
restrict
interactingAtoms,
__global
const
real4*
restrict
posq,
unsigned
int
maxTiles,
unsigned
int
startBlockIndex,
unsigned int endTileIndex) {
unsigned
int
numBlocks,
__global
real2*
restrict
sortedBlocks,
__global
const
real4*
restrict
sortedBlockCenter,
__global
const
real4*
restrict
sortedBlockBoundingBox,
ushort2 buffer[BUFFER_SIZE];
__global
const
unsigned
int*
restrict
exclusionIndices,
__global
const
unsigned
int*
restrict
exclusionRowIndices,
__global
real4*
restrict
oldPositions,
int valuesInBuffer = 0;
__global
const
int*
restrict
rebuildNeighborList
)
{
const int numTiles = endTileIndex-startTileIndex;
if
(
rebuildNeighborList[0]
==
0
)
unsigned int start = startTileIndex+get_group_id(0)*numTiles/get_num_groups(0);
return
; // The neighbor list doesn't need to be rebuilt.
unsigned int end = startTileIndex+(get_group_id(0)+1)*numTiles/get_num_groups(0);
unsigned
short
buffer[BUFFER_SIZE]
;
for (int index = start; index < end; index++) {
int
atoms[BUFFER_SIZE]
;
// Identify the pair of blocks to compare.
int
exclusionsForX[MAX_EXCLUSIONS]
;
int
valuesInBuffer
;
unsigned int y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*index));
int
numAtoms
;
unsigned int x = (index-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y |
|
x
>=
NUM_BLOCKS
)
{
//
Occasionally
happens
due
to
roundoff
error.
//
Loop
over
blocks
sorted
by
size.
y
+=
(
x
<
y
?
-1
:
1
)
;
x
=
(
index-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
for
(
int
i
=
startBlockIndex+get_group_id
(
0
)
; i < startBlockIndex+numBlocks; i += get_num_groups(0)) {
}
valuesInBuffer
=
0
;
numAtoms
=
0
;
//
Find
the
distance
between
the
bounding
boxes
of
the
two
cells.
real2
sortedKey
=
sortedBlocks[i]
;
unsigned
short
x
=
(
unsigned
short
)
sortedKey.y
;
real4
blockCenterX
=
blockCenter[x]
;
real4
blockSizeX
=
blockBoundingBox[x]
;
real4
delta
=
blockCenter[x]-blockCenter[y]
;
//
Load
exclusion
data
for
block
x.
const
int
exclusionStart
=
exclusionRowIndices[x]
;
const
int
exclusionEnd
=
exclusionRowIndices[x+1]
;
const
int
numExclusions
=
exclusionEnd-exclusionStart
;
for
(
int
j
=
0
; j < numExclusions; j++)
exclusionsForX[j]
=
exclusionIndices[exclusionStart+j]
;
//
Compare
it
to
other
blocks
after
this
one
in
sorted
order.
for
(
int
j
=
i+1
; j < NUM_BLOCKS; j++) {
real2
sortedKey2
=
sortedBlocks[j]
;
unsigned
short
y
=
(
unsigned
short
)
sortedKey2.y
;
bool
hasExclusions
=
false
;
for
(
int
k
=
0
; k < numExclusions; k++)
hasExclusions
|
=
(
exclusionsForX[k]
==
y
)
;
if
(
hasExclusions
)
continue
;
real4
blockCenterY
=
sortedBlockCenter[j]
;
real4
blockSizeY
=
sortedBlockBoundingBox[j]
;
real4
delta
=
blockCenterX-blockCenterY
;
#
ifdef
USE_PERIODIC
#
ifdef
USE_PERIODIC
delta.x
-=
floor
(
delta.x*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
delta.x
-=
floor
(
delta.x*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
delta.y
-=
floor
(
delta.y*invPeriodicBoxSize.y+0.5f
)
*periodicBoxSize.y
;
delta.y
-=
floor
(
delta.y*invPeriodicBoxSize.y+0.5f
)
*periodicBoxSize.y
;
delta.z
-=
floor
(
delta.z*invPeriodicBoxSize.z+0.5f
)
*periodicBoxSize.z
;
delta.z
-=
floor
(
delta.z*invPeriodicBoxSize.z+0.5f
)
*periodicBoxSize.z
;
#
endif
#
endif
real4
boxSizea
=
blockBoundingBox[x]
;
delta.x
=
max
((
real
)
0
,
fabs
(
delta.x
)
-blockSizeX.x-blockSizeY.x
)
;
real4
boxSizeb
=
blockBoundingBox[y]
;
delta.y
=
max
((
real
)
0
,
fabs
(
delta.y
)
-blockSizeX.y-blockSizeY.y
)
;
delta.x
=
max
((
real
)
0
,
fabs
(
delta.x
)
-boxSizea.x-boxSizeb.x
)
;
delta.z
=
max
((
real
)
0
,
fabs
(
delta.z
)
-blockSizeX.z-blockSizeY.z
)
;
delta.y
=
max
((
real
)
0
,
fabs
(
delta.y
)
-boxSizea.y-boxSizeb.y
)
;
if
(
delta.x*delta.x+delta.y*delta.y+delta.z*delta.z
<
PADDED_CUTOFF_SQUARED
)
{
delta.z
=
max
((
real
)
0
,
fabs
(
delta.z
)
-boxSizea.z-boxSizeb.z
)
;
//
Add
this
tile
to
the
buffer.
if
(
delta.x*delta.x+delta.y*delta.y+delta.z*delta.z
<
cutoffSquared
)
{
//
Add
this
tile
to
the
buffer.
buffer[valuesInBuffer++]
=
(
ushort2
)
(
x,
y
)
;
buffer[valuesInBuffer++]
=
y
;
if
(
valuesInBuffer
==
BUFFER_SIZE
)
{
if
(
valuesInBuffer
==
BUFFER_SIZE
)
{
storeInteractionData
(
buffer,
valuesInBuffer,
interactionCount,
interactingTiles,
interactionFlags,
cutoffSquared,
periodicBoxSize,
invPeriodicBoxSize,
posq,
blockCenter,
blockBoundingBox,
maxTiles
)
;
storeInteractionData
(
x,
buffer,
atoms,
&numAtoms,
valuesInBuffer,
interactionCount,
interactingTiles,
interactingAtoms,
periodicBoxSize,
invPeriodicBoxSize,
posq,
blockCenterX,
blockSizeX,
maxTiles,
false
)
;
valuesInBuffer
=
0
;
valuesInBuffer
=
0
;
}
}
}
}
}
storeInteractionData
(
x,
buffer,
atoms,
&numAtoms,
valuesInBuffer,
interactionCount,
interactingTiles,
interactingAtoms,
periodicBoxSize,
invPeriodicBoxSize,
posq,
blockCenterX,
blockSizeX,
maxTiles,
true
)
;
}
}
storeInteractionData
(
buffer,
valuesInBuffer,
interactionCount,
interactingTiles,
interactionFlags,
cutoffSquared,
periodicBoxSize,
invPeriodicBoxSize,
posq,
blockCenter,
blockBoundingBox,
maxTiles
)
;
//
Record
the
positions
the
neighbor
list
is
based
on.
for
(
int
i
=
get_global_id
(
0
)
; i < NUM_ATOMS; i += get_global_size(0))
oldPositions[i]
=
posq[i]
;
}
}
platforms/opencl/src/kernels/gbsaObc.cl
0 → 100644
View file @
93c467b2
#
ifdef
SUPPORTS_64_BIT_ATOMICS
#
pragma
OPENCL
EXTENSION
cl_khr_int64_base_atomics
:
enable
#
endif
#
define
WARPS_PER_GROUP
(
FORCE_WORK_GROUP_SIZE/TILE_SIZE
)
typedef
struct
{
real
x,
y,
z
;
real
q
;
float
radius,
scaledRadius
;
real
bornSum
;
}
AtomData1
;
/**
*
Compute
the
Born
sum.
*/
__kernel
void
computeBornSum
(
#
ifdef
SUPPORTS_64_BIT_ATOMICS
__global
long*
restrict
global_bornSum,
#
else
__global
real*
restrict
global_bornSum,
#
endif
__global
const
real4*
restrict
posq,
__global
const
float2*
restrict
global_params,
#
ifdef
USE_CUTOFF
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
const
real4*
restrict
blockCenter,
__global
const
int*
restrict
interactingAtoms,
#
else
unsigned
int
numTiles,
#
endif
__global
const
ushort2*
exclusionTiles
)
{
const
unsigned
int
totalWarps
=
get_global_size
(
0
)
/TILE_SIZE
;
const
unsigned
int
warp
=
get_global_id
(
0
)
/TILE_SIZE
;
const
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
TILE_SIZE-1
)
;
const
unsigned
int
tbx
=
get_local_id
(
0
)
-
tgx
;
__local
AtomData1
localData[FORCE_WORK_GROUP_SIZE]
;
//
First
loop:
process
tiles
that
contain
exclusions.
const
unsigned
int
firstExclusionTile
=
FIRST_EXCLUSION_TILE+warp*
(
LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE
)
/totalWarps
;
const
unsigned
int
lastExclusionTile
=
FIRST_EXCLUSION_TILE+
(
warp+1
)
*
(
LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE
)
/totalWarps
;
for
(
int
pos
=
firstExclusionTile
; pos < lastExclusionTile; pos++) {
const
ushort2
tileIndices
=
exclusionTiles[pos]
;
const
unsigned
int
x
=
tileIndices.x
;
const
unsigned
int
y
=
tileIndices.y
;
real
bornSum
=
0.0f
;
unsigned
int
atom1
=
x*TILE_SIZE
+
tgx
;
real4
posq1
=
posq[atom1]
;
float2
params1
=
global_params[atom1]
;
if
(
x
==
y
)
{
//
This
tile
is
on
the
diagonal.
localData[get_local_id
(
0
)
].x
=
posq1.x
;
localData[get_local_id
(
0
)
].y
=
posq1.y
;
localData[get_local_id
(
0
)
].z
=
posq1.z
;
localData[get_local_id
(
0
)
].q
=
posq1.w
;
localData[get_local_id
(
0
)
].radius
=
params1.x
;
localData[get_local_id
(
0
)
].scaledRadius
=
params1.y
;
SYNC_WARPS
;
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
real4
delta
=
(
real4
)
(
localData[tbx+j].x-posq1.x,
localData[tbx+j].y-posq1.y,
localData[tbx+j].z-posq1.z,
0
)
;
#
ifdef
USE_PERIODIC
delta.xyz
-=
floor
(
delta.xyz*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
#
endif
real
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
#
ifdef
USE_CUTOFF
if
(
atom1
<
NUM_ATOMS
&&
y*TILE_SIZE+j
<
NUM_ATOMS
&&
r2
<
CUTOFF_SQUARED
)
{
#
else
if
(
atom1
<
NUM_ATOMS
&&
y*TILE_SIZE+j
<
NUM_ATOMS
)
{
#
endif
real
invR
=
RSQRT
(
r2
)
;
real
r
=
RECIP
(
invR
)
;
float2
params2
=
(
float2
)
(
localData[tbx+j].radius,
localData[tbx+j].scaledRadius
)
;
real
rScaledRadiusJ
=
r+params2.y
;
if
((
j
!=
tgx
)
&&
(
params1.x
<
rScaledRadiusJ
))
{
real
l_ij
=
RECIP
(
max
((
real
)
params1.x,
fabs
(
r-params2.y
)))
;
real
u_ij
=
RECIP
(
rScaledRadiusJ
)
;
real
l_ij2
=
l_ij*l_ij
;
real
u_ij2
=
u_ij*u_ij
;
real
ratio
=
LOG
(
u_ij
*
RECIP
(
l_ij
))
;
bornSum
+=
l_ij
-
u_ij
+
(
0.50f*invR*ratio
)
+
0.25f*
(
r*
(
u_ij2-l_ij2
)
+
(
params2.y*params2.y*invR
)
*
(
l_ij2-u_ij2
))
;
bornSum
+=
(
params1.x
<
params2.y-r
?
2.0f*
(
RECIP
(
params1.x
)
-l_ij
)
:
0
)
;
}
}
SYNC_WARPS
;
}
}
else
{
//
This
is
an
off-diagonal
tile.
unsigned
int
j
=
y*TILE_SIZE
+
tgx
;
real4
tempPosq
=
posq[j]
;
localData[get_local_id
(
0
)
].x
=
tempPosq.x
;
localData[get_local_id
(
0
)
].y
=
tempPosq.y
;
localData[get_local_id
(
0
)
].z
=
tempPosq.z
;
localData[get_local_id
(
0
)
].q
=
tempPosq.w
;
float2
tempParams
=
global_params[j]
;
localData[get_local_id
(
0
)
].radius
=
tempParams.x
;
localData[get_local_id
(
0
)
].scaledRadius
=
tempParams.y
;
localData[get_local_id
(
0
)
].bornSum
=
0.0f
;
SYNC_WARPS
;
//
Compute
the
full
set
of
interactions
in
this
tile.
unsigned
int
tj
=
tgx
;
for
(
j
=
0
; j < TILE_SIZE; j++) {
real4
delta
=
(
real4
)
(
localData[tbx+tj].x-posq1.x,
localData[tbx+tj].y-posq1.y,
localData[tbx+tj].z-posq1.z,
0
)
;
#
ifdef
USE_PERIODIC
delta.xyz
-=
floor
(
delta.xyz*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
#
endif
real
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
#
ifdef
USE_CUTOFF
if
(
atom1
<
NUM_ATOMS
&&
y*TILE_SIZE+tj
<
NUM_ATOMS
&&
r2
<
CUTOFF_SQUARED
)
{
#
else
if
(
atom1
<
NUM_ATOMS
&&
y*TILE_SIZE+tj
<
NUM_ATOMS
)
{
#
endif
real
invR
=
RSQRT
(
r2
)
;
real
r
=
RECIP
(
invR
)
;
float2
params2
=
(
float2
)
(
localData[tbx+tj].radius,
localData[tbx+tj].scaledRadius
)
;
real
rScaledRadiusJ
=
r+params2.y
;
if
(
params1.x
<
rScaledRadiusJ
)
{
real
l_ij
=
RECIP
(
max
((
real
)
params1.x,
fabs
(
r-params2.y
)))
;
real
u_ij
=
RECIP
(
rScaledRadiusJ
)
;
real
l_ij2
=
l_ij*l_ij
;
real
u_ij2
=
u_ij*u_ij
;
real
ratio
=
LOG
(
u_ij
*
RECIP
(
l_ij
))
;
bornSum
+=
l_ij
-
u_ij
+
(
0.50f*invR*ratio
)
+
0.25f*
(
r*
(
u_ij2-l_ij2
)
+
(
params2.y*params2.y*invR
)
*
(
l_ij2-u_ij2
))
;
bornSum
+=
(
params1.x
<
params2.y-r
?
2.0f*
(
RECIP
(
params1.x
)
-l_ij
)
:
0
)
;
}
real
rScaledRadiusI
=
r+params1.y
;
if
(
params2.x
<
rScaledRadiusI
)
{
real
l_ij
=
RECIP
(
max
((
real
)
params2.x,
fabs
(
r-params1.y
)))
;
real
u_ij
=
RECIP
(
rScaledRadiusI
)
;
real
l_ij2
=
l_ij*l_ij
;
real
u_ij2
=
u_ij*u_ij
;
real
ratio
=
LOG
(
u_ij
*
RECIP
(
l_ij
))
;
real
term
=
l_ij
-
u_ij
+
(
0.50f*invR*ratio
)
+
0.25f*
(
r*
(
u_ij2-l_ij2
)
+
(
params1.y*params1.y*invR
)
*
(
l_ij2-u_ij2
))
;
term
+=
(
params2.x
<
params1.y-r
?
2.0f*
(
RECIP
(
params2.x
)
-l_ij
)
:
0
)
;
localData[tbx+tj].bornSum
+=
term
;
}
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
)
;
SYNC_WARPS
;
}
}
//
Write
results.
#
ifdef
SUPPORTS_64_BIT_ATOMICS
unsigned
int
offset
=
x*TILE_SIZE
+
tgx
;
atom_add
(
&global_bornSum[offset],
(
long
)
(
bornSum*0x100000000
))
;
if
(
x
!=
y
)
{
offset
=
y*TILE_SIZE
+
tgx
;
atom_add
(
&global_bornSum[offset],
(
long
)
(
localData[get_local_id
(
0
)
].bornSum*0x100000000
))
;
}
#
else
unsigned
int
offset1
=
x*TILE_SIZE
+
tgx
+
warp*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y*TILE_SIZE
+
tgx
+
warp*PADDED_NUM_ATOMS
;
global_bornSum[offset1]
+=
bornSum
;
if
(
x
!=
y
)
global_bornSum[offset2]
+=
localData[get_local_id
(
0
)
].bornSum
;
#
endif
}
//
Second
loop:
tiles
without
exclusions,
either
from
the
neighbor
list
(
with
cutoff
)
or
just
enumerating
all
//
of
them
(
no
cutoff
)
.
#
ifdef
USE_CUTOFF
unsigned
int
numTiles
=
interactionCount[0]
;
int
pos
=
warp*
(
numTiles
>
maxTiles
?
NUM_BLOCKS*
(
NUM_BLOCKS+1
)
/2
:
numTiles
)
/totalWarps
;
int
end
=
(
warp+1
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS*
(
NUM_BLOCKS+1
)
/2
:
numTiles
)
/totalWarps
;
#
else
int
pos
=
warp*numTiles/totalWarps
;
int
end
=
(
warp+1
)
*numTiles/totalWarps
;
#
endif
int
skipBase
=
0
;
int
currentSkipIndex
=
tbx
;
__local
int
atomIndices[FORCE_WORK_GROUP_SIZE]
;
__local
int
skipTiles[FORCE_WORK_GROUP_SIZE]
;
skipTiles[get_local_id
(
0
)
]
=
-1
;
while
(
pos
<
end
)
{
real
bornSum
=
0
;
bool
includeTile
=
true
;
//
Extract
the
coordinates
of
this
tile.
unsigned
int
x,
y
;
bool
singlePeriodicCopy
=
false
;
#
ifdef
USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles[pos]
;
x
=
tileIndices.x
;
singlePeriodicCopy
=
tileIndices.y
;
}
else
#
endif
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS+0.5f-SQRT
((
NUM_BLOCKS+0.5f
)
*
(
NUM_BLOCKS+0.5f
)
-2*pos
))
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
if
(
x
<
y
|
| x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
// Skip over tiles that have exclusions, since they were already processed.
SYNC_WARPS;
while (skipTiles[tbx+TILE_SIZE-1] < pos) {
SYNC_WARPS;
if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[skipBase+tgx];
skipTiles[get_local_id(0)] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
}
else
skipTiles[get_local_id(0)] = end;
skipBase += TILE_SIZE;
currentSkipIndex = tbx;
SYNC_WARPS;
}
while (skipTiles[currentSkipIndex] < pos)
currentSkipIndex++;
includeTile = (skipTiles[currentSkipIndex] != pos);
}
if (includeTile) {
unsigned int atom1 = x*TILE_SIZE + tgx;
// Load atom data for this tile.
real4 posq1 = posq[atom1];
float2 params1 = global_params[atom1];
#ifdef USE_CUTOFF
unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+tgx] : y*TILE_SIZE + tgx);
#else
unsigned int j = y*TILE_SIZE + tgx;
#endif
atomIndices[get_local_id(0)] = j;
if (j < PADDED_NUM_ATOMS) {
real4 tempPosq = posq[j];
localData[get_local_id(0)].x = tempPosq.x;
localData[get_local_id(0)].y = tempPosq.y;
localData[get_local_id(0)].z = tempPosq.z;
localData[get_local_id(0)].q = tempPosq.w;
float2 tempParams = global_params[j];
localData[get_local_id(0)].radius = tempParams.x;
localData[get_local_id(0)].scaledRadius = tempParams.y;
localData[get_local_id(0)].bornSum = 0.0f;
}
SYNC_WARPS;
#ifdef USE_PERIODIC
if (singlePeriodicCopy) {
// The box is small enough that we can just translate all the atoms into a single periodic
// box, then skip having to apply periodic boundary conditions later.
real4 blockCenterX = blockCenter[x];
posq1.xyz -= floor((posq1.xyz-blockCenterX.xyz)*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
localData[get_local_id(0)].x -= floor((localData[get_local_id(0)].x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
localData[get_local_id(0)].y -= floor((localData[get_local_id(0)].y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
localData[get_local_id(0)].z -= floor((localData[get_local_id(0)].z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
SYNC_WARPS;
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
real4 delta = (real4) (localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z, 0);
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
int atom2 = atomIndices[tbx+tj];
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
real invR = RSQRT(r2);
real r = RECIP(invR);
float2 params2 = (float2) (localData[tbx+tj].radius, localData[tbx+tj].scaledRadius);
real rScaledRadiusJ = r+params2.y;
if (params1.x < rScaledRadiusJ) {
real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
real u_ij = RECIP(rScaledRadiusJ);
real l_ij2 = l_ij*l_ij;
real u_ij2 = u_ij*u_ij;
real ratio = LOG(u_ij * RECIP(l_ij));
bornSum += l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
(params2.y*params2.y*invR)*(l_ij2-u_ij2));
bornSum += (params1.x < params2.y-r ? 2.0f*(RECIP(params1.x)-l_ij) : 0);
}
real rScaledRadiusI = r+params1.y;
if (params2.x < rScaledRadiusI) {
real l_ij = RECIP(max((real) params2.x, fabs(r-params1.y)));
real u_ij = RECIP(rScaledRadiusI);
real l_ij2 = l_ij*l_ij;
real u_ij2 = u_ij*u_ij;
real ratio = LOG(u_ij * RECIP(l_ij));
real term = l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
(params1.y*params1.y*invR)*(l_ij2-u_ij2));
term += (params2.x < params1.y-r ? 2.0f*(RECIP(params2.x)-l_ij) : 0);
localData[tbx+tj].bornSum += term;
}
}
tj = (tj + 1) & (TILE_SIZE - 1);
SYNC_WARPS;
}
}
else
#endif
{
// We need to apply periodic boundary conditions separately for each interaction.
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
real4 delta = (real4) (localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z, 0);
#ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
int atom2 = atomIndices[tbx+tj];
#ifdef USE_CUTOFF
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
float2 params2 = (float2) (localData[tbx+tj].radius, localData[tbx+tj].scaledRadius);
real rScaledRadiusJ = r+params2.y;
if (params1.x < rScaledRadiusJ) {
real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
real u_ij = RECIP(rScaledRadiusJ);
real l_ij2 = l_ij*l_ij;
real u_ij2 = u_ij*u_ij;
real ratio = LOG(u_ij * RECIP(l_ij));
bornSum += l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
(params2.y*params2.y*invR)*(l_ij2-u_ij2));
bornSum += (params1.x < params2.y-r ? 2.0f*(RECIP(params1.x)-l_ij) : 0);
}
real rScaledRadiusI = r+params1.y;
if (params2.x < rScaledRadiusI) {
real l_ij = RECIP(max((real) params2.x, fabs(r-params1.y)));
real u_ij = RECIP(rScaledRadiusI);
real l_ij2 = l_ij*l_ij;
real u_ij2 = u_ij*u_ij;
real ratio = LOG(u_ij * RECIP(l_ij));
real term = l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
(params1.y*params1.y*invR)*(l_ij2-u_ij2));
term += (params2.x < params1.y-r ? 2.0f*(RECIP(params2.x)-l_ij) : 0);
localData[tbx+tj].bornSum += term;
}
}
tj = (tj + 1) & (TILE_SIZE - 1);
SYNC_WARPS;
}
}
// Write results.
#ifdef USE_CUTOFF
unsigned int atom2 = atomIndices[get_local_id(0)];
#else
unsigned int atom2 = y*TILE_SIZE + tgx;
#endif
#ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&global_bornSum[atom1], (long) (bornSum*0x100000000));
if (atom2 < PADDED_NUM_ATOMS)
atom_add(&global_bornSum[atom2], (long) (localData[get_local_id(0)].bornSum*0x100000000));
#else
unsigned int offset1 = atom1 + warp*PADDED_NUM_ATOMS;
unsigned int offset2 = atom2 + warp*PADDED_NUM_ATOMS;
global_bornSum[offset1] += bornSum;
if (atom2 < PADDED_NUM_ATOMS)
global_bornSum[offset2] += localData[get_local_id(0)].bornSum;
#endif
}
pos++;
}
}
typedef struct {
real x, y, z;
real q;
real fx, fy, fz, fw;
real bornRadius;
} AtomData2;
/**
* First part of computing the GBSA interaction.
*/
__kernel void computeGBSAForce1(
#ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict forceBuffers, __global long* restrict global_bornForce,
#else
__global real4* restrict forceBuffers, __global real* restrict global_bornForce,
#endif
__global real* restrict energyBuffer, __global const real4* restrict posq, __global const real* restrict global_bornRadii,
#ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, __global const real4* restrict blockCenter, __global const int* restrict interactingAtoms,
#else
unsigned int numTiles,
#endif
__global const ushort2* exclusionTiles) {
const unsigned int totalWarps = get_global_size(0)/TILE_SIZE;
const unsigned int warp = get_global_id(0)/TILE_SIZE;
const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
const unsigned int tbx = get_local_id(0) - tgx;
real energy = 0.0f;
__local AtomData2 localData[FORCE_WORK_GROUP_SIZE];
// First loop: process tiles that contain exclusions.
const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
const ushort2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x;
const unsigned int y = tileIndices.y;
real4 force = 0.0f;
unsigned int atom1 = x*TILE_SIZE + tgx;
real4 posq1 = posq[atom1];
real bornRadius1 = global_bornRadii[atom1];
if (x == y) {
// This tile is on the diagonal.
const unsigned int localAtomIndex = get_local_id(0);
localData[localAtomIndex].x = posq1.x;
localData[localAtomIndex].y = posq1.y;
localData[localAtomIndex].z = posq1.z;
localData[localAtomIndex].q = posq1.w;
localData[get_local_id(0)].bornRadius = bornRadius1;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
real4 posq2 = (real4) (localData[tbx+j].x, localData[tbx+j].y, localData[tbx+j].z, localData[tbx+j].q);
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
real bornRadius2 = localData[tbx+j].bornRadius;
real alpha2_ij = bornRadius1*bornRadius2;
real D_ij = r2*RECIP(4.0f*alpha2_ij);
real expTerm = EXP(-D_ij);
real denominator2 = r2 + alpha2_ij*expTerm;
real denominator = SQRT(denominator2);
real tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator);
real Gpol = tempEnergy*RECIP(denominator2);
real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
real dEdR = Gpol*(1.0f - 0.25f*expTerm);
force.w += dGpol_dalpha2_ij*bornRadius2;
energy += 0.5f*tempEnergy;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
#ifdef USE_CUTOFF
}
#endif
SYNC_WARPS;
}
}
}
else {
// This is an off-diagonal tile.
unsigned int j = y*TILE_SIZE + tgx;
real4 tempPosq = posq[j];
localData[get_local_id(0)].x = tempPosq.x;
localData[get_local_id(0)].y = tempPosq.y;
localData[get_local_id(0)].z = tempPosq.z;
localData[get_local_id(0)].q = tempPosq.w;
localData[get_local_id(0)].bornRadius = global_bornRadii[j];
localData[get_local_id(0)].fx = 0.0f;
localData[get_local_id(0)].fy = 0.0f;
localData[get_local_id(0)].fz = 0.0f;
localData[get_local_id(0)].fw = 0.0f;
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS) {
real4 posq2 = (real4) (localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z, localData[tbx+tj].q);
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
real bornRadius2 = localData[tbx+tj].bornRadius;
real alpha2_ij = bornRadius1*bornRadius2;
real D_ij = r2*RECIP(4.0f*alpha2_ij);
real expTerm = EXP(-D_ij);
real denominator2 = r2 + alpha2_ij*expTerm;
real denominator = SQRT(denominator2);
real tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator);
real Gpol = tempEnergy*RECIP(denominator2);
real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
real dEdR = Gpol*(1.0f - 0.25f*expTerm);
force.w += dGpol_dalpha2_ij*bornRadius2;
energy += tempEnergy;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
localData[tbx+tj].fx += delta.x;
localData[tbx+tj].fy += delta.y;
localData[tbx+tj].fz += delta.z;
localData[tbx+tj].fw += dGpol_dalpha2_ij*bornRadius1;
#ifdef USE_CUTOFF
}
#endif
}
tj = (tj + 1) & (TILE_SIZE - 1);
SYNC_WARPS;
}
}
// Write results.
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset = x*TILE_SIZE + tgx;
atom_add(&forceBuffers[offset], (long) (force.x*0x100000000));
atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
atom_add(&global_bornForce[offset], (long) (force.w*0x100000000));
if (x != y) {
offset = y*TILE_SIZE + tgx;
atom_add(&forceBuffers[offset], (long) (localData[get_local_id(0)].fx*0x100000000));
atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fy*0x100000000));
atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fz*0x100000000));
atom_add(&global_bornForce[offset], (long) (localData[get_local_id(0)].fw*0x100000000));
}
#else
unsigned int offset1 = x*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
forceBuffers[offset1].xyz += force.xyz;
global_bornForce[offset1] += force.w;
if (x != y) {
forceBuffers[offset2] += (real4) (localData[get_local_id(0)].fx, localData[get_local_id(0)].fy, localData[get_local_id(0)].fz, 0.0f);
global_bornForce[offset2] += localData[get_local_id(0)].fw;
}
#endif
}
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
// of them (no cutoff).
#ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0];
int pos = warp*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
int end = (warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
#else
int pos = warp*numTiles/totalWarps;
int end = (warp+1)*numTiles/totalWarps;
#endif
int skipBase = 0;
int currentSkipIndex = tbx;
__local int atomIndices[FORCE_WORK_GROUP_SIZE];
__local int skipTiles[FORCE_WORK_GROUP_SIZE];
skipTiles[get_local_id(0)] = -1;
while (pos < end) {
real4 force = 0;
bool includeTile = true;
// Extract the coordinates of this tile.
unsigned int x, y;
bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
x = tileIndices.x;
singlePeriodicCopy = tileIndices.y;
}
else
#endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y |
|
x
>=
NUM_BLOCKS
)
{
//
Occasionally
happens
due
to
roundoff
error.
y
+=
(
x
<
y
?
-1
:
1
)
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
}
//
Skip
over
tiles
that
have
exclusions,
since
they
were
already
processed.
SYNC_WARPS
;
while
(
skipTiles[tbx+TILE_SIZE-1]
<
pos
)
{
SYNC_WARPS
;
if
(
skipBase+tgx
<
NUM_TILES_WITH_EXCLUSIONS
)
{
ushort2
tile
=
exclusionTiles[skipBase+tgx]
;
skipTiles[get_local_id
(
0
)
]
=
tile.x
+
tile.y*NUM_BLOCKS
-
tile.y*
(
tile.y+1
)
/2
;
}
else
skipTiles[get_local_id
(
0
)
]
=
end
;
skipBase
+=
TILE_SIZE
;
currentSkipIndex
=
tbx
;
SYNC_WARPS
;
}
while
(
skipTiles[currentSkipIndex]
<
pos
)
currentSkipIndex++
;
includeTile
=
(
skipTiles[currentSkipIndex]
!=
pos
)
;
}
if
(
includeTile
)
{
unsigned
int
atom1
=
x*TILE_SIZE
+
tgx
;
//
Load
atom
data
for
this
tile.
real4
posq1
=
posq[atom1]
;
real
bornRadius1
=
global_bornRadii[atom1]
;
#
ifdef
USE_CUTOFF
unsigned
int
j
=
(
numTiles
<=
maxTiles
?
interactingAtoms[pos*TILE_SIZE+tgx]
:
y*TILE_SIZE
+
tgx
)
;
#
else
unsigned
int
j
=
y*TILE_SIZE
+
tgx
;
#
endif
atomIndices[get_local_id
(
0
)
]
=
j
;
if
(
j
<
PADDED_NUM_ATOMS
)
{
real4
tempPosq
=
posq[j]
;
localData[get_local_id
(
0
)
].x
=
tempPosq.x
;
localData[get_local_id
(
0
)
].y
=
tempPosq.y
;
localData[get_local_id
(
0
)
].z
=
tempPosq.z
;
localData[get_local_id
(
0
)
].q
=
tempPosq.w
;
localData[get_local_id
(
0
)
].bornRadius
=
global_bornRadii[j]
;
localData[get_local_id
(
0
)
].fx
=
0.0f
;
localData[get_local_id
(
0
)
].fy
=
0.0f
;
localData[get_local_id
(
0
)
].fz
=
0.0f
;
localData[get_local_id
(
0
)
].fw
=
0.0f
;
}
#
ifdef
USE_PERIODIC
if
(
singlePeriodicCopy
)
{
//
The
box
is
small
enough
that
we
can
just
translate
all
the
atoms
into
a
single
periodic
//
box,
then
skip
having
to
apply
periodic
boundary
conditions
later.
real4
blockCenterX
=
blockCenter[x]
;
posq1.xyz
-=
floor
((
posq1.xyz-blockCenterX.xyz
)
*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
localData[get_local_id
(
0
)
].x
-=
floor
((
localData[get_local_id
(
0
)
].x-blockCenterX.x
)
*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
localData[get_local_id
(
0
)
].y
-=
floor
((
localData[get_local_id
(
0
)
].y-blockCenterX.y
)
*invPeriodicBoxSize.y+0.5f
)
*periodicBoxSize.y
;
localData[get_local_id
(
0
)
].z
-=
floor
((
localData[get_local_id
(
0
)
].z-blockCenterX.z
)
*invPeriodicBoxSize.z+0.5f
)
*periodicBoxSize.z
;
SYNC_WARPS
;
unsigned
int
tj
=
tgx
;
for
(
j
=
0
; j < TILE_SIZE; j++) {
int
atom2
=
atomIndices[tbx+tj]
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
real4
posq2
=
(
real4
)
(
localData[tbx+tj].x,
localData[tbx+tj].y,
localData[tbx+tj].z,
localData[tbx+tj].q
)
;
real4
delta
=
(
real4
)
(
posq2.xyz
-
posq1.xyz,
0
)
;
real
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
if
(
r2
<
CUTOFF_SQUARED
)
{
real
invR
=
RSQRT
(
r2
)
;
real
r
=
RECIP
(
invR
)
;
real
bornRadius2
=
localData[tbx+tj].bornRadius
;
real
alpha2_ij
=
bornRadius1*bornRadius2
;
real
D_ij
=
r2*RECIP
(
4.0f*alpha2_ij
)
;
real
expTerm
=
EXP
(
-D_ij
)
;
real
denominator2
=
r2
+
alpha2_ij*expTerm
;
real
denominator
=
SQRT
(
denominator2
)
;
real
tempEnergy
=
(
PREFACTOR*posq1.w*posq2.w
)
*RECIP
(
denominator
)
;
real
Gpol
=
tempEnergy*RECIP
(
denominator2
)
;
real
dGpol_dalpha2_ij
=
-0.5f*Gpol*expTerm*
(
1.0f+D_ij
)
;
real
dEdR
=
Gpol*
(
1.0f
-
0.25f*expTerm
)
;
force.w
+=
dGpol_dalpha2_ij*bornRadius2
;
energy
+=
tempEnergy
;
delta.xyz
*=
dEdR
;
force.xyz
-=
delta.xyz
;
localData[tbx+tj].fx
+=
delta.x
;
localData[tbx+tj].fy
+=
delta.y
;
localData[tbx+tj].fz
+=
delta.z
;
localData[tbx+tj].fw
+=
dGpol_dalpha2_ij*bornRadius1
;
}
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
)
;
SYNC_WARPS
;
}
}
else
#
endif
{
//
We
need
to
apply
periodic
boundary
conditions
separately
for
each
interaction.
unsigned
int
tj
=
tgx
;
for
(
j
=
0
; j < TILE_SIZE; j++) {
int
atom2
=
atomIndices[tbx+tj]
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
real4
posq2
=
(
real4
)
(
localData[tbx+tj].x,
localData[tbx+tj].y,
localData[tbx+tj].z,
localData[tbx+tj].q
)
;
real4
delta
=
(
real4
)
(
posq2.xyz
-
posq1.xyz,
0
)
;
#
ifdef
USE_PERIODIC
delta.xyz
-=
floor
(
delta.xyz*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
#
endif
real
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
#
ifdef
USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
#
endif
real
invR
=
RSQRT
(
r2
)
;
real
r
=
RECIP
(
invR
)
;
real
bornRadius2
=
localData[tbx+tj].bornRadius
;
real
alpha2_ij
=
bornRadius1*bornRadius2
;
real
D_ij
=
r2*RECIP
(
4.0f*alpha2_ij
)
;
real
expTerm
=
EXP
(
-D_ij
)
;
real
denominator2
=
r2
+
alpha2_ij*expTerm
;
real
denominator
=
SQRT
(
denominator2
)
;
real
tempEnergy
=
(
PREFACTOR*posq1.w*posq2.w
)
*RECIP
(
denominator
)
;
real
Gpol
=
tempEnergy*RECIP
(
denominator2
)
;
real
dGpol_dalpha2_ij
=
-0.5f*Gpol*expTerm*
(
1.0f+D_ij
)
;
real
dEdR
=
Gpol*
(
1.0f
-
0.25f*expTerm
)
;
force.w
+=
dGpol_dalpha2_ij*bornRadius2
;
energy
+=
tempEnergy
;
delta.xyz
*=
dEdR
;
force.xyz
-=
delta.xyz
;
localData[tbx+tj].fx
+=
delta.x
;
localData[tbx+tj].fy
+=
delta.y
;
localData[tbx+tj].fz
+=
delta.z
;
localData[tbx+tj].fw
+=
dGpol_dalpha2_ij*bornRadius1
;
#
ifdef
USE_CUTOFF
}
#
endif
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
)
;
SYNC_WARPS
;
}
}
//
Write
results.
#
ifdef
USE_CUTOFF
unsigned
int
atom2
=
atomIndices[get_local_id
(
0
)
]
;
#
else
unsigned
int
atom2
=
y*TILE_SIZE
+
tgx
;
#
endif
#
ifdef
SUPPORTS_64_BIT_ATOMICS
atom_add
(
&forceBuffers[atom1],
(
long
)
(
force.x*0x100000000
))
;
atom_add
(
&forceBuffers[atom1+PADDED_NUM_ATOMS],
(
long
)
(
force.y*0x100000000
))
;
atom_add
(
&forceBuffers[atom1+2*PADDED_NUM_ATOMS],
(
long
)
(
force.z*0x100000000
))
;
atom_add
(
&global_bornForce[atom1],
(
long
)
(
force.w*0x100000000
))
;
if
(
atom2
<
PADDED_NUM_ATOMS
)
{
atom_add
(
&forceBuffers[atom2],
(
long
)
(
localData[get_local_id
(
0
)
].fx*0x100000000
))
;
atom_add
(
&forceBuffers[atom2+PADDED_NUM_ATOMS],
(
long
)
(
localData[get_local_id
(
0
)
].fy*0x100000000
))
;
atom_add
(
&forceBuffers[atom2+2*PADDED_NUM_ATOMS],
(
long
)
(
localData[get_local_id
(
0
)
].fz*0x100000000
))
;
atom_add
(
&global_bornForce[atom2],
(
long
)
(
localData[get_local_id
(
0
)
].fw*0x100000000
))
;
}
#
else
unsigned
int
offset1
=
atom1
+
warp*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
atom2
+
warp*PADDED_NUM_ATOMS
;
forceBuffers[offset1].xyz
+=
force.xyz
;
global_bornForce[offset1]
+=
force.w
;
if
(
atom2
<
PADDED_NUM_ATOMS
)
{
forceBuffers[offset2]
+=
(
real4
)
(
localData[get_local_id
(
0
)
].fx,
localData[get_local_id
(
0
)
].fy,
localData[get_local_id
(
0
)
].fz,
0.0f
)
;
global_bornForce[offset2]
+=
localData[get_local_id
(
0
)
].fw
;
}
#
endif
}
pos++
;
}
energyBuffer[get_global_id
(
0
)
]
+=
energy
;
}
platforms/opencl/src/kernels/gbsaObc_cpu.cl
View file @
93c467b2
#
define
TILE_SIZE
32
#
ifdef
SUPPORTS_64_BIT_ATOMICS
#
pragma
OPENCL
EXTENSION
cl_khr_int64_base_atomics
:
enable
#
endif
typedef
struct
{
typedef
struct
{
real
x,
y,
z
;
real
x,
y,
z
;
...
@@ -10,58 +12,42 @@ typedef struct {
...
@@ -10,58 +12,42 @@ typedef struct {
/**
/**
*
Compute
the
Born
sum.
*
Compute
the
Born
sum.
*/
*/
__kernel
void
computeBornSum
(
__kernel
void
computeBornSum
(
__global
real*
restrict
global_bornSum,
__global
const
real4*
restrict
posq,
__global
const
float2*
restrict
global_params,
#
ifdef
SUPPORTS_64_BIT_ATOMICS
#
ifdef
USE_CUTOFF
__global
long*
restrict
global_bornSum,
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
const
unsigned
int*
restrict
interactionFlags
)
{
#
else
#
else
unsigned
int
numTiles
)
{
__global
real*
restrict
global_bornSum,
#
endif
#
endif
__global
const
real4*
restrict
posq,
__global
const
float2*
restrict
global_params,
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
unsigned
int
numTiles
=
interactionCount[0]
;
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
const
real4*
restrict
blockCenter,
__global
const
int*
restrict
interactingAtoms,
unsigned
int
pos
=
get_group_id
(
0
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS*
(
NUM_BLOCKS+1
)
/2
:
numTiles
)
/get_num_groups
(
0
)
;
unsigned
int
end
=
(
get_group_id
(
0
)
+1
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS*
(
NUM_BLOCKS+1
)
/2
:
numTiles
)
/get_num_groups
(
0
)
;
#
else
#
else
unsigned
int
pos
=
get_group_id
(
0
)
*numTiles/get_num_groups
(
0
)
;
unsigned
int
numTiles,
unsigned
int
end
=
(
get_group_id
(
0
)
+1
)
*numTiles/get_num_groups
(
0
)
;
#
endif
#
endif
unsigned
int
lasty
=
0xFFFFFFFF
;
__global
const
ushort2*
exclusionTiles
)
{
__local
AtomData1
localData[TILE_SIZE]
;
__local
AtomData1
localData[TILE_SIZE]
;
while
(
pos
<
end
)
{
//
First
loop:
process
tiles
that
contain
exclusions.
//
Extract
the
coordinates
of
this
tile
unsigned
int
x,
y
;
const
unsigned
int
firstExclusionTile
=
FIRST_EXCLUSION_TILE+get_group_id
(
0
)
*
(
LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE
)
/get_num_groups
(
0
)
;
#
ifdef
USE_CUTOFF
const
unsigned
int
lastExclusionTile
=
FIRST_EXCLUSION_TILE+
(
get_group_id
(
0
)
+1
)
*
(
LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE
)
/get_num_groups
(
0
)
;
if
(
numTiles
<=
maxTiles
)
{
for
(
int
pos
=
firstExclusionTile
; pos < lastExclusionTile; pos++) {
ushort2
tileIndices
=
tiles[pos]
;
const
ushort2
tileIndices
=
exclusionTiles[pos]
;
x
=
tileIndices.x
;
const
unsigned
int
x
=
tileIndices.x
;
y
=
tileIndices.y
;
const
unsigned
int
y
=
tileIndices.y
;
}
else
//
Load
the
data
for
this
tile.
#
endif
{
for
(
int
localAtomIndex
=
0
; localAtomIndex < TILE_SIZE; localAtomIndex++) {
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS+0.5f-sqrt
((
NUM_BLOCKS+0.5f
)
*
(
NUM_BLOCKS+0.5f
)
-2*pos
))
;
unsigned
int
j
=
y*TILE_SIZE
+
localAtomIndex
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
real4
tempPosq
=
posq[j]
;
if
(
x
<
y
|
| x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
localData[localAtomIndex].x
=
tempPosq.x
;
y += (x < y ? -1 : 1);
localData[localAtomIndex].y
=
tempPosq.y
;
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
localData[localAtomIndex].z
=
tempPosq.z
;
}
localData[localAtomIndex].q
=
tempPosq.w
;
}
float2
tempParams
=
global_params[j]
;
localData[localAtomIndex].radius
=
tempParams.x
;
// Load the data for this tile if we don't already have it cached.
localData[localAtomIndex].scaledRadius
=
tempParams.y
;
if (lasty != y) {
for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
unsigned int j = y*TILE_SIZE + localAtomIndex;
real4 tempPosq = posq[j];
localData[localAtomIndex].x = tempPosq.x;
localData[localAtomIndex].y = tempPosq.y;
localData[localAtomIndex].z = tempPosq.z;
localData[localAtomIndex].q = tempPosq.w;
float2 tempParams = global_params[j];
localData[localAtomIndex].radius = tempParams.x;
localData[localAtomIndex].scaledRadius = tempParams.y;
}
}
}
if
(
x
==
y
)
{
if
(
x
==
y
)
{
//
This
tile
is
on
the
diagonal.
//
This
tile
is
on
the
diagonal.
...
@@ -93,31 +79,31 @@ __kernel void computeBornSum(__global real* restrict global_bornSum, __global co
...
@@ -93,31 +79,31 @@ __kernel void computeBornSum(__global real* restrict global_bornSum, __global co
real
l_ij2
=
l_ij*l_ij
;
real
l_ij2
=
l_ij*l_ij
;
real
u_ij2
=
u_ij*u_ij
;
real
u_ij2
=
u_ij*u_ij
;
real
ratio
=
LOG
(
u_ij
*
RECIP
(
l_ij
))
;
real
ratio
=
LOG
(
u_ij
*
RECIP
(
l_ij
))
;
bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
bornSum
+=
l_ij
-
u_ij
+
(
0.50f*invR*ratio
)
+
0.25f*
(
r*
(
u_ij2-l_ij2
)
+
(0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2);
(
params2.y*params2.y*invR
)
*
(
l_ij2-u_ij2
))
;
if (params1.x < params2.y-r)
bornSum
+=
(
params1.x
<
params2.y-r
?
2.0f*
(
RECIP
(
params1.x
)
-l_ij
)
:
0
)
;
bornSum += 2.0f*(RECIP(params1.x)-l_ij);
}
}
}
}
}
}
//
Write
results.
//
Write
results.
unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#
ifdef
SUPPORTS_64_BIT_ATOMICS
atom_add
(
&global_bornSum[atom1],
(
long
)
(
bornSum*0x100000000
))
;
#
else
unsigned
int
offset
=
atom1
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
global_bornSum[offset]
+=
bornSum
;
global_bornSum[offset]
+=
bornSum
;
#
endif
}
}
}
}
else
{
else
{
//
This
is
an
off-diagonal
tile.
//
This
is
an
off-diagonal
tile.
for
(
int
tgx
=
0
; tgx < TILE_SIZE; tgx++)
for
(
int
tgx
=
0
; tgx < TILE_SIZE; tgx++)
localData[tgx].bornSum = 0.0f;
localData[tgx].bornSum
=
0
;
// Compute the full set of interactions in this tile.
for
(
unsigned
int
tgx
=
0
; tgx < TILE_SIZE; tgx++) {
for
(
unsigned
int
tgx
=
0
; tgx < TILE_SIZE; tgx++) {
unsigned
int
atom1
=
x*TILE_SIZE+tgx
;
unsigned
int
atom1
=
x*TILE_SIZE+tgx
;
real bornSum = 0
.0f
;
real
bornSum
=
0
;
real4
posq1
=
posq[atom1]
;
real4
posq1
=
posq[atom1]
;
float2
params1
=
global_params[atom1]
;
float2
params1
=
global_params[atom1]
;
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
...
@@ -126,7 +112,7 @@ __kernel void computeBornSum(__global real* restrict global_bornSum, __global co
...
@@ -126,7 +112,7 @@ __kernel void computeBornSum(__global real* restrict global_bornSum, __global co
#
ifdef
USE_PERIODIC
#
ifdef
USE_PERIODIC
delta.xyz
-=
floor
(
delta.xyz*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
delta.xyz
-=
floor
(
delta.xyz*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
#
endif
#
endif
real r2 = d
ot(
delta.x
yz,
delta.
xyz)
;
real
r2
=
d
elta.x*
delta.x
+
delta.
y*delta.y
+
delta.z*delta.z
;
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
if
(
atom1
<
NUM_ATOMS
&&
y*TILE_SIZE+j
<
NUM_ATOMS
&&
r2
<
CUTOFF_SQUARED
)
{
if
(
atom1
<
NUM_ATOMS
&&
y*TILE_SIZE+j
<
NUM_ATOMS
&&
r2
<
CUTOFF_SQUARED
)
{
#
else
#
else
...
@@ -134,8 +120,6 @@ __kernel void computeBornSum(__global real* restrict global_bornSum, __global co
...
@@ -134,8 +120,6 @@ __kernel void computeBornSum(__global real* restrict global_bornSum, __global co
#
endif
#
endif
real
invR
=
RSQRT
(
r2
)
;
real
invR
=
RSQRT
(
r2
)
;
real
r
=
RECIP
(
invR
)
;
real
r
=
RECIP
(
invR
)
;
float2
params2
=
(
float2
)
(
localData[j].radius,
localData[j].scaledRadius
)
;
float2
params2
=
(
float2
)
(
localData[j].radius,
localData[j].scaledRadius
)
;
real
rScaledRadiusJ
=
r+params2.y
;
real
rScaledRadiusJ
=
r+params2.y
;
if
(
params1.x
<
rScaledRadiusJ
)
{
if
(
params1.x
<
rScaledRadiusJ
)
{
...
@@ -144,10 +128,9 @@ __kernel void computeBornSum(__global real* restrict global_bornSum, __global co
...
@@ -144,10 +128,9 @@ __kernel void computeBornSum(__global real* restrict global_bornSum, __global co
real
l_ij2
=
l_ij*l_ij
;
real
l_ij2
=
l_ij*l_ij
;
real
u_ij2
=
u_ij*u_ij
;
real
u_ij2
=
u_ij*u_ij
;
real
ratio
=
LOG
(
u_ij
*
RECIP
(
l_ij
))
;
real
ratio
=
LOG
(
u_ij
*
RECIP
(
l_ij
))
;
bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
bornSum
+=
l_ij
-
u_ij
+
(
0.50f*invR*ratio
)
+
0.25f*
(
r*
(
u_ij2-l_ij2
)
+
(0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2);
(
params2.y*params2.y*invR
)
*
(
l_ij2-u_ij2
))
;
if (params1.x < params2.y-r)
bornSum
+=
(
params1.x
<
params2.y-r
?
2.0f*
(
RECIP
(
params1.x
)
-l_ij
)
:
0
)
;
bornSum += 2.0f*(RECIP(params1.x)-l_ij);
}
}
real
rScaledRadiusI
=
r+params1.y
;
real
rScaledRadiusI
=
r+params1.y
;
if
(
params2.x
<
rScaledRadiusI
)
{
if
(
params2.x
<
rScaledRadiusI
)
{
...
@@ -156,10 +139,9 @@ __kernel void computeBornSum(__global real* restrict global_bornSum, __global co
...
@@ -156,10 +139,9 @@ __kernel void computeBornSum(__global real* restrict global_bornSum, __global co
real
l_ij2
=
l_ij*l_ij
;
real
l_ij2
=
l_ij*l_ij
;
real
u_ij2
=
u_ij*u_ij
;
real
u_ij2
=
u_ij*u_ij
;
real
ratio
=
LOG
(
u_ij
*
RECIP
(
l_ij
))
;
real
ratio
=
LOG
(
u_ij
*
RECIP
(
l_ij
))
;
real term = l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
real
term
=
l_ij
-
u_ij
+
(
0.50f*invR*ratio
)
+
0.25f*
(
r*
(
u_ij2-l_ij2
)
+
(0.25f*params1.y*params1.y*invR)*(l_ij2-u_ij2);
(
params1.y*params1.y*invR
)
*
(
l_ij2-u_ij2
))
;
if (params2.x < params1.y-r)
term
+=
(
params2.x
<
params1.y-r
?
2.0f*
(
RECIP
(
params2.x
)
-l_ij
)
:
0
)
;
term += 2.0f*(RECIP(params2.x)-l_ij);
localData[j].bornSum
+=
term
;
localData[j].bornSum
+=
term
;
}
}
}
}
...
@@ -167,91 +149,299 @@ __kernel void computeBornSum(__global real* restrict global_bornSum, __global co
...
@@ -167,91 +149,299 @@ __kernel void computeBornSum(__global real* restrict global_bornSum, __global co
//
Write
results
for
atom1.
//
Write
results
for
atom1.
#
ifdef
SUPPORTS_64_BIT_ATOMICS
atom_add
(
&global_bornSum[atom1],
(
long
)
(
bornSum*0x100000000
))
;
#
else
unsigned
int
offset
=
atom1
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset
=
atom1
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
global_bornSum[offset]
+=
bornSum
;
global_bornSum[offset]
+=
bornSum
;
#
endif
}
}
// Write results
//
Write
results
.
for
(
int
tgx
=
0
; tgx < TILE_SIZE; tgx++) {
for
(
int
tgx
=
0
; tgx < TILE_SIZE; tgx++) {
#
ifdef
SUPPORTS_64_BIT_ATOMICS
unsigned
int
offset
=
y*TILE_SIZE
+
tgx
;
atom_add
(
&global_bornSum[offset],
(
long
)
(
localData[tgx].bornSum*0x100000000
))
;
#
else
unsigned
int
offset
=
y*TILE_SIZE+tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
unsigned
int
offset
=
y*TILE_SIZE+tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
global_bornSum[offset]
+=
localData[tgx].bornSum
;
global_bornSum[offset]
+=
localData[tgx].bornSum
;
#
endif
}
}
}
}
lasty = y;
pos++;
}
}
}
typedef struct {
//
Second
loop:
tiles
without
exclusions,
either
from
the
neighbor
list
(
with
cutoff
)
or
just
enumerating
all
real x, y, z;
//
of
them
(
no
cutoff
)
.
real q;
real fx, fy, fz, fw;
real bornRadius;
} AtomData2;
/**
* First part of computing the GBSA interaction.
*/
__kernel void computeGBSAForce1(__global real4* restrict forceBuffers, __global real* restrict global_bornForce,
__global real* restrict energyBuffer, __global const real4* restrict posq, __global const real* restrict global_bornRadii,
#ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags) {
#else
unsigned int numTiles) {
#endif
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
unsigned
int
numTiles
=
interactionCount[0]
;
unsigned
int
numTiles
=
interactionCount[0]
;
unsigned
int pos = get_group_id(0)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0);
int
pos
=
get_group_id
(
0
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS*
(
NUM_BLOCKS+1
)
/2
:
numTiles
)
/get_num_groups
(
0
)
;
unsigned
int end = (get_group_id(0)+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0);
int
end
=
(
get_group_id
(
0
)
+1
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS*
(
NUM_BLOCKS+1
)
/2
:
numTiles
)
/get_num_groups
(
0
)
;
#
else
#
else
unsigned
int pos = get_group_id(0)*numTiles/get_num_groups(0);
int
pos
=
get_group_id
(
0
)
*numTiles/get_num_groups
(
0
)
;
unsigned
int end = (get_group_id(0)+1)*numTiles/get_num_groups(0);
int
end
=
(
get_group_id
(
0
)
+1
)
*numTiles/get_num_groups
(
0
)
;
#
endif
#
endif
real energy = 0.0f
;
int
nextToSkip
=
-1
;
unsigned int lasty = 0xFFFFFFFF
;
int
currentSkipIndex
=
0
;
__local
AtomData2 localData
[TILE_SIZE];
__local
int
atomIndices
[TILE_SIZE]
;
while
(
pos
<
end
)
{
while
(
pos
<
end
)
{
// Extract the coordinates of this tile
bool
includeTile
=
true
;
//
Extract
the
coordinates
of
this
tile.
unsigned
int
x,
y
;
unsigned
int
x,
y
;
bool
singlePeriodicCopy
=
false
;
#
ifdef
USE_CUTOFF
#
ifdef
USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles[pos]
;
ushort2
tileIndices
=
tiles[pos]
;
x
=
tileIndices.x
;
x
=
tileIndices.x
;
y = tileIndices.y;
singlePeriodicCop
y
=
tileIndices.y
;
}
}
else
else
#
endif
#
endif
{
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-
sqrt
((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS+0.5f-
SQRT
((
NUM_BLOCKS+0.5f
)
*
(
NUM_BLOCKS+0.5f
)
-2*pos
))
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
if
(
x
<
y
|
| x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
if
(
x
<
y
|
| x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
}
//
Load
the
data
for
this
tile
if
we
don
't
already
have
it
cached.
// Skip over tiles that have exclusions, since they were already processed.
while (nextToSkip < pos) {
if (currentSkipIndex < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[currentSkipIndex++];
nextToSkip = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
}
else
nextToSkip = end;
}
includeTile = (nextToSkip != pos);
}
if (includeTile) {
// Load the data for this tile.
if
(
lasty
!=
y
)
{
for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
unsigned
int
j
=
y*TILE_SIZE
+
localAtomIndex
;
#ifdef USE_CUTOFF
real4
tempPosq
=
posq[j]
;
unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+localAtomIndex] : y*TILE_SIZE+localAtomIndex);
localData[localAtomIndex].x
=
tempPosq.x
;
#else
localData[localAtomIndex].y
=
tempPosq.y
;
unsigned int j = y*TILE_SIZE+localAtomIndex;
localData[localAtomIndex].z
=
tempPosq.z
;
#endif
localData[localAtomIndex].q
=
tempPosq.w
;
atomIndices[localAtomIndex] = j;
localData[localAtomIndex].bornRadius
=
global_bornRadii[j]
;
if (j < PADDED_NUM_ATOMS) {
real4 tempPosq = posq[j];
localData[localAtomIndex].x = tempPosq.x;
localData[localAtomIndex].y = tempPosq.y;
localData[localAtomIndex].z = tempPosq.z;
localData[localAtomIndex].q = tempPosq.w;
float2 tempParams = global_params[j];
localData[localAtomIndex].radius = tempParams.x;
localData[localAtomIndex].scaledRadius = tempParams.y;
localData[localAtomIndex].bornSum = 0.0f;
}
}
#ifdef USE_PERIODIC
if (singlePeriodicCopy) {
// The box is small enough that we can just translate all the atoms into a single periodic
// box, then skip having to apply periodic boundary conditions later.
real4 blockCenterX = blockCenter[x];
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
localData[tgx].x -= floor((localData[tgx].x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
localData[tgx].y -= floor((localData[tgx].y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
localData[tgx].z -= floor((localData[tgx].z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
}
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
unsigned int atom1 = x*TILE_SIZE+tgx;
real bornSum = 0;
real4 posq1 = posq[atom1];
float2 params1 = global_params[atom1];
for (unsigned int j = 0; j < TILE_SIZE; j++) {
real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
int atom2 = atomIndices[j];
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
real invR = RSQRT(r2);
real r = RECIP(invR);
float2 params2 = (float2) (localData[j].radius, localData[j].scaledRadius);
real rScaledRadiusJ = r+params2.y;
if (params1.x < rScaledRadiusJ) {
real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
real u_ij = RECIP(rScaledRadiusJ);
real l_ij2 = l_ij*l_ij;
real u_ij2 = u_ij*u_ij;
real ratio = LOG(u_ij * RECIP(l_ij));
bornSum += l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
(params2.y*params2.y*invR)*(l_ij2-u_ij2));
bornSum += (params1.x < params2.y-r ? 2.0f*(RECIP(params1.x)-l_ij) : 0);
}
real rScaledRadiusI = r+params1.y;
if (params2.x < rScaledRadiusI) {
real l_ij = RECIP(max((real) params2.x, fabs(r-params1.y)));
real u_ij = RECIP(rScaledRadiusI);
real l_ij2 = l_ij*l_ij;
real u_ij2 = u_ij*u_ij;
real ratio = LOG(u_ij * RECIP(l_ij));
real term = l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
(params1.y*params1.y*invR)*(l_ij2-u_ij2));
term += (params2.x < params1.y-r ? 2.0f*(RECIP(params2.x)-l_ij) : 0);
localData[j].bornSum += term;
}
}
}
// Write results for atom1.
#ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&global_bornSum[atom1], (long) (bornSum*0x100000000));
#else
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
global_bornSum[offset] += bornSum;
#endif
}
}
else
#endif
{
// We need to apply periodic boundary conditions separately for each interaction.
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
unsigned int atom1 = x*TILE_SIZE+tgx;
real bornSum = 0;
real4 posq1 = posq[atom1];
float2 params1 = global_params[atom1];
for (unsigned int j = 0; j < TILE_SIZE; j++) {
real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
int atom2 = atomIndices[j];
#ifdef USE_CUTOFF
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
float2 params2 = (float2) (localData[j].radius, localData[j].scaledRadius);
real rScaledRadiusJ = r+params2.y;
if (params1.x < rScaledRadiusJ) {
real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
real u_ij = RECIP(rScaledRadiusJ);
real l_ij2 = l_ij*l_ij;
real u_ij2 = u_ij*u_ij;
real ratio = LOG(u_ij * RECIP(l_ij));
bornSum += l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
(params2.y*params2.y*invR)*(l_ij2-u_ij2));
bornSum += (params1.x < params2.y-r ? 2.0f*(RECIP(params1.x)-l_ij) : 0);
}
real rScaledRadiusI = r+params1.y;
if (params2.x < rScaledRadiusI) {
real l_ij = RECIP(max((real) params2.x, fabs(r-params1.y)));
real u_ij = RECIP(rScaledRadiusI);
real l_ij2 = l_ij*l_ij;
real u_ij2 = u_ij*u_ij;
real ratio = LOG(u_ij * RECIP(l_ij));
real term = l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
(params1.y*params1.y*invR)*(l_ij2-u_ij2));
term += (params2.x < params1.y-r ? 2.0f*(RECIP(params2.x)-l_ij) : 0);
localData[j].bornSum += term;
}
}
}
// Write results for atom1.
#ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&global_bornSum[atom1], (long) (bornSum*0x100000000));
#else
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
global_bornSum[offset] += bornSum;
#endif
}
}
// Write results.
for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
#ifdef USE_CUTOFF
unsigned int atom2 = atomIndices[tgx];
#else
unsigned int atom2 = y*TILE_SIZE + tgx;
#endif
if (atom2 < PADDED_NUM_ATOMS) {
#ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&global_bornSum[atom2], (long) (localData[tgx].bornSum*0x100000000));
#else
unsigned int offset = atom2 + get_group_id(0)*PADDED_NUM_ATOMS;
global_bornSum[offset] += localData[tgx].bornSum;
#endif
}
}
}
}
}
pos++;
}
}
typedef struct {
real x, y, z;
real q;
real fx, fy, fz, fw;
real bornRadius;
} AtomData2;
/**
* First part of computing the GBSA interaction.
*/
__kernel void computeGBSAForce1(
#ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict forceBuffers, __global long* restrict global_bornForce,
#else
__global real4* restrict forceBuffers, __global real* restrict global_bornForce,
#endif
__global real* restrict energyBuffer, __global const real4* restrict posq, __global const real* restrict global_bornRadii,
#ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, __global const real4* restrict blockCenter, __global const int* restrict interactingAtoms,
#else
unsigned int numTiles,
#endif
__global const ushort2* exclusionTiles) {
real energy = 0.0f;
__local AtomData2 localData[TILE_SIZE];
// First loop: process tiles that contain exclusions.
const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+get_group_id(0)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0);
const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(get_group_id(0)+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0);
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
const ushort2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x;
const unsigned int y = tileIndices.y;
// Load the data for this tile.
for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
unsigned int j = y*TILE_SIZE + localAtomIndex;
real4 tempPosq = posq[j];
localData[localAtomIndex].x = tempPosq.x;
localData[localAtomIndex].y = tempPosq.y;
localData[localAtomIndex].z = tempPosq.z;
localData[localAtomIndex].q = tempPosq.w;
localData[localAtomIndex].bornRadius = global_bornRadii[j];
}
if (x == y) {
if (x == y) {
// This tile is on the diagonal.
// This tile is on the diagonal.
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
unsigned int atom1 = x*TILE_SIZE+tgx;
unsigned int atom1 = x*TILE_SIZE+tgx;
real4
force
=
0
.0f
;
real4 force = 0;
real4 posq1 = posq[atom1];
real4 posq1 = posq[atom1];
real bornRadius1 = global_bornRadii[atom1];
real bornRadius1 = global_bornRadii[atom1];
for (unsigned int j = 0; j < TILE_SIZE; j++) {
for (unsigned int j = 0; j < TILE_SIZE; j++) {
...
@@ -260,7 +450,7 @@ __kernel void computeGBSAForce1(__global real4* restrict forceBuffers, __global
...
@@ -260,7 +450,7 @@ __kernel void computeGBSAForce1(__global real4* restrict forceBuffers, __global
#ifdef USE_PERIODIC
#ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
#endif
#endif
real
r2
=
d
ot
(
delta.x
yz,
delta.
xyz
)
;
real r2 = d
elta.x*
delta.x
+
delta.
y*delta.y + delta.z*delta.z
;
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else
#else
...
@@ -277,35 +467,40 @@ __kernel void computeGBSAForce1(__global real4* restrict forceBuffers, __global
...
@@ -277,35 +467,40 @@ __kernel void computeGBSAForce1(__global real4* restrict forceBuffers, __global
real tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator);
real tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator);
real Gpol = tempEnergy*RECIP(denominator2);
real Gpol = tempEnergy*RECIP(denominator2);
real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
force.w
+=
dGpol_dalpha2_ij*bornRadius2
;
real dEdR = Gpol*(1.0f - 0.25f*expTerm);
real dEdR = Gpol*(1.0f - 0.25f*expTerm);
force.w += dGpol_dalpha2_ij*bornRadius2;
energy += 0.5f*tempEnergy;
energy += 0.5f*tempEnergy;
force.xyz
-=
delta.xyz*dEdR
;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
}
}
}
}
// Write results.
// Write results.
unsigned
int
offset
=
x*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
#ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&forceBuffers[atom1], (long) (force.x*0x100000000));
atom_add(&forceBuffers[atom1+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
atom_add(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
atom_add(&global_bornForce[atom1], (long) (force.w*0x100000000));
#else
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz;
forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz;
global_bornForce[offset] += force.w;
global_bornForce[offset] += force.w;
#endif
}
}
}
}
else {
else {
// This is an off-diagonal tile.
// This is an off-diagonal tile.
for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
localData[tgx].fx
=
0
.0f
;
localData[tgx].fx = 0;
localData[tgx].fy
=
0
.0f
;
localData[tgx].fy = 0;
localData[tgx].fz
=
0
.0f
;
localData[tgx].fz = 0;
localData[tgx].fw
=
0
.0f
;
localData[tgx].fw = 0;
}
}
//
Compute
the
full
set
of
interactions
in
this
tile.
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
unsigned int atom1 = x*TILE_SIZE+tgx;
unsigned int atom1 = x*TILE_SIZE+tgx;
real4
force
=
0
.0f
;
real4 force = 0;
real4 posq1 = posq[atom1];
real4 posq1 = posq[atom1];
real bornRadius1 = global_bornRadii[atom1];
real bornRadius1 = global_bornRadii[atom1];
for (unsigned int j = 0; j < TILE_SIZE; j++) {
for (unsigned int j = 0; j < TILE_SIZE; j++) {
...
@@ -314,7 +509,7 @@ __kernel void computeGBSAForce1(__global real4* restrict forceBuffers, __global
...
@@ -314,7 +509,7 @@ __kernel void computeGBSAForce1(__global real4* restrict forceBuffers, __global
#ifdef USE_PERIODIC
#ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
#endif
#endif
real
r2
=
d
ot
(
delta.x
yz,
delta.
xyz
)
;
real r2 = d
elta.x*
delta.x
+
delta.
y*delta.y + delta.z*delta.z
;
#ifdef USE_CUTOFF
#ifdef USE_CUTOFF
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else
#else
...
@@ -331,8 +526,8 @@ __kernel void computeGBSAForce1(__global real4* restrict forceBuffers, __global
...
@@ -331,8 +526,8 @@ __kernel void computeGBSAForce1(__global real4* restrict forceBuffers, __global
real tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator);
real tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator);
real Gpol = tempEnergy*RECIP(denominator2);
real Gpol = tempEnergy*RECIP(denominator2);
real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
force.w
+=
dGpol_dalpha2_ij*bornRadius2
;
real dEdR = Gpol*(1.0f - 0.25f*expTerm);
real dEdR = Gpol*(1.0f - 0.25f*expTerm);
force.w += dGpol_dalpha2_ij*bornRadius2;
energy += tempEnergy;
energy += tempEnergy;
delta.xyz *= dEdR;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
force.xyz -= delta.xyz;
...
@@ -343,16 +538,30 @@ __kernel void computeGBSAForce1(__global real4* restrict forceBuffers, __global
...
@@ -343,16 +538,30 @@ __kernel void computeGBSAForce1(__global real4* restrict forceBuffers, __global
}
}
}
}
//
Write
results
for
atom1.
// Write results for atom1.
#ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&forceBuffers[atom1], (long) (force.x*0x100000000));
atom_add(&forceBuffers[atom1+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
atom_add(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
atom_add(&global_bornForce[atom1], (long) (force.w*0x100000000));
#else
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz;
forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz;
global_bornForce[offset] += force.w;
global_bornForce[offset] += force.w;
#endif
}
}
//
Write
results
// Write results
.
for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset = y*TILE_SIZE + tgx;
atom_add(&forceBuffers[offset], (long) (localData[tgx].fx*0x100000000));
atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (localData[tgx].fy*0x100000000));
atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (localData[tgx].fz*0x100000000));
atom_add(&global_bornForce[offset], (long) (localData[tgx].fw*0x100000000));
#else
unsigned int offset = y*TILE_SIZE+tgx + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset = y*TILE_SIZE+tgx + get_group_id(0)*PADDED_NUM_ATOMS;
real4 f = forceBuffers[offset];
real4 f = forceBuffers[offset];
f.x += localData[tgx].fx;
f.x += localData[tgx].fx;
...
@@ -360,9 +569,231 @@ __kernel void computeGBSAForce1(__global real4* restrict forceBuffers, __global
...
@@ -360,9 +569,231 @@ __kernel void computeGBSAForce1(__global real4* restrict forceBuffers, __global
f.z += localData[tgx].fz;
f.z += localData[tgx].fz;
forceBuffers[offset] = f;
forceBuffers[offset] = f;
global_bornForce[offset] += localData[tgx].fw;
global_bornForce[offset] += localData[tgx].fw;
#endif
}
}
}
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
// of them (no cutoff).
#ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0];
int pos = get_group_id(0)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0);
int end = (get_group_id(0)+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0);
#else
int pos = get_group_id(0)*numTiles/get_num_groups(0);
int end = (get_group_id(0)+1)*numTiles/get_num_groups(0);
#endif
int nextToSkip = -1;
int currentSkipIndex = 0;
__local int atomIndices[TILE_SIZE];
while (pos < end) {
bool includeTile = true;
// Extract the coordinates of this tile.
unsigned int x, y;
bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
x = tileIndices.x;
singlePeriodicCopy = tileIndices.y;
}
else
#endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y |
|
x
>=
NUM_BLOCKS
)
{
//
Occasionally
happens
due
to
roundoff
error.
y
+=
(
x
<
y
?
-1
:
1
)
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
}
//
Skip
over
tiles
that
have
exclusions,
since
they
were
already
processed.
while
(
nextToSkip
<
pos
)
{
if
(
currentSkipIndex
<
NUM_TILES_WITH_EXCLUSIONS
)
{
ushort2
tile
=
exclusionTiles[currentSkipIndex++]
;
nextToSkip
=
tile.x
+
tile.y*NUM_BLOCKS
-
tile.y*
(
tile.y+1
)
/2
;
}
else
nextToSkip
=
end
;
}
includeTile
=
(
nextToSkip
!=
pos
)
;
}
if
(
includeTile
)
{
//
Load
the
data
for
this
tile.
for
(
int
localAtomIndex
=
0
; localAtomIndex < TILE_SIZE; localAtomIndex++) {
#
ifdef
USE_CUTOFF
unsigned
int
j
=
(
numTiles
<=
maxTiles
?
interactingAtoms[pos*TILE_SIZE+localAtomIndex]
:
y*TILE_SIZE+localAtomIndex
)
;
#
else
unsigned
int
j
=
y*TILE_SIZE+localAtomIndex
;
#
endif
atomIndices[localAtomIndex]
=
j
;
if
(
j
<
PADDED_NUM_ATOMS
)
{
real4
tempPosq
=
posq[j]
;
localData[localAtomIndex].x
=
tempPosq.x
;
localData[localAtomIndex].y
=
tempPosq.y
;
localData[localAtomIndex].z
=
tempPosq.z
;
localData[localAtomIndex].q
=
tempPosq.w
;
localData[localAtomIndex].bornRadius
=
global_bornRadii[j]
;
localData[localAtomIndex].fx
=
0.0f
;
localData[localAtomIndex].fy
=
0.0f
;
localData[localAtomIndex].fz
=
0.0f
;
localData[localAtomIndex].fw
=
0.0f
;
}
}
#
ifdef
USE_PERIODIC
if
(
singlePeriodicCopy
)
{
//
The
box
is
small
enough
that
we
can
just
translate
all
the
atoms
into
a
single
periodic
//
box,
then
skip
having
to
apply
periodic
boundary
conditions
later.
real4
blockCenterX
=
blockCenter[x]
;
for
(
unsigned
int
tgx
=
0
; tgx < TILE_SIZE; tgx++) {
localData[tgx].x
-=
floor
((
localData[tgx].x-blockCenterX.x
)
*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
localData[tgx].y
-=
floor
((
localData[tgx].y-blockCenterX.y
)
*invPeriodicBoxSize.y+0.5f
)
*periodicBoxSize.y
;
localData[tgx].z
-=
floor
((
localData[tgx].z-blockCenterX.z
)
*invPeriodicBoxSize.z+0.5f
)
*periodicBoxSize.z
;
}
for
(
unsigned
int
tgx
=
0
; tgx < TILE_SIZE; tgx++) {
unsigned
int
atom1
=
x*TILE_SIZE+tgx
;
real4
force
=
0
;
real4
posq1
=
posq[atom1]
;
posq1.xyz
-=
floor
((
posq1.xyz-blockCenterX.xyz
)
*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
float
bornRadius1
=
global_bornRadii[atom1]
;
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
real4
posq2
=
(
real4
)
(
localData[j].x,
localData[j].y,
localData[j].z,
localData[j].q
)
;
real4
delta
=
(
real4
)
(
posq2.xyz
-
posq1.xyz,
0
)
;
real
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
int
atom2
=
atomIndices[j]
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
&&
r2
<
CUTOFF_SQUARED
)
{
real
invR
=
RSQRT
(
r2
)
;
real
r
=
RECIP
(
invR
)
;
real
bornRadius2
=
localData[j].bornRadius
;
real
alpha2_ij
=
bornRadius1*bornRadius2
;
real
D_ij
=
r2*RECIP
(
4.0f*alpha2_ij
)
;
real
expTerm
=
EXP
(
-D_ij
)
;
real
denominator2
=
r2
+
alpha2_ij*expTerm
;
real
denominator
=
SQRT
(
denominator2
)
;
real
tempEnergy
=
(
PREFACTOR*posq1.w*posq2.w
)
*RECIP
(
denominator
)
;
real
Gpol
=
tempEnergy*RECIP
(
denominator2
)
;
real
dGpol_dalpha2_ij
=
-0.5f*Gpol*expTerm*
(
1.0f+D_ij
)
;
real
dEdR
=
Gpol*
(
1.0f
-
0.25f*expTerm
)
;
force.w
+=
dGpol_dalpha2_ij*bornRadius2
;
energy
+=
tempEnergy
;
delta.xyz
*=
dEdR
;
force.xyz
-=
delta.xyz
;
localData[j].fx
+=
delta.x
;
localData[j].fy
+=
delta.y
;
localData[j].fz
+=
delta.z
;
localData[j].fw
+=
dGpol_dalpha2_ij*bornRadius1
;
}
}
//
Write
results
for
atom1.
#
ifdef
SUPPORTS_64_BIT_ATOMICS
atom_add
(
&forceBuffers[atom1],
(
long
)
(
force.x*0x100000000
))
;
atom_add
(
&forceBuffers[atom1+PADDED_NUM_ATOMS],
(
long
)
(
force.y*0x100000000
))
;
atom_add
(
&forceBuffers[atom1+2*PADDED_NUM_ATOMS],
(
long
)
(
force.z*0x100000000
))
;
atom_add
(
&global_bornForce[atom1],
(
long
)
(
force.w*0x100000000
))
;
#
else
unsigned
int
offset
=
atom1
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
forceBuffers[offset].xyz
=
forceBuffers[offset].xyz+force.xyz
;
global_bornForce[offset]
+=
force.w
;
#
endif
}
}
else
#
endif
{
//
We
need
to
apply
periodic
boundary
conditions
separately
for
each
interaction.
for
(
unsigned
int
tgx
=
0
; tgx < TILE_SIZE; tgx++) {
unsigned
int
atom1
=
x*TILE_SIZE+tgx
;
real4
force
=
0
;
real4
posq1
=
posq[atom1]
;
float
bornRadius1
=
global_bornRadii[atom1]
;
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
real4
posq2
=
(
real4
)
(
localData[j].x,
localData[j].y,
localData[j].z,
localData[j].q
)
;
real4
delta
=
(
real4
)
(
posq2.xyz
-
posq1.xyz,
0
)
;
#
ifdef
USE_PERIODIC
delta.xyz
-=
floor
(
delta.xyz*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
#
endif
real
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
int
atom2
=
atomIndices[j]
;
#
ifdef
USE_CUTOFF
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
&&
r2
<
CUTOFF_SQUARED
)
{
#
else
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
#
endif
real
invR
=
RSQRT
(
r2
)
;
real
r
=
RECIP
(
invR
)
;
real
bornRadius2
=
localData[j].bornRadius
;
real
alpha2_ij
=
bornRadius1*bornRadius2
;
real
D_ij
=
r2*RECIP
(
4.0f*alpha2_ij
)
;
real
expTerm
=
EXP
(
-D_ij
)
;
real
denominator2
=
r2
+
alpha2_ij*expTerm
;
real
denominator
=
SQRT
(
denominator2
)
;
real
tempEnergy
=
(
PREFACTOR*posq1.w*posq2.w
)
*RECIP
(
denominator
)
;
real
Gpol
=
tempEnergy*RECIP
(
denominator2
)
;
real
dGpol_dalpha2_ij
=
-0.5f*Gpol*expTerm*
(
1.0f+D_ij
)
;
real
dEdR
=
Gpol*
(
1.0f
-
0.25f*expTerm
)
;
force.w
+=
dGpol_dalpha2_ij*bornRadius2
;
energy
+=
tempEnergy
;
delta.xyz
*=
dEdR
;
force.xyz
-=
delta.xyz
;
localData[j].fx
+=
delta.x
;
localData[j].fy
+=
delta.y
;
localData[j].fz
+=
delta.z
;
localData[j].fw
+=
dGpol_dalpha2_ij*bornRadius1
;
}
}
//
Write
results
for
atom1.
#
ifdef
SUPPORTS_64_BIT_ATOMICS
atom_add
(
&forceBuffers[atom1],
(
long
)
(
force.x*0x100000000
))
;
atom_add
(
&forceBuffers[atom1+PADDED_NUM_ATOMS],
(
long
)
(
force.y*0x100000000
))
;
atom_add
(
&forceBuffers[atom1+2*PADDED_NUM_ATOMS],
(
long
)
(
force.z*0x100000000
))
;
atom_add
(
&global_bornForce[atom1],
(
long
)
(
force.w*0x100000000
))
;
#
else
unsigned
int
offset
=
atom1
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
forceBuffers[offset].xyz
=
forceBuffers[offset].xyz+force.xyz
;
global_bornForce[offset]
+=
force.w
;
#
endif
}
}
//
Write
results.
for
(
int
tgx
=
0
; tgx < TILE_SIZE; tgx++) {
#
ifdef
USE_CUTOFF
unsigned
int
atom2
=
atomIndices[tgx]
;
#
else
unsigned
int
atom2
=
y*TILE_SIZE
+
tgx
;
#
endif
if
(
atom2
<
PADDED_NUM_ATOMS
)
{
#
ifdef
SUPPORTS_64_BIT_ATOMICS
atom_add
(
&forceBuffers[atom2],
(
long
)
(
localData[tgx].fx*0x100000000
))
;
atom_add
(
&forceBuffers[atom2+PADDED_NUM_ATOMS],
(
long
)
(
localData[tgx].fy*0x100000000
))
;
atom_add
(
&forceBuffers[atom2+2*PADDED_NUM_ATOMS],
(
long
)
(
localData[tgx].fz*0x100000000
))
;
atom_add
(
&global_bornForce[atom2],
(
long
)
(
localData[tgx].fw*0x100000000
))
;
#
else
unsigned
int
offset
=
atom2
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
real4
f
=
forceBuffers[offset]
;
f.x
+=
localData[tgx].fx
;
f.y
+=
localData[tgx].fy
;
f.z
+=
localData[tgx].fz
;
forceBuffers[offset]
=
f
;
global_bornForce[offset]
+=
localData[tgx].fw
;
#
endif
}
}
}
}
}
lasty
=
y
;
pos++
;
pos++
;
}
}
energyBuffer[get_global_id
(
0
)
]
+=
energy
;
energyBuffer[get_global_id
(
0
)
]
+=
energy
;
...
...
platforms/opencl/src/kernels/gbsaObc_default.cl
deleted
100644 → 0
View file @
f6d4557d
#
ifdef
SUPPORTS_64_BIT_ATOMICS
#
pragma
OPENCL
EXTENSION
cl_khr_global_int32_base_atomics
:
enable
#
pragma
OPENCL
EXTENSION
cl_khr_int64_base_atomics
:
enable
#
endif
#
define
TILE_SIZE
32
typedef
struct
{
real
x,
y,
z
;
float
radius,
scaledRadius
;
}
AtomData1
;
/**
*
Compute
the
Born
sum.
*/
__kernel
__attribute__
((
reqd_work_group_size
(
FORCE_WORK_GROUP_SIZE,
1
,
1
)))
void
computeBornSum
(
#
ifdef
SUPPORTS_64_BIT_ATOMICS
__global
long*
restrict
global_bornSum,
#
else
__global
real*
restrict
global_bornSum,
#
endif
__global
const
real4*
restrict
posq,
__global
const
float2*
restrict
global_params,
#
ifdef
USE_CUTOFF
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
unsigned
int
maxTiles
)
{
#
else
unsigned
int
numTiles
)
{
#
endif
#
ifdef
USE_CUTOFF
unsigned
int
numTiles
=
interactionCount[0]
;
unsigned
int
pos
=
get_group_id
(
0
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS*
(
NUM_BLOCKS+1
)
/2
:
numTiles
)
/get_num_groups
(
0
)
;
unsigned
int
end
=
(
get_group_id
(
0
)
+1
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS*
(
NUM_BLOCKS+1
)
/2
:
numTiles
)
/get_num_groups
(
0
)
;
#
else
unsigned
int
pos
=
get_group_id
(
0
)
*numTiles/get_num_groups
(
0
)
;
unsigned
int
end
=
(
get_group_id
(
0
)
+1
)
*numTiles/get_num_groups
(
0
)
;
#
endif
unsigned
int
lasty
=
0xFFFFFFFF
;
__local
AtomData1
localData[TILE_SIZE]
;
__local
real
localBornSum[FORCE_WORK_GROUP_SIZE]
;
__local
real
localTemp[TILE_SIZE]
;
while
(
pos
<
end
)
{
//
Extract
the
coordinates
of
this
tile
unsigned
int
x,
y
;
#
ifdef
USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles[pos]
;
x
=
tileIndices.x
;
y
=
tileIndices.y
;
}
else
#
endif
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS+0.5f-sqrt
((
NUM_BLOCKS+0.5f
)
*
(
NUM_BLOCKS+0.5f
)
-2*pos
))
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
if
(
x
<
y
|
| x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
unsigned int baseLocalAtom = (get_local_id(0) < TILE_SIZE ? 0 : TILE_SIZE/2);
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
unsigned int localForceOffset = get_local_id(0) & ~(TILE_SIZE-1);
unsigned int atom1 = x*TILE_SIZE + tgx;
real bornSum = 0.0f;
real4 posq1 = posq[atom1];
float2 params1 = global_params[atom1];
if (x == y) {
// This tile is on the diagonal.
if (get_local_id(0) < TILE_SIZE) {
localData[get_local_id(0)].x = posq1.x;
localData[get_local_id(0)].y = posq1.y;
localData[get_local_id(0)].z = posq1.z;
localData[get_local_id(0)].radius = params1.x;
localData[get_local_id(0)].scaledRadius = params1.y;
}
barrier(CLK_LOCAL_MEM_FENCE);
for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
real4 delta = (real4) (localData[baseLocalAtom+j].x-posq1.x, localData[baseLocalAtom+j].y-posq1.y, localData[baseLocalAtom+j].z-posq1.z, 0);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real invR = RSQRT(r2);
real r = RECIP(invR);
float2 params2 = (float2) (localData[baseLocalAtom+j].radius, localData[baseLocalAtom+j].scaledRadius);
real rScaledRadiusJ = r+params2.y;
#ifdef USE_CUTOFF
unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+baseLocalAtom+j < NUM_ATOMS && r2 < CUTOFF_SQUARED && (j+baseLocalAtom != tgx) && (params1.x < rScaledRadiusJ));
#else
unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+baseLocalAtom+j < NUM_ATOMS && (j+baseLocalAtom != tgx) && (params1.x < rScaledRadiusJ));
#endif
real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
real u_ij = RECIP(rScaledRadiusJ);
real l_ij2 = l_ij*l_ij;
real u_ij2 = u_ij*u_ij;
real ratio = LOG(u_ij * RECIP(l_ij));
bornSum += (includeInteraction ? l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2) : (real) 0);
bornSum += (includeInteraction && params1.x < params2.y-r ? 2.0f*(RECIP(params1.x)-l_ij) : (real) 0);
}
// Sum the forces and write results.
if (get_local_id(0) >= TILE_SIZE)
localTemp[tgx] = bornSum;
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) {
#ifdef SUPPORTS_64_BIT_ATOMICS
const unsigned int offset = x*TILE_SIZE + tgx;
atom_add(&global_bornSum[offset], (long) ((bornSum + localTemp[tgx])*0x100000000));
#else
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
const unsigned int offset = x*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else
const unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif
global_bornSum[offset] += bornSum + localTemp[tgx];
#endif
}
// barrier not required here as localTemp is not accessed before encountering another barrier.
}
else {
// This is an off-diagonal tile.
if (lasty != y && get_local_id(0) < TILE_SIZE) {
unsigned int j = y*TILE_SIZE + tgx;
real4 tempPosq = posq[j];
localData[get_local_id(0)].x = tempPosq.x;
localData[get_local_id(0)].y = tempPosq.y;
localData[get_local_id(0)].z = tempPosq.z;
float2 tempParams = global_params[j];
localData[get_local_id(0)].radius = tempParams.x;
localData[get_local_id(0)].scaledRadius = tempParams.y;
}
localBornSum[get_local_id(0)] = 0.0f;
barrier(CLK_LOCAL_MEM_FENCE);
// Compute the full set of interactions in this tile.
unsigned int tj = (tgx+baseLocalAtom) & (TILE_SIZE-1);
for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
real4 delta = (real4) (localData[tj].x-posq1.x, localData[tj].y-posq1.y, localData[tj].z-posq1.z, 0);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS && r2 < CUTOFF_SQUARED);
#else
unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS);
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
float2 params2 = (float2) (localData[tj].radius, localData[tj].scaledRadius);
real rScaledRadiusJ = r+params2.y;
{
real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
real u_ij = RECIP(rScaledRadiusJ);
real l_ij2 = l_ij*l_ij;
real u_ij2 = u_ij*u_ij;
real ratio = LOG(u_ij * RECIP(l_ij));
unsigned int includeTerm = (includeInteraction && params1.x < rScaledRadiusJ);
bornSum += (includeTerm ? l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2) : (real) 0);
bornSum += (includeTerm && params1.x < params2.y-r ? 2.0f*(RECIP(params1.x)-l_ij) : (real) 0);
}
real rScaledRadiusI = r+params1.y;
{
real l_ij = RECIP(max((real) params2.x, fabs(r-params1.y)));
real u_ij = RECIP(rScaledRadiusI);
real l_ij2 = l_ij*l_ij;
real u_ij2 = u_ij*u_ij;
real ratio = LOG(u_ij * RECIP(l_ij));
real term = l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params1.y*params1.y*invR)*(l_ij2-u_ij2);
term += (params2.x < params1.y-r ? 2.0f*(RECIP(params2.x)-l_ij) : (real) 0);
localBornSum[tj+localForceOffset] += (includeInteraction && params2.x < rScaledRadiusI ? term : (real) 0);
}
barrier(CLK_LOCAL_MEM_FENCE);
tj = (tj+1) & (TILE_SIZE-1);
}
// Sum the forces and write results.
if (get_local_id(0) >= TILE_SIZE)
localTemp[tgx] = bornSum;
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) {
#ifdef SUPPORTS_64_BIT_ATOMICS
const unsigned int offset1 = x*TILE_SIZE + tgx;
const unsigned int offset2 = y*TILE_SIZE + tgx;
atom_add(&global_bornSum[offset1], (long) ((bornSum + localTemp[tgx])*0x100000000));
atom_add(&global_bornSum[offset2], (long) ((localBornSum[get_local_id(0)] + localBornSum[get_local_id(0)+TILE_SIZE])*0x100000000));
#else
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
const unsigned int offset1 = x*TILE_SIZE + tgx + y*PADDED_NUM_ATOMS;
const unsigned int offset2 = y*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else
const unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
const unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif
// Do both loads before both stores to minimize store-load waits.
real sum1 = global_bornSum[offset1];
real sum2 = global_bornSum[offset2];
sum1 += bornSum + localTemp[tgx];
sum2 += localBornSum[get_local_id(0)] + localBornSum[get_local_id(0)+TILE_SIZE];
global_bornSum[offset1] = sum1;
global_bornSum[offset2] = sum2;
#endif
}
barrier(CLK_LOCAL_MEM_FENCE);
}
lasty = y;
pos++;
}
}
typedef struct {
real x, y, z, w;
real padding;
} PaddedUnalignedFloat4;
typedef struct {
real x, y, z;
real q;
real bornRadius;
real temp_x, temp_y, temp_z, temp_w;
} AtomData2;
/**
* First part of computing the GBSA interaction.
*/
__kernel __attribute__((reqd_work_group_size(FORCE_WORK_GROUP_SIZE, 1, 1)))
void computeGBSAForce1(
#ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict forceBuffers, __global long* restrict global_bornForce,
#else
__global real4* restrict forceBuffers, __global real* restrict global_bornForce,
#endif
__global real* restrict energyBuffer, __global const real4* restrict posq, __global const real* restrict global_bornRadii,
#ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles) {
#else
unsigned int numTiles) {
#endif
#ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0];
unsigned int pos = get_group_id(0)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0);
unsigned int end = (get_group_id(0)+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0);
#else
unsigned int pos = get_group_id(0)*numTiles/get_num_groups(0);
unsigned int end = (get_group_id(0)+1)*numTiles/get_num_groups(0);
#endif
real energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF;
__local AtomData2 localData[TILE_SIZE];
__local PaddedUnalignedFloat4 localForce[FORCE_WORK_GROUP_SIZE];
while (pos < end) {
// Extract the coordinates of this tile
unsigned int x, y;
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
x = tileIndices.x;
y = tileIndices.y;
}
else
#endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y |
|
x
>=
NUM_BLOCKS
)
{
//
Occasionally
happens
due
to
roundoff
error.
y
+=
(
x
<
y
?
-1
:
1
)
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
}
}
unsigned
int
baseLocalAtom
=
(
get_local_id
(
0
)
<
TILE_SIZE
?
0
:
TILE_SIZE/2
)
;
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
TILE_SIZE-1
)
;
unsigned
int
localForceOffset
=
get_local_id
(
0
)
&
~
(
TILE_SIZE-1
)
;
unsigned
int
atom1
=
x*TILE_SIZE
+
tgx
;
real4
force
=
0.0f
;
real4
posq1
=
posq[atom1]
;
real
bornRadius1
=
global_bornRadii[atom1]
;
if
(
x
==
y
)
{
//
This
tile
is
on
the
diagonal.
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
{
localData[get_local_id
(
0
)
].x
=
posq1.x
;
localData[get_local_id
(
0
)
].y
=
posq1.y
;
localData[get_local_id
(
0
)
].z
=
posq1.z
;
localData[get_local_id
(
0
)
].q
=
posq1.w
;
localData[get_local_id
(
0
)
].bornRadius
=
bornRadius1
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
for
(
unsigned
int
j
=
0
; j < TILE_SIZE/2; j++) {
unsigned
int
includeInteraction
=
(
atom1
<
NUM_ATOMS
&&
y*TILE_SIZE+baseLocalAtom+j
<
NUM_ATOMS
)
;
real4
posq2
=
(
real4
)
(
localData[baseLocalAtom+j].x,
localData[baseLocalAtom+j].y,
localData[baseLocalAtom+j].z,
localData[baseLocalAtom+j].q
)
;
real4
delta
=
(
real4
)
(
posq2.xyz
-
posq1.xyz,
0
)
;
#
ifdef
USE_PERIODIC
delta.x
-=
floor
(
delta.x*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
delta.y
-=
floor
(
delta.y*invPeriodicBoxSize.y+0.5f
)
*periodicBoxSize.y
;
delta.z
-=
floor
(
delta.z*invPeriodicBoxSize.z+0.5f
)
*periodicBoxSize.z
;
#
endif
real
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
real
invR
=
RSQRT
(
r2
)
;
real
r
=
RECIP
(
invR
)
;
real
bornRadius2
=
localData[baseLocalAtom+j].bornRadius
;
real
alpha2_ij
=
bornRadius1*bornRadius2
;
real
D_ij
=
r2*RECIP
(
4.0f*alpha2_ij
)
;
real
expTerm
=
EXP
(
-D_ij
)
;
real
denominator2
=
r2
+
alpha2_ij*expTerm
;
real
denominator
=
SQRT
(
denominator2
)
;
real
tempEnergy
=
(
PREFACTOR*posq1.w*posq2.w
)
*RECIP
(
denominator
)
;
real
Gpol
=
tempEnergy*RECIP
(
denominator2
)
;
real
dGpol_dalpha2_ij
=
-0.5f*Gpol*expTerm*
(
1.0f+D_ij
)
;
real
dEdR
=
Gpol*
(
1.0f
-
0.25f*expTerm
)
;
#
ifdef
USE_CUTOFF
dEdR
=
(
r2
>
CUTOFF_SQUARED
?
(
real
)
0
:
dEdR
)
;
tempEnergy
=
(
r2
>
CUTOFF_SQUARED
?
(
real
)
0
:
tempEnergy
)
;
dGpol_dalpha2_ij
=
(
r2
>
CUTOFF_SQUARED
?
(
real
)
0
:
dGpol_dalpha2_ij
)
;
#
endif
force.w
+=
(
includeInteraction
?
dGpol_dalpha2_ij*bornRadius2
:
(
real
)
0
)
;
energy
+=
(
includeInteraction
?
0.5f*tempEnergy
:
(
real
)
0
)
;
delta.xyz
*=
(
includeInteraction
?
dEdR
:
(
real
)
0
)
;
force.xyz
-=
delta.xyz
;
}
//
Sum
the
forces
and
write
results.
if
(
get_local_id
(
0
)
>=
TILE_SIZE
)
{
localData[tgx].temp_x
=
force.x
;
localData[tgx].temp_y
=
force.y
;
localData[tgx].temp_z
=
force.z
;
localData[tgx].temp_w
=
force.w
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
{
#
ifdef
SUPPORTS_64_BIT_ATOMICS
const
unsigned
int
offset
=
x*TILE_SIZE
+
tgx
;
atom_add
(
&forceBuffers[offset],
(
long
)
((
force.x
+
localData[tgx].temp_x
)
*0x100000000
))
;
atom_add
(
&forceBuffers[offset+PADDED_NUM_ATOMS],
(
long
)
((
force.y
+
localData[tgx].temp_y
)
*0x100000000
))
;
atom_add
(
&forceBuffers[offset+2*PADDED_NUM_ATOMS],
(
long
)
((
force.z
+
localData[tgx].temp_z
)
*0x100000000
))
;
atom_add
(
&global_bornForce[offset],
(
long
)
((
force.w
+
localData[tgx].temp_w
)
*0x100000000
))
;
#
else
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
const
unsigned
int
offset
=
x*TILE_SIZE
+
tgx
+
x*PADDED_NUM_ATOMS
;
#
else
const
unsigned
int
offset
=
x*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
#
endif
//
Cheaper
to
load/store
real4
than
real3.
Do
all
loads
before
all
stores
to
minimize
store-load
waits.
real4
sum
=
forceBuffers[offset]
;
real
global_sum
=
global_bornForce[offset]
;
sum.x
+=
force.x
+
localData[tgx].temp_x
;
sum.y
+=
force.y
+
localData[tgx].temp_y
;
sum.z
+=
force.z
+
localData[tgx].temp_z
;
global_sum
+=
force.w
+
localData[tgx].temp_w
;
forceBuffers[offset]
=
sum
;
global_bornForce[offset]
=
global_sum
;
#
endif
}
//
barrier
not
required
here
as
localData[*]/temp_*
is
not
accessed
before
encountering
another
barrier.
}
else
{
//
This
is
an
off-diagonal
tile.
if
(
lasty
!=
y
&&
get_local_id
(
0
)
<
TILE_SIZE
)
{
unsigned
int
j
=
y*TILE_SIZE
+
tgx
;
real4
tempPosq
=
posq[j]
;
localData[get_local_id
(
0
)
].x
=
tempPosq.x
;
localData[get_local_id
(
0
)
].y
=
tempPosq.y
;
localData[get_local_id
(
0
)
].z
=
tempPosq.z
;
localData[get_local_id
(
0
)
].q
=
tempPosq.w
;
localData[get_local_id
(
0
)
].bornRadius
=
global_bornRadii[j]
;
}
localForce[get_local_id
(
0
)
].x
=
0.0f
;
localForce[get_local_id
(
0
)
].y
=
0.0f
;
localForce[get_local_id
(
0
)
].z
=
0.0f
;
localForce[get_local_id
(
0
)
].w
=
0.0f
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
//
Compute
the
full
set
of
interactions
in
this
tile.
unsigned
int
tj
=
(
tgx+baseLocalAtom
)
&
(
TILE_SIZE-1
)
;
for
(
unsigned
int
j
=
0
; j < TILE_SIZE/2; j++) {
unsigned
int
includeInteraction
=
(
atom1
<
NUM_ATOMS
&&
y*TILE_SIZE+tj
<
NUM_ATOMS
)
;
real4
posq2
=
(
real4
)
(
localData[tj].x,
localData[tj].y,
localData[tj].z,
localData[tj].q
)
;
real4
delta
=
(
real4
)
(
posq2.xyz
-
posq1.xyz,
0
)
;
#
ifdef
USE_PERIODIC
delta.x
-=
floor
(
delta.x*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
delta.y
-=
floor
(
delta.y*invPeriodicBoxSize.y+0.5f
)
*periodicBoxSize.y
;
delta.z
-=
floor
(
delta.z*invPeriodicBoxSize.z+0.5f
)
*periodicBoxSize.z
;
#
endif
real
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
real
invR
=
RSQRT
(
r2
)
;
real
r
=
RECIP
(
invR
)
;
real
bornRadius2
=
localData[tj].bornRadius
;
real
alpha2_ij
=
bornRadius1*bornRadius2
;
real
D_ij
=
r2*RECIP
(
4.0f*alpha2_ij
)
;
real
expTerm
=
EXP
(
-D_ij
)
;
real
denominator2
=
r2
+
alpha2_ij*expTerm
;
real
denominator
=
SQRT
(
denominator2
)
;
real
tempEnergy
=
(
PREFACTOR*posq1.w*posq2.w
)
*RECIP
(
denominator
)
;
real
Gpol
=
tempEnergy*RECIP
(
denominator2
)
;
real
dGpol_dalpha2_ij
=
-0.5f*Gpol*expTerm*
(
1.0f+D_ij
)
;
real
dEdR
=
Gpol*
(
1.0f
-
0.25f*expTerm
)
;
#
ifdef
USE_CUTOFF
dEdR
=
(
r2
>
CUTOFF_SQUARED
?
(
real
)
0
:
dEdR
)
;
tempEnergy
=
(
r2
>
CUTOFF_SQUARED
?
(
real
)
0
:
tempEnergy
)
;
dGpol_dalpha2_ij
=
(
r2
>
CUTOFF_SQUARED
?
(
real
)
0
:
dGpol_dalpha2_ij
)
;
#
endif
force.w
+=
(
includeInteraction
?
dGpol_dalpha2_ij*bornRadius2
:
(
real
)
0
)
;
energy
+=
(
includeInteraction
?
tempEnergy
:
(
real
)
0
)
;
delta.xyz
*=
(
includeInteraction
?
dEdR
:
(
real
)
0
)
;
force.xyz
-=
delta.xyz
;
localForce[tj+localForceOffset].x
+=
delta.x
;
localForce[tj+localForceOffset].y
+=
delta.y
;
localForce[tj+localForceOffset].z
+=
delta.z
;
localForce[tj+localForceOffset].w
+=
(
includeInteraction
?
dGpol_dalpha2_ij*bornRadius1
:
(
real
)
0
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
tj
=
(
tj+1
)
&
(
TILE_SIZE-1
)
;
}
//
Sum
the
forces
and
write
results.
if
(
get_local_id
(
0
)
>=
TILE_SIZE
)
{
localData[tgx].temp_x
=
force.x
;
localData[tgx].temp_y
=
force.y
;
localData[tgx].temp_z
=
force.z
;
localData[tgx].temp_w
=
force.w
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
{
#
ifdef
SUPPORTS_64_BIT_ATOMICS
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
const
unsigned
int
offset1
=
x*TILE_SIZE
+
tgx
;
const
unsigned
int
offset2
=
y*TILE_SIZE
+
tgx
;
atom_add
(
&forceBuffers[offset1],
(
long
)
((
force.x
+
localData[tgx].temp_x
)
*0x100000000
))
;
atom_add
(
&forceBuffers[offset1+PADDED_NUM_ATOMS],
(
long
)
((
force.y
+
localData[tgx].temp_y
)
*0x100000000
))
;
atom_add
(
&forceBuffers[offset1+2*PADDED_NUM_ATOMS],
(
long
)
((
force.z
+
localData[tgx].temp_z
)
*0x100000000
))
;
atom_add
(
&global_bornForce[offset1],
(
long
)
((
force.w
+
localData[tgx].temp_w
)
*0x100000000
))
;
atom_add
(
&forceBuffers[offset2],
(
long
)
((
localForce[get_local_id
(
0
)
].x
+
localForce[get_local_id
(
0
)
+TILE_SIZE].x
)
*0x100000000
))
;
atom_add
(
&forceBuffers[offset2+PADDED_NUM_ATOMS],
(
long
)
((
localForce[get_local_id
(
0
)
].y
+
localForce[get_local_id
(
0
)
+TILE_SIZE].y
)
*0x100000000
))
;
atom_add
(
&forceBuffers[offset2+2*PADDED_NUM_ATOMS],
(
long
)
((
localForce[get_local_id
(
0
)
].z
+
localForce[get_local_id
(
0
)
+TILE_SIZE].z
)
*0x100000000
))
;
atom_add
(
&global_bornForce[offset2],
(
long
)
((
localForce[get_local_id
(
0
)
].w
+
localForce[get_local_id
(
0
)
+TILE_SIZE].w
)
*0x100000000
))
;
#
else
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
const
unsigned
int
offset1
=
x*TILE_SIZE
+
tgx
+
y*PADDED_NUM_ATOMS
;
const
unsigned
int
offset2
=
y*TILE_SIZE
+
tgx
+
x*PADDED_NUM_ATOMS
;
#
else
const
unsigned
int
offset1
=
x*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
const
unsigned
int
offset2
=
y*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
#
endif
//
Cheaper
to
load/store
real4
than
real3.
Do
all
loads
before
all
stores
to
minimize
store-load
waits.
real4
sum1
=
forceBuffers[offset1]
;
real4
sum2
=
forceBuffers[offset2]
;
real
global_sum1
=
global_bornForce[offset1]
;
real
global_sum2
=
global_bornForce[offset2]
;
sum1.x
+=
force.x
+
localData[tgx].temp_x
;
sum1.y
+=
force.y
+
localData[tgx].temp_y
;
sum1.z
+=
force.z
+
localData[tgx].temp_z
;
global_sum1
+=
force.w
+
localData[tgx].temp_w
;
sum2.x
+=
localForce[get_local_id
(
0
)
].x
+
localForce[get_local_id
(
0
)
+TILE_SIZE].x
;
sum2.y
+=
localForce[get_local_id
(
0
)
].y
+
localForce[get_local_id
(
0
)
+TILE_SIZE].y
;
sum2.z
+=
localForce[get_local_id
(
0
)
].z
+
localForce[get_local_id
(
0
)
+TILE_SIZE].z
;
global_sum2
+=
localForce[get_local_id
(
0
)
].w
+
localForce[get_local_id
(
0
)
+TILE_SIZE].w
;
forceBuffers[offset1]
=
sum1
;
forceBuffers[offset2]
=
sum2
;
global_bornForce[offset1]
=
global_sum1
;
global_bornForce[offset2]
=
global_sum2
;
#
endif
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
lasty
=
y
;
pos++
;
}
energyBuffer[get_global_id
(
0
)
]
+=
energy
;
}
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment