Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
93c467b2
Commit
93c467b2
authored
Mar 22, 2013
by
Peter Eastman
Browse files
Merged 5.1Optimizations branch back to trunk
parent
f6d4557d
Changes
86
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
3182 additions
and
2450 deletions
+3182
-2450
platforms/opencl/src/OpenCLSort.cpp
platforms/opencl/src/OpenCLSort.cpp
+161
-0
platforms/opencl/src/OpenCLSort.h
platforms/opencl/src/OpenCLSort.h
+64
-136
platforms/opencl/src/kernels/andersenThermostat.cl
platforms/opencl/src/kernels/andersenThermostat.cl
+3
-3
platforms/opencl/src/kernels/brownian.cl
platforms/opencl/src/kernels/brownian.cl
+3
-3
platforms/opencl/src/kernels/ccma.cl
platforms/opencl/src/kernels/ccma.cl
+6
-1
platforms/opencl/src/kernels/coulombLennardJones.cl
platforms/opencl/src/kernels/coulombLennardJones.cl
+3
-3
platforms/opencl/src/kernels/customGBEnergyN2.cl
platforms/opencl/src/kernels/customGBEnergyN2.cl
+376
-0
platforms/opencl/src/kernels/customGBEnergyN2_cpu.cl
platforms/opencl/src/kernels/customGBEnergyN2_cpu.cl
+300
-157
platforms/opencl/src/kernels/customGBEnergyN2_default.cl
platforms/opencl/src/kernels/customGBEnergyN2_default.cl
+0
-258
platforms/opencl/src/kernels/customGBEnergyN2_nvidia.cl
platforms/opencl/src/kernels/customGBEnergyN2_nvidia.cl
+0
-282
platforms/opencl/src/kernels/customGBValueN2.cl
platforms/opencl/src/kernels/customGBValueN2.cl
+324
-0
platforms/opencl/src/kernels/customGBValueN2_cpu.cl
platforms/opencl/src/kernels/customGBValueN2_cpu.cl
+240
-141
platforms/opencl/src/kernels/customGBValueN2_default.cl
platforms/opencl/src/kernels/customGBValueN2_default.cl
+0
-237
platforms/opencl/src/kernels/customGBValueN2_nvidia.cl
platforms/opencl/src/kernels/customGBValueN2_nvidia.cl
+0
-304
platforms/opencl/src/kernels/fft.cl
platforms/opencl/src/kernels/fft.cl
+2
-2
platforms/opencl/src/kernels/findInteractingBlocks.cl
platforms/opencl/src/kernels/findInteractingBlocks.cl
+224
-205
platforms/opencl/src/kernels/findInteractingBlocks_cpu.cl
platforms/opencl/src/kernels/findInteractingBlocks_cpu.cl
+171
-106
platforms/opencl/src/kernels/gbsaObc.cl
platforms/opencl/src/kernels/gbsaObc.cl
+748
-0
platforms/opencl/src/kernels/gbsaObc_cpu.cl
platforms/opencl/src/kernels/gbsaObc_cpu.cl
+557
-126
platforms/opencl/src/kernels/gbsaObc_default.cl
platforms/opencl/src/kernels/gbsaObc_default.cl
+0
-486
No files found.
platforms/opencl/src/OpenCLSort.cpp
0 → 100644
View file @
93c467b2
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2010-2013 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "OpenCLSort.h"
#include "OpenCLKernelSources.h"
#include <map>
using
namespace
OpenMM
;
using
namespace
std
;
OpenCLSort
::
OpenCLSort
(
OpenCLContext
&
context
,
SortTrait
*
trait
,
unsigned
int
length
)
:
context
(
context
),
trait
(
trait
),
dataRange
(
NULL
),
bucketOfElement
(
NULL
),
offsetInBucket
(
NULL
),
bucketOffset
(
NULL
),
buckets
(
NULL
),
dataLength
(
length
)
{
// Create kernels.
std
::
map
<
std
::
string
,
std
::
string
>
replacements
;
replacements
[
"DATA_TYPE"
]
=
trait
->
getDataType
();
replacements
[
"KEY_TYPE"
]
=
trait
->
getKeyType
();
replacements
[
"SORT_KEY"
]
=
trait
->
getSortKey
();
replacements
[
"MIN_KEY"
]
=
trait
->
getMinKey
();
replacements
[
"MAX_KEY"
]
=
trait
->
getMaxKey
();
replacements
[
"MAX_VALUE"
]
=
trait
->
getMaxValue
();
replacements
[
"VALUE_IS_INT2"
]
=
(
trait
->
getDataType
()
==
std
::
string
(
"int2"
)
?
"1"
:
"0"
);
cl
::
Program
program
=
context
.
createProgram
(
context
.
replaceStrings
(
OpenCLKernelSources
::
sort
,
replacements
));
shortListKernel
=
cl
::
Kernel
(
program
,
"sortShortList"
);
computeRangeKernel
=
cl
::
Kernel
(
program
,
"computeRange"
);
assignElementsKernel
=
cl
::
Kernel
(
program
,
"assignElementsToBuckets"
);
computeBucketPositionsKernel
=
cl
::
Kernel
(
program
,
"computeBucketPositions"
);
copyToBucketsKernel
=
cl
::
Kernel
(
program
,
"copyDataToBuckets"
);
sortBucketsKernel
=
cl
::
Kernel
(
program
,
"sortBuckets"
);
// Work out the work group sizes for various kernels.
unsigned
int
maxGroupSize
=
std
::
min
(
256
,
(
int
)
context
.
getDevice
().
getInfo
<
CL_DEVICE_MAX_WORK_GROUP_SIZE
>
());
int
maxSharedMem
=
context
.
getDevice
().
getInfo
<
CL_DEVICE_LOCAL_MEM_SIZE
>
();
unsigned
int
maxLocalBuffer
=
(
unsigned
int
)
((
maxSharedMem
/
trait
->
getDataSize
())
/
2
);
isShortList
=
(
length
<=
maxLocalBuffer
);
for
(
rangeKernelSize
=
1
;
rangeKernelSize
*
2
<=
maxGroupSize
;
rangeKernelSize
*=
2
)
;
positionsKernelSize
=
rangeKernelSize
;
sortKernelSize
=
(
isShortList
?
rangeKernelSize
:
rangeKernelSize
/
2
);
if
(
rangeKernelSize
>
length
)
rangeKernelSize
=
length
;
if
(
sortKernelSize
>
maxLocalBuffer
)
sortKernelSize
=
maxLocalBuffer
;
unsigned
int
targetBucketSize
=
sortKernelSize
/
2
;
unsigned
int
numBuckets
=
length
/
targetBucketSize
;
if
(
numBuckets
<
1
)
numBuckets
=
1
;
if
(
positionsKernelSize
>
numBuckets
)
positionsKernelSize
=
numBuckets
;
// Create workspace arrays.
if
(
!
isShortList
)
{
dataRange
=
new
OpenCLArray
(
context
,
2
,
trait
->
getKeySize
(),
"sortDataRange"
);
bucketOffset
=
OpenCLArray
::
create
<
cl_uint
>
(
context
,
numBuckets
,
"bucketOffset"
);
bucketOfElement
=
OpenCLArray
::
create
<
cl_uint
>
(
context
,
length
,
"bucketOfElement"
);
offsetInBucket
=
OpenCLArray
::
create
<
cl_uint
>
(
context
,
length
,
"offsetInBucket"
);
buckets
=
new
OpenCLArray
(
context
,
length
,
trait
->
getDataSize
(),
"buckets"
);
}
}
OpenCLSort
::~
OpenCLSort
()
{
delete
trait
;
if
(
dataRange
!=
NULL
)
delete
dataRange
;
if
(
bucketOfElement
!=
NULL
)
delete
bucketOfElement
;
if
(
offsetInBucket
!=
NULL
)
delete
offsetInBucket
;
if
(
bucketOffset
!=
NULL
)
delete
bucketOffset
;
if
(
buckets
!=
NULL
)
delete
buckets
;
}
void
OpenCLSort
::
sort
(
OpenCLArray
&
data
)
{
if
(
data
.
getSize
()
!=
dataLength
||
data
.
getElementSize
()
!=
trait
->
getDataSize
())
throw
OpenMMException
(
"OpenCLSort called with different data size"
);
if
(
data
.
getSize
()
==
0
)
return
;
if
(
isShortList
)
{
// We can use a simpler sort kernel that does the entire operation at once in local memory.
shortListKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
data
.
getDeviceBuffer
());
shortListKernel
.
setArg
<
cl_uint
>
(
1
,
dataLength
);
shortListKernel
.
setArg
(
2
,
dataLength
*
trait
->
getDataSize
(),
NULL
);
context
.
executeKernel
(
shortListKernel
,
sortKernelSize
,
sortKernelSize
);
}
else
{
// Compute the range of data values.
computeRangeKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
data
.
getDeviceBuffer
());
computeRangeKernel
.
setArg
<
cl_uint
>
(
1
,
data
.
getSize
());
computeRangeKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
dataRange
->
getDeviceBuffer
());
computeRangeKernel
.
setArg
(
3
,
rangeKernelSize
*
trait
->
getKeySize
(),
NULL
);
context
.
executeKernel
(
computeRangeKernel
,
rangeKernelSize
,
rangeKernelSize
);
// Assign array elements to buckets.
unsigned
int
numBuckets
=
bucketOffset
->
getSize
();
context
.
clearBuffer
(
*
bucketOffset
);
assignElementsKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
data
.
getDeviceBuffer
());
assignElementsKernel
.
setArg
<
cl_int
>
(
1
,
data
.
getSize
());
assignElementsKernel
.
setArg
<
cl_int
>
(
2
,
numBuckets
);
assignElementsKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
dataRange
->
getDeviceBuffer
());
assignElementsKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
bucketOffset
->
getDeviceBuffer
());
assignElementsKernel
.
setArg
<
cl
::
Buffer
>
(
5
,
bucketOfElement
->
getDeviceBuffer
());
assignElementsKernel
.
setArg
<
cl
::
Buffer
>
(
6
,
offsetInBucket
->
getDeviceBuffer
());
context
.
executeKernel
(
assignElementsKernel
,
data
.
getSize
());
// Compute the position of each bucket.
computeBucketPositionsKernel
.
setArg
<
cl_int
>
(
0
,
numBuckets
);
computeBucketPositionsKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
bucketOffset
->
getDeviceBuffer
());
computeBucketPositionsKernel
.
setArg
(
2
,
positionsKernelSize
*
sizeof
(
cl_int
),
NULL
);
context
.
executeKernel
(
computeBucketPositionsKernel
,
positionsKernelSize
,
positionsKernelSize
);
// Copy the data into the buckets.
copyToBucketsKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
data
.
getDeviceBuffer
());
copyToBucketsKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
buckets
->
getDeviceBuffer
());
copyToBucketsKernel
.
setArg
<
cl_int
>
(
2
,
data
.
getSize
());
copyToBucketsKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
bucketOffset
->
getDeviceBuffer
());
copyToBucketsKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
bucketOfElement
->
getDeviceBuffer
());
copyToBucketsKernel
.
setArg
<
cl
::
Buffer
>
(
5
,
offsetInBucket
->
getDeviceBuffer
());
context
.
executeKernel
(
copyToBucketsKernel
,
data
.
getSize
());
// Sort each bucket.
sortBucketsKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
data
.
getDeviceBuffer
());
sortBucketsKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
buckets
->
getDeviceBuffer
());
sortBucketsKernel
.
setArg
<
cl_int
>
(
2
,
numBuckets
);
sortBucketsKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
bucketOffset
->
getDeviceBuffer
());
sortBucketsKernel
.
setArg
(
4
,
sortKernelSize
*
trait
->
getDataSize
(),
NULL
);
context
.
executeKernel
(
sortBucketsKernel
,
((
data
.
getSize
()
+
sortKernelSize
-
1
)
/
sortKernelSize
)
*
sortKernelSize
,
sortKernelSize
);
}
}
platforms/opencl/src/OpenCLSort.h
View file @
93c467b2
...
...
@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2010 Stanford University and the Authors.
*
* Portions copyright (c) 2010
-2013
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
...
...
@@ -28,9 +28,7 @@
* -------------------------------------------------------------------------- */
#include "OpenCLArray.h"
#include "OpenCLKernelSources.h"
#include "windowsExportOpenCL.h"
#include <map>
namespace
OpenMM
{
...
...
@@ -38,26 +36,19 @@ namespace OpenMM {
* This class sorts arrays of values. It supports any type of values, not just scalars,
* so long as an appropriate sorting key can be defined by which to sort them.
*
* The
class is templatiz
ed by a "trait" class that defines the type of data to
* The
sorting behavior is specifi
ed by a "trait" class that defines the type of data to
* sort and the key for sorting it. Here is an example of a trait class for
* sorting floats:
*
* struct FloatTrait {
* // The name of the data and key types being sorted.
* // Both the host type and OpenCL type is required.
* // For primitive types they will be the same.
* typedef cl_float DataType;
* typedef cl_float KeyType;
* static const char* clDataType() {return "float";}
* static const char* clKeyType() {return "float";}
* // The minimum value a key can take.
* static const char* clMinKey() {return "-MAXFLOAT";}
* // The maximum value a key can take.
* static const char* clMaxKey() {return "MAXFLOAT";}
* // A value whose key is guaranteed to equal clMaxKey().
* static const char* clMaxValue() {return "MAXFLOAT";}
* // The OpenCL code to select the key from the data value.
* static const char* clSortKey() {return "value";}
* class FloatTrait : public OpenCLSort::SortTrait {
* int getDataSize() const {return 4;}
* int getKeySize() const {return 4;}
* const char* getDataType() const {return "float";}
* const char* getKeyType() const {return "float";}
* const char* getMinKey() const {return "-MAXFLOAT";}
* const char* getMaxKey() const {return "MAXFLOAT";}
* const char* getMaxValue() const {return "MAXFLOAT";}
* const char* getSortKey() const {return "value";}
* };
*
* The algorithm used is a bucket sort, followed by a bitonic sort within each bucket
...
...
@@ -74,139 +65,76 @@ namespace OpenMM {
* elements).
*/
template
<
class
TRAIT
>
class
OpenCLSort
{
class
OPENMM_EXPORT_OPENCL
OpenCLSort
{
public:
class
SortTrait
;
/**
* Create an OpenCLSort object for sorting data of a particular type.
*
* @param context the context in which to perform calculations
* @param trait a SortTrait defining the type of data to sort. It should have been allocated
* on the heap with the "new" operator. This object takes over ownership of it,
* and deletes it when the OpenCLSort is deleted.
* @param length the length of the arrays this object will be used to sort
*/
OpenCLSort
(
OpenCLContext
&
context
,
unsigned
int
length
)
:
context
(
context
),
dataRange
(
NULL
),
bucketOfElement
(
NULL
),
offsetInBucket
(
NULL
),
bucketOffset
(
NULL
),
buckets
(
NULL
)
{
// Create kernels.
std
::
map
<
std
::
string
,
std
::
string
>
replacements
;
replacements
[
"DATA_TYPE"
]
=
TRAIT
::
clDataType
();
replacements
[
"KEY_TYPE"
]
=
TRAIT
::
clKeyType
();
replacements
[
"SORT_KEY"
]
=
TRAIT
::
clSortKey
();
replacements
[
"MIN_KEY"
]
=
TRAIT
::
clMinKey
();
replacements
[
"MAX_KEY"
]
=
TRAIT
::
clMaxKey
();
replacements
[
"MAX_VALUE"
]
=
TRAIT
::
clMaxValue
();
replacements
[
"VALUE_IS_INT2"
]
=
(
TRAIT
::
clDataType
()
==
std
::
string
(
"int2"
)
?
"1"
:
"0"
);
cl
::
Program
program
=
context
.
createProgram
(
context
.
replaceStrings
(
OpenCLKernelSources
::
sort
,
replacements
));
computeRangeKernel
=
cl
::
Kernel
(
program
,
"computeRange"
);
assignElementsKernel
=
cl
::
Kernel
(
program
,
"assignElementsToBuckets"
);
computeBucketPositionsKernel
=
cl
::
Kernel
(
program
,
"computeBucketPositions"
);
copyToBucketsKernel
=
cl
::
Kernel
(
program
,
"copyDataToBuckets"
);
sortBucketsKernel
=
cl
::
Kernel
(
program
,
"sortBuckets"
);
// Work out the work group sizes for various kernels.
unsigned
int
maxGroupSize
=
std
::
min
(
256
,
(
int
)
context
.
getDevice
().
getInfo
<
CL_DEVICE_MAX_WORK_GROUP_SIZE
>
());
for
(
rangeKernelSize
=
1
;
rangeKernelSize
*
2
<=
maxGroupSize
;
rangeKernelSize
*=
2
)
;
positionsKernelSize
=
rangeKernelSize
;
sortKernelSize
=
rangeKernelSize
/
2
;
if
(
rangeKernelSize
>
length
)
rangeKernelSize
=
length
;
unsigned
int
maxLocalBuffer
=
(
unsigned
int
)
((
context
.
getDevice
().
getInfo
<
CL_DEVICE_LOCAL_MEM_SIZE
>
()
/
sizeof
(
typename
TRAIT
::
DataType
))
/
2
);
if
(
sortKernelSize
>
maxLocalBuffer
)
sortKernelSize
=
maxLocalBuffer
;
unsigned
int
targetBucketSize
=
sortKernelSize
/
2
;
unsigned
int
numBuckets
=
length
/
targetBucketSize
;
if
(
numBuckets
<
1
)
numBuckets
=
1
;
if
(
positionsKernelSize
>
numBuckets
)
positionsKernelSize
=
numBuckets
;
// Create workspace arrays.
dataRange
=
OpenCLArray
::
create
<
typename
TRAIT
::
KeyType
>
(
context
,
2
,
"sortDataRange"
);
bucketOffset
=
OpenCLArray
::
create
<
cl_uint
>
(
context
,
numBuckets
,
"bucketOffset"
);
bucketOfElement
=
OpenCLArray
::
create
<
cl_uint
>
(
context
,
length
,
"bucketOfElement"
);
offsetInBucket
=
OpenCLArray
::
create
<
cl_uint
>
(
context
,
length
,
"offsetInBucket"
);
buckets
=
OpenCLArray
::
create
<
typename
TRAIT
::
DataType
>
(
context
,
length
,
"buckets"
);
}
~
OpenCLSort
()
{
if
(
dataRange
!=
NULL
)
delete
dataRange
;
if
(
bucketOfElement
!=
NULL
)
delete
bucketOfElement
;
if
(
offsetInBucket
!=
NULL
)
delete
offsetInBucket
;
if
(
bucketOffset
!=
NULL
)
delete
bucketOffset
;
if
(
buckets
!=
NULL
)
delete
buckets
;
}
OpenCLSort
(
OpenCLContext
&
context
,
SortTrait
*
trait
,
unsigned
int
length
);
~
OpenCLSort
();
/**
* Sort an array.
*/
void
sort
(
OpenCLArray
&
data
)
{
if
(
data
.
getSize
()
!=
bucketOfElement
->
getSize
())
throw
OpenMMException
(
"OpenCLSort called with different data size"
);
if
(
data
.
getSize
()
==
0
)
return
;
// Compute the range of data values.
computeRangeKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
data
.
getDeviceBuffer
());
computeRangeKernel
.
setArg
<
cl_uint
>
(
1
,
data
.
getSize
());
computeRangeKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
dataRange
->
getDeviceBuffer
());
computeRangeKernel
.
setArg
(
3
,
rangeKernelSize
*
sizeof
(
typename
TRAIT
::
KeyType
),
NULL
);
context
.
executeKernel
(
computeRangeKernel
,
rangeKernelSize
,
rangeKernelSize
);
// Assign array elements to buckets.
unsigned
int
numBuckets
=
bucketOffset
->
getSize
();
context
.
clearBuffer
(
*
bucketOffset
);
assignElementsKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
data
.
getDeviceBuffer
());
assignElementsKernel
.
setArg
<
cl_int
>
(
1
,
data
.
getSize
());
assignElementsKernel
.
setArg
<
cl_int
>
(
2
,
numBuckets
);
assignElementsKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
dataRange
->
getDeviceBuffer
());
assignElementsKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
bucketOffset
->
getDeviceBuffer
());
assignElementsKernel
.
setArg
<
cl
::
Buffer
>
(
5
,
bucketOfElement
->
getDeviceBuffer
());
assignElementsKernel
.
setArg
<
cl
::
Buffer
>
(
6
,
offsetInBucket
->
getDeviceBuffer
());
context
.
executeKernel
(
assignElementsKernel
,
data
.
getSize
());
// Compute the position of each bucket.
computeBucketPositionsKernel
.
setArg
<
cl_int
>
(
0
,
numBuckets
);
computeBucketPositionsKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
bucketOffset
->
getDeviceBuffer
());
computeBucketPositionsKernel
.
setArg
(
2
,
positionsKernelSize
*
sizeof
(
cl_int
),
NULL
);
context
.
executeKernel
(
computeBucketPositionsKernel
,
positionsKernelSize
,
positionsKernelSize
);
// Copy the data into the buckets.
copyToBucketsKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
data
.
getDeviceBuffer
());
copyToBucketsKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
buckets
->
getDeviceBuffer
());
copyToBucketsKernel
.
setArg
<
cl_int
>
(
2
,
data
.
getSize
());
copyToBucketsKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
bucketOffset
->
getDeviceBuffer
());
copyToBucketsKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
bucketOfElement
->
getDeviceBuffer
());
copyToBucketsKernel
.
setArg
<
cl
::
Buffer
>
(
5
,
offsetInBucket
->
getDeviceBuffer
());
context
.
executeKernel
(
copyToBucketsKernel
,
data
.
getSize
());
// Sort each bucket.
sortBucketsKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
data
.
getDeviceBuffer
());
sortBucketsKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
buckets
->
getDeviceBuffer
());
sortBucketsKernel
.
setArg
<
cl_int
>
(
2
,
numBuckets
);
sortBucketsKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
bucketOffset
->
getDeviceBuffer
());
sortBucketsKernel
.
setArg
(
4
,
sortKernelSize
*
sizeof
(
typename
TRAIT
::
DataType
),
NULL
);
context
.
executeKernel
(
sortBucketsKernel
,
((
data
.
getSize
()
+
sortKernelSize
-
1
)
/
sortKernelSize
)
*
sortKernelSize
,
sortKernelSize
);
}
void
sort
(
OpenCLArray
&
data
);
private:
OpenCLContext
&
context
;
SortTrait
*
trait
;
OpenCLArray
*
dataRange
;
OpenCLArray
*
bucketOfElement
;
OpenCLArray
*
offsetInBucket
;
OpenCLArray
*
bucketOffset
;
OpenCLArray
*
buckets
;
cl
::
Kernel
computeRangeKernel
,
assignElementsKernel
,
computeBucketPositionsKernel
,
copyToBucketsKernel
,
sortBucketsKernel
;
unsigned
int
rangeKernelSize
,
positionsKernelSize
,
sortKernelSize
;
cl
::
Kernel
shortListKernel
,
computeRangeKernel
,
assignElementsKernel
,
computeBucketPositionsKernel
,
copyToBucketsKernel
,
sortBucketsKernel
;
unsigned
int
dataLength
,
rangeKernelSize
,
positionsKernelSize
,
sortKernelSize
;
bool
isShortList
;
};
/**
* A subclass of SortTrait defines the type of value to sort, and the key for sorting them.
*/
class
OpenCLSort
::
SortTrait
{
public:
virtual
~
SortTrait
()
{
}
/**
* Get the size of each data value in bytes.
*/
virtual
int
getDataSize
()
const
=
0
;
/**
* Get the size of each key value in bytes.
*/
virtual
int
getKeySize
()
const
=
0
;
/**
* Get the data type of the values to sort.
*/
virtual
const
char
*
getDataType
()
const
=
0
;
/**
* Get the data type of the sorting key.
*/
virtual
const
char
*
getKeyType
()
const
=
0
;
/**
* Get the minimum value a key can take.
*/
virtual
const
char
*
getMinKey
()
const
=
0
;
/**
* Get the maximum value a key can take.
*/
virtual
const
char
*
getMaxKey
()
const
=
0
;
/**
* Get a value whose key is guaranteed to equal getMaxKey().
*/
virtual
const
char
*
getMaxValue
()
const
=
0
;
/**
* Get the CUDA code to select the key from the data value.
*/
virtual
const
char
*
getSortKey
()
const
=
0
;
};
}
// namespace OpenMM
...
...
platforms/opencl/src/kernels/andersenThermostat.cl
View file @
93c467b2
...
...
@@ -4,14 +4,14 @@
__kernel
void
applyAndersenThermostat
(
float
collisionFrequency,
float
kT,
__global
mixed4*
velm,
__global
const
mixed2*
restrict
stepSize,
__global
const
float4*
restrict
random,
unsigned
int
randomIndex,
__global
const
int*
restrict
atomGroups
)
{
float
collisionProbability
=
1.0f-
exp
(
-collisionFrequency*stepSize[0].y
)
;
float
randomRange
=
erf
(
collisionProbability/
sqrt
(
2.0f
))
;
float
collisionProbability
=
1.0f-
EXP
(
-collisionFrequency*stepSize[0].y
)
;
float
randomRange
=
erf
(
collisionProbability/
SQRT
(
2.0f
))
;
for
(
int
index
=
get_global_id
(
0
)
; index < NUM_ATOMS; index += get_global_size(0)) {
mixed4
velocity
=
velm[index]
;
float4
selectRand
=
random[randomIndex+atomGroups[index]]
;
float4
velRand
=
random[randomIndex+index]
;
real
scale
=
(
selectRand.w
>
-randomRange
&&
selectRand.w
<
randomRange
?
0
:
1
)
;
real
add
=
(
1-scale
)
*
sqrt
(
kT*velocity.w
)
;
real
add
=
(
1-scale
)
*
SQRT
(
kT*velocity.w
)
;
velocity.x
=
scale*velocity.x
+
add*velRand.x
;
velocity.y
=
scale*velocity.y
+
add*velRand.y
;
velocity.z
=
scale*velocity.z
+
add*velRand.z
;
...
...
platforms/opencl/src/kernels/brownian.cl
View file @
93c467b2
...
...
@@ -8,9 +8,9 @@ __kernel void integrateBrownianPart1(mixed tauDeltaT, mixed noiseAmplitude, __gl
for
(
int
index
=
get_global_id
(
0
)
; index < NUM_ATOMS; index += get_global_size(0)) {
mixed
invMass
=
velm[index].w
;
if
(
invMass
!=
0
)
{
posDelta[index]
=
(
mixed4
)
(
tauDeltaT*invMass*force[index].x
+
noiseAmplitude*
sqrt
(
invMass
)
*random[randomIndex].x,
tauDeltaT*invMass*force[index].y
+
noiseAmplitude*
sqrt
(
invMass
)
*random[randomIndex].y,
tauDeltaT*invMass*force[index].z
+
noiseAmplitude*
sqrt
(
invMass
)
*random[randomIndex].z,
0
)
;
posDelta[index]
=
(
mixed4
)
(
tauDeltaT*invMass*force[index].x
+
noiseAmplitude*
SQRT
(
invMass
)
*random[randomIndex].x,
tauDeltaT*invMass*force[index].y
+
noiseAmplitude*
SQRT
(
invMass
)
*random[randomIndex].y,
tauDeltaT*invMass*force[index].z
+
noiseAmplitude*
SQRT
(
invMass
)
*random[randomIndex].z,
0
)
;
}
randomIndex
+=
get_global_size
(
0
)
;
}
...
...
platforms/opencl/src/kernels/ccma.cl
View file @
93c467b2
...
...
@@ -10,7 +10,8 @@ mixed4 loadPos(__global const real4* restrict posq, __global const real4* restri
/**
*
Compute
the
direction
each
constraint
is
pointing
in.
This
is
called
once
at
the
beginning
of
constraint
evaluation.
*/
__kernel
void
computeConstraintDirections
(
__global
const
int2*
restrict
constraintAtoms,
__global
mixed4*
restrict
constraintDistance,
__global
const
real4*
restrict
atomPositions,
__global
const
real4*
restrict
posCorrection
)
{
__kernel
void
computeConstraintDirections
(
__global
const
int2*
restrict
constraintAtoms,
__global
mixed4*
restrict
constraintDistance,
__global
const
real4*
restrict
atomPositions,
__global
const
real4*
restrict
posCorrection,
__global
int*
restrict
converged
)
{
for
(
int
index
=
get_global_id
(
0
)
; index < NUM_CONSTRAINTS; index += get_global_size(0)) {
//
Compute
the
direction
for
this
constraint.
...
...
@@ -23,6 +24,10 @@ __kernel void computeConstraintDirections(__global const int2* restrict constrai
dir.z
=
oldPos1.z-oldPos2.z
;
constraintDistance[index]
=
dir
;
}
if
(
get_global_id
(
0
)
==
0
)
{
converged[0]
=
1
;
converged[1]
=
0
;
}
}
/**
...
...
platforms/opencl/src/kernels/coulombLennardJones.cl
View file @
93c467b2
#
if
USE_EWALD
bool
needCorrection
=
isExcluded
&&
atom1
!=
atom2
&&
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
;
bool
needCorrection
=
hasExclusions
&&
isExcluded
&&
atom1
!=
atom2
&&
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
;
if
(
!isExcluded
|
| needCorrection) {
real tempForce = 0;
if (r2 < CUTOFF_SQUARED |
|
needCorrection
)
{
const
real
alphaR
=
EWALD_ALPHA*r
;
const
real
expAlphaRSqr
=
EXP
(
-alphaR*alphaR
)
;
...
...
@@ -16,6 +15,7 @@ if (!isExcluded || needCorrection) {
t
*=
t
;
t
*=
t
;
const
real
erfcAlphaR
=
RECIP
(
t*t
)
;
real
tempForce
=
0
;
if
(
needCorrection
)
{
//
Subtract
off
the
part
of
this
interaction
that
was
included
in
the
reciprocal
space
contribution.
...
...
@@ -36,8 +36,8 @@ if (!isExcluded || needCorrection) {
tempEnergy
+=
prefactor*erfcAlphaR
;
#
endif
}
dEdR
+=
tempForce*invR*invR
;
}
dEdR
+=
tempForce*invR*invR
;
}
#
else
{
...
...
platforms/opencl/src/kernels/customGBEnergyN2.cl
0 → 100644
View file @
93c467b2
#
ifdef
SUPPORTS_64_BIT_ATOMICS
#
pragma
OPENCL
EXTENSION
cl_khr_int64_base_atomics
:
enable
#
define
STORE_DERIVATIVE_1
(
INDEX
)
atom_add
(
&derivBuffers[offset+
(
INDEX-1
)
*PADDED_NUM_ATOMS],
(
long
)
(
deriv##INDEX##_1*0x100000000
))
;
#
define
STORE_DERIVATIVE_2
(
INDEX
)
atom_add
(
&derivBuffers[offset+
(
INDEX-1
)
*PADDED_NUM_ATOMS],
(
long
)
(
local_deriv##INDEX[get_local_id
(
0
)
]*0x100000000
))
;
#
else
#
define
STORE_DERIVATIVE_1
(
INDEX
)
derivBuffers##INDEX[offset]
+=
deriv##INDEX##_1
;
#
define
STORE_DERIVATIVE_2
(
INDEX
)
derivBuffers##INDEX[offset]
+=
local_deriv##INDEX[get_local_id
(
0
)
]
;
#
endif
/**
*
Compute
a
force
based
on
pair
interactions.
*/
__kernel
void
computeN2Energy
(
#
ifdef
SUPPORTS_64_BIT_ATOMICS
__global
long*
restrict
forceBuffers,
#
else
__global
real4*
restrict
forceBuffers,
#
endif
__global
real*
restrict
energyBuffer,
__local
real4*
restrict
local_force,
__global
const
real4*
restrict
posq,
__local
real4*
restrict
local_posq,
__global
const
unsigned
int*
restrict
exclusions,
__global
const
ushort2*
exclusionTiles,
#
ifdef
USE_CUTOFF
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
const
real4*
restrict
blockCenter,
__global
const
int*
restrict
interactingAtoms
#
else
unsigned
int
numTiles
#
endif
PARAMETER_ARGUMENTS
)
{
const
unsigned
int
totalWarps
=
get_global_size
(
0
)
/TILE_SIZE
;
const
unsigned
int
warp
=
get_global_id
(
0
)
/TILE_SIZE
;
const
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
TILE_SIZE-1
)
;
const
unsigned
int
tbx
=
get_local_id
(
0
)
-
tgx
;
real
energy
=
0
;
//
First
loop:
process
tiles
that
contain
exclusions.
const
unsigned
int
firstExclusionTile
=
FIRST_EXCLUSION_TILE+warp*
(
LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE
)
/totalWarps
;
const
unsigned
int
lastExclusionTile
=
FIRST_EXCLUSION_TILE+
(
warp+1
)
*
(
LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE
)
/totalWarps
;
for
(
int
pos
=
firstExclusionTile
; pos < lastExclusionTile; pos++) {
const
ushort2
tileIndices
=
exclusionTiles[pos]
;
const
unsigned
int
x
=
tileIndices.x
;
const
unsigned
int
y
=
tileIndices.y
;
real4
force
=
0
;
DECLARE_ATOM1_DERIVATIVES
unsigned
int
atom1
=
x*TILE_SIZE
+
tgx
;
real4
posq1
=
posq[atom1]
;
LOAD_ATOM1_PARAMETERS
#
ifdef
USE_EXCLUSIONS
unsigned
int
excl
=
exclusions[pos*TILE_SIZE+tgx]
;
#
endif
if
(
x
==
y
)
{
//
This
tile
is
on
the
diagonal.
const
unsigned
int
localAtomIndex
=
get_local_id
(
0
)
;
local_posq[localAtomIndex]
=
posq1
;
LOAD_LOCAL_PARAMETERS_FROM_1
SYNC_WARPS
;
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
int
atom2
=
tbx+j
;
real4
posq2
=
local_posq[atom2]
;
real4
delta
=
(
real4
)
(
posq2.xyz
-
posq1.xyz,
0
)
;
#
ifdef
USE_PERIODIC
delta.xyz
-=
floor
(
delta.xyz*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
#
endif
real
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
#
ifdef
USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
#
endif
real
invR
=
RSQRT
(
r2
)
;
real
r
=
RECIP
(
invR
)
;
LOAD_ATOM2_PARAMETERS
atom2
=
y*TILE_SIZE+j
;
real
dEdR
=
0
;
real
tempEnergy
=
0
;
#
ifdef
USE_EXCLUSIONS
bool
isExcluded
=
!
(
excl
&
0x1
)
;
#
endif
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
&&
atom1
!=
atom2
)
{
COMPUTE_INTERACTION
dEdR
/=
-r
;
}
energy
+=
0.5f*tempEnergy
;
delta.xyz
*=
dEdR
;
force.xyz
-=
delta.xyz
;
#
ifdef
USE_CUTOFF
}
#
endif
#
ifdef
USE_EXCLUSIONS
excl
>>=
1
;
#
endif
SYNC_WARPS
;
}
}
else
{
//
This
is
an
off-diagonal
tile.
const
unsigned
int
localAtomIndex
=
get_local_id
(
0
)
;
unsigned
int
j
=
y*TILE_SIZE
+
tgx
;
local_posq[localAtomIndex]
=
posq[j]
;
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
local_force[localAtomIndex]
=
0
;
CLEAR_LOCAL_DERIVATIVES
SYNC_WARPS
;
#
ifdef
USE_EXCLUSIONS
excl
=
(
excl
>>
tgx
)
| (excl << (TILE_SIZE - tgx));
#endif
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+tj;
real4 posq2 = local_posq[atom2];
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+tj;
real dEdR = 0;
real tempEnergy = 0;
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
COMPUTE_INTERACTION
dEdR /= -r;
}
energy += tempEnergy;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
atom2 = tbx+tj;
local_force[atom2].xyz += delta.xyz;
RECORD_DERIVATIVE_2
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl >>= 1;
#endif
tj = (tj + 1) & (TILE_SIZE - 1);
SYNC_WARPS;
}
}
// Write results.
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset = x*TILE_SIZE + tgx;
atom_add(&forceBuffers[offset], (long) (force.x*0x100000000));
atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
STORE_DERIVATIVES_1
if (x != y) {
offset = y*TILE_SIZE + tgx;
atom_add(&forceBuffers[offset], (long) (local_force[get_local_id(0)].x*0x100000000));
atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (local_force[get_local_id(0)].y*0x100000000));
atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (local_force[get_local_id(0)].z*0x100000000));
STORE_DERIVATIVES_2
}
#else
unsigned int offset1 = x*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
unsigned int offset = offset1;
forceBuffers[offset1].xyz += force.xyz;
STORE_DERIVATIVES_1
if (x != y) {
offset = offset2;
forceBuffers[offset2] += (real4) (local_force[get_local_id(0)].x, local_force[get_local_id(0)].y, local_force[get_local_id(0)].z, 0.0f);
STORE_DERIVATIVES_2
}
#endif
}
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
// of them (no cutoff).
#ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0];
int pos = warp*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
int end = (warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
#else
int pos = warp*numTiles/totalWarps;
int end = (warp+1)*numTiles/totalWarps;
#endif
int skipBase = 0;
int currentSkipIndex = tbx;
__local int atomIndices[FORCE_WORK_GROUP_SIZE];
__local int skipTiles[FORCE_WORK_GROUP_SIZE];
skipTiles[get_local_id(0)] = -1;
while (pos < end) {
const bool isExcluded = false;
real4 force = 0;
DECLARE_ATOM1_DERIVATIVES
bool includeTile = true;
// Extract the coordinates of this tile.
unsigned int x, y;
bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
x = tileIndices.x;
singlePeriodicCopy = tileIndices.y;
}
else
#endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y |
|
x
>=
NUM_BLOCKS
)
{
//
Occasionally
happens
due
to
roundoff
error.
y
+=
(
x
<
y
?
-1
:
1
)
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
}
//
Skip
over
tiles
that
have
exclusions,
since
they
were
already
processed.
SYNC_WARPS
;
while
(
skipTiles[tbx+TILE_SIZE-1]
<
pos
)
{
SYNC_WARPS
;
if
(
skipBase+tgx
<
NUM_TILES_WITH_EXCLUSIONS
)
{
ushort2
tile
=
exclusionTiles[skipBase+tgx]
;
skipTiles[get_local_id
(
0
)
]
=
tile.x
+
tile.y*NUM_BLOCKS
-
tile.y*
(
tile.y+1
)
/2
;
}
else
skipTiles[get_local_id
(
0
)
]
=
end
;
skipBase
+=
TILE_SIZE
;
currentSkipIndex
=
tbx
;
SYNC_WARPS
;
}
while
(
skipTiles[currentSkipIndex]
<
pos
)
currentSkipIndex++
;
includeTile
=
(
skipTiles[currentSkipIndex]
!=
pos
)
;
}
if
(
includeTile
)
{
unsigned
int
atom1
=
x*TILE_SIZE
+
tgx
;
//
Load
atom
data
for
this
tile.
real4
posq1
=
posq[atom1]
;
LOAD_ATOM1_PARAMETERS
const
unsigned
int
localAtomIndex
=
get_local_id
(
0
)
;
#
ifdef
USE_CUTOFF
unsigned
int
j
=
(
numTiles
<=
maxTiles
?
interactingAtoms[pos*TILE_SIZE+tgx]
:
y*TILE_SIZE
+
tgx
)
;
#
else
unsigned
int
j
=
y*TILE_SIZE
+
tgx
;
#
endif
atomIndices[get_local_id
(
0
)
]
=
j
;
if
(
j
<
PADDED_NUM_ATOMS
)
{
local_posq[localAtomIndex]
=
posq[j]
;
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
local_force[localAtomIndex]
=
0
;
CLEAR_LOCAL_DERIVATIVES
}
SYNC_WARPS
;
#
ifdef
USE_PERIODIC
if
(
singlePeriodicCopy
)
{
//
The
box
is
small
enough
that
we
can
just
translate
all
the
atoms
into
a
single
periodic
//
box,
then
skip
having
to
apply
periodic
boundary
conditions
later.
real4
blockCenterX
=
blockCenter[x]
;
posq1.xyz
-=
floor
((
posq1.xyz-blockCenterX.xyz
)
*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
local_posq[get_local_id
(
0
)
].x
-=
floor
((
local_posq[get_local_id
(
0
)
].x-blockCenterX.x
)
*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
local_posq[get_local_id
(
0
)
].y
-=
floor
((
local_posq[get_local_id
(
0
)
].y-blockCenterX.y
)
*invPeriodicBoxSize.y+0.5f
)
*periodicBoxSize.y
;
local_posq[get_local_id
(
0
)
].z
-=
floor
((
local_posq[get_local_id
(
0
)
].z-blockCenterX.z
)
*invPeriodicBoxSize.z+0.5f
)
*periodicBoxSize.z
;
SYNC_WARPS
;
unsigned
int
tj
=
tgx
;
for
(
j
=
0
; j < TILE_SIZE; j++) {
int
atom2
=
tbx+tj
;
real4
posq2
=
local_posq[atom2]
;
real4
delta
=
(
real4
)
(
posq2.xyz
-
posq1.xyz,
0
)
;
real
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
if
(
r2
<
CUTOFF_SQUARED
)
{
real
invR
=
RSQRT
(
r2
)
;
real
r
=
RECIP
(
invR
)
;
LOAD_ATOM2_PARAMETERS
atom2
=
atomIndices[tbx+tj]
;
real
dEdR
=
0
;
real
tempEnergy
=
0
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
COMPUTE_INTERACTION
dEdR
/=
-r
;
}
energy
+=
tempEnergy
;
delta.xyz
*=
dEdR
;
force.xyz
-=
delta.xyz
;
atom2
=
tbx+tj
;
local_force[atom2].xyz
+=
delta.xyz
;
RECORD_DERIVATIVE_2
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
)
;
SYNC_WARPS
;
}
}
else
#
endif
{
//
We
need
to
apply
periodic
boundary
conditions
separately
for
each
interaction.
unsigned
int
tj
=
tgx
;
for
(
j
=
0
; j < TILE_SIZE; j++) {
int
atom2
=
tbx+tj
;
real4
posq2
=
local_posq[atom2]
;
real4
delta
=
(
real4
)
(
posq2.xyz
-
posq1.xyz,
0
)
;
#
ifdef
USE_PERIODIC
delta.xyz
-=
floor
(
delta.xyz*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
#
endif
real
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
#
ifdef
USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
#
endif
real
invR
=
RSQRT
(
r2
)
;
real
r
=
RECIP
(
invR
)
;
LOAD_ATOM2_PARAMETERS
atom2
=
atomIndices[tbx+tj]
;
real
dEdR
=
0
;
real
tempEnergy
=
0
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
COMPUTE_INTERACTION
dEdR
/=
-r
;
}
energy
+=
tempEnergy
;
delta.xyz
*=
dEdR
;
force.xyz
-=
delta.xyz
;
atom2
=
tbx+tj
;
local_force[atom2].xyz
+=
delta.xyz
;
RECORD_DERIVATIVE_2
#
ifdef
USE_CUTOFF
}
#
endif
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
)
;
SYNC_WARPS
;
}
}
//
Write
results.
#
ifdef
USE_CUTOFF
unsigned
int
atom2
=
atomIndices[get_local_id
(
0
)
]
;
#
else
unsigned
int
atom2
=
y*TILE_SIZE
+
tgx
;
#
endif
#
ifdef
SUPPORTS_64_BIT_ATOMICS
atom_add
(
&forceBuffers[atom1],
(
long
)
(
force.x*0x100000000
))
;
atom_add
(
&forceBuffers[atom1+PADDED_NUM_ATOMS],
(
long
)
(
force.y*0x100000000
))
;
atom_add
(
&forceBuffers[atom1+2*PADDED_NUM_ATOMS],
(
long
)
(
force.z*0x100000000
))
;
unsigned
int
offset
=
atom1
;
STORE_DERIVATIVES_1
if
(
atom2
<
PADDED_NUM_ATOMS
)
{
atom_add
(
&forceBuffers[atom2],
(
long
)
(
local_force[get_local_id
(
0
)
].x*0x100000000
))
;
atom_add
(
&forceBuffers[atom2+PADDED_NUM_ATOMS],
(
long
)
(
local_force[get_local_id
(
0
)
].y*0x100000000
))
;
atom_add
(
&forceBuffers[atom2+2*PADDED_NUM_ATOMS],
(
long
)
(
local_force[get_local_id
(
0
)
].z*0x100000000
))
;
offset
=
atom2
;
STORE_DERIVATIVES_2
}
#
else
unsigned
int
offset1
=
atom1
+
warp*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
atom2
+
warp*PADDED_NUM_ATOMS
;
forceBuffers[offset1].xyz
+=
force.xyz
;
unsigned
int
offset
=
offset1
;
STORE_DERIVATIVES_1
if
(
atom2
<
PADDED_NUM_ATOMS
)
{
forceBuffers[offset2]
+=
(
real4
)
(
local_force[get_local_id
(
0
)
].x,
local_force[get_local_id
(
0
)
].y,
local_force[get_local_id
(
0
)
].z,
0.0f
)
;
offset
=
offset2
;
STORE_DERIVATIVES_2
}
#
endif
}
pos++
;
}
energyBuffer[get_global_id
(
0
)
]
+=
energy
;
}
platforms/opencl/src/kernels/customGBEnergyN2_cpu.cl
View file @
93c467b2
#
define
TILE_SIZE
32
#
define
STORE_DERIVATIVE_1
(
INDEX
)
derivBuffers##INDEX[offset1]
+=
deriv##INDEX##_1
;
#
define
STORE_DERIVATIVE_2
(
INDEX
)
derivBuffers##INDEX[offset2]
+=
local_deriv##INDEX[tgx]
;
#
ifdef
SUPPORTS_64_BIT_ATOMICS
#
pragma
OPENCL
EXTENSION
cl_khr_int64_base_atomics
:
enable
#
define
STORE_DERIVATIVE_1
(
INDEX
)
atom_add
(
&derivBuffers[offset+
(
INDEX-1
)
*PADDED_NUM_ATOMS],
(
long
)
(
deriv##INDEX##_1*0x100000000
))
;
#
define
STORE_DERIVATIVE_2
(
INDEX
)
atom_add
(
&derivBuffers[offset+
(
INDEX-1
)
*PADDED_NUM_ATOMS],
(
long
)
(
local_deriv##INDEX[tgx]*0x100000000
))
;
#
else
#
define
STORE_DERIVATIVE_1
(
INDEX
)
derivBuffers##INDEX[offset]
+=
deriv##INDEX##_1
;
#
define
STORE_DERIVATIVE_2
(
INDEX
)
derivBuffers##INDEX[offset]
+=
local_deriv##INDEX[tgx]
;
#
endif
/**
*
Compute
a
force
based
on
pair
interactions.
*/
__kernel
void
computeN2Energy
(
__global
real4*
restrict
forceBuffers,
__global
real*
restrict
energyBuffer,
__local
real4*
restrict
local_force,
__global
const
real4*
restrict
posq,
__local
real4*
restrict
local_posq,
__global
const
unsigned
int*
restrict
exclusions,
__global
const
unsigned
int*
restrict
exclusionIndices,
__global
const
unsigned
int*
restrict
exclusionRowIndices,
__local
real4*
restrict
tempBuffer,
#
ifdef
USE_CUTOFF
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
const
unsigned
int*
restrict
interactionFlags
__kernel
void
computeN2Energy
(
#
ifdef
SUPPORTS_64_BIT_ATOMICS
__global
long*
restrict
forceBuffers,
#
else
unsigned
int
numTiles
__global
real4*
restrict
forceBuffers,
#
endif
PARAMETER_ARGUMENTS
)
{
__global
real*
restrict
energyBuffer,
__local
real4*
restrict
local_force,
__global
const
real4*
restrict
posq,
__local
real4*
restrict
local_posq,
__global
const
unsigned
int*
restrict
exclusions,
__global
const
ushort2*
exclusionTiles,
#
ifdef
USE_CUTOFF
unsigned
int
numTiles
=
interactionCount[0]
;
unsigned
int
pos
=
get_group_id
(
0
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS*
(
NUM_BLOCKS+1
)
/2
:
numTiles
)
/get_num_groups
(
0
)
;
unsigned
int
end
=
(
get_group_id
(
0
)
+1
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS*
(
NUM_BLOCKS+1
)
/2
:
numTiles
)
/get_num_groups
(
0
)
;
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
const
real4*
restrict
blockCenter,
__global
const
int*
restrict
interactingAtoms
#
else
unsigned
int
pos
=
get_group_id
(
0
)
*numTiles/get_num_groups
(
0
)
;
unsigned
int
end
=
(
get_group_id
(
0
)
+1
)
*numTiles/get_num_groups
(
0
)
;
unsigned
int
numTiles
#
endif
PARAMETER_ARGUMENTS
)
{
real
energy
=
0
;
unsigned
int
lasty
=
0xFFFFFFFF
;
while
(
pos
<
end
)
{
//
Extract
the
coordinates
of
this
tile
unsigned
int
x,
y
;
#
ifdef
USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles[pos]
;
x
=
tileIndices.x
;
y
=
tileIndices.y
;
}
else
#
endif
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS+0.5f-SQRT
((
NUM_BLOCKS+0.5f
)
*
(
NUM_BLOCKS+0.5f
)
-2*pos
))
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
if
(
x
<
y
|
| x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
//
First
loop:
process
tiles
that
contain
exclusions.
const
unsigned
int
firstExclusionTile
=
FIRST_EXCLUSION_TILE+get_group_id
(
0
)
*
(
LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE
)
/get_num_groups
(
0
)
;
const
unsigned
int
lastExclusionTile
=
FIRST_EXCLUSION_TILE+
(
get_group_id
(
0
)
+1
)
*
(
LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE
)
/get_num_groups
(
0
)
;
for
(
int
pos
=
firstExclusionTile
; pos < lastExclusionTile; pos++) {
const
ushort2
tileIndices
=
exclusionTiles[pos]
;
const
unsigned
int
x
=
tileIndices.x
;
const
unsigned
int
y
=
tileIndices.y
;
// Lo
cate the exclusion
data for this tile.
//
Lo
ad
the
data
for
this
tile.
#ifdef USE_EXCLUSIONS
unsigned int exclusionStart = exclusionRowIndices[x];
unsigned int exclusionEnd = exclusionRowIndices[x+1];
int exclusionIndex = -1;
for (int i = exclusionStart; i < exclusionEnd; i++)
if (exclusionIndices[i] == y) {
exclusionIndex = i*TILE_SIZE;
break;
}
bool hasExclusions = (exclusionIndex > -1);
#else
bool hasExclusions = false;
#endif
// Load the data for this tile if we don't already have it cached.
if (lasty != y) {
for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
unsigned int j = y*TILE_SIZE + localAtomIndex;
local_posq[localAtomIndex] = posq[j];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
for
(
int
localAtomIndex
=
0
; localAtomIndex < TILE_SIZE; localAtomIndex++) {
unsigned
int
j
=
y*TILE_SIZE
+
localAtomIndex
;
local_posq[localAtomIndex]
=
posq[j]
;
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
if
(
x
==
y
)
{
//
This
tile
is
on
the
diagonal.
for
(
unsigned
int
tgx
=
0
; tgx < TILE_SIZE; tgx++) {
#
ifdef
USE_EXCLUSIONS
unsigned int excl = exclusions[
exclusionIndex
+tgx];
unsigned
int
excl
=
exclusions[
pos*TILE_SIZE
+tgx]
;
#
endif
unsigned
int
atom1
=
x*TILE_SIZE+tgx
;
real4
force
=
0
;
...
...
@@ -84,9 +56,6 @@ __kernel void computeN2Energy(__global real4* restrict forceBuffers, __global re
real4
posq1
=
posq[atom1]
;
LOAD_ATOM1_PARAMETERS
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
real4
posq2
=
local_posq[j]
;
real4
delta
=
(
real4
)
(
posq2.xyz
-
posq1.xyz,
0
)
;
#
ifdef
USE_PERIODIC
...
...
@@ -96,20 +65,23 @@ __kernel void computeN2Energy(__global real4* restrict forceBuffers, __global re
#
ifdef
USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
#
endif
real invR = RSQRT(r2);
real r = RECIP(invR);
unsigned int atom2 = j;
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
real dEdR = 0;
real tempEnergy = 0;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
COMPUTE_INTERACTION
dEdR /= -r;
}
energy += 0.5f*tempEnergy;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
real
invR
=
RSQRT
(
r2
)
;
real
r
=
RECIP
(
invR
)
;
unsigned
int
atom2
=
j
;
LOAD_ATOM2_PARAMETERS
atom2
=
y*TILE_SIZE+j
;
real
dEdR
=
0
;
real
tempEnergy
=
0
;
#
ifdef
USE_EXCLUSIONS
bool
isExcluded
=
!
(
excl
&
0x1
)
;
#
endif
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
&&
atom1
!=
atom2
)
{
COMPUTE_INTERACTION
dEdR
/=
-r
;
}
energy
+=
0.5f*tempEnergy
;
delta.xyz
*=
dEdR
;
force.xyz
-=
delta.xyz
;
#
ifdef
USE_CUTOFF
}
#
endif
...
...
@@ -118,11 +90,19 @@ __kernel void computeN2Energy(__global real4* restrict forceBuffers, __global re
#
endif
}
// Write results
//
Write
results
.
unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset1].xyz += force.xyz;
#
ifdef
SUPPORTS_64_BIT_ATOMICS
unsigned
int
offset
=
atom1
;
atom_add
(
&forceBuffers[offset],
(
long
)
(
force.x*0x100000000
))
;
atom_add
(
&forceBuffers[offset+PADDED_NUM_ATOMS],
(
long
)
(
force.y*0x100000000
))
;
atom_add
(
&forceBuffers[offset+2*PADDED_NUM_ATOMS],
(
long
)
(
force.z*0x100000000
))
;
STORE_DERIVATIVES_1
#
else
unsigned
int
offset
=
atom1
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
forceBuffers[offset].xyz
+=
force.xyz
;
STORE_DERIVATIVES_1
#
endif
}
}
else
{
...
...
@@ -132,60 +112,212 @@ __kernel void computeN2Energy(__global real4* restrict forceBuffers, __global re
local_force[localAtomIndex]
=
0
;
CLEAR_LOCAL_DERIVATIVES
}
#if defined(USE_CUTOFF) && defined(USE_EXCLUSIONS)
unsigned int flags1 = (numTiles <= maxTiles ? interactionFlags[2*pos] : 0xFFFFFFFF);
unsigned int flags2 = (numTiles <= maxTiles ? interactionFlags[2*pos+1] : 0xFFFFFFFF);
if (!hasExclusions && (flags1 != 0xFFFFFFFF |
|
flags2
!=
0xFFFFFFFF
)
)
{
//
Compute
only
a
subset
of
the
interactions
in
this
tile.
for
(
unsigned
int
tgx
=
0
; tgx < TILE_SIZE; tgx++) {
if
((
flags2&
(
1<<tgx
))
!=
0
)
{
unsigned
int
atom1
=
x*TILE_SIZE+tgx
;
real
value
=
0
;
DECLARE_ATOM1_DERIVATIVES
real4
posq1
=
posq[atom1]
;
LOAD_ATOM1_PARAMETERS
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
if
((
flags&
(
1<<j
))
!=
0
)
{
real4
posq2
=
local_posq[j]
;
real4
delta
=
(
real4
)
(
posq2.xyz
-
posq1.xyz,
0
)
;
for
(
unsigned
int
tgx
=
0
; tgx < TILE_SIZE; tgx++) {
#
ifdef
USE_EXCLUSIONS
unsigned
int
excl
=
exclusions[pos*TILE_SIZE+tgx]
;
#
endif
unsigned
int
atom1
=
x*TILE_SIZE+tgx
;
real4
force
=
0
;
DECLARE_ATOM1_DERIVATIVES
real4
posq1
=
posq[atom1]
;
LOAD_ATOM1_PARAMETERS
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
real4
posq2
=
local_posq[j]
;
real4
delta
=
(
real4
)
(
posq2.xyz
-
posq1.xyz,
0
)
;
#
ifdef
USE_PERIODIC
delta.xyz
-=
floor
(
delta.xyz*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
delta.xyz
-=
floor
(
delta.xyz*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
#
endif
real
r2
=
dot
(
delta.xyz,
delta.xyz
)
;
if
(
r2
<
CUTOFF_SQUARED
)
{
real
invR
=
RSQRT
(
r2
)
;
real
r
=
RECIP
(
invR
)
;
unsigned
int
atom2
=
j
;
LOAD_ATOM2_PARAMETERS
atom2
=
y*TILE_SIZE+j
;
real
dEdR
=
0
;
real
tempEnergy
=
0
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
COMPUTE_INTERACTION
dEdR
/=
-r
;
}
energy
+=
tempEnergy
;
delta.xyz
*=
dEdR
;
force.xyz
-=
delta.xyz
;
atom2
=
j
;
local_force[atom2].xyz
+=
delta.xyz
;
RECORD_DERIVATIVE_2
}
}
real
r2
=
dot
(
delta.xyz,
delta.xyz
)
;
#
ifdef
USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
#
endif
real
invR
=
RSQRT
(
r2
)
;
real
r
=
RECIP
(
invR
)
;
unsigned
int
atom2
=
j
;
LOAD_ATOM2_PARAMETERS
atom2
=
y*TILE_SIZE+j
;
real
dEdR
=
0
;
real
tempEnergy
=
0
;
#
ifdef
USE_EXCLUSIONS
bool
isExcluded
=
(
atom1
>=
NUM_ATOMS
|
| atom2 >= NUM_ATOMS || !(excl & 0x1));
if (!isExcluded) {
#else
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
#endif
COMPUTE_INTERACTION
dEdR /= -r;
}
energy += tempEnergy;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
atom2 = j;
local_force[atom2].xyz += delta.xyz;
RECORD_DERIVATIVE_2
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl >>= 1;
#endif
}
// Write results for atom1.
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset = atom1;
atom_add(&forceBuffers[offset], (long) (force.x*0x100000000));
atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
STORE_DERIVATIVES_1
#else
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz += force.xyz;
STORE_DERIVATIVES_1
#endif
}
// Write results.
for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset = y*TILE_SIZE+tgx;
atom_add(&forceBuffers[offset], (long) (local_force[tgx].x*0x100000000));
atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (local_force[tgx].y*0x100000000));
atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (local_force[tgx].z*0x100000000));
STORE_DERIVATIVES_2
#else
unsigned int offset = y*TILE_SIZE+tgx + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz += local_force[tgx].xyz;
STORE_DERIVATIVES_2
#endif
}
}
}
//
Write
results
for
atom1.
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
// of them (no cutoff).
unsigned
int
offset
=
atom1
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
global_value[offset]
+=
value
;
#ifdef USE_CUTOFF
const unsigned int numTiles = interactionCount[0];
int pos = get_group_id(0)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0);
int end = (get_group_id(0)+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0);
#else
int pos = get_group_id(0)*numTiles/get_num_groups(0);
int end = (get_group_id(0)+1)*numTiles/get_num_groups(0);
#endif
int nextToSkip = -1;
int currentSkipIndex = 0;
__local int atomIndices[TILE_SIZE];
while (pos < end) {
const bool isExcluded = false;
bool includeTile = true;
// Extract the coordinates of this tile.
unsigned int x, y;
bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
x = tileIndices.x;
singlePeriodicCopy = tileIndices.y;
}
else
#endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y |
|
x
>=
NUM_BLOCKS
)
{
//
Occasionally
happens
due
to
roundoff
error.
y
+=
(
x
<
y
?
-1
:
1
)
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
}
//
Skip
over
tiles
that
have
exclusions,
since
they
were
already
processed.
while
(
nextToSkip
<
pos
)
{
if
(
currentSkipIndex
<
NUM_TILES_WITH_EXCLUSIONS
)
{
ushort2
tile
=
exclusionTiles[currentSkipIndex++]
;
nextToSkip
=
tile.x
+
tile.y*NUM_BLOCKS
-
tile.y*
(
tile.y+1
)
/2
;
}
else
nextToSkip
=
end
;
}
includeTile
=
(
nextToSkip
!=
pos
)
;
}
if
(
includeTile
)
{
//
Load
the
data
for
this
tile.
for
(
int
localAtomIndex
=
0
; localAtomIndex < TILE_SIZE; localAtomIndex++) {
#
ifdef
USE_CUTOFF
unsigned
int
j
=
(
numTiles
<=
maxTiles
?
interactingAtoms[pos*TILE_SIZE+localAtomIndex]
:
y*TILE_SIZE+localAtomIndex
)
;
#
else
unsigned
int
j
=
y*TILE_SIZE+localAtomIndex
;
#
endif
atomIndices[localAtomIndex]
=
j
;
if
(
j
<
PADDED_NUM_ATOMS
)
{
local_posq[localAtomIndex]
=
posq[j]
;
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
local_force[localAtomIndex]
=
0
;
CLEAR_LOCAL_DERIVATIVES
}
}
#
ifdef
USE_PERIODIC
if
(
singlePeriodicCopy
)
{
//
The
box
is
small
enough
that
we
can
just
translate
all
the
atoms
into
a
single
periodic
//
box,
then
skip
having
to
apply
periodic
boundary
conditions
later.
real4
blockCenterX
=
blockCenter[x]
;
for
(
unsigned
int
tgx
=
0
; tgx < TILE_SIZE; tgx++)
local_posq[tgx].xyz
-=
floor
((
local_posq[tgx].xyz-blockCenterX.xyz
)
*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
for
(
unsigned
int
tgx
=
0
; tgx < TILE_SIZE; tgx++) {
unsigned
int
atom1
=
x*TILE_SIZE+tgx
;
real4
force
=
0
;
DECLARE_ATOM1_DERIVATIVES
real4
posq1
=
posq[atom1]
;
LOAD_ATOM1_PARAMETERS
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
real4
posq2
=
local_posq[j]
;
real4
delta
=
(
real4
)
(
posq2.xyz
-
posq1.xyz,
0
)
;
real
r2
=
dot
(
delta.xyz,
delta.xyz
)
;
if
(
atom1
<
NUM_ATOMS
&&
atomIndices[j]
<
NUM_ATOMS
&&
r2
<
CUTOFF_SQUARED
)
{
real
invR
=
RSQRT
(
r2
)
;
real
r
=
RECIP
(
invR
)
;
unsigned
int
atom2
=
j
;
LOAD_ATOM2_PARAMETERS
atom2
=
atomIndices[j]
;
real
dEdR
=
0
;
real
tempEnergy
=
0
;
COMPUTE_INTERACTION
dEdR
/=
-r
;
energy
+=
tempEnergy
;
delta.xyz
*=
dEdR
;
force.xyz
-=
delta.xyz
;
atom2
=
j
;
local_force[atom2].xyz
+=
delta.xyz
;
RECORD_DERIVATIVE_2
}
}
//
Write
results
for
atom1.
#
ifdef
SUPPORTS_64_BIT_ATOMICS
unsigned
int
offset
=
atom1
;
atom_add
(
&forceBuffers[offset],
(
long
)
(
force.x*0x100000000
))
;
atom_add
(
&forceBuffers[offset+PADDED_NUM_ATOMS],
(
long
)
(
force.y*0x100000000
))
;
atom_add
(
&forceBuffers[offset+2*PADDED_NUM_ATOMS],
(
long
)
(
force.z*0x100000000
))
;
STORE_DERIVATIVES_1
#
else
unsigned
int
offset
=
atom1
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
forceBuffers[offset].xyz
+=
force.xyz
;
STORE_DERIVATIVES_1
#
endif
}
}
else
#
endif
{
//
Compute
the
full
set
of
interactions
in
this
tile
.
//
We
need
to
apply
periodic
boundary
conditions
separately
for
each
interaction
.
for
(
unsigned
int
tgx
=
0
; tgx < TILE_SIZE; tgx++) {
unsigned
int
atom1
=
x*TILE_SIZE+tgx
;
...
...
@@ -193,13 +325,7 @@ __kernel void computeN2Energy(__global real4* restrict forceBuffers, __global re
DECLARE_ATOM1_DERIVATIVES
real4
posq1
=
posq[atom1]
;
LOAD_ATOM1_PARAMETERS
#
ifdef
USE_EXCLUSIONS
unsigned
int
excl
=
(
hasExclusions
?
exclusions[exclusionIndex+tgx]
:
0xFFFFFFFF
)
;
#
endif
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
#
ifdef
USE_EXCLUSIONS
bool
isExcluded
=
!
(
excl
&
0x1
)
;
#
endif
real4
posq2
=
local_posq[j]
;
real4
delta
=
(
real4
)
(
posq2.xyz
-
posq1.xyz,
0
)
;
#
ifdef
USE_PERIODIC
...
...
@@ -207,50 +333,67 @@ __kernel void computeN2Energy(__global real4* restrict forceBuffers, __global re
#
endif
real
r2
=
dot
(
delta.xyz,
delta.xyz
)
;
#
ifdef
USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
if
(
atom1
<
NUM_ATOMS
&&
atomIndices[j]
<
NUM_ATOMS
&&
r2
<
CUTOFF_SQUARED
)
{
#
else
if
(
atom1
<
NUM_ATOMS
&&
atomIndices[j]
<
NUM_ATOMS
)
{
#
endif
real
invR
=
RSQRT
(
r2
)
;
real
r
=
RECIP
(
invR
)
;
unsigned
int
atom2
=
j
;
LOAD_ATOM2_PARAMETERS
atom2
=
y*TILE_SIZE+j
;
real
dEdR
=
0
;
real
tempEnergy
=
0
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
real
invR
=
RSQRT
(
r2
)
;
real
r
=
RECIP
(
invR
)
;
unsigned
int
atom2
=
j
;
LOAD_ATOM2_PARAMETERS
atom2
=
atomIndices[j]
;
real
dEdR
=
0
;
real
tempEnergy
=
0
;
COMPUTE_INTERACTION
dEdR
/=
-r
;
energy
+=
tempEnergy
;
delta.xyz
*=
dEdR
;
force.xyz
-=
delta.xyz
;
atom2
=
j
;
local_force[atom2].xyz
+=
delta.xyz
;
RECORD_DERIVATIVE_2
}
energy
+=
tempEnergy
;
delta.xyz
*=
dEdR
;
force.xyz
-=
delta.xyz
;
atom2
=
j
;
local_force[atom2].xyz
+=
delta.xyz
;
RECORD_DERIVATIVE_2
#
ifdef
USE_CUTOFF
}
#
endif
#
ifdef
USE_EXCLUSIONS
excl
>>=
1
;
#
endif
}
//
Write
results
for
atom1.
unsigned
int
offset1
=
atom1
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
forceBuffers[offset1].xyz
+=
force.xyz
;
#
ifdef
SUPPORTS_64_BIT_ATOMICS
unsigned
int
offset
=
atom1
;
atom_add
(
&forceBuffers[offset],
(
long
)
(
force.x*0x100000000
))
;
atom_add
(
&forceBuffers[offset+PADDED_NUM_ATOMS],
(
long
)
(
force.y*0x100000000
))
;
atom_add
(
&forceBuffers[offset+2*PADDED_NUM_ATOMS],
(
long
)
(
force.z*0x100000000
))
;
STORE_DERIVATIVES_1
#
else
unsigned
int
offset
=
atom1
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
forceBuffers[offset].xyz
+=
force.xyz
;
STORE_DERIVATIVES_1
#
endif
}
}
//
Write
results
//
Write
results
.
for
(
int
tgx
=
0
; tgx < TILE_SIZE; tgx++) {
unsigned
int
offset2
=
y*TILE_SIZE+tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
forceBuffers[offset2].xyz
+=
local_force[tgx].xyz
;
STORE_DERIVATIVES_2
#
ifdef
USE_CUTOFF
unsigned
int
atom2
=
atomIndices[tgx]
;
#
else
unsigned
int
atom2
=
y*TILE_SIZE
+
tgx
;
#
endif
if
(
atom2
<
PADDED_NUM_ATOMS
)
{
#
ifdef
SUPPORTS_64_BIT_ATOMICS
atom_add
(
&forceBuffers[atom2],
(
long
)
(
local_force[tgx].x*0x100000000
))
;
atom_add
(
&forceBuffers[atom2+PADDED_NUM_ATOMS],
(
long
)
(
local_force[tgx].y*0x100000000
))
;
atom_add
(
&forceBuffers[atom2+2*PADDED_NUM_ATOMS],
(
long
)
(
local_force[tgx].z*0x100000000
))
;
unsigned
int
offset
=
atom2
;
STORE_DERIVATIVES_2
#
else
unsigned
int
offset
=
atom2
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
forceBuffers[offset].xyz
+=
local_force[tgx].xyz
;
STORE_DERIVATIVES_2
#
endif
}
}
}
lasty
=
y
;
pos++
;
}
energyBuffer[get_global_id
(
0
)
]
+=
energy
;
...
...
platforms/opencl/src/kernels/customGBEnergyN2_default.cl
deleted
100644 → 0
View file @
f6d4557d
#
define
TILE_SIZE
32
#
ifdef
SUPPORTS_64_BIT_ATOMICS
#
pragma
OPENCL
EXTENSION
cl_khr_int64_base_atomics
:
enable
#
define
STORE_DERIVATIVE_1
(
INDEX
)
atom_add
(
&derivBuffers[offset1+
(
INDEX-1
)
*PADDED_NUM_ATOMS],
(
long
)
(
deriv##INDEX##_1*0x100000000
))
;
#
define
STORE_DERIVATIVE_2
(
INDEX
)
atom_add
(
&derivBuffers[offset2+
(
INDEX-1
)
*PADDED_NUM_ATOMS],
(
long
)
(
local_deriv##INDEX[get_local_id
(
0
)
]*0x100000000
))
;
#
else
#
define
STORE_DERIVATIVE_1
(
INDEX
)
derivBuffers##INDEX[offset1]
+=
deriv##INDEX##_1+tempDerivBuffer##INDEX[get_local_id
(
0
)
+TILE_SIZE]
;
#
define
STORE_DERIVATIVE_2
(
INDEX
)
derivBuffers##INDEX[offset2]
+=
local_deriv##INDEX[get_local_id
(
0
)
]+local_deriv##INDEX[get_local_id
(
0
)
+TILE_SIZE]
;
#
endif
/**
*
Compute
a
force
based
on
pair
interactions.
*/
__kernel
__attribute__
((
reqd_work_group_size
(
WORK_GROUP_SIZE,
1
,
1
)))
void
computeN2Energy
(
#
ifdef
SUPPORTS_64_BIT_ATOMICS
__global
long*
restrict
forceBuffers,
#
else
__global
real4*
restrict
forceBuffers,
#
endif
__global
real*
restrict
energyBuffer,
__local
real4*
restrict
local_force,
__global
const
real4*
restrict
posq,
__local
real4*
restrict
local_posq,
__global
const
unsigned
int*
restrict
exclusions,
__global
const
unsigned
int*
restrict
exclusionIndices,
__global
const
unsigned
int*
restrict
exclusionRowIndices,
__local
real4*
restrict
tempForceBuffer,
#
ifdef
USE_CUTOFF
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
unsigned
int
maxTiles
#
else
unsigned
int
numTiles
#
endif
PARAMETER_ARGUMENTS
)
{
#
ifdef
USE_CUTOFF
unsigned
int
numTiles
=
interactionCount[0]
;
unsigned
int
pos
=
get_group_id
(
0
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS*
(
NUM_BLOCKS+1
)
/2
:
numTiles
)
/get_num_groups
(
0
)
;
unsigned
int
end
=
(
get_group_id
(
0
)
+1
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS*
(
NUM_BLOCKS+1
)
/2
:
numTiles
)
/get_num_groups
(
0
)
;
#
else
unsigned
int
pos
=
get_group_id
(
0
)
*numTiles/get_num_groups
(
0
)
;
unsigned
int
end
=
(
get_group_id
(
0
)
+1
)
*numTiles/get_num_groups
(
0
)
;
#
endif
real
energy
=
0
;
unsigned
int
lasty
=
0xFFFFFFFF
;
__local
unsigned
int
exclusionRange[2]
;
__local
int
exclusionIndex[1]
;
DECLARE_TEMP_BUFFERS
while
(
pos
<
end
)
{
//
Extract
the
coordinates
of
this
tile
unsigned
int
x,
y
;
#
ifdef
USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles[pos]
;
x
=
tileIndices.x
;
y
=
tileIndices.y
;
}
else
#
endif
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS+0.5f-SQRT
((
NUM_BLOCKS+0.5f
)
*
(
NUM_BLOCKS+0.5f
)
-2*pos
))
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
if
(
x
<
y
|
| x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
unsigned int baseLocalAtom = (get_local_id(0) < TILE_SIZE ? 0 : TILE_SIZE/2);
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
unsigned int forceBufferOffset = (tgx < TILE_SIZE/2 ? 0 : TILE_SIZE);
unsigned int atom1 = x*TILE_SIZE + tgx;
real4 force = 0;
DECLARE_ATOM1_DERIVATIVES
real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
// Locate the exclusion data for this tile.
#ifdef USE_EXCLUSIONS
if (get_local_id(0) < 2)
exclusionRange[get_local_id(0)] = exclusionRowIndices[x+get_local_id(0)];
if (tgx == 0)
exclusionIndex[0] = -1;
barrier(CLK_LOCAL_MEM_FENCE);
for (int i = exclusionRange[0]+tgx; i < exclusionRange[1]; i += TILE_SIZE)
if (exclusionIndices[i] == y)
exclusionIndex[0] = i*TILE_SIZE;
barrier(CLK_LOCAL_MEM_FENCE);
bool hasExclusions = (exclusionIndex[0] > -1);
#endif
if (x == y) {
// This tile is on the diagonal.
const unsigned int localAtomIndex = get_local_id(0);
local_posq[localAtomIndex] = posq1;
LOAD_LOCAL_PARAMETERS_FROM_1
barrier(CLK_LOCAL_MEM_FENCE);
#ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[exclusionIndex[0]+tgx] >> baseLocalAtom;
#endif
for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
int atom2 = baseLocalAtom+j;
real4 posq2 = local_posq[atom2];
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+baseLocalAtom+j;
real dEdR = 0;
real tempEnergy = 0;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
COMPUTE_INTERACTION
dEdR /= -r;
}
energy += 0.5f*tempEnergy;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl >>= 1;
#endif
}
// Sum the forces and write results.
if (get_local_id(0) >= TILE_SIZE) {
tempForceBuffer[get_local_id(0)] = force;
SET_TEMP_BUFFERS
}
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) {
#ifdef SUPPORTS_64_BIT_ATOMICS
const unsigned int offset1 = x*TILE_SIZE + tgx;
atom_add(&forceBuffers[offset1], (long) ((force.x + tempForceBuffer[get_local_id(0)+TILE_SIZE].x)*0x100000000));
atom_add(&forceBuffers[offset1+PADDED_NUM_ATOMS], (long) ((force.y + tempForceBuffer[get_local_id(0)+TILE_SIZE].y)*0x100000000));
atom_add(&forceBuffers[offset1+2*PADDED_NUM_ATOMS], (long) ((force.z + tempForceBuffer[get_local_id(0)+TILE_SIZE].z)*0x100000000));
#else
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
const unsigned int offset1 = x*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else
const unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif
forceBuffers[offset1].xyz += force.xyz + tempForceBuffer[get_local_id(0)+TILE_SIZE].xyz;
#endif
STORE_DERIVATIVES_1
}
}
else {
// This is an off-diagonal tile.
const unsigned int localAtomIndex = get_local_id(0);
if (lasty != y && get_local_id(0) < TILE_SIZE) {
unsigned int j = y*TILE_SIZE + tgx;
local_posq[localAtomIndex] = posq[j];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
local_force[localAtomIndex] = 0;
CLEAR_LOCAL_DERIVATIVES
barrier(CLK_LOCAL_MEM_FENCE);
// Compute the full set of interactions in this tile.
#ifdef USE_EXCLUSIONS
unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[0]+tgx] : 0xFFFFFFFF);
excl = (excl >> baseLocalAtom) & 0xFFFF;
excl += excl << 16;
excl = (excl >> tgx) |
(
excl
<<
(
TILE_SIZE
-
tgx
))
;
#
endif
unsigned
int
tj
=
tgx%
(
TILE_SIZE/2
)
;
for
(
unsigned
int
j
=
0
; j < TILE_SIZE/2; j++) {
#
ifdef
USE_EXCLUSIONS
bool
isExcluded
=
!
(
excl
&
0x1
)
;
#
endif
int
atom2
=
baseLocalAtom+tj
;
real4
posq2
=
local_posq[atom2]
;
real4
delta
=
(
real4
)
(
posq2.xyz
-
posq1.xyz,
0
)
;
#
ifdef
USE_PERIODIC
delta.x
-=
floor
(
delta.x*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
delta.y
-=
floor
(
delta.y*invPeriodicBoxSize.y+0.5f
)
*periodicBoxSize.y
;
delta.z
-=
floor
(
delta.z*invPeriodicBoxSize.z+0.5f
)
*periodicBoxSize.z
;
#
endif
real
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
#
ifdef
USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
#
endif
real
invR
=
RSQRT
(
r2
)
;
real
r
=
RECIP
(
invR
)
;
LOAD_ATOM2_PARAMETERS
atom2
=
y*TILE_SIZE+baseLocalAtom+tj
;
real
dEdR
=
0
;
real
tempEnergy
=
0
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
COMPUTE_INTERACTION
dEdR
/=
-r
;
}
energy
+=
tempEnergy
;
delta.xyz
*=
dEdR
;
force.xyz
-=
delta.xyz
;
atom2
=
baseLocalAtom+tj+forceBufferOffset
;
local_force[baseLocalAtom+tj+forceBufferOffset].xyz
+=
delta.xyz
;
RECORD_DERIVATIVE_2
#
ifdef
USE_CUTOFF
}
#
endif
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
#
ifdef
USE_EXCLUSIONS
excl
>>=
1
;
#
endif
tj
=
(
tj+1
)
%
(
TILE_SIZE/2
)
;
}
//
Sum
the
forces
and
write
results.
if
(
get_local_id
(
0
)
>=
TILE_SIZE
)
{
tempForceBuffer[get_local_id
(
0
)
]
=
force
;
SET_TEMP_BUFFERS
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
{
#
ifdef
SUPPORTS_64_BIT_ATOMICS
const
unsigned
int
offset1
=
x*TILE_SIZE
+
tgx
;
const
unsigned
int
offset2
=
y*TILE_SIZE
+
tgx
;
atom_add
(
&forceBuffers[offset1],
(
long
)
((
force.x+tempForceBuffer[get_local_id
(
0
)
+TILE_SIZE].x
)
*0x100000000
))
;
atom_add
(
&forceBuffers[offset1+PADDED_NUM_ATOMS],
(
long
)
((
force.y+tempForceBuffer[get_local_id
(
0
)
+TILE_SIZE].y
)
*0x100000000
))
;
atom_add
(
&forceBuffers[offset1+2*PADDED_NUM_ATOMS],
(
long
)
((
force.z+tempForceBuffer[get_local_id
(
0
)
+TILE_SIZE].z
)
*0x100000000
))
;
atom_add
(
&forceBuffers[offset2],
(
long
)
((
local_force[get_local_id
(
0
)
].x+local_force[get_local_id
(
0
)
+TILE_SIZE].x
)
*0x100000000
))
;
atom_add
(
&forceBuffers[offset2+PADDED_NUM_ATOMS],
(
long
)
((
local_force[get_local_id
(
0
)
].y+local_force[get_local_id
(
0
)
+TILE_SIZE].y
)
*0x100000000
))
;
atom_add
(
&forceBuffers[offset2+2*PADDED_NUM_ATOMS],
(
long
)
((
local_force[get_local_id
(
0
)
].z+local_force[get_local_id
(
0
)
+TILE_SIZE].z
)
*0x100000000
))
;
#
else
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
const
unsigned
int
offset1
=
x*TILE_SIZE
+
tgx
+
y*PADDED_NUM_ATOMS
;
const
unsigned
int
offset2
=
y*TILE_SIZE
+
tgx
+
x*PADDED_NUM_ATOMS
;
#
else
const
unsigned
int
offset1
=
x*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
const
unsigned
int
offset2
=
y*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
#
endif
forceBuffers[offset1].xyz
+=
force.xyz+tempForceBuffer[get_local_id
(
0
)
+TILE_SIZE].xyz
;
forceBuffers[offset2].xyz
+=
local_force[get_local_id
(
0
)
].xyz+local_force[get_local_id
(
0
)
+TILE_SIZE].xyz
;
#
endif
STORE_DERIVATIVES_1
STORE_DERIVATIVES_2
}
}
lasty
=
y
;
pos++
;
}
energyBuffer[get_global_id
(
0
)
]
+=
energy
;
}
platforms/opencl/src/kernels/customGBEnergyN2_nvidia.cl
deleted
100644 → 0
View file @
f6d4557d
#
pragma
OPENCL
EXTENSION
cl_khr_global_int32_base_atomics
:
enable
#
ifdef
SUPPORTS_64_BIT_ATOMICS
#
pragma
OPENCL
EXTENSION
cl_khr_int64_base_atomics
:
enable
#
define
STORE_DERIVATIVE_1
(
INDEX
)
atom_add
(
&derivBuffers[offset+
(
INDEX-1
)
*PADDED_NUM_ATOMS],
(
long
)
(
deriv##INDEX##_1*0x100000000
))
;
#
define
STORE_DERIVATIVE_2
(
INDEX
)
atom_add
(
&derivBuffers[offset+
(
INDEX-1
)
*PADDED_NUM_ATOMS],
(
long
)
(
local_deriv##INDEX[get_local_id
(
0
)
]*0x100000000
))
;
#
else
#
define
STORE_DERIVATIVE_1
(
INDEX
)
derivBuffers##INDEX[offset]
+=
deriv##INDEX##_1
;
#
define
STORE_DERIVATIVE_2
(
INDEX
)
derivBuffers##INDEX[offset]
+=
local_deriv##INDEX[get_local_id
(
0
)
]
;
#
endif
#
define
TILE_SIZE
32
/**
*
Compute
a
force
based
on
pair
interactions.
*/
__kernel
void
computeN2Energy
(
#
ifdef
SUPPORTS_64_BIT_ATOMICS
__global
long*
restrict
forceBuffers,
#
else
__global
real4*
restrict
forceBuffers,
#
endif
__global
real*
restrict
energyBuffer,
__local
real4*
restrict
local_force,
__global
const
real4*
restrict
posq,
__local
real4*
restrict
local_posq,
__global
const
unsigned
int*
restrict
exclusions,
__global
const
unsigned
int*
restrict
exclusionIndices,
__global
const
unsigned
int*
restrict
exclusionRowIndices,
__local
real4*
restrict
tempBuffer,
#
ifdef
USE_CUTOFF
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
const
unsigned
int*
restrict
interactionFlags
#
else
unsigned
int
numTiles
#
endif
PARAMETER_ARGUMENTS
)
{
unsigned
int
totalWarps
=
get_global_size
(
0
)
/TILE_SIZE
;
unsigned
int
warp
=
get_global_id
(
0
)
/TILE_SIZE
;
#
ifdef
USE_CUTOFF
unsigned
int
numTiles
=
interactionCount[0]
;
unsigned
int
pos
=
warp*
(
numTiles
>
maxTiles
?
NUM_BLOCKS*
(
NUM_BLOCKS+1
)
/2
:
numTiles
)
/totalWarps
;
unsigned
int
end
=
(
warp+1
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS*
(
NUM_BLOCKS+1
)
/2
:
numTiles
)
/totalWarps
;
#
else
unsigned
int
pos
=
warp*numTiles/totalWarps
;
unsigned
int
end
=
(
warp+1
)
*numTiles/totalWarps
;
#
endif
real
energy
=
0
;
unsigned
int
lasty
=
0xFFFFFFFF
;
__local
unsigned
int
exclusionRange[2*WARPS_PER_GROUP]
;
__local
int
exclusionIndex[WARPS_PER_GROUP]
;
__local
int2*
reservedBlocks
=
(
__local
int2*
)
exclusionRange
;
do
{
//
Extract
the
coordinates
of
this
tile
const
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
TILE_SIZE-1
)
;
const
unsigned
int
tbx
=
get_local_id
(
0
)
-
tgx
;
const
unsigned
int
localGroupIndex
=
get_local_id
(
0
)
/TILE_SIZE
;
unsigned
int
x,
y
;
real4
force
=
0
;
DECLARE_ATOM1_DERIVATIVES
if
(
pos
<
end
)
{
#
ifdef
USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles[pos]
;
x
=
tileIndices.x
;
y
=
tileIndices.y
;
}
else
#
endif
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS+0.5f-SQRT
((
NUM_BLOCKS+0.5f
)
*
(
NUM_BLOCKS+0.5f
)
-2*pos
))
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
if
(
x
<
y
|
| x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
unsigned int atom1 = x*TILE_SIZE + tgx;
real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
// Locate the exclusion data for this tile.
#ifdef USE_EXCLUSIONS
if (tgx < 2)
exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
if (tgx == 0)
exclusionIndex[localGroupIndex] = -1;
for (unsigned int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
if (exclusionIndices[i] == y)
exclusionIndex[localGroupIndex] = i*TILE_SIZE;
bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
#else
bool hasExclusions = false;
#endif
if (pos >= end)
; // This warp is done.
else if (x == y) {
// This tile is on the diagonal.
const unsigned int localAtomIndex = get_local_id(0);
local_posq[localAtomIndex] = posq1;
LOAD_LOCAL_PARAMETERS_FROM_1
#ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
#endif
for (unsigned int j = 0; j < TILE_SIZE; j++) {
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
int atom2 = tbx+j;
real4 posq2 = local_posq[atom2];
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
real dEdR = 0;
real tempEnergy = 0;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
COMPUTE_INTERACTION
dEdR /= -r;
}
energy += 0.5f*tempEnergy;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl >>= 1;
#endif
}
}
else {
// This is an off-diagonal tile.
const unsigned int localAtomIndex = get_local_id(0);
if (lasty != y) {
unsigned int j = y*TILE_SIZE + tgx;
local_posq[localAtomIndex] = posq[j];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
local_force[localAtomIndex] = 0;
CLEAR_LOCAL_DERIVATIVES
#ifdef USE_CUTOFF
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
if (!hasExclusions && flags == 0) {
// No interactions in this tile.
}
else
#endif
{
// Compute the full set of interactions in this tile.
#ifdef USE_EXCLUSIONS
unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[localGroupIndex]+tgx] : 0xFFFFFFFF);
excl = (excl >> tgx) |
(
excl
<<
(
TILE_SIZE
-
tgx
))
;
#
endif
unsigned
int
tj
=
tgx
;
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
#
ifdef
USE_EXCLUSIONS
bool
isExcluded
=
!
(
excl
&
0x1
)
;
#
endif
int
atom2
=
tbx+tj
;
real4
posq2
=
local_posq[atom2]
;
real4
delta
=
(
real4
)
(
posq2.xyz
-
posq1.xyz,
0
)
;
#
ifdef
USE_PERIODIC
delta.x
-=
floor
(
delta.x*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
delta.y
-=
floor
(
delta.y*invPeriodicBoxSize.y+0.5f
)
*periodicBoxSize.y
;
delta.z
-=
floor
(
delta.z*invPeriodicBoxSize.z+0.5f
)
*periodicBoxSize.z
;
#
endif
real
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
#
ifdef
USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
#
endif
real
invR
=
RSQRT
(
r2
)
;
real
r
=
RECIP
(
invR
)
;
LOAD_ATOM2_PARAMETERS
atom2
=
y*TILE_SIZE+tj
;
real
dEdR
=
0
;
real
tempEnergy
=
0
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
COMPUTE_INTERACTION
dEdR
/=
-r
;
}
energy
+=
tempEnergy
;
delta.xyz
*=
dEdR
;
force.xyz
-=
delta.xyz
;
atom2
=
tbx+tj
;
local_force[atom2].xyz
+=
delta.xyz
;
RECORD_DERIVATIVE_2
#
ifdef
USE_CUTOFF
}
#
endif
#
ifdef
USE_EXCLUSIONS
excl
>>=
1
;
#
endif
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
)
;
}
}
}
}
lasty
=
y
;
//
Write
results.
We
need
to
coordinate
between
warps
to
make
sure
no
two
of
them
//
ever
try
to
write
to
the
same
piece
of
memory
at
the
same
time.
#
ifdef
SUPPORTS_64_BIT_ATOMICS
if
(
pos
<
end
)
{
const
unsigned
int
offset
=
x*TILE_SIZE
+
tgx
;
atom_add
(
&forceBuffers[offset],
(
long
)
(
force.x*0x100000000
))
;
atom_add
(
&forceBuffers[offset+PADDED_NUM_ATOMS],
(
long
)
(
force.y*0x100000000
))
;
atom_add
(
&forceBuffers[offset+2*PADDED_NUM_ATOMS],
(
long
)
(
force.z*0x100000000
))
;
STORE_DERIVATIVES_1
}
if
(
pos
<
end
&&
x
!=
y
)
{
const
unsigned
int
offset
=
y*TILE_SIZE
+
tgx
;
atom_add
(
&forceBuffers[offset],
(
long
)
(
local_force[get_local_id
(
0
)
].x*0x100000000
))
;
atom_add
(
&forceBuffers[offset+PADDED_NUM_ATOMS],
(
long
)
(
local_force[get_local_id
(
0
)
].y*0x100000000
))
;
atom_add
(
&forceBuffers[offset+2*PADDED_NUM_ATOMS],
(
long
)
(
local_force[get_local_id
(
0
)
].z*0x100000000
))
;
STORE_DERIVATIVES_2
}
#
else
int
writeX
=
(
pos
<
end
?
x
:
-1
)
;
int
writeY
=
(
pos
<
end
&&
x
!=
y
?
y
:
-1
)
;
if
(
tgx
==
0
)
reservedBlocks[localGroupIndex]
=
(
int2
)(
writeX,
writeY
)
;
bool
done
=
false
;
int
doneIndex
=
0
;
int
checkIndex
=
0
;
while
(
true
)
{
//
See
if
any
warp
still
needs
to
write
its
data.
bool
allDone
=
true
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
while
(
doneIndex
<
WARPS_PER_GROUP
&&
allDone
)
{
if
(
reservedBlocks[doneIndex].x
!=
-1
)
allDone
=
false
;
else
doneIndex++
;
}
if
(
allDone
)
break
;
if
(
!done
)
{
//
See
whether
this
warp
can
write
its
data.
This
requires
that
no
previous
warp
//
is
trying
to
write
to
the
same
block
of
the
buffer.
bool
canWrite
=
(
writeX
!=
-1
)
;
while
(
checkIndex
<
localGroupIndex
&&
canWrite
)
{
if
((
reservedBlocks[checkIndex].x
==
x
|
| reservedBlocks[checkIndex].y == x) ||
(writeY != -1 && (reservedBlocks[checkIndex].x == y |
|
reservedBlocks[checkIndex].y
==
y
)))
canWrite
=
false
;
else
checkIndex++
;
}
if
(
canWrite
)
{
//
Write
the
data
to
global
memory,
then
mark
this
warp
as
done.
if
(
writeX
>
-1
)
{
const
unsigned
int
offset
=
x*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
forceBuffers[offset].xyz
+=
force.xyz
;
STORE_DERIVATIVES_1
}
if
(
writeY
>
-1
)
{
const
unsigned
int
offset
=
y*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
forceBuffers[offset].xyz
+=
local_force[get_local_id
(
0
)
].xyz
;
STORE_DERIVATIVES_2
}
done
=
true
;
if
(
tgx
==
0
)
reservedBlocks[localGroupIndex]
=
(
int2
)(
-1
,
-1
)
;
}
}
}
#
endif
pos++
;
}
while
(
pos
<
end
)
;
energyBuffer[get_global_id
(
0
)
]
+=
energy
;
}
platforms/opencl/src/kernels/customGBValueN2.cl
0 → 100644
View file @
93c467b2
#
ifdef
SUPPORTS_64_BIT_ATOMICS
#
pragma
OPENCL
EXTENSION
cl_khr_int64_base_atomics
:
enable
#
endif
/**
*
Compute
a
value
based
on
pair
interactions.
*/
__kernel
void
computeN2Value
(
__global
const
real4*
restrict
posq,
__local
real4*
restrict
local_posq,
__global
const
unsigned
int*
restrict
exclusions,
__global
const
ushort2*
exclusionTiles,
#
ifdef
SUPPORTS_64_BIT_ATOMICS
__global
long*
restrict
global_value,
#
else
__global
real*
restrict
global_value,
#
endif
__local
real*
restrict
local_value,
#
ifdef
USE_CUTOFF
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
const
real4*
restrict
blockCenter,
__global
const
int*
restrict
interactingAtoms
#
else
unsigned
int
numTiles
#
endif
PARAMETER_ARGUMENTS
)
{
const
unsigned
int
totalWarps
=
get_global_size
(
0
)
/TILE_SIZE
;
const
unsigned
int
warp
=
get_global_id
(
0
)
/TILE_SIZE
;
const
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
TILE_SIZE-1
)
;
const
unsigned
int
tbx
=
get_local_id
(
0
)
-
tgx
;
//
First
loop:
process
tiles
that
contain
exclusions.
const
unsigned
int
firstExclusionTile
=
FIRST_EXCLUSION_TILE+warp*
(
LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE
)
/totalWarps
;
const
unsigned
int
lastExclusionTile
=
FIRST_EXCLUSION_TILE+
(
warp+1
)
*
(
LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE
)
/totalWarps
;
for
(
int
pos
=
firstExclusionTile
; pos < lastExclusionTile; pos++) {
const
ushort2
tileIndices
=
exclusionTiles[pos]
;
const
unsigned
int
x
=
tileIndices.x
;
const
unsigned
int
y
=
tileIndices.y
;
real
value
=
0
;
unsigned
int
atom1
=
x*TILE_SIZE
+
tgx
;
real4
posq1
=
posq[atom1]
;
LOAD_ATOM1_PARAMETERS
#
ifdef
USE_EXCLUSIONS
unsigned
int
excl
=
exclusions[pos*TILE_SIZE+tgx]
;
#
endif
if
(
x
==
y
)
{
//
This
tile
is
on
the
diagonal.
const
unsigned
int
localAtomIndex
=
get_local_id
(
0
)
;
local_posq[localAtomIndex]
=
posq1
;
LOAD_LOCAL_PARAMETERS_FROM_1
SYNC_WARPS
;
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
int
atom2
=
tbx+j
;
real4
posq2
=
local_posq[atom2]
;
real4
delta
=
(
real4
)
(
posq2.xyz
-
posq1.xyz,
0
)
;
#
ifdef
USE_PERIODIC
delta.xyz
-=
floor
(
delta.xyz*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
#
endif
real
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
#
ifdef
USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
#
endif
real
invR
=
RSQRT
(
r2
)
;
real
r
=
RECIP
(
invR
)
;
LOAD_ATOM2_PARAMETERS
atom2
=
y*TILE_SIZE+j
;
real
tempValue1
=
0
;
real
tempValue2
=
0
;
#
ifdef
USE_EXCLUSIONS
bool
isExcluded
=
(
atom1
>=
NUM_ATOMS
|
| atom2 >= NUM_ATOMS || !(excl & 0x1));
if (!isExcluded && atom1 != atom2) {
#else
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
#endif
COMPUTE_VALUE
}
value += tempValue1;
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl >>= 1;
#endif
SYNC_WARPS;
}
}
else {
// This is an off-diagonal tile.
const unsigned int localAtomIndex = get_local_id(0);
unsigned int j = y*TILE_SIZE + tgx;
local_posq[localAtomIndex] = posq[j];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
local_value[localAtomIndex] = 0;
SYNC_WARPS;
#ifdef USE_EXCLUSIONS
excl = (excl >> tgx) |
(
excl
<<
(
TILE_SIZE
-
tgx
))
;
#
endif
unsigned
int
tj
=
tgx
;
for
(
j
=
0
; j < TILE_SIZE; j++) {
int
atom2
=
tbx+tj
;
real4
posq2
=
local_posq[atom2]
;
real4
delta
=
(
real4
)
(
posq2.xyz
-
posq1.xyz,
0
)
;
#
ifdef
USE_PERIODIC
delta.xyz
-=
floor
(
delta.xyz*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
#
endif
real
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
#
ifdef
USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
#
endif
real
invR
=
RSQRT
(
r2
)
;
real
r
=
RECIP
(
invR
)
;
LOAD_ATOM2_PARAMETERS
atom2
=
y*TILE_SIZE+tj
;
real
tempValue1
=
0
;
real
tempValue2
=
0
;
#
ifdef
USE_EXCLUSIONS
bool
isExcluded
=
(
atom1
>=
NUM_ATOMS
|
| atom2 >= NUM_ATOMS || !(excl & 0x1));
if (!isExcluded) {
#else
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
#endif
COMPUTE_VALUE
}
value += tempValue1;
local_value[tbx+tj] += tempValue2;
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl >>= 1;
#endif
tj = (tj + 1) & (TILE_SIZE - 1);
SYNC_WARPS;
}
}
// Write results.
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset = x*TILE_SIZE + tgx;
atom_add(&global_value[offset], (long) (value*0x100000000));
if (x != y) {
offset = y*TILE_SIZE + tgx;
atom_add(&global_value[offset], (long) (local_value[get_local_id(0)]*0x100000000));
}
#else
unsigned int offset1 = x*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
global_value[offset1] += value;
if (x != y)
global_value[offset2] += local_value[get_local_id(0)];
#endif
}
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
// of them (no cutoff).
#ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0];
int pos = warp*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
int end = (warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
#else
int pos = warp*numTiles/totalWarps;
int end = (warp+1)*numTiles/totalWarps;
#endif
int skipBase = 0;
int currentSkipIndex = tbx;
__local int atomIndices[FORCE_WORK_GROUP_SIZE];
__local int skipTiles[FORCE_WORK_GROUP_SIZE];
skipTiles[get_local_id(0)] = -1;
while (pos < end) {
real value = 0;
bool includeTile = true;
// Extract the coordinates of this tile.
unsigned int x, y;
bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
x = tileIndices.x;
singlePeriodicCopy = tileIndices.y;
}
else
#endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y |
|
x
>=
NUM_BLOCKS
)
{
//
Occasionally
happens
due
to
roundoff
error.
y
+=
(
x
<
y
?
-1
:
1
)
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
}
//
Skip
over
tiles
that
have
exclusions,
since
they
were
already
processed.
SYNC_WARPS
;
while
(
skipTiles[tbx+TILE_SIZE-1]
<
pos
)
{
SYNC_WARPS
;
if
(
skipBase+tgx
<
NUM_TILES_WITH_EXCLUSIONS
)
{
ushort2
tile
=
exclusionTiles[skipBase+tgx]
;
skipTiles[get_local_id
(
0
)
]
=
tile.x
+
tile.y*NUM_BLOCKS
-
tile.y*
(
tile.y+1
)
/2
;
}
else
skipTiles[get_local_id
(
0
)
]
=
end
;
skipBase
+=
TILE_SIZE
;
currentSkipIndex
=
tbx
;
SYNC_WARPS
;
}
while
(
skipTiles[currentSkipIndex]
<
pos
)
currentSkipIndex++
;
includeTile
=
(
skipTiles[currentSkipIndex]
!=
pos
)
;
}
if
(
includeTile
)
{
unsigned
int
atom1
=
x*TILE_SIZE
+
tgx
;
//
Load
atom
data
for
this
tile.
real4
posq1
=
posq[atom1]
;
LOAD_ATOM1_PARAMETERS
const
unsigned
int
localAtomIndex
=
get_local_id
(
0
)
;
#
ifdef
USE_CUTOFF
unsigned
int
j
=
(
numTiles
<=
maxTiles
?
interactingAtoms[pos*TILE_SIZE+tgx]
:
y*TILE_SIZE
+
tgx
)
;
#
else
unsigned
int
j
=
y*TILE_SIZE
+
tgx
;
#
endif
atomIndices[get_local_id
(
0
)
]
=
j
;
if
(
j
<
PADDED_NUM_ATOMS
)
{
local_posq[localAtomIndex]
=
posq[j]
;
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
local_value[localAtomIndex]
=
0
;
}
SYNC_WARPS
;
#
ifdef
USE_PERIODIC
if
(
singlePeriodicCopy
)
{
//
The
box
is
small
enough
that
we
can
just
translate
all
the
atoms
into
a
single
periodic
//
box,
then
skip
having
to
apply
periodic
boundary
conditions
later.
real4
blockCenterX
=
blockCenter[x]
;
posq1.xyz
-=
floor
((
posq1.xyz-blockCenterX.xyz
)
*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
local_posq[get_local_id
(
0
)
].x
-=
floor
((
local_posq[get_local_id
(
0
)
].x-blockCenterX.x
)
*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
local_posq[get_local_id
(
0
)
].y
-=
floor
((
local_posq[get_local_id
(
0
)
].y-blockCenterX.y
)
*invPeriodicBoxSize.y+0.5f
)
*periodicBoxSize.y
;
local_posq[get_local_id
(
0
)
].z
-=
floor
((
local_posq[get_local_id
(
0
)
].z-blockCenterX.z
)
*invPeriodicBoxSize.z+0.5f
)
*periodicBoxSize.z
;
SYNC_WARPS
;
unsigned
int
tj
=
tgx
;
for
(
j
=
0
; j < TILE_SIZE; j++) {
int
atom2
=
tbx+tj
;
real4
posq2
=
local_posq[atom2]
;
real4
delta
=
(
real4
)
(
posq2.xyz
-
posq1.xyz,
0
)
;
real
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
if
(
r2
<
CUTOFF_SQUARED
)
{
real
invR
=
RSQRT
(
r2
)
;
real
r
=
RECIP
(
invR
)
;
LOAD_ATOM2_PARAMETERS
atom2
=
atomIndices[tbx+tj]
;
real
tempValue1
=
0
;
real
tempValue2
=
0
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
COMPUTE_VALUE
}
value
+=
tempValue1
;
local_value[tbx+tj]
+=
tempValue2
;
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
)
;
SYNC_WARPS
;
}
}
else
#
endif
{
//
We
need
to
apply
periodic
boundary
conditions
separately
for
each
interaction.
unsigned
int
tj
=
tgx
;
for
(
j
=
0
; j < TILE_SIZE; j++) {
int
atom2
=
tbx+tj
;
real4
posq2
=
local_posq[atom2]
;
real4
delta
=
(
real4
)
(
posq2.xyz
-
posq1.xyz,
0
)
;
#
ifdef
USE_PERIODIC
delta.xyz
-=
floor
(
delta.xyz*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
#
endif
real
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
#
ifdef
USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
#
endif
real
invR
=
RSQRT
(
r2
)
;
real
r
=
RECIP
(
invR
)
;
LOAD_ATOM2_PARAMETERS
atom2
=
atomIndices[tbx+tj]
;
real
tempValue1
=
0
;
real
tempValue2
=
0
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
COMPUTE_VALUE
}
value
+=
tempValue1
;
local_value[tbx+tj]
+=
tempValue2
;
#
ifdef
USE_CUTOFF
}
#
endif
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
)
;
SYNC_WARPS
;
}
}
//
Write
results.
#
ifdef
USE_CUTOFF
unsigned
int
atom2
=
atomIndices[get_local_id
(
0
)
]
;
#
else
unsigned
int
atom2
=
y*TILE_SIZE
+
tgx
;
#
endif
#
ifdef
SUPPORTS_64_BIT_ATOMICS
atom_add
(
&global_value[atom1],
(
long
)
(
value*0x100000000
))
;
if
(
atom2
<
PADDED_NUM_ATOMS
)
atom_add
(
&global_value[atom2],
(
long
)
(
local_value[get_local_id
(
0
)
]*0x100000000
))
;
#
else
unsigned
int
offset1
=
atom1
+
warp*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
atom2
+
warp*PADDED_NUM_ATOMS
;
global_value[offset1]
+=
value
;
if
(
atom2
<
PADDED_NUM_ATOMS
)
global_value[offset2]
+=
local_value[get_local_id
(
0
)
]
;
#
endif
}
pos++
;
}
}
platforms/opencl/src/kernels/customGBValueN2_cpu.cl
View file @
93c467b2
#
define
TILE_SIZE
32
#
ifdef
SUPPORTS_64_BIT_ATOMICS
#
pragma
OPENCL
EXTENSION
cl_khr_int64_base_atomics
:
enable
#
endif
/**
*
Compute
a
value
based
on
pair
interactions.
*/
__kernel
void
computeN2Value
(
__global
const
real4*
restrict
posq,
__local
real4*
restrict
local_posq,
__global
const
unsigned
int*
restrict
exclusions,
__global
const
unsigned
int*
restrict
exclusionIndices,
__global
const
unsigned
int*
restrict
exclusionRowIndices,
__global
real*
restrict
global_value,
__local
real*
restrict
local_value,
__local
real*
restrict
tempBuffer,
__global
const
ushort2*
exclusionTiles,
#
ifdef
SUPPORTS_64_BIT_ATOMICS
__global
long*
restrict
global_value,
#
else
__global
real*
restrict
global_value,
#
endif
__local
real*
restrict
local_value,
#
ifdef
USE_CUTOFF
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
const
unsigned
int*
restrict
interacti
onFlag
s
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
const
real4*
restrict
blockCenter,
__global
const
int*
restrict
interacti
ngAtom
s
#
else
unsigned
int
numTiles
#
endif
PARAMETER_ARGUMENTS
)
{
//
First
loop:
process
tiles
that
contain
exclusions.
const
unsigned
int
firstExclusionTile
=
FIRST_EXCLUSION_TILE+get_group_id
(
0
)
*
(
LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE
)
/get_num_groups
(
0
)
;
const
unsigned
int
lastExclusionTile
=
FIRST_EXCLUSION_TILE+
(
get_group_id
(
0
)
+1
)
*
(
LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE
)
/get_num_groups
(
0
)
;
for
(
int
pos
=
firstExclusionTile
; pos < lastExclusionTile; pos++) {
const
ushort2
tileIndices
=
exclusionTiles[pos]
;
const
unsigned
int
x
=
tileIndices.x
;
const
unsigned
int
y
=
tileIndices.y
;
//
Load
the
data
for
this
tile.
for
(
int
localAtomIndex
=
0
; localAtomIndex < TILE_SIZE; localAtomIndex++) {
unsigned
int
j
=
y*TILE_SIZE
+
localAtomIndex
;
local_posq[localAtomIndex]
=
posq[j]
;
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
if
(
x
==
y
)
{
//
This
tile
is
on
the
diagonal.
for
(
unsigned
int
tgx
=
0
; tgx < TILE_SIZE; tgx++) {
#
ifdef
USE_EXCLUSIONS
unsigned
int
excl
=
exclusions[pos*TILE_SIZE+tgx]
;
#
endif
unsigned
int
atom1
=
x*TILE_SIZE+tgx
;
real
value
=
0
;
real4
posq1
=
posq[atom1]
;
LOAD_ATOM1_PARAMETERS
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
real4
posq2
=
local_posq[j]
;
real4
delta
=
(
real4
)
(
posq2.xyz
-
posq1.xyz,
0
)
;
#
ifdef
USE_PERIODIC
delta.xyz
-=
floor
(
delta.xyz*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
#
endif
real
r2
=
dot
(
delta.xyz,
delta.xyz
)
;
#
ifdef
USE_CUTOFF
unsigned
int
numTiles
=
interactionCount[0]
;
unsigned
int
pos
=
get_group_id
(
0
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS*
(
NUM_BLOCKS+1
)
/2
:
numTiles
)
/get_num_groups
(
0
)
;
unsigned
int
end
=
(
get_group_id
(
0
)
+1
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS*
(
NUM_BLOCKS+1
)
/2
:
numTiles
)
/get_num_groups
(
0
)
;
if
(
r2
<
CUTOFF_SQUARED
)
{
#
endif
real
invR
=
RSQRT
(
r2
)
;
real
r
=
RECIP
(
invR
)
;
unsigned
int
atom2
=
j
;
LOAD_ATOM2_PARAMETERS
atom2
=
y*TILE_SIZE+j
;
real
tempValue1
=
0
;
real
tempValue2
=
0
;
#
ifdef
USE_EXCLUSIONS
bool
isExcluded
=
(
atom1
>=
NUM_ATOMS
|
| atom2 >= NUM_ATOMS || !(excl & 0x1));
if (!isExcluded && atom1 != atom2) {
#else
unsigned
int
pos
=
get_group_id
(
0
)
*numTiles/get_num_groups
(
0
)
;
unsigned
int
end
=
(
get_group_id
(
0
)
+1
)
*numTiles/get_num_groups
(
0
)
;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
#endif
unsigned
int
lasty
=
0xFFFFFFFF
;
while
(
pos
<
end
)
{
//
Extract
the
coordinates
of
this
tile
unsigned
int
x,
y
;
COMPUTE_VALUE
}
value += tempValue1;
#ifdef USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles[pos]
;
x
=
tileIndices.x
;
y
=
tileIndices.y
;
}
else
}
#endif
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS+0.5f-SQRT
((
NUM_BLOCKS+0.5f
)
*
(
NUM_BLOCKS+0.5f
)
-2*pos
))
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
if
(
x
<
y
|
| x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
// Locate the exclusion data for this tile.
#ifdef USE_EXCLUSIONS
unsigned int exclusionStart = exclusionRowIndices[x];
unsigned int exclusionEnd = exclusionRowIndices[x+1];
int exclusionIndex = -1;
for (int i = exclusionStart; i < exclusionEnd; i++)
if (exclusionIndices[i] == y) {
exclusionIndex = i*TILE_SIZE;
break;
}
bool hasExclusions = (exclusionIndex > -1);
#else
bool hasExclusions = false;
excl >>= 1;
#endif
}
// Load the data for this tile if we don't already have it cached
.
// Write results
.
if (lasty != y) {
for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
unsigned int j = y*TILE_SIZE + localAtomIndex;
local_posq[localAtomIndex] = posq[j];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
#ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&global_value[atom1], (long) (value*0x100000000));
#else
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
global_value[offset] += value;
#endif
}
}
if (x == y)
{
// This
tile
is
o
n
the
diagonal.
else
{
// This is
a
n
off-
diagonal
tile
.
for (int tgx = 0; tgx < TILE_SIZE; tgx++)
local_value[tgx] = 0;
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
#ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[
exclusionIndex
+tgx];
unsigned int excl = exclusions[
pos*TILE_SIZE
+tgx];
#endif
unsigned int atom1 = x*TILE_SIZE+tgx;
real value = 0;
real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
for (unsigned int j = 0; j < TILE_SIZE; j++) {
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
real4 posq2 = local_posq[j];
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
...
...
@@ -92,21 +114,23 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
unsigned int atom2 = j;
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
real tempValue1 = 0;
real tempValue2 = 0;
real invR = RSQRT(r2);
real r = RECIP(invR);
unsigned int atom2 = j;
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
real tempValue1 = 0;
real tempValue2 = 0;
#ifdef USE_EXCLUSIONS
if (!isExcluded && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 0x1));
if (!isExcluded) {
#else
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
#endif
COMPUTE_VALUE
}
value += tempValue1;
COMPUTE_VALUE
}
value += tempValue1;
local_value[j] += tempValue2;
#ifdef USE_CUTOFF
}
#endif
...
...
@@ -115,78 +139,148 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
#endif
}
// Write results
// Write results
for atom1.
unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&global_value[atom1], (long) (value*0x100000000));
#else
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
global_value[offset] += value;
#endif
}
// Write results.
for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset = y*TILE_SIZE+tgx;
atom_add(&global_value[offset], (long) (local_value[tgx]*0x100000000));
#else
unsigned int offset = y*TILE_SIZE+tgx + get_group_id(0)*PADDED_NUM_ATOMS;
global_value[offset] += local_value[tgx];
#endif
}
}
else {
// This is an off-diagonal tile.
}
for (int tgx = 0; tgx < TILE_SIZE; tgx++)
local_value[tgx] = 0;
#if defined(USE_CUTOFF) && defined(USE_EXCLUSIONS)
unsigned int flags1 = (numTiles <= maxTiles ? interactionFlags[2*pos] : 0xFFFFFFFF);
unsigned int flags2 = (numTiles <= maxTiles ? interactionFlags[2*pos+1] : 0xFFFFFFFF);
if (!hasExclusions && (flags1 != 0xFFFFFFFF |
|
flags2
!=
0xFFFFFFFF
)
)
{
//
Compute
only
a
subset
of
the
interactions
in
this
tile.
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
// of them (no cutoff).
for
(
unsigned
int
tgx
=
0
; tgx < TILE_SIZE; tgx++) {
if
((
flags2&
(
1<<tgx
))
!=
0
)
{
unsigned
int
atom1
=
x*TILE_SIZE+tgx
;
real
value
=
0
;
real4
posq1
=
posq[atom1]
;
LOAD_ATOM1_PARAMETERS
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
if
((
flags&
(
1<<j
))
!=
0
)
{
real4
posq2
=
local_posq[j]
;
real4
delta
=
(
real4
)
(
posq2.xyz
-
posq1.xyz,
0
)
;
#ifdef USE_CUTOFF
const unsigned int numTiles = interactionCount[0];
int pos = get_group_id(0)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0);
int end = (get_group_id(0)+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0);
#else
int pos = get_group_id(0)*numTiles/get_num_groups(0);
int end = (get_group_id(0)+1)*numTiles/get_num_groups(0);
#endif
int nextToSkip = -1;
int currentSkipIndex = 0;
__local int atomIndices[TILE_SIZE];
while (pos < end) {
bool includeTile = true;
// Extract the coordinates of this tile.
unsigned int x, y;
bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
x = tileIndices.x;
singlePeriodicCopy = tileIndices.y;
}
else
#endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y |
|
x
>=
NUM_BLOCKS
)
{
//
Occasionally
happens
due
to
roundoff
error.
y
+=
(
x
<
y
?
-1
:
1
)
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
}
//
Skip
over
tiles
that
have
exclusions,
since
they
were
already
processed.
while
(
nextToSkip
<
pos
)
{
if
(
currentSkipIndex
<
NUM_TILES_WITH_EXCLUSIONS
)
{
ushort2
tile
=
exclusionTiles[currentSkipIndex++]
;
nextToSkip
=
tile.x
+
tile.y*NUM_BLOCKS
-
tile.y*
(
tile.y+1
)
/2
;
}
else
nextToSkip
=
end
;
}
includeTile
=
(
nextToSkip
!=
pos
)
;
}
if
(
includeTile
)
{
//
Load
the
data
for
this
tile.
for
(
int
localAtomIndex
=
0
; localAtomIndex < TILE_SIZE; localAtomIndex++) {
#
ifdef
USE_CUTOFF
unsigned
int
j
=
(
numTiles
<=
maxTiles
?
interactingAtoms[pos*TILE_SIZE+localAtomIndex]
:
y*TILE_SIZE+localAtomIndex
)
;
#
else
unsigned
int
j
=
y*TILE_SIZE+localAtomIndex
;
#
endif
atomIndices[localAtomIndex]
=
j
;
if
(
j
<
PADDED_NUM_ATOMS
)
{
local_posq[localAtomIndex]
=
posq[j]
;
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
local_value[localAtomIndex]
=
0
;
}
}
#
ifdef
USE_PERIODIC
delta.xyz
-=
floor
(
delta.xyz*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
#
endif
real
r2
=
dot
(
delta.xyz,
delta.xyz
)
;
real
tempValue1
=
0
;
real
tempValue2
=
0
;
if
(
r2
<
CUTOFF_SQUARED
)
{
real
invR
=
RSQRT
(
r2
)
;
real
r
=
RECIP
(
invR
)
;
unsigned
int
atom2
=
j
;
LOAD_ATOM2_PARAMETERS
atom2
=
y*TILE_SIZE+j
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
COMPUTE_VALUE
}
value
+=
tempValue1
;
local_value[j]
+=
tempValue2
;
}
}
if
(
singlePeriodicCopy
)
{
//
The
box
is
small
enough
that
we
can
just
translate
all
the
atoms
into
a
single
periodic
//
box,
then
skip
having
to
apply
periodic
boundary
conditions
later.
real4
blockCenterX
=
blockCenter[x]
;
for
(
unsigned
int
tgx
=
0
; tgx < TILE_SIZE; tgx++)
local_posq[tgx].xyz
-=
floor
((
local_posq[tgx].xyz-blockCenterX.xyz
)
*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
for
(
unsigned
int
tgx
=
0
; tgx < TILE_SIZE; tgx++) {
unsigned
int
atom1
=
x*TILE_SIZE+tgx
;
real
value
=
0
;
real4
posq1
=
posq[atom1]
;
LOAD_ATOM1_PARAMETERS
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
real4
posq2
=
local_posq[j]
;
real4
delta
=
(
real4
)
(
posq2.xyz
-
posq1.xyz,
0
)
;
real
r2
=
dot
(
delta.xyz,
delta.xyz
)
;
if
(
atom1
<
NUM_ATOMS
&&
atomIndices[j]
<
NUM_ATOMS
&&
r2
<
CUTOFF_SQUARED
)
{
real
invR
=
RSQRT
(
r2
)
;
real
r
=
RECIP
(
invR
)
;
unsigned
int
atom2
=
j
;
LOAD_ATOM2_PARAMETERS
atom2
=
atomIndices[j]
;
real
tempValue1
=
0
;
real
tempValue2
=
0
;
COMPUTE_VALUE
value
+=
tempValue1
;
local_value[j]
+=
tempValue2
;
}
}
//
Write
results
for
atom1.
//
Write
results
for
atom1.
unsigned
int
offset
=
atom1
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
global_value[offset]
+=
value
;
}
#
ifdef
SUPPORTS_64_BIT_ATOMICS
atom_add
(
&global_value[atom1],
(
long
)
(
value*0x100000000
))
;
#
else
unsigned
int
offset
=
atom1
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
global_value[offset]
+=
value
;
#
endif
}
}
else
#
endif
{
//
Compute
the
full
set
of
interactions
in
this
tile
.
//
We
need
to
apply
periodic
boundary
conditions
separately
for
each
interaction
.
for
(
unsigned
int
tgx
=
0
; tgx < TILE_SIZE; tgx++) {
unsigned
int
atom1
=
x*TILE_SIZE+tgx
;
real
value
=
0
;
real4
posq1
=
posq[atom1]
;
LOAD_ATOM1_PARAMETERS
#
ifdef
USE_EXCLUSIONS
unsigned
int
excl
=
(
hasExclusions
?
exclusions[exclusionIndex+tgx]
:
0xFFFFFFFF
)
;
#
endif
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
#
ifdef
USE_EXCLUSIONS
bool
isExcluded
=
!
(
excl
&
0x1
)
;
#
endif
real4
posq2
=
local_posq[j]
;
real4
delta
=
(
real4
)
(
posq2.xyz
-
posq1.xyz,
0
)
;
#
ifdef
USE_PERIODIC
...
...
@@ -194,47 +288,52 @@ __kernel void computeN2Value(__global const real4* restrict posq, __local real4*
#
endif
real
r2
=
dot
(
delta.xyz,
delta.xyz
)
;
#
ifdef
USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
#
endif
real
invR
=
RSQRT
(
r2
)
;
real
r
=
RECIP
(
invR
)
;
unsigned
int
atom2
=
j
;
LOAD_ATOM2_PARAMETERS
atom2
=
y*TILE_SIZE+j
;
real
tempValue1
=
0
;
real
tempValue2
=
0
;
#
ifdef
USE_EXCLUSIONS
if
(
!isExcluded
&&
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
if
(
atom1
<
NUM_ATOMS
&&
atomIndices[j]
<
NUM_ATOMS
&&
r2
<
CUTOFF_SQUARED
)
{
#
else
if
(
atom1
<
NUM_ATOMS
&&
atom
2
<
NUM_ATOMS
)
{
if
(
atom1
<
NUM_ATOMS
&&
atom
Indices[j]
<
NUM_ATOMS
)
{
#
endif
real
invR
=
RSQRT
(
r2
)
;
real
r
=
RECIP
(
invR
)
;
unsigned
int
atom2
=
j
;
LOAD_ATOM2_PARAMETERS
atom2
=
atomIndices[j]
;
real
tempValue1
=
0
;
real
tempValue2
=
0
;
COMPUTE_VALUE
value
+=
tempValue1
;
local_value[j]
+=
tempValue2
;
}
value
+=
tempValue1
;
local_value[j]
+=
tempValue2
;
#
ifdef
USE_CUTOFF
}
#
endif
#
ifdef
USE_EXCLUSIONS
excl
>>=
1
;
#
endif
}
//
Write
results
for
atom1.
#
ifdef
SUPPORTS_64_BIT_ATOMICS
atom_add
(
&global_value[atom1],
(
long
)
(
value*0x100000000
))
;
#
else
unsigned
int
offset
=
atom1
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
global_value[offset]
+=
value
;
#
endif
}
}
//
Write
results
//
Write
results
.
for
(
int
tgx
=
0
; tgx < TILE_SIZE; tgx++) {
unsigned
int
offset
=
y*TILE_SIZE+tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
global_value[offset]
+=
local_value[tgx]
;
#
ifdef
USE_CUTOFF
unsigned
int
atom2
=
atomIndices[tgx]
;
#
else
unsigned
int
atom2
=
y*TILE_SIZE
+
tgx
;
#
endif
if
(
atom2
<
PADDED_NUM_ATOMS
)
{
#
ifdef
SUPPORTS_64_BIT_ATOMICS
atom_add
(
&global_value[atom2],
(
long
)
(
local_value[tgx]*0x100000000
))
;
#
else
unsigned
int
offset
=
atom2
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
global_value[offset]
+=
local_value[tgx]
;
#
endif
}
}
}
lasty
=
y
;
pos++
;
}
}
platforms/opencl/src/kernels/customGBValueN2_default.cl
deleted
100644 → 0
View file @
f6d4557d
#
ifdef
SUPPORTS_64_BIT_ATOMICS
#
pragma
OPENCL
EXTENSION
cl_khr_global_int32_base_atomics
:
enable
#
pragma
OPENCL
EXTENSION
cl_khr_int64_base_atomics
:
enable
#
endif
#
define
TILE_SIZE
32
/**
*
Compute
a
value
based
on
pair
interactions.
*/
__kernel
__attribute__
((
reqd_work_group_size
(
WORK_GROUP_SIZE,
1
,
1
)))
void
computeN2Value
(
__global
const
real4*
restrict
posq,
__local
real4*
restrict
local_posq,
__global
const
unsigned
int*
restrict
exclusions,
__global
const
unsigned
int*
restrict
exclusionIndices,
__global
const
unsigned
int*
restrict
exclusionRowIndices,
#
ifdef
SUPPORTS_64_BIT_ATOMICS
__global
long*
restrict
global_value,
#
else
__global
real*
restrict
global_value,
#
endif
__local
real*
restrict
local_value,
__local
real*
restrict
tempBuffer,
#
ifdef
USE_CUTOFF
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
unsigned
int
maxTiles
#
else
unsigned
int
numTiles
#
endif
PARAMETER_ARGUMENTS
)
{
#
ifdef
USE_CUTOFF
unsigned
int
numTiles
=
interactionCount[0]
;
unsigned
int
pos
=
get_group_id
(
0
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS*
(
NUM_BLOCKS+1
)
/2
:
numTiles
)
/get_num_groups
(
0
)
;
unsigned
int
end
=
(
get_group_id
(
0
)
+1
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS*
(
NUM_BLOCKS+1
)
/2
:
numTiles
)
/get_num_groups
(
0
)
;
#
else
unsigned
int
pos
=
get_group_id
(
0
)
*numTiles/get_num_groups
(
0
)
;
unsigned
int
end
=
(
get_group_id
(
0
)
+1
)
*numTiles/get_num_groups
(
0
)
;
#
endif
real
energy
=
0
;
unsigned
int
lasty
=
0xFFFFFFFF
;
__local
unsigned
int
exclusionRange[2]
;
__local
int
exclusionIndex[1]
;
while
(
pos
<
end
)
{
//
Extract
the
coordinates
of
this
tile
unsigned
int
x,
y
;
#
ifdef
USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles[pos]
;
x
=
tileIndices.x
;
y
=
tileIndices.y
;
}
else
#
endif
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS+0.5f-SQRT
((
NUM_BLOCKS+0.5f
)
*
(
NUM_BLOCKS+0.5f
)
-2*pos
))
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
if
(
x
<
y
|
| x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
unsigned int baseLocalAtom = (get_local_id(0) < TILE_SIZE ? 0 : TILE_SIZE/2);
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
unsigned int valueBufferOffset = (tgx < TILE_SIZE/2 ? 0 : TILE_SIZE);
unsigned int atom1 = x*TILE_SIZE + tgx;
real value = 0;
real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
// Locate the exclusion data for this tile.
#ifdef USE_EXCLUSIONS
if (get_local_id(0) < 2)
exclusionRange[get_local_id(0)] = exclusionRowIndices[x+get_local_id(0)];
if (tgx == 0)
exclusionIndex[0] = -1;
barrier(CLK_LOCAL_MEM_FENCE);
for (int i = exclusionRange[0]+tgx; i < exclusionRange[1]; i += TILE_SIZE)
if (exclusionIndices[i] == y)
exclusionIndex[0] = i*TILE_SIZE;
barrier(CLK_LOCAL_MEM_FENCE);
bool hasExclusions = (exclusionIndex[0] > -1);
#endif
if (x == y) {
// This tile is on the diagonal.
const unsigned int localAtomIndex = get_local_id(0);
local_posq[localAtomIndex] = posq1;
LOAD_LOCAL_PARAMETERS_FROM_1
barrier(CLK_LOCAL_MEM_FENCE);
#ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[exclusionIndex[0]+tgx] >> baseLocalAtom;
#endif
for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
int atom2 = baseLocalAtom+j;
real4 posq2 = local_posq[atom2];
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+baseLocalAtom+j;
real tempValue1 = 0;
real tempValue2 = 0;
#ifdef USE_EXCLUSIONS
if (!isExcluded && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
#else
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
#endif
COMPUTE_VALUE
}
value += tempValue1;
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl >>= 1;
#endif
}
// Sum the values and write results.
if (get_local_id(0) >= TILE_SIZE)
tempBuffer[get_local_id(0)] = value;
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) {
#ifdef SUPPORTS_64_BIT_ATOMICS
const unsigned int offset = x*TILE_SIZE + tgx;
atom_add(&global_value[offset], (long) ((value + tempBuffer[get_local_id(0)+TILE_SIZE])*0x100000000));
#else
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
const unsigned int offset = x*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else
const unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif
global_value[offset] += value + tempBuffer[get_local_id(0)+TILE_SIZE];
#endif
}
}
else {
// This is an off-diagonal tile.
if (lasty != y && get_local_id(0) < TILE_SIZE) {
unsigned int j = y*TILE_SIZE + tgx;
local_posq[get_local_id(0)] = posq[j];
const unsigned int localAtomIndex = get_local_id(0);
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
local_value[get_local_id(0)] = 0;
barrier(CLK_LOCAL_MEM_FENCE);
// Compute the full set of interactions in this tile.
#ifdef USE_EXCLUSIONS
unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[0]+tgx] : 0xFFFFFFFF);
excl = (excl >> baseLocalAtom) & 0xFFFF;
excl += excl << 16;
excl = (excl >> tgx) |
(
excl
<<
(
TILE_SIZE
-
tgx
))
;
#
endif
unsigned
int
tj
=
tgx%
(
TILE_SIZE/2
)
;
for
(
unsigned
int
j
=
0
; j < TILE_SIZE/2; j++) {
#
ifdef
USE_EXCLUSIONS
bool
isExcluded
=
!
(
excl
&
0x1
)
;
#
endif
int
atom2
=
baseLocalAtom+tj
;
real4
posq2
=
local_posq[atom2]
;
real4
delta
=
(
real4
)
(
posq2.xyz
-
posq1.xyz,
0
)
;
#
ifdef
USE_PERIODIC
delta.x
-=
floor
(
delta.x*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
delta.y
-=
floor
(
delta.y*invPeriodicBoxSize.y+0.5f
)
*periodicBoxSize.y
;
delta.z
-=
floor
(
delta.z*invPeriodicBoxSize.z+0.5f
)
*periodicBoxSize.z
;
#
endif
real
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
#
ifdef
USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
#
endif
real
invR
=
RSQRT
(
r2
)
;
real
r
=
RECIP
(
invR
)
;
LOAD_ATOM2_PARAMETERS
atom2
=
y*TILE_SIZE+baseLocalAtom+tj
;
real
tempValue1
=
0
;
real
tempValue2
=
0
;
#
ifdef
USE_EXCLUSIONS
if
(
!isExcluded
&&
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
#
else
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
#
endif
COMPUTE_VALUE
}
value
+=
tempValue1
;
local_value[baseLocalAtom+tj+valueBufferOffset]
+=
tempValue2
;
#
ifdef
USE_CUTOFF
}
#
endif
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
#
ifdef
USE_EXCLUSIONS
excl
>>=
1
;
#
endif
tj
=
(
tj+1
)
%
(
TILE_SIZE/2
)
;
}
//
Sum
the
values
and
write
results.
if
(
get_local_id
(
0
)
>=
TILE_SIZE
)
tempBuffer[get_local_id
(
0
)
]
=
value
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
{
#
ifdef
SUPPORTS_64_BIT_ATOMICS
const
unsigned
int
offset1
=
x*TILE_SIZE
+
tgx
;
const
unsigned
int
offset2
=
y*TILE_SIZE
+
tgx
;
atom_add
(
&global_value[offset1],
(
long
)
((
value
+
tempBuffer[get_local_id
(
0
)
+TILE_SIZE]
)
*0x100000000
))
;
atom_add
(
&global_value[offset2],
(
long
)
((
local_value[get_local_id
(
0
)
]
+
local_value[get_local_id
(
0
)
+TILE_SIZE]
)
*0x100000000
))
;
#
else
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
const
unsigned
int
offset1
=
x*TILE_SIZE
+
tgx
+
y*PADDED_NUM_ATOMS
;
const
unsigned
int
offset2
=
y*TILE_SIZE
+
tgx
+
x*PADDED_NUM_ATOMS
;
#
else
const
unsigned
int
offset1
=
x*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
const
unsigned
int
offset2
=
y*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
#
endif
global_value[offset1]
+=
value
+
tempBuffer[get_local_id
(
0
)
+TILE_SIZE]
;
global_value[offset2]
+=
local_value[get_local_id
(
0
)
]
+
local_value[get_local_id
(
0
)
+TILE_SIZE]
;
#
endif
}
}
lasty
=
y
;
pos++
;
}
}
platforms/opencl/src/kernels/customGBValueN2_nvidia.cl
deleted
100644 → 0
View file @
f6d4557d
#
pragma
OPENCL
EXTENSION
cl_khr_global_int32_base_atomics
:
enable
#
ifdef
SUPPORTS_64_BIT_ATOMICS
#
pragma
OPENCL
EXTENSION
cl_khr_int64_base_atomics
:
enable
#
endif
#
define
TILE_SIZE
32
/**
*
Compute
a
value
based
on
pair
interactions.
*/
__kernel
void
computeN2Value
(
__global
const
real4*
restrict
posq,
__local
real4*
restrict
local_posq,
__global
const
unsigned
int*
restrict
exclusions,
__global
const
unsigned
int*
restrict
exclusionIndices,
__global
const
unsigned
int*
restrict
exclusionRowIndices,
#
ifdef
SUPPORTS_64_BIT_ATOMICS
__global
long*
restrict
global_value,
#
else
__global
real*
restrict
global_value,
#
endif
__local
real*
restrict
local_value,
__local
real*
restrict
tempBuffer,
#
ifdef
USE_CUTOFF
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
const
unsigned
int*
restrict
interactionFlags
#
else
unsigned
int
numTiles
#
endif
PARAMETER_ARGUMENTS
)
{
unsigned
int
totalWarps
=
get_global_size
(
0
)
/TILE_SIZE
;
unsigned
int
warp
=
get_global_id
(
0
)
/TILE_SIZE
;
#
ifdef
USE_CUTOFF
unsigned
int
numTiles
=
interactionCount[0]
;
unsigned
int
pos
=
warp*
(
numTiles
>
maxTiles
?
NUM_BLOCKS*
(
NUM_BLOCKS+1
)
/2
:
numTiles
)
/totalWarps
;
unsigned
int
end
=
(
warp+1
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS*
(
NUM_BLOCKS+1
)
/2
:
numTiles
)
/totalWarps
;
#
else
unsigned
int
pos
=
warp*numTiles/totalWarps
;
unsigned
int
end
=
(
warp+1
)
*numTiles/totalWarps
;
#
endif
real
energy
=
0
;
unsigned
int
lasty
=
0xFFFFFFFF
;
__local
unsigned
int
exclusionRange[2*WARPS_PER_GROUP]
;
__local
int
exclusionIndex[WARPS_PER_GROUP]
;
__local
int2*
reservedBlocks
=
(
__local
int2*
)
exclusionRange
;
do
{
//
Extract
the
coordinates
of
this
tile
const
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
TILE_SIZE-1
)
;
const
unsigned
int
tbx
=
get_local_id
(
0
)
-
tgx
;
const
unsigned
int
localGroupIndex
=
get_local_id
(
0
)
/TILE_SIZE
;
unsigned
int
x,
y
;
real
value
=
0
;
if
(
pos
<
end
)
{
#
ifdef
USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles[pos]
;
x
=
tileIndices.x
;
y
=
tileIndices.y
;
}
else
#
endif
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS+0.5f-SQRT
((
NUM_BLOCKS+0.5f
)
*
(
NUM_BLOCKS+0.5f
)
-2*pos
))
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
if
(
x
<
y
|
| x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
unsigned int atom1 = x*TILE_SIZE + tgx;
real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
// Locate the exclusion data for this tile.
#ifdef USE_EXCLUSIONS
if (tgx < 2)
exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
if (tgx == 0)
exclusionIndex[localGroupIndex] = -1;
for (unsigned int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
if (exclusionIndices[i] == y)
exclusionIndex[localGroupIndex] = i*TILE_SIZE;
bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
#else
bool hasExclusions = false;
#endif
if (pos >= end)
; // This warp is done.
else if (x == y) {
// This tile is on the diagonal.
const unsigned int localAtomIndex = get_local_id(0);
local_posq[localAtomIndex] = posq1;
LOAD_LOCAL_PARAMETERS_FROM_1
#ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
#endif
for (unsigned int j = 0; j < TILE_SIZE; j++) {
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
int atom2 = tbx+j;
real4 posq2 = local_posq[atom2];
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
real tempValue1 = 0;
real tempValue2 = 0;
#ifdef USE_EXCLUSIONS
if (!isExcluded && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
#else
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
#endif
COMPUTE_VALUE
}
value += tempValue1;
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl >>= 1;
#endif
}
}
else {
// This is an off-diagonal tile.
if (lasty != y) {
unsigned int j = y*TILE_SIZE + tgx;
local_posq[get_local_id(0)] = posq[j];
const unsigned int localAtomIndex = get_local_id(0);
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
local_value[get_local_id(0)] = 0;
#ifdef USE_CUTOFF
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
if (!hasExclusions && flags != 0xFFFFFFFF) {
if (flags == 0) {
// No interactions in this tile.
}
else {
// Compute only a subset of the interactions in this tile.
for (unsigned int j = 0; j < TILE_SIZE; j++) {
if ((flags&(1<<j)) != 0) {
int atom2 = tbx+j;
real4 posq2 = local_posq[atom2];
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real tempValue1 = 0;
real tempValue2 = 0;
if (r2 < CUTOFF_SQUARED) {
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
COMPUTE_VALUE
}
value += tempValue1;
}
tempBuffer[get_local_id(0)] = tempValue2;
// Sum the forces on atom2.
if (tgx % 4 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+1]+tempBuffer[get_local_id(0)+2]+tempBuffer[get_local_id(0)+3];
if (tgx == 0)
local_value[tbx+j] += tempBuffer[get_local_id(0)]+tempBuffer[get_local_id(0)+4]+tempBuffer[get_local_id(0)+8]+tempBuffer[get_local_id(0)+12]+tempBuffer[get_local_id(0)+16]+tempBuffer[get_local_id(0)+20]+tempBuffer[get_local_id(0)+24]+tempBuffer[get_local_id(0)+28];
}
}
}
}
else
#endif
{
// Compute the full set of interactions in this tile.
#ifdef USE_EXCLUSIONS
unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[localGroupIndex]+tgx] : 0xFFFFFFFF);
excl = (excl >> tgx) |
(
excl
<<
(
TILE_SIZE
-
tgx
))
;
#
endif
unsigned
int
tj
=
tgx
;
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
#
ifdef
USE_EXCLUSIONS
bool
isExcluded
=
!
(
excl
&
0x1
)
;
#
endif
int
atom2
=
tbx+tj
;
real4
posq2
=
local_posq[atom2]
;
real4
delta
=
(
real4
)
(
posq2.xyz
-
posq1.xyz,
0
)
;
#
ifdef
USE_PERIODIC
delta.x
-=
floor
(
delta.x*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
delta.y
-=
floor
(
delta.y*invPeriodicBoxSize.y+0.5f
)
*periodicBoxSize.y
;
delta.z
-=
floor
(
delta.z*invPeriodicBoxSize.z+0.5f
)
*periodicBoxSize.z
;
#
endif
real
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
#
ifdef
USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
#
endif
real
invR
=
RSQRT
(
r2
)
;
real
r
=
RECIP
(
invR
)
;
LOAD_ATOM2_PARAMETERS
atom2
=
y*TILE_SIZE+tj
;
real
tempValue1
=
0
;
real
tempValue2
=
0
;
#
ifdef
USE_EXCLUSIONS
if
(
!isExcluded
&&
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
#
else
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
#
endif
COMPUTE_VALUE
}
value
+=
tempValue1
;
local_value[tbx+tj]
+=
tempValue2
;
#
ifdef
USE_CUTOFF
}
#
endif
#
ifdef
USE_EXCLUSIONS
excl
>>=
1
;
#
endif
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
)
;
}
}
}
}
//
Write
results.
We
need
to
coordinate
between
warps
to
make
sure
no
two
of
them
//
ever
try
to
write
to
the
same
piece
of
memory
at
the
same
time.
#
ifdef
SUPPORTS_64_BIT_ATOMICS
if
(
pos
<
end
)
{
const
unsigned
int
offset
=
x*TILE_SIZE
+
tgx
;
atom_add
(
&global_value[offset],
(
long
)
(
value*0x100000000
))
;
}
if
(
pos
<
end
&&
x
!=
y
)
{
const
unsigned
int
offset
=
y*TILE_SIZE
+
tgx
;
atom_add
(
&global_value[offset],
(
long
)
(
local_value[get_local_id
(
0
)
]*0x100000000
))
;
}
#
else
int
writeX
=
(
pos
<
end
?
x
:
-1
)
;
int
writeY
=
(
pos
<
end
&&
x
!=
y
?
y
:
-1
)
;
if
(
tgx
==
0
)
reservedBlocks[localGroupIndex]
=
(
int2
)(
writeX,
writeY
)
;
bool
done
=
false
;
int
doneIndex
=
0
;
int
checkIndex
=
0
;
while
(
true
)
{
//
See
if
any
warp
still
needs
to
write
its
data.
bool
allDone
=
true
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
while
(
doneIndex
<
WARPS_PER_GROUP
&&
allDone
)
{
if
(
reservedBlocks[doneIndex].x
!=
-1
)
allDone
=
false
;
else
doneIndex++
;
}
if
(
allDone
)
break
;
if
(
!done
)
{
//
See
whether
this
warp
can
write
its
data.
This
requires
that
no
previous
warp
//
is
trying
to
write
to
the
same
block
of
the
buffer.
bool
canWrite
=
(
writeX
!=
-1
)
;
while
(
checkIndex
<
localGroupIndex
&&
canWrite
)
{
if
((
reservedBlocks[checkIndex].x
==
x
|
| reservedBlocks[checkIndex].y == x) ||
(writeY != -1 && (reservedBlocks[checkIndex].x == y |
|
reservedBlocks[checkIndex].y
==
y
)))
canWrite
=
false
;
else
checkIndex++
;
}
if
(
canWrite
)
{
//
Write
the
data
to
global
memory,
then
mark
this
warp
as
done.
if
(
writeX
>
-1
)
{
const
unsigned
int
offset
=
x*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
global_value[offset]
+=
value
;
}
if
(
writeY
>
-1
)
{
const
unsigned
int
offset
=
y*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
global_value[offset]
+=
local_value[get_local_id
(
0
)
]
;
}
done
=
true
;
if
(
tgx
==
0
)
reservedBlocks[localGroupIndex]
=
(
int2
)(
-1
,
-1
)
;
}
}
}
#
endif
lasty
=
y
;
pos++
;
}
while
(
pos
<
end
)
;
}
platforms/opencl/src/kernels/fft.cl
View file @
93c467b2
...
...
@@ -11,14 +11,14 @@ __kernel void execFFT(__global const real2* restrict in, __global real2* restric
for
(
int
i
=
get_local_id
(
0
)
; i < ZSIZE; i += get_local_size(0))
w[i]
=
(
real2
)
(
cos
(
-sign*i*2*M_PI/ZSIZE
)
,
sin
(
-sign*i*2*M_PI/ZSIZE
))
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
for
(
int
index
=
get_group_id
(
0
)
; index < XSIZE*YSIZE; index += get_num_groups(0)) {
for
(
int
index
=
get_group_id
(
0
)
*BLOCKS_PER_GROUP+get_local_id
(
0
)
/ZSIZE
; index < XSIZE*YSIZE; index += get_num_groups(0)
*BLOCKS_PER_GROUP
) {
int
x
=
index/YSIZE
;
int
y
=
index-x*YSIZE
;
#
if
LOOP_REQUIRED
for
(
int
z
=
get_local_id
(
0
)
; z < ZSIZE; z += get_local_size(0))
data0[z]
=
in[x*
(
YSIZE*ZSIZE
)
+y*ZSIZE+z]
;
#
else
data0[get_local_id
(
0
)
]
=
in[x*
(
YSIZE*ZSIZE
)
+y*ZSIZE+get_local_id
(
0
)
]
;
data0[get_local_id
(
0
)
]
=
in[x*
(
YSIZE*ZSIZE
)
+y*ZSIZE+get_local_id
(
0
)
%ZSIZE
]
;
#
endif
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
COMPUTE_FFT
...
...
platforms/opencl/src/kernels/findInteractingBlocks.cl
View file @
93c467b2
#
pragma
OPENCL
EXTENSION
cl_khr_global_int32_base_atomics
:
enable
#
pragma
OPENCL
EXTENSION
cl_khr_byte_addressable_store
:
enable
#
define
TILE_SIZE
32
#
define
GROUP_SIZE
64
#
define
BUFFER_GROUPS
4
#
define
BUFFER_SIZE
BUFFER_GROUPS*GROUP_SIZE
#
define
WARP_SIZE
32
#
define
INVALID
0xFFFF
/**
*
Find
a
bounding
box
for
the
atoms
in
each
block.
*/
__kernel
void
findBlockBounds
(
int
numAtoms,
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
__global
const
real4*
restrict
posq,
__global
real4*
restrict
blockCenter,
__global
real4*
restrict
blockBoundingBox,
__global
unsigned
int*
restrict
interactionCount
)
{
__kernel
void
findBlockBounds
(
int
numAtoms,
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
__global
const
real4*
restrict
posq,
__global
real4*
restrict
blockCenter,
__global
real4*
restrict
blockBoundingBox,
__global
int*
restrict
rebuildNeighborList,
__global
real2*
restrict
sortedBlocks
)
{
int
index
=
get_global_id
(
0
)
;
int
base
=
index*TILE_SIZE
;
while
(
base
<
numAtoms
)
{
real4
pos
=
posq[base]
;
#
ifdef
USE_PERIODIC
pos.x
-=
floor
(
pos.x*invPeriodicBoxSize.x
)
*periodicBoxSize.x
;
pos.y
-=
floor
(
pos.y*invPeriodicBoxSize.y
)
*periodicBoxSize.y
;
pos.z
-=
floor
(
pos.z*invPeriodicBoxSize.z
)
*periodicBoxSize.z
;
pos.xyz
-=
floor
(
pos.xyz*invPeriodicBoxSize.xyz
)
*periodicBoxSize.xyz
;
real4
firstPoint
=
pos
;
#
endif
real4
minPos
=
pos
;
...
...
@@ -25,146 +24,229 @@ __kernel void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeri
for
(
int
i
=
base+1
; i < last; i++) {
pos
=
posq[i]
;
#
ifdef
USE_PERIODIC
pos.x
-=
floor
((
pos.x-firstPoint.x
)
*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
pos.y
-=
floor
((
pos.y-firstPoint.y
)
*invPeriodicBoxSize.y+0.5f
)
*periodicBoxSize.y
;
pos.z
-=
floor
((
pos.z-firstPoint.z
)
*invPeriodicBoxSize.z+0.5f
)
*periodicBoxSize.z
;
pos.xyz
-=
floor
((
pos.xyz-firstPoint.xyz
)
*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
#
endif
minPos
=
min
(
minPos,
pos
)
;
maxPos
=
max
(
maxPos,
pos
)
;
}
blockBoundingBox[index]
=
0.5f*
(
maxPos-minPos
)
;
real4
blockSize
=
0.5f*
(
maxPos-minPos
)
;
blockBoundingBox[index]
=
blockSize
;
blockCenter[index]
=
0.5f*
(
maxPos+minPos
)
;
sortedBlocks[index]
=
(
real2
)
(
blockSize.x+blockSize.y+blockSize.z,
index
)
;
index
+=
get_global_size
(
0
)
;
base
=
index*TILE_SIZE
;
}
if
(
get_global_id
(
0
)
==
0
)
interactionCoun
t[0]
=
0
;
rebuildNeighborLis
t[0]
=
0
;
}
/**
*
This
is
called
by
findBlocksWithInteractions
()
.
It
compacts
the
list
of
blocks
and
writes
them
*
to
global
memory.
*
Sort
the
data
about
bounding
boxes
so
it
can
be
accessed
more
efficiently
in
the
next
kernel.
*/
void
storeInteractionData
(
__local
ushort2*
buffer,
__local
int*
valid,
__local
short*
sum,
__local
ushort2*
temp,
__local
int*
baseIndex,
__global
unsigned
int*
interactionCount,
__global
ushort2*
interactingTiles,
real
cutoffSquared,
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
__global
const
real4*
posq,
__global
const
real4*
blockCenter,
__global
const
real4*
blockBoundingBox,
unsigned
int
maxTiles
)
{
//
The
buffer
is
full,
so
we
need
to
compact
it
and
write
out
results.
Start
by
doing
a
parallel
prefix
sum.
__kernel
void
sortBoxData
(
__global
const
real2*
restrict
sortedBlock,
__global
const
real4*
restrict
blockCenter,
__global
const
real4*
restrict
blockBoundingBox,
__global
real4*
restrict
sortedBlockCenter,
__global
real4*
restrict
sortedBlockBoundingBox,
__global
const
real4*
restrict
posq,
__global
const
real4*
restrict
oldPositions,
__global
unsigned
int*
restrict
interactionCount,
__global
int*
restrict
rebuildNeighborList
)
{
for
(
int
i
=
get_global_id
(
0
)
; i < NUM_BLOCKS; i += get_global_size(0)) {
int
index
=
(
int
)
sortedBlock[i].y
;
sortedBlockCenter[i]
=
blockCenter[index]
;
sortedBlockBoundingBox[i]
=
blockBoundingBox[index]
;
}
//
Also
check
whether
any
atom
has
moved
enough
so
that
we
really
need
to
rebuild
the
neighbor
list.
for
(
int
i
=
get_local_id
(
0
)
; i < BUFFER_SIZE; i += GROUP_SIZE)
temp[i].x
=
(
valid[i]
?
1
:
0
)
;
bool
rebuild
=
false
;
for
(
int
i
=
get_global_id
(
0
)
; i < NUM_ATOMS; i += get_global_size(0)) {
real4
delta
=
oldPositions[i]-posq[i]
;
if
(
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
>
0.25f*PADDING*PADDING
)
rebuild
=
true
;
}
if
(
rebuild
)
{
rebuildNeighborList[0]
=
1
;
interactionCount[0]
=
0
;
}
}
/**
*
Perform
a
parallel
prefix
sum
over
an
array.
The
input
values
are
all
assumed
to
be
0
or
1.
*/
void
prefixSum
(
__local
short*
sum,
__local
ushort2*
temp
)
{
for
(
int
i
=
get_local_id
(
0
)
; i < BUFFER_SIZE; i += get_local_size(0))
temp[i].x
=
sum[i]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
int
whichBuffer
=
0
;
for
(
int
offset
=
1
; offset < BUFFER_SIZE; offset *= 2) {
if
(
whichBuffer
==
0
)
for
(
int
i
=
get_local_id
(
0
)
; i < BUFFER_SIZE; i +=
GROUP_SIZE
)
for
(
int
i
=
get_local_id
(
0
)
; i < BUFFER_SIZE; i +=
get_local_size(0)
)
temp[i].y
=
(
i
<
offset
?
temp[i].x
:
temp[i].x+temp[i-offset].x
)
;
else
for
(
int
i
=
get_local_id
(
0
)
; i < BUFFER_SIZE; i +=
GROUP_SIZE
)
for
(
int
i
=
get_local_id
(
0
)
; i < BUFFER_SIZE; i +=
get_local_size(0)
)
temp[i].x
=
(
i
<
offset
?
temp[i].y
:
temp[i].y+temp[i-offset].y
)
;
whichBuffer
=
1-whichBuffer
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
if
(
whichBuffer
==
0
)
for
(
int
i
=
get_local_id
(
0
)
; i < BUFFER_SIZE; i +=
GROUP_SIZE
)
for
(
int
i
=
get_local_id
(
0
)
; i < BUFFER_SIZE; i +=
get_local_size(0)
)
sum[i]
=
temp[i].x
;
else
for
(
int
i
=
get_local_id
(
0
)
; i < BUFFER_SIZE; i +=
GROUP_SIZE
)
for
(
int
i
=
get_local_id
(
0
)
; i < BUFFER_SIZE; i +=
get_local_size(0)
)
sum[i]
=
temp[i].y
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
int
numValid
=
sum[BUFFER_SIZE-1]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
//
Compact
the
buffer.
}
for
(
int
i
=
get_local_id
(
0
)
; i < BUFFER_SIZE; i += GROUP_SIZE)
if
(
valid[i]
)
{
temp[sum[i]-1]
=
buffer[i]
;
sum[i]
=
valid[i]
;
valid[i]
=
false
;
buffer[i]
=
(
ushort2
)
1
;
/**
*
This
is
called
by
findBlocksWithInteractions
()
.
It
compacts
the
list
of
blocks,
identifies
interactions
*
in
them,
and
writes
the
result
to
global
memory.
*/
void
storeInteractionData
(
unsigned
short
x,
__local
unsigned
short*
buffer,
__local
short*
sum,
__local
ushort2*
temp,
__local
int*
atoms,
__local
int*
numAtoms,
__local
int*
baseIndex,
__global
unsigned
int*
interactionCount,
__global
ushort2*
interactingTiles,
__global
unsigned
int*
interactingAtoms,
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
__global
const
real4*
posq,
__local
real4*
posBuffer,
real4
blockCenterX,
real4
blockSizeX,
unsigned
int
maxTiles,
bool
finish
)
{
const
bool
singlePeriodicCopy
=
(
0.5f*periodicBoxSize.x-blockSizeX.x
>=
PADDED_CUTOFF
&&
0.5f*periodicBoxSize.y-blockSizeX.y
>=
PADDED_CUTOFF
&&
0.5f*periodicBoxSize.z-blockSizeX.z
>=
PADDED_CUTOFF
)
;
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
{
real4
pos
=
posq[x*TILE_SIZE+get_local_id
(
0
)
]
;
#
ifdef
USE_PERIODIC
if
(
singlePeriodicCopy
)
{
//
The
box
is
small
enough
that
we
can
just
translate
all
the
atoms
into
a
single
periodic
//
box,
then
skip
having
to
apply
periodic
boundary
conditions
later.
pos.xyz
-=
floor
((
pos.xyz-blockCenterX.xyz
)
*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
}
#
endif
posBuffer[get_local_id
(
0
)
]
=
pos
;
}
//
The
buffer
is
full,
so
we
need
to
compact
it
and
write
out
results.
Start
by
doing
a
parallel
prefix
sum.
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
for
(
int
i
=
get_local_id
(
0
)
; i < BUFFER_SIZE; i += get_local_size(0))
sum[i]
=
(
buffer[i]
==
INVALID
?
0
:
1
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
prefixSum
(
sum,
temp
)
;
int
numValid
=
sum[BUFFER_SIZE-1]
;
#
ifndef
WARPS_ARE_ATOMIC
//
Filter
the
list
of
tiles
by
comparing
the
distance
from
each
atom
to
the
other
bounding
box.
//
We
only
do
this
if
we
aren
't
already
optimizing
the
computation
using
flags.
//
Compact
the
buffer.
int
index
=
get_local_id
(
0
)
&
(
TILE_SIZE-1
)
;
int
group
=
get_local_id
(
0
)
/TILE_SIZE
;
real4
center,
boxSize,
pos
;
for
(
int
tile
=
0
; tile < numValid; tile++) {
int
x
=
temp[tile].x
;
int
y
=
temp[tile].y
;
if
(
x
==
y
)
continue
;
for
(
int
i
=
get_local_id
(
0
)
; i < BUFFER_SIZE; i += get_local_size(0))
if
(
buffer[i]
!=
INVALID
)
temp[sum[i]-1].x
=
buffer[i]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
for
(
int
i
=
get_local_id
(
0
)
; i < BUFFER_SIZE; i += get_local_size(0))
buffer[i]
=
temp[i].x
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
//
Lo
ad
an
atom
position
and
the
bounding
box
the
other
block
.
//
Lo
op
over
the
tiles
and
find
specific
interactions
in
them
.
#
ifdef
MAC_AMD_WORKAROUND
int
box
=
(
group
==
0
?
x
:
y
)
;
int
atom
=
(
group
==
0
?
y
:
x
)
*TILE_SIZE+index
;
__global
real*
bc
=
(
__global
real*
)
blockCenter
;
__global
real*
bb
=
(
__global
real*
)
blockBoundingBox
;
__global
real*
ps
=
(
__global
real*
)
posq
;
center
=
(
real4
)
(
bc[4*box],
bc[4*box+1],
bc[4*box+2],
0
)
;
boxSize
=
(
real4
)
(
bb[4*box],
bb[4*box+1],
bb[4*box+2],
0
)
;
pos
=
(
real4
)
(
ps[4*atom],
ps[4*atom+1],
ps[4*atom+2],
0
)
;
#
else
center
=
blockCenter[
(
group
==
0
?
x
:
y
)
]
;
boxSize
=
blockBoundingBox[
(
group
==
0
?
x
:
y
)
]
;
pos
=
posq[
(
group
==
0
?
y
:
x
)
*TILE_SIZE+index]
;
const
int
indexInWarp
=
get_local_id
(
0
)
%WARP_SIZE
;
for
(
int
base
=
0
; base < numValid; base += BUFFER_SIZE/WARP_SIZE) {
for
(
int
i
=
get_local_id
(
0
)
/WARP_SIZE
; i < BUFFER_SIZE/WARP_SIZE && base+i < numValid; i += GROUP_SIZE/WARP_SIZE) {
//
Check
each
atom
in
block
Y
for
interactions.
real4
pos
=
posq[buffer[base+i]*TILE_SIZE+indexInWarp]
;
#
ifdef
USE_PERIODIC
if
(
singlePeriodicCopy
)
pos.xyz
-=
floor
((
pos.xyz-blockCenterX.xyz
)
*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
#
endif
//
Find
the
distance
of
the
atom
from
the
bounding
box.
real4
delta
=
pos-center
;
bool
interacts
=
false
;
#
ifdef
USE_PERIODIC
delta.x
-=
floor
(
delta.x*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
delta.y
-=
floor
(
delta.y*invPeriodicBoxSize.y+0.5f
)
*periodicBoxSize.y
;
delta.z
-=
floor
(
delta.z*invPeriodicBoxSize.z+0.5f
)
*periodicBoxSize.z
;
if
(
!singlePeriodicCopy
)
{
for
(
int
j
=
0
; j < TILE_SIZE; j++) {
real4
delta
=
pos-posBuffer[j]
;
delta.xyz
-=
floor
(
delta.xyz*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
interacts
|= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED);
}
}
else {
#endif
delta
=
max
((
real4
)
0
,
fabs
(
delta
)
-boxSize
)
;
__local
ushort*
flag
=
(
__local
ushort*
)
&buffer[tile]
;
if
(
delta.x*delta.x+delta.y*delta.y+delta.z*delta.z
<
cutoffSquared
)
flag[group]
=
false
;
for (int j = 0; j < TILE_SIZE; j++) {
real4 delta = pos-posBuffer[j];
interacts |
=
(
delta.x*delta.x+delta.y*delta.y+delta.z*delta.z
<
PADDED_CUTOFF_SQUARED
)
;
}
#
ifdef
USE_PERIODIC
}
#
endif
sum[i*WARP_SIZE+indexInWarp]
=
(
interacts
?
1
:
0
)
;
}
for
(
int
i
=
numValid-base+get_local_id
(
0
)
/WARP_SIZE
; i < BUFFER_SIZE/WARP_SIZE; i += GROUP_SIZE/WARP_SIZE)
sum[i*WARP_SIZE+indexInWarp]
=
0
;
//
Compact
the
list
of
atoms.
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
flag[0]
|
| flag[1]) {
// This tile contains no interactions.
prefixSum
(
sum,
temp
)
;
for
(
int
i
=
get_local_id
(
0
)
; i < BUFFER_SIZE; i += get_local_size(0))
if
(
sum[i]
!=
(
i
==
0
?
0
:
sum[i-1]
))
atoms[*numAtoms+sum[i]-1]
=
buffer[base+i/WARP_SIZE]*TILE_SIZE+indexInWarp
;
//
Store
them
to
global
memory.
numValid--;
int
atomsToStore
=
*numAtoms+sum[BUFFER_SIZE-1]
;
bool
storePartialTile
=
(
finish
&&
base
>=
numValid-BUFFER_SIZE/WARP_SIZE
)
;
int
tilesToStore
=
(
storePartialTile
?
(
atomsToStore+TILE_SIZE-1
)
/TILE_SIZE
:
atomsToStore/TILE_SIZE
)
;
if
(
tilesToStore
>
0
)
{
if
(
get_local_id
(
0
)
==
0
)
*baseIndex
=
atom_add
(
interactionCount,
tilesToStore
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
get_local_id
(
0
)
==
0
)
*numAtoms
=
atomsToStore-tilesToStore*TILE_SIZE
;
if
(
*baseIndex+tilesToStore
<=
maxTiles
)
{
if
(
get_local_id
(
0
)
<
tilesToStore
)
interactingTiles[*baseIndex+get_local_id
(
0
)
]
=
(
ushort2
)
(
x,
singlePeriodicCopy
)
;
for
(
int
i
=
get_local_id
(
0
)
; i < tilesToStore*TILE_SIZE; i += get_local_size(0))
interactingAtoms[*baseIndex*TILE_SIZE+i]
=
(
i
<
atomsToStore
?
atoms[i]
:
NUM_ATOMS
)
;
}
}
else
{
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
get_local_id
(
0
)
==
0
)
temp[tile] = temp[numValid];
tile--;
*numAtoms
+=
sum[BUFFER_SIZE-1]
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
get_local_id
(
0
)
<
*numAtoms
&&
!storePartialTile
)
atoms[get_local_id
(
0
)
]
=
atoms[tilesToStore*TILE_SIZE+get_local_id
(
0
)
]
;
}
#endif
// Store it to global memory.
if
(
numValid
==
0
&&
*numAtoms
>
0
&&
finish
)
{
//
We
didn
't
have
any
more
tiles
to
process,
but
there
were
some
atoms
left
over
from
a
//
previous
call
to
this
function.
Save
them
now.
if (get_local_id(0) == 0)
*baseIndex = atom_add(interactionCount, numValid);
barrier(CLK_LOCAL_MEM_FENCE);
if (*baseIndex+numValid <= maxTiles)
for (int i = get_local_id(0); i < numValid; i += GROUP_SIZE)
interactingTiles[*baseIndex+i] = temp[i];
barrier(CLK_LOCAL_MEM_FENCE);
if
(
get_local_id
(
0
)
==
0
)
*baseIndex
=
atom_add
(
interactionCount,
1
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
*baseIndex
<
maxTiles
)
{
if
(
get_local_id
(
0
)
==
0
)
interactingTiles[*baseIndex]
=
(
ushort2
)
(
x,
singlePeriodicCopy
)
;
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
interactingAtoms[*baseIndex*TILE_SIZE+get_local_id
(
0
)
]
=
(
get_local_id
(
0
)
<
*numAtoms
?
atoms[get_local_id
(
0
)
]
:
NUM_ATOMS
)
;
}
}
//
Reset
the
buffer
for
processing
more
tiles.
for
(
int
i
=
get_local_id
(
0
)
; i < BUFFER_SIZE; i += get_local_size(0))
buffer[i]
=
INVALID
;
}
/**
*
Compare
the
bounding
boxes
for
each
pair
of
blocks.
If
they
are
sufficiently
far
apart,
*
mark
them
as
non-interacting.
*/
__kernel void findBlocksWithInteractions(
real cutoffSquared,
real4 periodicBoxSize, real4 invPeriodicBoxSize, __global const real4* restrict blockCenter,
__kernel
void
findBlocksWithInteractions
(
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
__global
const
real4*
restrict
blockCenter,
__global
const
real4*
restrict
blockBoundingBox,
__global
unsigned
int*
restrict
interactionCount,
__global
ushort2*
restrict
interactingTiles,
__global unsigned int* restrict interactionFlags, __global const real4* restrict posq, unsigned int maxTiles, unsigned int startTileIndex,
unsigned int endTileIndex) {
__local ushort2 buffer[BUFFER_SIZE];
__local int valid[BUFFER_SIZE];
__global
unsigned
int*
restrict
interactingAtoms,
__global
const
real4*
restrict
posq,
unsigned
int
maxTiles,
unsigned
int
startBlockIndex,
unsigned
int
numBlocks,
__global
real2*
restrict
sortedBlocks,
__global
const
real4*
restrict
sortedBlockCenter,
__global
const
real4*
restrict
sortedBlockBoundingBox,
__global
const
unsigned
int*
restrict
exclusionIndices,
__global
const
unsigned
int*
restrict
exclusionRowIndices,
__global
real4*
restrict
oldPositions,
__global
const
int*
restrict
rebuildNeighborList
)
{
__local
unsigned
short
buffer[BUFFER_SIZE]
;
__local
short
sum[BUFFER_SIZE]
;
__local
ushort2
temp[BUFFER_SIZE]
;
__local
int
atoms[BUFFER_SIZE+TILE_SIZE]
;
__local
real4
posBuffer[TILE_SIZE]
;
__local
int
exclusionsForX[MAX_EXCLUSIONS]
;
__local
int
bufferFull
;
__local
int
globalIndex
;
__local
int
numAtoms
;
#
ifdef
AMD_ATOMIC_WORK_AROUND
//
Do
a
byte
write
to
force
all
memory
accesses
to
interactionCount
to
use
the
complete
path.
//
This
avoids
the
atomic
access
from
causing
all
word
accesses
to
other
buffers
from
using
the
slow
complete
path.
...
...
@@ -173,142 +255,79 @@ __kernel void findBlocksWithInteractions(real cutoffSquared, real4 periodicBoxSi
if
(
get_global_id
(
0
)
==
get_local_id
(
0
)
+1
)
((
__global
char*
)
interactionCount
)
[sizeof
(
unsigned
int
)
+1]
=
0
;
#
endif
if
(
rebuildNeighborList[0]
==
0
)
return
; // The neighbor list doesn't need to be rebuilt.
int
valuesInBuffer
=
0
;
if
(
get_local_id
(
0
)
==
0
)
bufferFull
=
false
;
for
(
int
i
=
0
; i < BUFFER_GROUPS; ++i)
valid
[i*GROUP_SIZE+get_local_id(0)] =
false
;
buffer
[i*GROUP_SIZE+get_local_id
(
0
)
]
=
INVALID
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
for (int baseIndex = startTileIndex+get_group_id(0)*get_local_size(0); baseIndex < endTileIndex; baseIndex += get_global_size(0)) {
// Identify the pair of blocks to compare.
int index = baseIndex+get_local_id(0);
if (index < endTileIndex) {
unsigned int y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*index));
unsigned int x = (index-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y |
|
x
>=
NUM_BLOCKS
)
{
//
Occasionally
happens
due
to
roundoff
error.
y
+=
(
x
<
y
?
-1
:
1
)
;
x
=
(
index-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
}
//
Find
the
distance
between
the
bounding
boxes
of
the
two
cells.
//
Loop
over
blocks
sorted
by
size.
for
(
int
i
=
startBlockIndex+get_group_id
(
0
)
; i < startBlockIndex+numBlocks; i += get_num_groups(0)) {
if
(
get_local_id
(
0
)
==
get_local_size
(
0
)
-1
)
numAtoms
=
0
;
real2
sortedKey
=
sortedBlocks[i]
;
unsigned
short
x
=
(
unsigned
short
)
sortedKey.y
;
real4
blockCenterX
=
blockCenter[x]
;
real4
blockSizeX
=
blockBoundingBox[x]
;
#
ifdef
MAC_AMD_WORKAROUND
__global
real*
bc
=
(
__global
real*
)
blockCenter
;
__global
real*
bb
=
(
__global
real*
)
blockBoundingBox
;
real4
bcx
=
(
real4
)
(
bc[4*x],
bc[4*x+1],
bc[4*x+2],
0
)
;
real4
bcy
=
(
real4
)
(
bc[4*y],
bc[4*y+1],
bc[4*y+2],
0
)
;
real4
delta
=
bcx-bcy
;
real4
boxSizea
=
(
real4
)
(
bb[4*x],
bb[4*x+1],
bb[4*x+2],
0
)
;
real4
boxSizeb
=
(
real4
)
(
bb[4*y],
bb[4*y+1],
bb[4*y+2],
0
)
;
#
else
real4
delta
=
blockCenter[x]-blockCenter[y]
;
real4
boxSizea
=
blockBoundingBox[x]
;
real4
boxSizeb
=
blockBoundingBox[y]
;
#
endif
//
Load
exclusion
data
for
block
x.
const
int
exclusionStart
=
exclusionRowIndices[x]
;
const
int
exclusionEnd
=
exclusionRowIndices[x+1]
;
const
int
numExclusions
=
exclusionEnd-exclusionStart
;
for
(
int
j
=
get_local_id
(
0
)
; j < numExclusions; j += get_local_size(0))
exclusionsForX[j]
=
exclusionIndices[exclusionStart+j]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
//
Compare
it
to
other
blocks
after
this
one
in
sorted
order.
for
(
int
base
=
i+1
; base < NUM_BLOCKS; base += get_local_size(0)) {
int
j
=
base+get_local_id
(
0
)
;
real2
sortedKey2
=
(
j
<
NUM_BLOCKS
?
sortedBlocks[j]
:
(
real2
)
0
)
;
real4
blockCenterY
=
(
j
<
NUM_BLOCKS
?
sortedBlockCenter[j]
:
(
real4
)
0
)
;
real4
blockSizeY
=
(
j
<
NUM_BLOCKS
?
sortedBlockBoundingBox[j]
:
(
real4
)
0
)
;
unsigned
short
y
=
(
unsigned
short
)
sortedKey2.y
;
real4
delta
=
blockCenterX-blockCenterY
;
#
ifdef
USE_PERIODIC
delta.x
-=
floor
(
delta.x*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
delta.y
-=
floor
(
delta.y*invPeriodicBoxSize.y+0.5f
)
*periodicBoxSize.y
;
delta.z
-=
floor
(
delta.z*invPeriodicBoxSize.z+0.5f
)
*periodicBoxSize.z
;
#
endif
delta.x
=
max
((
real
)
0
,
fabs
(
delta.x
)
-boxSizea.x-boxSizeb.x
)
;
delta.y
=
max
((
real
)
0
,
fabs
(
delta.y
)
-boxSizea.y-boxSizeb.y
)
;
delta.z
=
max
((
real
)
0
,
fabs
(
delta.z
)
-boxSizea.z-boxSizeb.z
)
;
if
(
delta.x*delta.x+delta.y*delta.y+delta.z*delta.z
<
cutoffSquared
)
{
delta.x
=
max
((
real
)
0
,
fabs
(
delta.x
)
-blockSizeX.x-blockSizeY.x
)
;
delta.y
=
max
((
real
)
0
,
fabs
(
delta.y
)
-blockSizeX.y-blockSizeY.y
)
;
delta.z
=
max
((
real
)
0
,
fabs
(
delta.z
)
-blockSizeX.z-blockSizeY.z
)
;
bool
hasExclusions
=
false
;
for
(
int
k
=
0
; k < numExclusions; k++)
hasExclusions
|
=
(
exclusionsForX[k]
==
y
)
;
if
(
j
<
NUM_BLOCKS
&&
delta.x*delta.x+delta.y*delta.y+delta.z*delta.z
<
PADDED_CUTOFF_SQUARED
&&
!hasExclusions
)
{
//
Add
this
tile
to
the
buffer.
int
bufferIndex
=
valuesInBuffer*GROUP_SIZE+get_local_id
(
0
)
;
valid[bufferIndex]
=
true
;
buffer[bufferIndex]
=
(
ushort2
)
(
x,
y
)
;
buffer[bufferIndex]
=
y
;
valuesInBuffer++
;
if
(
!bufferFull
&&
valuesInBuffer
==
BUFFER_GROUPS
)
bufferFull
=
true
;
}
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
bufferFull
)
{
storeInteractionData
(
buffer,
valid,
sum,
temp,
&globalIndex,
interactionCount,
interactingTiles,
cutoffSquared,
periodicBoxSize,
invPeriodicBoxSize,
posq,
blockCenter,
blockBoundingBox,
maxTiles
)
;
valuesInBuffer
=
0
;
if
(
get_local_id
(
0
)
==
0
)
bufferFull
=
false
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
}
storeInteractionData
(
buffer,
valid,
sum,
temp,
&globalIndex,
interactionCount,
interactingTiles,
cutoffSquared,
periodicBoxSize,
invPeriodicBoxSize,
posq,
blockCenter,
blockBoundingBox,
maxTiles
)
;
}
/**
*
Compare
each
atom
in
one
block
to
the
bounding
box
of
another
block,
and
set
*
flags
for
which
ones
are
interacting.
*/
__kernel
void
findInteractionsWithinBlocks
(
real
cutoffSquared,
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
__global
const
real4*
restrict
posq,
__global
const
ushort2*
restrict
tiles,
__global
const
real4*
restrict
blockCenter,
__global
const
real4*
restrict
blockBoundingBox,
__global
unsigned
int*
restrict
interactionFlags,
__global
const
unsigned
int*
restrict
interactionCount,
__local
volatile
unsigned
int*
restrict
flags,
unsigned
int
maxTiles
)
{
unsigned
int
totalWarps
=
get_global_size
(
0
)
/TILE_SIZE
;
unsigned
int
warp
=
get_global_id
(
0
)
/TILE_SIZE
;
unsigned
int
numTiles
=
interactionCount[0]
;
unsigned
int
pos
=
warp*numTiles/totalWarps
;
unsigned
int
end
=
(
warp+1
)
*numTiles/totalWarps
;
unsigned
int
index
=
get_local_id
(
0
)
&
(
TILE_SIZE
-
1
)
;
if
(
numTiles
>
maxTiles
)
return
;
unsigned
int
lasty
=
0xFFFFFFFF
;
real4
apos
;
while
(
pos
<
end
)
{
//
Extract
the
coordinates
of
this
tile
ushort2
tileIndices
=
tiles[pos]
;
unsigned
int
x
=
tileIndices.x
;
unsigned
int
y
=
tileIndices.y
;
if
(
x
==
y
)
{
if
(
index
==
0
)
interactionFlags[pos]
=
0xFFFFFFFF
;
}
else
{
//
Load
the
bounding
box
for
x
and
the
atom
positions
for
y.
real4
center
=
blockCenter[x]
;
real4
boxSize
=
blockBoundingBox[x]
;
if
(
y
!=
lasty
)
apos
=
posq[y*TILE_SIZE+index]
;
//
Find
the
distance
of
the
atom
from
the
bounding
box.
real4
delta
=
apos-center
;
#
ifdef
USE_PERIODIC
delta.x
-=
floor
(
delta.x*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
delta.y
-=
floor
(
delta.y*invPeriodicBoxSize.y+0.5f
)
*periodicBoxSize.y
;
delta.z
-=
floor
(
delta.z*invPeriodicBoxSize.z+0.5f
)
*periodicBoxSize.z
;
#
endif
delta
=
max
((
real4
)
0
,
fabs
(
delta
)
-boxSize
)
;
int
thread
=
get_local_id
(
0
)
;
flags[thread]
=
(
delta.x*delta.x+delta.y*delta.y+delta.z*delta.z
>
cutoffSquared
?
0
:
1
<<
index
)
;
//
Sum
the
flags.
#
ifdef
WARPS_ARE_ATOMIC
if
(
index
%
4
==
0
)
flags[thread]
+=
flags[thread+1]+flags[thread+2]+flags[thread+3]
;
#
else
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
index
%
4
==
0
)
flags[thread]
+=
flags[thread+1]+flags[thread+2]+flags[thread+3]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
#
endif
if
(
index
==
0
)
{
unsigned
int
allFlags
=
flags[thread]+flags[thread+4]+flags[thread+8]+flags[thread+12]+flags[thread+16]+flags[thread+20]+flags[thread+24]+flags[thread+28]
;
//
Count
how
many
flags
are
set,
and
based
on
that
decide
whether
to
compute
all
interactions
//
or
only
a
fraction
of
them.
unsigned
int
bits
=
(
allFlags&0x55555555
)
+
((
allFlags>>1
)
&0x55555555
)
;
bits
=
(
bits&0x33333333
)
+
((
bits>>2
)
&0x33333333
)
;
bits
=
(
bits&0x0F0F0F0F
)
+
((
bits>>4
)
&0x0F0F0F0F
)
;
bits
=
(
bits&0x00FF00FF
)
+
((
bits>>8
)
&0x00FF00FF
)
;
bits
=
(
bits&0x0000FFFF
)
+
((
bits>>16
)
&0x0000FFFF
)
;
interactionFlags[pos]
=
(
bits
>
12
?
0xFFFFFFFF
:
allFlags
)
;
if
(
bufferFull
)
{
storeInteractionData
(
x,
buffer,
sum,
temp,
atoms,
&numAtoms,
&globalIndex,
interactionCount,
interactingTiles,
interactingAtoms,
periodicBoxSize,
invPeriodicBoxSize,
posq,
posBuffer,
blockCenterX,
blockSizeX,
maxTiles,
false
)
;
valuesInBuffer
=
0
;
if
(
get_local_id
(
0
)
==
0
)
bufferFull
=
false
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
lasty
=
y
;
}
pos++
;
storeInteractionData
(
x,
buffer,
sum,
temp,
atoms,
&numAtoms,
&globalIndex,
interactionCount,
interactingTiles,
interactingAtoms,
periodicBoxSize,
invPeriodicBoxSize,
posq,
posBuffer,
blockCenterX,
blockSizeX,
maxTiles,
true
)
;
}
//
Record
the
positions
the
neighbor
list
is
based
on.
for
(
int
i
=
get_global_id
(
0
)
; i < NUM_ATOMS; i += get_global_size(0))
oldPositions[i]
=
posq[i]
;
}
platforms/opencl/src/kernels/findInteractingBlocks_cpu.cl
View file @
93c467b2
#
pragma
OPENCL
EXTENSION
cl_khr_global_int32_base_atomics
:
enable
#
pragma
OPENCL
EXTENSION
cl_khr_byte_addressable_store
:
enable
#
define
TILE_SIZE
32
#
define
GROUP_SIZE
64
#
define
BUFFER_GROUPS
4
#
define
BUFFER_SIZE
BUFFER_GROUPS*GROUP_SIZE
/**
*
Find
a
bounding
box
for
the
atoms
in
each
block.
*/
__kernel
void
findBlockBounds
(
int
numAtoms,
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
__global
const
real4*
restrict
posq,
__global
real4*
restrict
blockCenter,
__global
real4*
restrict
blockBoundingBox,
__global
unsigned
int*
restrict
interactionCount
)
{
__kernel
void
findBlockBounds
(
int
numAtoms,
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
__global
const
real4*
restrict
posq,
__global
real4*
restrict
blockCenter,
__global
real4*
restrict
blockBoundingBox,
__global
int*
restrict
rebuildNeighborList,
__global
real2*
restrict
sortedBlocks
)
{
int
index
=
get_global_id
(
0
)
;
int
base
=
index*TILE_SIZE
;
while
(
base
<
numAtoms
)
{
real4
pos
=
posq[base]
;
#
ifdef
USE_PERIODIC
pos.x
-=
floor
(
pos.x*invPeriodicBoxSize.x
)
*periodicBoxSize.x
;
pos.y
-=
floor
(
pos.y*invPeriodicBoxSize.y
)
*periodicBoxSize.y
;
pos.z
-=
floor
(
pos.z*invPeriodicBoxSize.z
)
*periodicBoxSize.z
;
pos.xyz
-=
floor
(
pos.xyz*invPeriodicBoxSize.xyz
)
*periodicBoxSize.xyz
;
real4
firstPoint
=
pos
;
#
endif
real4
minPos
=
pos
;
...
...
@@ -25,143 +22,211 @@ __kernel void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeri
for
(
int
i
=
base+1
; i < last; i++) {
pos
=
posq[i]
;
#
ifdef
USE_PERIODIC
pos.x
-=
floor
((
pos.x-firstPoint.x
)
*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
pos.y
-=
floor
((
pos.y-firstPoint.y
)
*invPeriodicBoxSize.y+0.5f
)
*periodicBoxSize.y
;
pos.z
-=
floor
((
pos.z-firstPoint.z
)
*invPeriodicBoxSize.z+0.5f
)
*periodicBoxSize.z
;
pos.xyz
-=
floor
((
pos.xyz-firstPoint.xyz
)
*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
#
endif
minPos
=
min
(
minPos,
pos
)
;
maxPos
=
max
(
maxPos,
pos
)
;
}
blockBoundingBox[index]
=
0.5f*
(
maxPos-minPos
)
;
real4
blockSize
=
0.5f*
(
maxPos-minPos
)
;
blockBoundingBox[index]
=
blockSize
;
blockCenter[index]
=
0.5f*
(
maxPos+minPos
)
;
sortedBlocks[index]
=
(
real2
)
(
blockSize.x+blockSize.y+blockSize.z,
index
)
;
index
+=
get_global_size
(
0
)
;
base
=
index*TILE_SIZE
;
}
if
(
get_global_id
(
0
)
==
0
)
rebuildNeighborList[0]
=
0
;
}
/**
*
Sort
the
data
about
bounding
boxes
so
it
can
be
accessed
more
efficiently
in
the
next
kernel.
*/
__kernel
void
sortBoxData
(
__global
const
real2*
restrict
sortedBlock,
__global
const
real4*
restrict
blockCenter,
__global
const
real4*
restrict
blockBoundingBox,
__global
real4*
restrict
sortedBlockCenter,
__global
real4*
restrict
sortedBlockBoundingBox,
__global
const
real4*
restrict
posq,
__global
const
real4*
restrict
oldPositions,
__global
unsigned
int*
restrict
interactionCount,
__global
int*
restrict
rebuildNeighborList
)
{
for
(
int
i
=
get_global_id
(
0
)
; i < NUM_BLOCKS; i += get_global_size(0)) {
int
index
=
(
int
)
sortedBlock[i].y
;
sortedBlockCenter[i]
=
blockCenter[index]
;
sortedBlockBoundingBox[i]
=
blockBoundingBox[index]
;
}
//
Also
check
whether
any
atom
has
moved
enough
so
that
we
really
need
to
rebuild
the
neighbor
list.
bool
rebuild
=
false
;
for
(
int
i
=
get_global_id
(
0
)
; i < NUM_ATOMS; i += get_global_size(0)) {
real4
delta
=
oldPositions[i]-posq[i]
;
if
(
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
>
0.25f*PADDING*PADDING
)
rebuild
=
true
;
}
if
(
rebuild
)
{
rebuildNeighborList[0]
=
1
;
interactionCount[0]
=
0
;
}
}
/**
*
This
is
called
by
findBlocksWithInteractions
()
.
It
compacts
the
list
of
blocks
and
writes
them
*
to
global
memory.
*/
void
storeInteractionData
(
ushort2*
buffer,
int
numValid,
__global
unsigned
int*
interactionCount,
__global
ushort2*
interactingTiles,
__global
unsigned
int*
interactionFlags,
real
cutoffSquared,
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
__global
real4*
posq,
__global
real4*
blockCenter,
__global
real4*
blockBoundingBox,
unsigned
int
maxTiles
)
{
//
Filter
the
list
of
tiles
by
comparing
the
distance
from
each
atom
to
the
other
bounding
box.
unsigned
int
flagsBuffer[2*BUFFER_SIZE]
;
real4
atomPositions[TILE_SIZE]
;
int
lasty
=
-1
;
real4
centery,
boxSizey
;
for
(
int
tile
=
0
; tile < numValid; ) {
int
x
=
buffer[tile].x
;
int
y
=
buffer[tile].y
;
if
(
x
==
y
)
{
tile++
;
continue
;
}
//
Load
the
atom
positions
and
bounding
boxes.
real4
centerx
=
blockCenter[x]
;
real4
boxSizex
=
blockBoundingBox[x]
;
if
(
y
!=
lasty
)
{
for
(
int
atom
=
0
; atom < TILE_SIZE; atom++)
atomPositions[atom]
=
posq[y*TILE_SIZE+atom]
;
centery
=
blockCenter[y]
;
boxSizey
=
blockBoundingBox[y]
;
lasty
=
y
;
void
storeInteractionData
(
unsigned
short
x,
unsigned
short*
buffer,
int*
atoms,
int*
numAtoms,
int
numValid,
__global
unsigned
int*
interactionCount,
__global
ushort2*
interactingTiles,
__global
unsigned
int*
interactingAtoms,
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
__global
real4*
posq,
real4
blockCenterX,
real4
blockSizeX,
unsigned
int
maxTiles,
bool
finish
)
{
real4
posBuffer[TILE_SIZE]
;
const
bool
singlePeriodicCopy
=
(
0.5f*periodicBoxSize.x-blockSizeX.x
>=
PADDED_CUTOFF
&&
0.5f*periodicBoxSize.y-blockSizeX.y
>=
PADDED_CUTOFF
&&
0.5f*periodicBoxSize.z-blockSizeX.z
>=
PADDED_CUTOFF
)
;
for
(
int
i
=
0
; i < TILE_SIZE; i++) {
real4
pos
=
posq[x*TILE_SIZE+i]
;
#
ifdef
USE_PERIODIC
if
(
singlePeriodicCopy
)
{
//
The
box
is
small
enough
that
we
can
just
translate
all
the
atoms
into
a
single
periodic
//
box,
then
skip
having
to
apply
periodic
boundary
conditions
later.
pos.xyz
-=
floor
((
pos.xyz-blockCenterX.xyz
)
*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
}
#
endif
posBuffer[i]
=
pos
;
}
//
Find
the
distance
of
each
atom
from
the
bounding
box
.
//
Loop
over
the
tiles
and
find
specific
interactions
in
them
.
unsigned
int
flags1
=
0
,
flags2
=
0
;
for
(
int
atom
=
0
; atom < TILE_SIZE; atom++) {
real4
delta
=
atomPositions[atom]-centerx
;
for
(
int
tile
=
0
; tile < numValid; tile++) {
for
(
int
indexInTile
=
0
; indexInTile < TILE_SIZE; indexInTile++) {
//
Check
each
atom
in
block
Y
for
interactions.
int
atom
=
buffer[tile]*TILE_SIZE+indexInTile
;
real4
pos
=
posq[atom]
;
#
ifdef
USE_PERIODIC
delta.xyz
-=
floor
(
delta.xyz*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
if
(
singlePeriodicCopy
)
pos.xyz
-=
floor
((
pos.xyz-blockCenterX.xyz
)
*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
#
endif
delta
=
max
((
real4
)
0
,
fabs
(
delta
)
-boxSizex
)
;
if
(
dot
(
delta.xyz,
delta.xyz
)
<
cutoffSquared
)
flags1
+=
1
<<
atom
;
delta
=
posq[x*TILE_SIZE+atom]-centery
;
bool
interacts
=
false
;
#
ifdef
USE_PERIODIC
delta.xyz
-=
floor
(
delta.xyz*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
if
(
!singlePeriodicCopy
)
{
for
(
int
j
=
0
; j < TILE_SIZE && !interacts; j++) {
real4
delta
=
pos-posBuffer[j]
;
delta.xyz
-=
floor
(
delta.xyz*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
interacts
=
(
delta.x*delta.x+delta.y*delta.y+delta.z*delta.z
<
PADDED_CUTOFF_SQUARED
)
;
}
}
else
{
#
endif
delta
=
max
((
real4
)
0
,
fabs
(
delta
)
-boxSizey
)
;
if
(
dot
(
delta.xyz,
delta.xyz
)
<
cutoffSquared
)
flags2
+=
1
<<
atom
;
}
if
(
flags1
==
0
|
| flags2 == 0) {
// This tile contains no interactions.
numValid--;
buffer[tile] = buffer[numValid];
}
else {
flagsBuffer[2*tile] = flags1;
flagsBuffer[2*tile+1] = flags2;
tile++;
for
(
int
j
=
0
; j < TILE_SIZE && !interacts; j++) {
real4
delta
=
pos-posBuffer[j]
;
interacts
=
(
delta.x*delta.x+delta.y*delta.y+delta.z*delta.z
<
PADDED_CUTOFF_SQUARED
)
;
}
#
ifdef
USE_PERIODIC
}
#
endif
if
(
interacts
)
atoms[
(
*numAtoms
)
++]
=
atom
;
if
(
*numAtoms
==
BUFFER_SIZE
)
{
//
The
atoms
buffer
is
full,
so
store
it
to
global
memory.
int
tilesToStore
=
BUFFER_SIZE/TILE_SIZE
;
int
baseIndex
=
atom_add
(
interactionCount,
tilesToStore
)
;
if
(
baseIndex+tilesToStore
<=
maxTiles
)
{
for
(
int
i
=
0
; i < tilesToStore; i++) {
interactingTiles[baseIndex+i]
=
(
ushort2
)
(
x,
singlePeriodicCopy
)
;
for
(
int
j
=
0
; j < TILE_SIZE; j++)
interactingAtoms[
(
baseIndex+i
)
*TILE_SIZE+j]
=
atoms[i*TILE_SIZE+j]
;
}
}
*numAtoms
=
0
;
}
}
}
// Store it to global memory.
int baseIndex = atom_add(interactionCount, numValid);
if (baseIndex+numValid <= maxTiles)
for (int i = 0; i < numValid; i++) {
interactingTiles[baseIndex+i] = buffer[i];
interactionFlags[2*(baseIndex+i)] = flagsBuffer[2*i];
interactionFlags[2*(baseIndex+i)+1] = flagsBuffer[2*i+1];
if
(
*numAtoms
>
0
&&
finish
)
{
//
There
are
some
leftover
atoms,
so
save
them
now.
int
tilesToStore
=
(
*numAtoms+TILE_SIZE-1
)
/TILE_SIZE
;
int
baseIndex
=
atom_add
(
interactionCount,
tilesToStore
)
;
if
(
baseIndex+tilesToStore
<=
maxTiles
)
{
for
(
int
i
=
0
; i < tilesToStore; i++) {
interactingTiles[baseIndex+i]
=
(
ushort2
)
(
x,
singlePeriodicCopy
)
;
for
(
int
j
=
0
; j < TILE_SIZE; j++) {
int
index
=
i*TILE_SIZE+j
;
interactingAtoms[
(
baseIndex+i
)
*TILE_SIZE+j]
=
(
index
<
*numAtoms
?
atoms[index]
:
NUM_ATOMS
)
;
}
}
}
}
}
/**
*
Compare
the
bounding
boxes
for
each
pair
of
blocks.
If
they
are
sufficiently
far
apart,
*
mark
them
as
non-interacting.
*/
__kernel void findBlocksWithInteractions(
real cutoffSquared,
real4 periodicBoxSize, real4 invPeriodicBoxSize, __global const real4* restrict blockCenter,
__kernel
void
findBlocksWithInteractions
(
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
__global
const
real4*
restrict
blockCenter,
__global
const
real4*
restrict
blockBoundingBox,
__global
unsigned
int*
restrict
interactionCount,
__global
ushort2*
restrict
interactingTiles,
__global unsigned int* restrict interactionFlags, __global const real4* restrict posq, unsigned int maxTiles, unsigned int startTileIndex,
unsigned int endTileIndex) {
ushort2 buffer[BUFFER_SIZE];
int valuesInBuffer = 0;
const int numTiles = endTileIndex-startTileIndex;
unsigned int start = startTileIndex+get_group_id(0)*numTiles/get_num_groups(0);
unsigned int end = startTileIndex+(get_group_id(0)+1)*numTiles/get_num_groups(0);
for (int index = start; index < end; index++) {
// Identify the pair of blocks to compare.
unsigned int y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*index));
unsigned int x = (index-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y |
|
x
>=
NUM_BLOCKS
)
{
//
Occasionally
happens
due
to
roundoff
error.
y
+=
(
x
<
y
?
-1
:
1
)
;
x
=
(
index-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
}
//
Find
the
distance
between
the
bounding
boxes
of
the
two
cells.
__global
unsigned
int*
restrict
interactingAtoms,
__global
const
real4*
restrict
posq,
unsigned
int
maxTiles,
unsigned
int
startBlockIndex,
unsigned
int
numBlocks,
__global
real2*
restrict
sortedBlocks,
__global
const
real4*
restrict
sortedBlockCenter,
__global
const
real4*
restrict
sortedBlockBoundingBox,
__global
const
unsigned
int*
restrict
exclusionIndices,
__global
const
unsigned
int*
restrict
exclusionRowIndices,
__global
real4*
restrict
oldPositions,
__global
const
int*
restrict
rebuildNeighborList
)
{
if
(
rebuildNeighborList[0]
==
0
)
return
; // The neighbor list doesn't need to be rebuilt.
unsigned
short
buffer[BUFFER_SIZE]
;
int
atoms[BUFFER_SIZE]
;
int
exclusionsForX[MAX_EXCLUSIONS]
;
int
valuesInBuffer
;
int
numAtoms
;
//
Loop
over
blocks
sorted
by
size.
for
(
int
i
=
startBlockIndex+get_group_id
(
0
)
; i < startBlockIndex+numBlocks; i += get_num_groups(0)) {
valuesInBuffer
=
0
;
numAtoms
=
0
;
real2
sortedKey
=
sortedBlocks[i]
;
unsigned
short
x
=
(
unsigned
short
)
sortedKey.y
;
real4
blockCenterX
=
blockCenter[x]
;
real4
blockSizeX
=
blockBoundingBox[x]
;
real4
delta
=
blockCenter[x]-blockCenter[y]
;
//
Load
exclusion
data
for
block
x.
const
int
exclusionStart
=
exclusionRowIndices[x]
;
const
int
exclusionEnd
=
exclusionRowIndices[x+1]
;
const
int
numExclusions
=
exclusionEnd-exclusionStart
;
for
(
int
j
=
0
; j < numExclusions; j++)
exclusionsForX[j]
=
exclusionIndices[exclusionStart+j]
;
//
Compare
it
to
other
blocks
after
this
one
in
sorted
order.
for
(
int
j
=
i+1
; j < NUM_BLOCKS; j++) {
real2
sortedKey2
=
sortedBlocks[j]
;
unsigned
short
y
=
(
unsigned
short
)
sortedKey2.y
;
bool
hasExclusions
=
false
;
for
(
int
k
=
0
; k < numExclusions; k++)
hasExclusions
|
=
(
exclusionsForX[k]
==
y
)
;
if
(
hasExclusions
)
continue
;
real4
blockCenterY
=
sortedBlockCenter[j]
;
real4
blockSizeY
=
sortedBlockBoundingBox[j]
;
real4
delta
=
blockCenterX-blockCenterY
;
#
ifdef
USE_PERIODIC
delta.x
-=
floor
(
delta.x*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
delta.y
-=
floor
(
delta.y*invPeriodicBoxSize.y+0.5f
)
*periodicBoxSize.y
;
delta.z
-=
floor
(
delta.z*invPeriodicBoxSize.z+0.5f
)
*periodicBoxSize.z
;
delta.x
-=
floor
(
delta.x*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
delta.y
-=
floor
(
delta.y*invPeriodicBoxSize.y+0.5f
)
*periodicBoxSize.y
;
delta.z
-=
floor
(
delta.z*invPeriodicBoxSize.z+0.5f
)
*periodicBoxSize.z
;
#
endif
real4
boxSizea
=
blockBoundingBox[x]
;
real4
boxSizeb
=
blockBoundingBox[y]
;
delta.x
=
max
((
real
)
0
,
fabs
(
delta.x
)
-boxSizea.x-boxSizeb.x
)
;
delta.y
=
max
((
real
)
0
,
fabs
(
delta.y
)
-boxSizea.y-boxSizeb.y
)
;
delta.z
=
max
((
real
)
0
,
fabs
(
delta.z
)
-boxSizea.z-boxSizeb.z
)
;
if
(
delta.x*delta.x+delta.y*delta.y+delta.z*delta.z
<
cutoffSquared
)
{
//
Add
this
tile
to
the
buffer.
delta.x
=
max
((
real
)
0
,
fabs
(
delta.x
)
-blockSizeX.x-blockSizeY.x
)
;
delta.y
=
max
((
real
)
0
,
fabs
(
delta.y
)
-blockSizeX.y-blockSizeY.y
)
;
delta.z
=
max
((
real
)
0
,
fabs
(
delta.z
)
-blockSizeX.z-blockSizeY.z
)
;
if
(
delta.x*delta.x+delta.y*delta.y+delta.z*delta.z
<
PADDED_CUTOFF_SQUARED
)
{
//
Add
this
tile
to
the
buffer.
buffer[valuesInBuffer++]
=
(
ushort2
)
(
x,
y
)
;
if
(
valuesInBuffer
==
BUFFER_SIZE
)
{
storeInteractionData
(
buffer,
valuesInBuffer,
interactionCount,
interactingTiles,
interactionFlags,
cutoffSquared,
periodicBoxSize,
invPeriodicBoxSize,
posq,
blockCenter,
blockBoundingBox,
maxTiles
)
;
valuesInBuffer
=
0
;
buffer[valuesInBuffer++]
=
y
;
if
(
valuesInBuffer
==
BUFFER_SIZE
)
{
storeInteractionData
(
x,
buffer,
atoms,
&numAtoms,
valuesInBuffer,
interactionCount,
interactingTiles,
interactingAtoms,
periodicBoxSize,
invPeriodicBoxSize,
posq,
blockCenterX,
blockSizeX,
maxTiles,
false
)
;
valuesInBuffer
=
0
;
}
}
}
storeInteractionData
(
x,
buffer,
atoms,
&numAtoms,
valuesInBuffer,
interactionCount,
interactingTiles,
interactingAtoms,
periodicBoxSize,
invPeriodicBoxSize,
posq,
blockCenterX,
blockSizeX,
maxTiles,
true
)
;
}
storeInteractionData
(
buffer,
valuesInBuffer,
interactionCount,
interactingTiles,
interactionFlags,
cutoffSquared,
periodicBoxSize,
invPeriodicBoxSize,
posq,
blockCenter,
blockBoundingBox,
maxTiles
)
;
//
Record
the
positions
the
neighbor
list
is
based
on.
for
(
int
i
=
get_global_id
(
0
)
; i < NUM_ATOMS; i += get_global_size(0))
oldPositions[i]
=
posq[i]
;
}
platforms/opencl/src/kernels/gbsaObc.cl
0 → 100644
View file @
93c467b2
#
ifdef
SUPPORTS_64_BIT_ATOMICS
#
pragma
OPENCL
EXTENSION
cl_khr_int64_base_atomics
:
enable
#
endif
#
define
WARPS_PER_GROUP
(
FORCE_WORK_GROUP_SIZE/TILE_SIZE
)
typedef
struct
{
real
x,
y,
z
;
real
q
;
float
radius,
scaledRadius
;
real
bornSum
;
}
AtomData1
;
/**
*
Compute
the
Born
sum.
*/
__kernel
void
computeBornSum
(
#
ifdef
SUPPORTS_64_BIT_ATOMICS
__global
long*
restrict
global_bornSum,
#
else
__global
real*
restrict
global_bornSum,
#
endif
__global
const
real4*
restrict
posq,
__global
const
float2*
restrict
global_params,
#
ifdef
USE_CUTOFF
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
const
real4*
restrict
blockCenter,
__global
const
int*
restrict
interactingAtoms,
#
else
unsigned
int
numTiles,
#
endif
__global
const
ushort2*
exclusionTiles
)
{
const
unsigned
int
totalWarps
=
get_global_size
(
0
)
/TILE_SIZE
;
const
unsigned
int
warp
=
get_global_id
(
0
)
/TILE_SIZE
;
const
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
TILE_SIZE-1
)
;
const
unsigned
int
tbx
=
get_local_id
(
0
)
-
tgx
;
__local
AtomData1
localData[FORCE_WORK_GROUP_SIZE]
;
//
First
loop:
process
tiles
that
contain
exclusions.
const
unsigned
int
firstExclusionTile
=
FIRST_EXCLUSION_TILE+warp*
(
LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE
)
/totalWarps
;
const
unsigned
int
lastExclusionTile
=
FIRST_EXCLUSION_TILE+
(
warp+1
)
*
(
LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE
)
/totalWarps
;
for
(
int
pos
=
firstExclusionTile
; pos < lastExclusionTile; pos++) {
const
ushort2
tileIndices
=
exclusionTiles[pos]
;
const
unsigned
int
x
=
tileIndices.x
;
const
unsigned
int
y
=
tileIndices.y
;
real
bornSum
=
0.0f
;
unsigned
int
atom1
=
x*TILE_SIZE
+
tgx
;
real4
posq1
=
posq[atom1]
;
float2
params1
=
global_params[atom1]
;
if
(
x
==
y
)
{
//
This
tile
is
on
the
diagonal.
localData[get_local_id
(
0
)
].x
=
posq1.x
;
localData[get_local_id
(
0
)
].y
=
posq1.y
;
localData[get_local_id
(
0
)
].z
=
posq1.z
;
localData[get_local_id
(
0
)
].q
=
posq1.w
;
localData[get_local_id
(
0
)
].radius
=
params1.x
;
localData[get_local_id
(
0
)
].scaledRadius
=
params1.y
;
SYNC_WARPS
;
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
real4
delta
=
(
real4
)
(
localData[tbx+j].x-posq1.x,
localData[tbx+j].y-posq1.y,
localData[tbx+j].z-posq1.z,
0
)
;
#
ifdef
USE_PERIODIC
delta.xyz
-=
floor
(
delta.xyz*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
#
endif
real
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
#
ifdef
USE_CUTOFF
if
(
atom1
<
NUM_ATOMS
&&
y*TILE_SIZE+j
<
NUM_ATOMS
&&
r2
<
CUTOFF_SQUARED
)
{
#
else
if
(
atom1
<
NUM_ATOMS
&&
y*TILE_SIZE+j
<
NUM_ATOMS
)
{
#
endif
real
invR
=
RSQRT
(
r2
)
;
real
r
=
RECIP
(
invR
)
;
float2
params2
=
(
float2
)
(
localData[tbx+j].radius,
localData[tbx+j].scaledRadius
)
;
real
rScaledRadiusJ
=
r+params2.y
;
if
((
j
!=
tgx
)
&&
(
params1.x
<
rScaledRadiusJ
))
{
real
l_ij
=
RECIP
(
max
((
real
)
params1.x,
fabs
(
r-params2.y
)))
;
real
u_ij
=
RECIP
(
rScaledRadiusJ
)
;
real
l_ij2
=
l_ij*l_ij
;
real
u_ij2
=
u_ij*u_ij
;
real
ratio
=
LOG
(
u_ij
*
RECIP
(
l_ij
))
;
bornSum
+=
l_ij
-
u_ij
+
(
0.50f*invR*ratio
)
+
0.25f*
(
r*
(
u_ij2-l_ij2
)
+
(
params2.y*params2.y*invR
)
*
(
l_ij2-u_ij2
))
;
bornSum
+=
(
params1.x
<
params2.y-r
?
2.0f*
(
RECIP
(
params1.x
)
-l_ij
)
:
0
)
;
}
}
SYNC_WARPS
;
}
}
else
{
//
This
is
an
off-diagonal
tile.
unsigned
int
j
=
y*TILE_SIZE
+
tgx
;
real4
tempPosq
=
posq[j]
;
localData[get_local_id
(
0
)
].x
=
tempPosq.x
;
localData[get_local_id
(
0
)
].y
=
tempPosq.y
;
localData[get_local_id
(
0
)
].z
=
tempPosq.z
;
localData[get_local_id
(
0
)
].q
=
tempPosq.w
;
float2
tempParams
=
global_params[j]
;
localData[get_local_id
(
0
)
].radius
=
tempParams.x
;
localData[get_local_id
(
0
)
].scaledRadius
=
tempParams.y
;
localData[get_local_id
(
0
)
].bornSum
=
0.0f
;
SYNC_WARPS
;
//
Compute
the
full
set
of
interactions
in
this
tile.
unsigned
int
tj
=
tgx
;
for
(
j
=
0
; j < TILE_SIZE; j++) {
real4
delta
=
(
real4
)
(
localData[tbx+tj].x-posq1.x,
localData[tbx+tj].y-posq1.y,
localData[tbx+tj].z-posq1.z,
0
)
;
#
ifdef
USE_PERIODIC
delta.xyz
-=
floor
(
delta.xyz*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
#
endif
real
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
#
ifdef
USE_CUTOFF
if
(
atom1
<
NUM_ATOMS
&&
y*TILE_SIZE+tj
<
NUM_ATOMS
&&
r2
<
CUTOFF_SQUARED
)
{
#
else
if
(
atom1
<
NUM_ATOMS
&&
y*TILE_SIZE+tj
<
NUM_ATOMS
)
{
#
endif
real
invR
=
RSQRT
(
r2
)
;
real
r
=
RECIP
(
invR
)
;
float2
params2
=
(
float2
)
(
localData[tbx+tj].radius,
localData[tbx+tj].scaledRadius
)
;
real
rScaledRadiusJ
=
r+params2.y
;
if
(
params1.x
<
rScaledRadiusJ
)
{
real
l_ij
=
RECIP
(
max
((
real
)
params1.x,
fabs
(
r-params2.y
)))
;
real
u_ij
=
RECIP
(
rScaledRadiusJ
)
;
real
l_ij2
=
l_ij*l_ij
;
real
u_ij2
=
u_ij*u_ij
;
real
ratio
=
LOG
(
u_ij
*
RECIP
(
l_ij
))
;
bornSum
+=
l_ij
-
u_ij
+
(
0.50f*invR*ratio
)
+
0.25f*
(
r*
(
u_ij2-l_ij2
)
+
(
params2.y*params2.y*invR
)
*
(
l_ij2-u_ij2
))
;
bornSum
+=
(
params1.x
<
params2.y-r
?
2.0f*
(
RECIP
(
params1.x
)
-l_ij
)
:
0
)
;
}
real
rScaledRadiusI
=
r+params1.y
;
if
(
params2.x
<
rScaledRadiusI
)
{
real
l_ij
=
RECIP
(
max
((
real
)
params2.x,
fabs
(
r-params1.y
)))
;
real
u_ij
=
RECIP
(
rScaledRadiusI
)
;
real
l_ij2
=
l_ij*l_ij
;
real
u_ij2
=
u_ij*u_ij
;
real
ratio
=
LOG
(
u_ij
*
RECIP
(
l_ij
))
;
real
term
=
l_ij
-
u_ij
+
(
0.50f*invR*ratio
)
+
0.25f*
(
r*
(
u_ij2-l_ij2
)
+
(
params1.y*params1.y*invR
)
*
(
l_ij2-u_ij2
))
;
term
+=
(
params2.x
<
params1.y-r
?
2.0f*
(
RECIP
(
params2.x
)
-l_ij
)
:
0
)
;
localData[tbx+tj].bornSum
+=
term
;
}
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
)
;
SYNC_WARPS
;
}
}
//
Write
results.
#
ifdef
SUPPORTS_64_BIT_ATOMICS
unsigned
int
offset
=
x*TILE_SIZE
+
tgx
;
atom_add
(
&global_bornSum[offset],
(
long
)
(
bornSum*0x100000000
))
;
if
(
x
!=
y
)
{
offset
=
y*TILE_SIZE
+
tgx
;
atom_add
(
&global_bornSum[offset],
(
long
)
(
localData[get_local_id
(
0
)
].bornSum*0x100000000
))
;
}
#
else
unsigned
int
offset1
=
x*TILE_SIZE
+
tgx
+
warp*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
y*TILE_SIZE
+
tgx
+
warp*PADDED_NUM_ATOMS
;
global_bornSum[offset1]
+=
bornSum
;
if
(
x
!=
y
)
global_bornSum[offset2]
+=
localData[get_local_id
(
0
)
].bornSum
;
#
endif
}
//
Second
loop:
tiles
without
exclusions,
either
from
the
neighbor
list
(
with
cutoff
)
or
just
enumerating
all
//
of
them
(
no
cutoff
)
.
#
ifdef
USE_CUTOFF
unsigned
int
numTiles
=
interactionCount[0]
;
int
pos
=
warp*
(
numTiles
>
maxTiles
?
NUM_BLOCKS*
(
NUM_BLOCKS+1
)
/2
:
numTiles
)
/totalWarps
;
int
end
=
(
warp+1
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS*
(
NUM_BLOCKS+1
)
/2
:
numTiles
)
/totalWarps
;
#
else
int
pos
=
warp*numTiles/totalWarps
;
int
end
=
(
warp+1
)
*numTiles/totalWarps
;
#
endif
int
skipBase
=
0
;
int
currentSkipIndex
=
tbx
;
__local
int
atomIndices[FORCE_WORK_GROUP_SIZE]
;
__local
int
skipTiles[FORCE_WORK_GROUP_SIZE]
;
skipTiles[get_local_id
(
0
)
]
=
-1
;
while
(
pos
<
end
)
{
real
bornSum
=
0
;
bool
includeTile
=
true
;
//
Extract
the
coordinates
of
this
tile.
unsigned
int
x,
y
;
bool
singlePeriodicCopy
=
false
;
#
ifdef
USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles[pos]
;
x
=
tileIndices.x
;
singlePeriodicCopy
=
tileIndices.y
;
}
else
#
endif
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS+0.5f-SQRT
((
NUM_BLOCKS+0.5f
)
*
(
NUM_BLOCKS+0.5f
)
-2*pos
))
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
if
(
x
<
y
|
| x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
// Skip over tiles that have exclusions, since they were already processed.
SYNC_WARPS;
while (skipTiles[tbx+TILE_SIZE-1] < pos) {
SYNC_WARPS;
if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[skipBase+tgx];
skipTiles[get_local_id(0)] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
}
else
skipTiles[get_local_id(0)] = end;
skipBase += TILE_SIZE;
currentSkipIndex = tbx;
SYNC_WARPS;
}
while (skipTiles[currentSkipIndex] < pos)
currentSkipIndex++;
includeTile = (skipTiles[currentSkipIndex] != pos);
}
if (includeTile) {
unsigned int atom1 = x*TILE_SIZE + tgx;
// Load atom data for this tile.
real4 posq1 = posq[atom1];
float2 params1 = global_params[atom1];
#ifdef USE_CUTOFF
unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+tgx] : y*TILE_SIZE + tgx);
#else
unsigned int j = y*TILE_SIZE + tgx;
#endif
atomIndices[get_local_id(0)] = j;
if (j < PADDED_NUM_ATOMS) {
real4 tempPosq = posq[j];
localData[get_local_id(0)].x = tempPosq.x;
localData[get_local_id(0)].y = tempPosq.y;
localData[get_local_id(0)].z = tempPosq.z;
localData[get_local_id(0)].q = tempPosq.w;
float2 tempParams = global_params[j];
localData[get_local_id(0)].radius = tempParams.x;
localData[get_local_id(0)].scaledRadius = tempParams.y;
localData[get_local_id(0)].bornSum = 0.0f;
}
SYNC_WARPS;
#ifdef USE_PERIODIC
if (singlePeriodicCopy) {
// The box is small enough that we can just translate all the atoms into a single periodic
// box, then skip having to apply periodic boundary conditions later.
real4 blockCenterX = blockCenter[x];
posq1.xyz -= floor((posq1.xyz-blockCenterX.xyz)*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
localData[get_local_id(0)].x -= floor((localData[get_local_id(0)].x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
localData[get_local_id(0)].y -= floor((localData[get_local_id(0)].y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
localData[get_local_id(0)].z -= floor((localData[get_local_id(0)].z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
SYNC_WARPS;
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
real4 delta = (real4) (localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z, 0);
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
int atom2 = atomIndices[tbx+tj];
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
real invR = RSQRT(r2);
real r = RECIP(invR);
float2 params2 = (float2) (localData[tbx+tj].radius, localData[tbx+tj].scaledRadius);
real rScaledRadiusJ = r+params2.y;
if (params1.x < rScaledRadiusJ) {
real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
real u_ij = RECIP(rScaledRadiusJ);
real l_ij2 = l_ij*l_ij;
real u_ij2 = u_ij*u_ij;
real ratio = LOG(u_ij * RECIP(l_ij));
bornSum += l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
(params2.y*params2.y*invR)*(l_ij2-u_ij2));
bornSum += (params1.x < params2.y-r ? 2.0f*(RECIP(params1.x)-l_ij) : 0);
}
real rScaledRadiusI = r+params1.y;
if (params2.x < rScaledRadiusI) {
real l_ij = RECIP(max((real) params2.x, fabs(r-params1.y)));
real u_ij = RECIP(rScaledRadiusI);
real l_ij2 = l_ij*l_ij;
real u_ij2 = u_ij*u_ij;
real ratio = LOG(u_ij * RECIP(l_ij));
real term = l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
(params1.y*params1.y*invR)*(l_ij2-u_ij2));
term += (params2.x < params1.y-r ? 2.0f*(RECIP(params2.x)-l_ij) : 0);
localData[tbx+tj].bornSum += term;
}
}
tj = (tj + 1) & (TILE_SIZE - 1);
SYNC_WARPS;
}
}
else
#endif
{
// We need to apply periodic boundary conditions separately for each interaction.
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
real4 delta = (real4) (localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z, 0);
#ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
int atom2 = atomIndices[tbx+tj];
#ifdef USE_CUTOFF
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
float2 params2 = (float2) (localData[tbx+tj].radius, localData[tbx+tj].scaledRadius);
real rScaledRadiusJ = r+params2.y;
if (params1.x < rScaledRadiusJ) {
real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
real u_ij = RECIP(rScaledRadiusJ);
real l_ij2 = l_ij*l_ij;
real u_ij2 = u_ij*u_ij;
real ratio = LOG(u_ij * RECIP(l_ij));
bornSum += l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
(params2.y*params2.y*invR)*(l_ij2-u_ij2));
bornSum += (params1.x < params2.y-r ? 2.0f*(RECIP(params1.x)-l_ij) : 0);
}
real rScaledRadiusI = r+params1.y;
if (params2.x < rScaledRadiusI) {
real l_ij = RECIP(max((real) params2.x, fabs(r-params1.y)));
real u_ij = RECIP(rScaledRadiusI);
real l_ij2 = l_ij*l_ij;
real u_ij2 = u_ij*u_ij;
real ratio = LOG(u_ij * RECIP(l_ij));
real term = l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
(params1.y*params1.y*invR)*(l_ij2-u_ij2));
term += (params2.x < params1.y-r ? 2.0f*(RECIP(params2.x)-l_ij) : 0);
localData[tbx+tj].bornSum += term;
}
}
tj = (tj + 1) & (TILE_SIZE - 1);
SYNC_WARPS;
}
}
// Write results.
#ifdef USE_CUTOFF
unsigned int atom2 = atomIndices[get_local_id(0)];
#else
unsigned int atom2 = y*TILE_SIZE + tgx;
#endif
#ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&global_bornSum[atom1], (long) (bornSum*0x100000000));
if (atom2 < PADDED_NUM_ATOMS)
atom_add(&global_bornSum[atom2], (long) (localData[get_local_id(0)].bornSum*0x100000000));
#else
unsigned int offset1 = atom1 + warp*PADDED_NUM_ATOMS;
unsigned int offset2 = atom2 + warp*PADDED_NUM_ATOMS;
global_bornSum[offset1] += bornSum;
if (atom2 < PADDED_NUM_ATOMS)
global_bornSum[offset2] += localData[get_local_id(0)].bornSum;
#endif
}
pos++;
}
}
typedef struct {
real x, y, z;
real q;
real fx, fy, fz, fw;
real bornRadius;
} AtomData2;
/**
* First part of computing the GBSA interaction.
*/
__kernel void computeGBSAForce1(
#ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict forceBuffers, __global long* restrict global_bornForce,
#else
__global real4* restrict forceBuffers, __global real* restrict global_bornForce,
#endif
__global real* restrict energyBuffer, __global const real4* restrict posq, __global const real* restrict global_bornRadii,
#ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, __global const real4* restrict blockCenter, __global const int* restrict interactingAtoms,
#else
unsigned int numTiles,
#endif
__global const ushort2* exclusionTiles) {
const unsigned int totalWarps = get_global_size(0)/TILE_SIZE;
const unsigned int warp = get_global_id(0)/TILE_SIZE;
const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
const unsigned int tbx = get_local_id(0) - tgx;
real energy = 0.0f;
__local AtomData2 localData[FORCE_WORK_GROUP_SIZE];
// First loop: process tiles that contain exclusions.
const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
const ushort2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x;
const unsigned int y = tileIndices.y;
real4 force = 0.0f;
unsigned int atom1 = x*TILE_SIZE + tgx;
real4 posq1 = posq[atom1];
real bornRadius1 = global_bornRadii[atom1];
if (x == y) {
// This tile is on the diagonal.
const unsigned int localAtomIndex = get_local_id(0);
localData[localAtomIndex].x = posq1.x;
localData[localAtomIndex].y = posq1.y;
localData[localAtomIndex].z = posq1.z;
localData[localAtomIndex].q = posq1.w;
localData[get_local_id(0)].bornRadius = bornRadius1;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
real4 posq2 = (real4) (localData[tbx+j].x, localData[tbx+j].y, localData[tbx+j].z, localData[tbx+j].q);
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
real bornRadius2 = localData[tbx+j].bornRadius;
real alpha2_ij = bornRadius1*bornRadius2;
real D_ij = r2*RECIP(4.0f*alpha2_ij);
real expTerm = EXP(-D_ij);
real denominator2 = r2 + alpha2_ij*expTerm;
real denominator = SQRT(denominator2);
real tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator);
real Gpol = tempEnergy*RECIP(denominator2);
real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
real dEdR = Gpol*(1.0f - 0.25f*expTerm);
force.w += dGpol_dalpha2_ij*bornRadius2;
energy += 0.5f*tempEnergy;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
#ifdef USE_CUTOFF
}
#endif
SYNC_WARPS;
}
}
}
else {
// This is an off-diagonal tile.
unsigned int j = y*TILE_SIZE + tgx;
real4 tempPosq = posq[j];
localData[get_local_id(0)].x = tempPosq.x;
localData[get_local_id(0)].y = tempPosq.y;
localData[get_local_id(0)].z = tempPosq.z;
localData[get_local_id(0)].q = tempPosq.w;
localData[get_local_id(0)].bornRadius = global_bornRadii[j];
localData[get_local_id(0)].fx = 0.0f;
localData[get_local_id(0)].fy = 0.0f;
localData[get_local_id(0)].fz = 0.0f;
localData[get_local_id(0)].fw = 0.0f;
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS) {
real4 posq2 = (real4) (localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z, localData[tbx+tj].q);
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
real bornRadius2 = localData[tbx+tj].bornRadius;
real alpha2_ij = bornRadius1*bornRadius2;
real D_ij = r2*RECIP(4.0f*alpha2_ij);
real expTerm = EXP(-D_ij);
real denominator2 = r2 + alpha2_ij*expTerm;
real denominator = SQRT(denominator2);
real tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator);
real Gpol = tempEnergy*RECIP(denominator2);
real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
real dEdR = Gpol*(1.0f - 0.25f*expTerm);
force.w += dGpol_dalpha2_ij*bornRadius2;
energy += tempEnergy;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
localData[tbx+tj].fx += delta.x;
localData[tbx+tj].fy += delta.y;
localData[tbx+tj].fz += delta.z;
localData[tbx+tj].fw += dGpol_dalpha2_ij*bornRadius1;
#ifdef USE_CUTOFF
}
#endif
}
tj = (tj + 1) & (TILE_SIZE - 1);
SYNC_WARPS;
}
}
// Write results.
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset = x*TILE_SIZE + tgx;
atom_add(&forceBuffers[offset], (long) (force.x*0x100000000));
atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
atom_add(&global_bornForce[offset], (long) (force.w*0x100000000));
if (x != y) {
offset = y*TILE_SIZE + tgx;
atom_add(&forceBuffers[offset], (long) (localData[get_local_id(0)].fx*0x100000000));
atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fy*0x100000000));
atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fz*0x100000000));
atom_add(&global_bornForce[offset], (long) (localData[get_local_id(0)].fw*0x100000000));
}
#else
unsigned int offset1 = x*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
forceBuffers[offset1].xyz += force.xyz;
global_bornForce[offset1] += force.w;
if (x != y) {
forceBuffers[offset2] += (real4) (localData[get_local_id(0)].fx, localData[get_local_id(0)].fy, localData[get_local_id(0)].fz, 0.0f);
global_bornForce[offset2] += localData[get_local_id(0)].fw;
}
#endif
}
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
// of them (no cutoff).
#ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0];
int pos = warp*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
int end = (warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
#else
int pos = warp*numTiles/totalWarps;
int end = (warp+1)*numTiles/totalWarps;
#endif
int skipBase = 0;
int currentSkipIndex = tbx;
__local int atomIndices[FORCE_WORK_GROUP_SIZE];
__local int skipTiles[FORCE_WORK_GROUP_SIZE];
skipTiles[get_local_id(0)] = -1;
while (pos < end) {
real4 force = 0;
bool includeTile = true;
// Extract the coordinates of this tile.
unsigned int x, y;
bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
x = tileIndices.x;
singlePeriodicCopy = tileIndices.y;
}
else
#endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y |
|
x
>=
NUM_BLOCKS
)
{
//
Occasionally
happens
due
to
roundoff
error.
y
+=
(
x
<
y
?
-1
:
1
)
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
}
//
Skip
over
tiles
that
have
exclusions,
since
they
were
already
processed.
SYNC_WARPS
;
while
(
skipTiles[tbx+TILE_SIZE-1]
<
pos
)
{
SYNC_WARPS
;
if
(
skipBase+tgx
<
NUM_TILES_WITH_EXCLUSIONS
)
{
ushort2
tile
=
exclusionTiles[skipBase+tgx]
;
skipTiles[get_local_id
(
0
)
]
=
tile.x
+
tile.y*NUM_BLOCKS
-
tile.y*
(
tile.y+1
)
/2
;
}
else
skipTiles[get_local_id
(
0
)
]
=
end
;
skipBase
+=
TILE_SIZE
;
currentSkipIndex
=
tbx
;
SYNC_WARPS
;
}
while
(
skipTiles[currentSkipIndex]
<
pos
)
currentSkipIndex++
;
includeTile
=
(
skipTiles[currentSkipIndex]
!=
pos
)
;
}
if
(
includeTile
)
{
unsigned
int
atom1
=
x*TILE_SIZE
+
tgx
;
//
Load
atom
data
for
this
tile.
real4
posq1
=
posq[atom1]
;
real
bornRadius1
=
global_bornRadii[atom1]
;
#
ifdef
USE_CUTOFF
unsigned
int
j
=
(
numTiles
<=
maxTiles
?
interactingAtoms[pos*TILE_SIZE+tgx]
:
y*TILE_SIZE
+
tgx
)
;
#
else
unsigned
int
j
=
y*TILE_SIZE
+
tgx
;
#
endif
atomIndices[get_local_id
(
0
)
]
=
j
;
if
(
j
<
PADDED_NUM_ATOMS
)
{
real4
tempPosq
=
posq[j]
;
localData[get_local_id
(
0
)
].x
=
tempPosq.x
;
localData[get_local_id
(
0
)
].y
=
tempPosq.y
;
localData[get_local_id
(
0
)
].z
=
tempPosq.z
;
localData[get_local_id
(
0
)
].q
=
tempPosq.w
;
localData[get_local_id
(
0
)
].bornRadius
=
global_bornRadii[j]
;
localData[get_local_id
(
0
)
].fx
=
0.0f
;
localData[get_local_id
(
0
)
].fy
=
0.0f
;
localData[get_local_id
(
0
)
].fz
=
0.0f
;
localData[get_local_id
(
0
)
].fw
=
0.0f
;
}
#
ifdef
USE_PERIODIC
if
(
singlePeriodicCopy
)
{
//
The
box
is
small
enough
that
we
can
just
translate
all
the
atoms
into
a
single
periodic
//
box,
then
skip
having
to
apply
periodic
boundary
conditions
later.
real4
blockCenterX
=
blockCenter[x]
;
posq1.xyz
-=
floor
((
posq1.xyz-blockCenterX.xyz
)
*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
localData[get_local_id
(
0
)
].x
-=
floor
((
localData[get_local_id
(
0
)
].x-blockCenterX.x
)
*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
localData[get_local_id
(
0
)
].y
-=
floor
((
localData[get_local_id
(
0
)
].y-blockCenterX.y
)
*invPeriodicBoxSize.y+0.5f
)
*periodicBoxSize.y
;
localData[get_local_id
(
0
)
].z
-=
floor
((
localData[get_local_id
(
0
)
].z-blockCenterX.z
)
*invPeriodicBoxSize.z+0.5f
)
*periodicBoxSize.z
;
SYNC_WARPS
;
unsigned
int
tj
=
tgx
;
for
(
j
=
0
; j < TILE_SIZE; j++) {
int
atom2
=
atomIndices[tbx+tj]
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
real4
posq2
=
(
real4
)
(
localData[tbx+tj].x,
localData[tbx+tj].y,
localData[tbx+tj].z,
localData[tbx+tj].q
)
;
real4
delta
=
(
real4
)
(
posq2.xyz
-
posq1.xyz,
0
)
;
real
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
if
(
r2
<
CUTOFF_SQUARED
)
{
real
invR
=
RSQRT
(
r2
)
;
real
r
=
RECIP
(
invR
)
;
real
bornRadius2
=
localData[tbx+tj].bornRadius
;
real
alpha2_ij
=
bornRadius1*bornRadius2
;
real
D_ij
=
r2*RECIP
(
4.0f*alpha2_ij
)
;
real
expTerm
=
EXP
(
-D_ij
)
;
real
denominator2
=
r2
+
alpha2_ij*expTerm
;
real
denominator
=
SQRT
(
denominator2
)
;
real
tempEnergy
=
(
PREFACTOR*posq1.w*posq2.w
)
*RECIP
(
denominator
)
;
real
Gpol
=
tempEnergy*RECIP
(
denominator2
)
;
real
dGpol_dalpha2_ij
=
-0.5f*Gpol*expTerm*
(
1.0f+D_ij
)
;
real
dEdR
=
Gpol*
(
1.0f
-
0.25f*expTerm
)
;
force.w
+=
dGpol_dalpha2_ij*bornRadius2
;
energy
+=
tempEnergy
;
delta.xyz
*=
dEdR
;
force.xyz
-=
delta.xyz
;
localData[tbx+tj].fx
+=
delta.x
;
localData[tbx+tj].fy
+=
delta.y
;
localData[tbx+tj].fz
+=
delta.z
;
localData[tbx+tj].fw
+=
dGpol_dalpha2_ij*bornRadius1
;
}
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
)
;
SYNC_WARPS
;
}
}
else
#
endif
{
//
We
need
to
apply
periodic
boundary
conditions
separately
for
each
interaction.
unsigned
int
tj
=
tgx
;
for
(
j
=
0
; j < TILE_SIZE; j++) {
int
atom2
=
atomIndices[tbx+tj]
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
real4
posq2
=
(
real4
)
(
localData[tbx+tj].x,
localData[tbx+tj].y,
localData[tbx+tj].z,
localData[tbx+tj].q
)
;
real4
delta
=
(
real4
)
(
posq2.xyz
-
posq1.xyz,
0
)
;
#
ifdef
USE_PERIODIC
delta.xyz
-=
floor
(
delta.xyz*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
#
endif
real
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
#
ifdef
USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
#
endif
real
invR
=
RSQRT
(
r2
)
;
real
r
=
RECIP
(
invR
)
;
real
bornRadius2
=
localData[tbx+tj].bornRadius
;
real
alpha2_ij
=
bornRadius1*bornRadius2
;
real
D_ij
=
r2*RECIP
(
4.0f*alpha2_ij
)
;
real
expTerm
=
EXP
(
-D_ij
)
;
real
denominator2
=
r2
+
alpha2_ij*expTerm
;
real
denominator
=
SQRT
(
denominator2
)
;
real
tempEnergy
=
(
PREFACTOR*posq1.w*posq2.w
)
*RECIP
(
denominator
)
;
real
Gpol
=
tempEnergy*RECIP
(
denominator2
)
;
real
dGpol_dalpha2_ij
=
-0.5f*Gpol*expTerm*
(
1.0f+D_ij
)
;
real
dEdR
=
Gpol*
(
1.0f
-
0.25f*expTerm
)
;
force.w
+=
dGpol_dalpha2_ij*bornRadius2
;
energy
+=
tempEnergy
;
delta.xyz
*=
dEdR
;
force.xyz
-=
delta.xyz
;
localData[tbx+tj].fx
+=
delta.x
;
localData[tbx+tj].fy
+=
delta.y
;
localData[tbx+tj].fz
+=
delta.z
;
localData[tbx+tj].fw
+=
dGpol_dalpha2_ij*bornRadius1
;
#
ifdef
USE_CUTOFF
}
#
endif
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
)
;
SYNC_WARPS
;
}
}
//
Write
results.
#
ifdef
USE_CUTOFF
unsigned
int
atom2
=
atomIndices[get_local_id
(
0
)
]
;
#
else
unsigned
int
atom2
=
y*TILE_SIZE
+
tgx
;
#
endif
#
ifdef
SUPPORTS_64_BIT_ATOMICS
atom_add
(
&forceBuffers[atom1],
(
long
)
(
force.x*0x100000000
))
;
atom_add
(
&forceBuffers[atom1+PADDED_NUM_ATOMS],
(
long
)
(
force.y*0x100000000
))
;
atom_add
(
&forceBuffers[atom1+2*PADDED_NUM_ATOMS],
(
long
)
(
force.z*0x100000000
))
;
atom_add
(
&global_bornForce[atom1],
(
long
)
(
force.w*0x100000000
))
;
if
(
atom2
<
PADDED_NUM_ATOMS
)
{
atom_add
(
&forceBuffers[atom2],
(
long
)
(
localData[get_local_id
(
0
)
].fx*0x100000000
))
;
atom_add
(
&forceBuffers[atom2+PADDED_NUM_ATOMS],
(
long
)
(
localData[get_local_id
(
0
)
].fy*0x100000000
))
;
atom_add
(
&forceBuffers[atom2+2*PADDED_NUM_ATOMS],
(
long
)
(
localData[get_local_id
(
0
)
].fz*0x100000000
))
;
atom_add
(
&global_bornForce[atom2],
(
long
)
(
localData[get_local_id
(
0
)
].fw*0x100000000
))
;
}
#
else
unsigned
int
offset1
=
atom1
+
warp*PADDED_NUM_ATOMS
;
unsigned
int
offset2
=
atom2
+
warp*PADDED_NUM_ATOMS
;
forceBuffers[offset1].xyz
+=
force.xyz
;
global_bornForce[offset1]
+=
force.w
;
if
(
atom2
<
PADDED_NUM_ATOMS
)
{
forceBuffers[offset2]
+=
(
real4
)
(
localData[get_local_id
(
0
)
].fx,
localData[get_local_id
(
0
)
].fy,
localData[get_local_id
(
0
)
].fz,
0.0f
)
;
global_bornForce[offset2]
+=
localData[get_local_id
(
0
)
].fw
;
}
#
endif
}
pos++
;
}
energyBuffer[get_global_id
(
0
)
]
+=
energy
;
}
platforms/opencl/src/kernels/gbsaObc_cpu.cl
View file @
93c467b2
#
define
TILE_SIZE
32
#
ifdef
SUPPORTS_64_BIT_ATOMICS
#
pragma
OPENCL
EXTENSION
cl_khr_int64_base_atomics
:
enable
#
endif
typedef
struct
{
real
x,
y,
z
;
...
...
@@ -10,58 +12,42 @@ typedef struct {
/**
*
Compute
the
Born
sum.
*/
__kernel
void
computeBornSum
(
__global
real*
restrict
global_bornSum,
__global
const
real4*
restrict
posq,
__global
const
float2*
restrict
global_params,
#
ifdef
USE_CUTOFF
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
const
unsigned
int*
restrict
interactionFlags
)
{
__kernel
void
computeBornSum
(
#
ifdef
SUPPORTS_64_BIT_ATOMICS
__global
long*
restrict
global_bornSum,
#
else
unsigned
int
numTiles
)
{
__global
real*
restrict
global_bornSum,
#
endif
__global
const
real4*
restrict
posq,
__global
const
float2*
restrict
global_params,
#
ifdef
USE_CUTOFF
unsigned
int
numTiles
=
interactionCount[0]
;
unsigned
int
pos
=
get_group_id
(
0
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS*
(
NUM_BLOCKS+1
)
/2
:
numTiles
)
/get_num_groups
(
0
)
;
unsigned
int
end
=
(
get_group_id
(
0
)
+1
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS*
(
NUM_BLOCKS+1
)
/2
:
numTiles
)
/get_num_groups
(
0
)
;
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
unsigned
int
maxTiles,
__global
const
real4*
restrict
blockCenter,
__global
const
int*
restrict
interactingAtoms,
#
else
unsigned
int
pos
=
get_group_id
(
0
)
*numTiles/get_num_groups
(
0
)
;
unsigned
int
end
=
(
get_group_id
(
0
)
+1
)
*numTiles/get_num_groups
(
0
)
;
unsigned
int
numTiles,
#
endif
unsigned
int
lasty
=
0xFFFFFFFF
;
__global
const
ushort2*
exclusionTiles
)
{
__local
AtomData1
localData[TILE_SIZE]
;
while
(
pos
<
end
)
{
//
Extract
the
coordinates
of
this
tile
unsigned
int
x,
y
;
#
ifdef
USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles[pos]
;
x
=
tileIndices.x
;
y
=
tileIndices.y
;
}
else
#
endif
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS+0.5f-sqrt
((
NUM_BLOCKS+0.5f
)
*
(
NUM_BLOCKS+0.5f
)
-2*pos
))
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
if
(
x
<
y
|
| x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
// Load the data for this tile if we don't already have it cached.
if (lasty != y) {
for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
unsigned int j = y*TILE_SIZE + localAtomIndex;
real4 tempPosq = posq[j];
localData[localAtomIndex].x = tempPosq.x;
localData[localAtomIndex].y = tempPosq.y;
localData[localAtomIndex].z = tempPosq.z;
localData[localAtomIndex].q = tempPosq.w;
float2 tempParams = global_params[j];
localData[localAtomIndex].radius = tempParams.x;
localData[localAtomIndex].scaledRadius = tempParams.y;
}
//
First
loop:
process
tiles
that
contain
exclusions.
const
unsigned
int
firstExclusionTile
=
FIRST_EXCLUSION_TILE+get_group_id
(
0
)
*
(
LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE
)
/get_num_groups
(
0
)
;
const
unsigned
int
lastExclusionTile
=
FIRST_EXCLUSION_TILE+
(
get_group_id
(
0
)
+1
)
*
(
LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE
)
/get_num_groups
(
0
)
;
for
(
int
pos
=
firstExclusionTile
; pos < lastExclusionTile; pos++) {
const
ushort2
tileIndices
=
exclusionTiles[pos]
;
const
unsigned
int
x
=
tileIndices.x
;
const
unsigned
int
y
=
tileIndices.y
;
//
Load
the
data
for
this
tile.
for
(
int
localAtomIndex
=
0
; localAtomIndex < TILE_SIZE; localAtomIndex++) {
unsigned
int
j
=
y*TILE_SIZE
+
localAtomIndex
;
real4
tempPosq
=
posq[j]
;
localData[localAtomIndex].x
=
tempPosq.x
;
localData[localAtomIndex].y
=
tempPosq.y
;
localData[localAtomIndex].z
=
tempPosq.z
;
localData[localAtomIndex].q
=
tempPosq.w
;
float2
tempParams
=
global_params[j]
;
localData[localAtomIndex].radius
=
tempParams.x
;
localData[localAtomIndex].scaledRadius
=
tempParams.y
;
}
if
(
x
==
y
)
{
//
This
tile
is
on
the
diagonal.
...
...
@@ -93,31 +79,31 @@ __kernel void computeBornSum(__global real* restrict global_bornSum, __global co
real
l_ij2
=
l_ij*l_ij
;
real
u_ij2
=
u_ij*u_ij
;
real
ratio
=
LOG
(
u_ij
*
RECIP
(
l_ij
))
;
bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2);
if (params1.x < params2.y-r)
bornSum += 2.0f*(RECIP(params1.x)-l_ij);
bornSum
+=
l_ij
-
u_ij
+
(
0.50f*invR*ratio
)
+
0.25f*
(
r*
(
u_ij2-l_ij2
)
+
(
params2.y*params2.y*invR
)
*
(
l_ij2-u_ij2
))
;
bornSum
+=
(
params1.x
<
params2.y-r
?
2.0f*
(
RECIP
(
params1.x
)
-l_ij
)
:
0
)
;
}
}
}
//
Write
results.
unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#
ifdef
SUPPORTS_64_BIT_ATOMICS
atom_add
(
&global_bornSum[atom1],
(
long
)
(
bornSum*0x100000000
))
;
#
else
unsigned
int
offset
=
atom1
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
global_bornSum[offset]
+=
bornSum
;
#
endif
}
}
else
{
//
This
is
an
off-diagonal
tile.
for
(
int
tgx
=
0
; tgx < TILE_SIZE; tgx++)
localData[tgx].bornSum = 0.0f;
// Compute the full set of interactions in this tile.
localData[tgx].bornSum
=
0
;
for
(
unsigned
int
tgx
=
0
; tgx < TILE_SIZE; tgx++) {
unsigned
int
atom1
=
x*TILE_SIZE+tgx
;
real bornSum = 0
.0f
;
real
bornSum
=
0
;
real4
posq1
=
posq[atom1]
;
float2
params1
=
global_params[atom1]
;
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
...
...
@@ -126,7 +112,7 @@ __kernel void computeBornSum(__global real* restrict global_bornSum, __global co
#
ifdef
USE_PERIODIC
delta.xyz
-=
floor
(
delta.xyz*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
#
endif
real r2 = d
ot(
delta.x
yz,
delta.
xyz)
;
real
r2
=
d
elta.x*
delta.x
+
delta.
y*delta.y
+
delta.z*delta.z
;
#
ifdef
USE_CUTOFF
if
(
atom1
<
NUM_ATOMS
&&
y*TILE_SIZE+j
<
NUM_ATOMS
&&
r2
<
CUTOFF_SQUARED
)
{
#
else
...
...
@@ -134,8 +120,6 @@ __kernel void computeBornSum(__global real* restrict global_bornSum, __global co
#
endif
real
invR
=
RSQRT
(
r2
)
;
real
r
=
RECIP
(
invR
)
;
float2
params2
=
(
float2
)
(
localData[j].radius,
localData[j].scaledRadius
)
;
real
rScaledRadiusJ
=
r+params2.y
;
if
(
params1.x
<
rScaledRadiusJ
)
{
...
...
@@ -144,10 +128,9 @@ __kernel void computeBornSum(__global real* restrict global_bornSum, __global co
real
l_ij2
=
l_ij*l_ij
;
real
u_ij2
=
u_ij*u_ij
;
real
ratio
=
LOG
(
u_ij
*
RECIP
(
l_ij
))
;
bornSum += l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2);
if (params1.x < params2.y-r)
bornSum += 2.0f*(RECIP(params1.x)-l_ij);
bornSum
+=
l_ij
-
u_ij
+
(
0.50f*invR*ratio
)
+
0.25f*
(
r*
(
u_ij2-l_ij2
)
+
(
params2.y*params2.y*invR
)
*
(
l_ij2-u_ij2
))
;
bornSum
+=
(
params1.x
<
params2.y-r
?
2.0f*
(
RECIP
(
params1.x
)
-l_ij
)
:
0
)
;
}
real
rScaledRadiusI
=
r+params1.y
;
if
(
params2.x
<
rScaledRadiusI
)
{
...
...
@@ -156,10 +139,9 @@ __kernel void computeBornSum(__global real* restrict global_bornSum, __global co
real
l_ij2
=
l_ij*l_ij
;
real
u_ij2
=
u_ij*u_ij
;
real
ratio
=
LOG
(
u_ij
*
RECIP
(
l_ij
))
;
real term = l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params1.y*params1.y*invR)*(l_ij2-u_ij2);
if (params2.x < params1.y-r)
term += 2.0f*(RECIP(params2.x)-l_ij);
real
term
=
l_ij
-
u_ij
+
(
0.50f*invR*ratio
)
+
0.25f*
(
r*
(
u_ij2-l_ij2
)
+
(
params1.y*params1.y*invR
)
*
(
l_ij2-u_ij2
))
;
term
+=
(
params2.x
<
params1.y-r
?
2.0f*
(
RECIP
(
params2.x
)
-l_ij
)
:
0
)
;
localData[j].bornSum
+=
term
;
}
}
...
...
@@ -167,91 +149,299 @@ __kernel void computeBornSum(__global real* restrict global_bornSum, __global co
//
Write
results
for
atom1.
#
ifdef
SUPPORTS_64_BIT_ATOMICS
atom_add
(
&global_bornSum[atom1],
(
long
)
(
bornSum*0x100000000
))
;
#
else
unsigned
int
offset
=
atom1
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
global_bornSum[offset]
+=
bornSum
;
#
endif
}
// Write results
//
Write
results
.
for
(
int
tgx
=
0
; tgx < TILE_SIZE; tgx++) {
#
ifdef
SUPPORTS_64_BIT_ATOMICS
unsigned
int
offset
=
y*TILE_SIZE
+
tgx
;
atom_add
(
&global_bornSum[offset],
(
long
)
(
localData[tgx].bornSum*0x100000000
))
;
#
else
unsigned
int
offset
=
y*TILE_SIZE+tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
global_bornSum[offset]
+=
localData[tgx].bornSum
;
#
endif
}
}
lasty = y;
pos++;
}
}
typedef struct {
real x, y, z;
real q;
real fx, fy, fz, fw;
real bornRadius;
} AtomData2;
//
Second
loop:
tiles
without
exclusions,
either
from
the
neighbor
list
(
with
cutoff
)
or
just
enumerating
all
//
of
them
(
no
cutoff
)
.
/**
* First part of computing the GBSA interaction.
*/
__kernel void computeGBSAForce1(__global real4* restrict forceBuffers, __global real* restrict global_bornForce,
__global real* restrict energyBuffer, __global const real4* restrict posq, __global const real* restrict global_bornRadii,
#ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags) {
#else
unsigned int numTiles) {
#endif
#
ifdef
USE_CUTOFF
unsigned
int
numTiles
=
interactionCount[0]
;
unsigned
int pos = get_group_id(0)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0);
unsigned
int end = (get_group_id(0)+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0);
int
pos
=
get_group_id
(
0
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS*
(
NUM_BLOCKS+1
)
/2
:
numTiles
)
/get_num_groups
(
0
)
;
int
end
=
(
get_group_id
(
0
)
+1
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS*
(
NUM_BLOCKS+1
)
/2
:
numTiles
)
/get_num_groups
(
0
)
;
#
else
unsigned
int pos = get_group_id(0)*numTiles/get_num_groups(0);
unsigned
int end = (get_group_id(0)+1)*numTiles/get_num_groups(0);
int
pos
=
get_group_id
(
0
)
*numTiles/get_num_groups
(
0
)
;
int
end
=
(
get_group_id
(
0
)
+1
)
*numTiles/get_num_groups
(
0
)
;
#
endif
real energy = 0.0f
;
unsigned int lasty = 0xFFFFFFFF
;
__local
AtomData2 localData
[TILE_SIZE];
int
nextToSkip
=
-1
;
int
currentSkipIndex
=
0
;
__local
int
atomIndices
[TILE_SIZE]
;
while
(
pos
<
end
)
{
// Extract the coordinates of this tile
bool
includeTile
=
true
;
//
Extract
the
coordinates
of
this
tile.
unsigned
int
x,
y
;
bool
singlePeriodicCopy
=
false
;
#
ifdef
USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles[pos]
;
x
=
tileIndices.x
;
y = tileIndices.y;
singlePeriodicCop
y
=
tileIndices.y
;
}
else
#
endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-
sqrt
((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS+0.5f-
SQRT
((
NUM_BLOCKS+0.5f
)
*
(
NUM_BLOCKS+0.5f
)
-2*pos
))
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
if
(
x
<
y
|
| x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
//
Load
the
data
for
this
tile
if
we
don
't
already
have
it
cached.
// Skip over tiles that have exclusions, since they were already processed.
while (nextToSkip < pos) {
if (currentSkipIndex < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[currentSkipIndex++];
nextToSkip = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
}
else
nextToSkip = end;
}
includeTile = (nextToSkip != pos);
}
if (includeTile) {
// Load the data for this tile.
if
(
lasty
!=
y
)
{
for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
unsigned
int
j
=
y*TILE_SIZE
+
localAtomIndex
;
real4
tempPosq
=
posq[j]
;
localData[localAtomIndex].x
=
tempPosq.x
;
localData[localAtomIndex].y
=
tempPosq.y
;
localData[localAtomIndex].z
=
tempPosq.z
;
localData[localAtomIndex].q
=
tempPosq.w
;
localData[localAtomIndex].bornRadius
=
global_bornRadii[j]
;
#ifdef USE_CUTOFF
unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+localAtomIndex] : y*TILE_SIZE+localAtomIndex);
#else
unsigned int j = y*TILE_SIZE+localAtomIndex;
#endif
atomIndices[localAtomIndex] = j;
if (j < PADDED_NUM_ATOMS) {
real4 tempPosq = posq[j];
localData[localAtomIndex].x = tempPosq.x;
localData[localAtomIndex].y = tempPosq.y;
localData[localAtomIndex].z = tempPosq.z;
localData[localAtomIndex].q = tempPosq.w;
float2 tempParams = global_params[j];
localData[localAtomIndex].radius = tempParams.x;
localData[localAtomIndex].scaledRadius = tempParams.y;
localData[localAtomIndex].bornSum = 0.0f;
}
}
#ifdef USE_PERIODIC
if (singlePeriodicCopy) {
// The box is small enough that we can just translate all the atoms into a single periodic
// box, then skip having to apply periodic boundary conditions later.
real4 blockCenterX = blockCenter[x];
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
localData[tgx].x -= floor((localData[tgx].x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
localData[tgx].y -= floor((localData[tgx].y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
localData[tgx].z -= floor((localData[tgx].z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
}
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
unsigned int atom1 = x*TILE_SIZE+tgx;
real bornSum = 0;
real4 posq1 = posq[atom1];
float2 params1 = global_params[atom1];
for (unsigned int j = 0; j < TILE_SIZE; j++) {
real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
int atom2 = atomIndices[j];
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
real invR = RSQRT(r2);
real r = RECIP(invR);
float2 params2 = (float2) (localData[j].radius, localData[j].scaledRadius);
real rScaledRadiusJ = r+params2.y;
if (params1.x < rScaledRadiusJ) {
real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
real u_ij = RECIP(rScaledRadiusJ);
real l_ij2 = l_ij*l_ij;
real u_ij2 = u_ij*u_ij;
real ratio = LOG(u_ij * RECIP(l_ij));
bornSum += l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
(params2.y*params2.y*invR)*(l_ij2-u_ij2));
bornSum += (params1.x < params2.y-r ? 2.0f*(RECIP(params1.x)-l_ij) : 0);
}
real rScaledRadiusI = r+params1.y;
if (params2.x < rScaledRadiusI) {
real l_ij = RECIP(max((real) params2.x, fabs(r-params1.y)));
real u_ij = RECIP(rScaledRadiusI);
real l_ij2 = l_ij*l_ij;
real u_ij2 = u_ij*u_ij;
real ratio = LOG(u_ij * RECIP(l_ij));
real term = l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
(params1.y*params1.y*invR)*(l_ij2-u_ij2));
term += (params2.x < params1.y-r ? 2.0f*(RECIP(params2.x)-l_ij) : 0);
localData[j].bornSum += term;
}
}
}
// Write results for atom1.
#ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&global_bornSum[atom1], (long) (bornSum*0x100000000));
#else
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
global_bornSum[offset] += bornSum;
#endif
}
}
else
#endif
{
// We need to apply periodic boundary conditions separately for each interaction.
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
unsigned int atom1 = x*TILE_SIZE+tgx;
real bornSum = 0;
real4 posq1 = posq[atom1];
float2 params1 = global_params[atom1];
for (unsigned int j = 0; j < TILE_SIZE; j++) {
real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
int atom2 = atomIndices[j];
#ifdef USE_CUTOFF
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
float2 params2 = (float2) (localData[j].radius, localData[j].scaledRadius);
real rScaledRadiusJ = r+params2.y;
if (params1.x < rScaledRadiusJ) {
real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
real u_ij = RECIP(rScaledRadiusJ);
real l_ij2 = l_ij*l_ij;
real u_ij2 = u_ij*u_ij;
real ratio = LOG(u_ij * RECIP(l_ij));
bornSum += l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
(params2.y*params2.y*invR)*(l_ij2-u_ij2));
bornSum += (params1.x < params2.y-r ? 2.0f*(RECIP(params1.x)-l_ij) : 0);
}
real rScaledRadiusI = r+params1.y;
if (params2.x < rScaledRadiusI) {
real l_ij = RECIP(max((real) params2.x, fabs(r-params1.y)));
real u_ij = RECIP(rScaledRadiusI);
real l_ij2 = l_ij*l_ij;
real u_ij2 = u_ij*u_ij;
real ratio = LOG(u_ij * RECIP(l_ij));
real term = l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
(params1.y*params1.y*invR)*(l_ij2-u_ij2));
term += (params2.x < params1.y-r ? 2.0f*(RECIP(params2.x)-l_ij) : 0);
localData[j].bornSum += term;
}
}
}
// Write results for atom1.
#ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&global_bornSum[atom1], (long) (bornSum*0x100000000));
#else
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
global_bornSum[offset] += bornSum;
#endif
}
}
// Write results.
for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
#ifdef USE_CUTOFF
unsigned int atom2 = atomIndices[tgx];
#else
unsigned int atom2 = y*TILE_SIZE + tgx;
#endif
if (atom2 < PADDED_NUM_ATOMS) {
#ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&global_bornSum[atom2], (long) (localData[tgx].bornSum*0x100000000));
#else
unsigned int offset = atom2 + get_group_id(0)*PADDED_NUM_ATOMS;
global_bornSum[offset] += localData[tgx].bornSum;
#endif
}
}
}
pos++;
}
}
typedef struct {
real x, y, z;
real q;
real fx, fy, fz, fw;
real bornRadius;
} AtomData2;
/**
* First part of computing the GBSA interaction.
*/
__kernel void computeGBSAForce1(
#ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict forceBuffers, __global long* restrict global_bornForce,
#else
__global real4* restrict forceBuffers, __global real* restrict global_bornForce,
#endif
__global real* restrict energyBuffer, __global const real4* restrict posq, __global const real* restrict global_bornRadii,
#ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, __global const real4* restrict blockCenter, __global const int* restrict interactingAtoms,
#else
unsigned int numTiles,
#endif
__global const ushort2* exclusionTiles) {
real energy = 0.0f;
__local AtomData2 localData[TILE_SIZE];
// First loop: process tiles that contain exclusions.
const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+get_group_id(0)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0);
const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(get_group_id(0)+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0);
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
const ushort2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x;
const unsigned int y = tileIndices.y;
// Load the data for this tile.
for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
unsigned int j = y*TILE_SIZE + localAtomIndex;
real4 tempPosq = posq[j];
localData[localAtomIndex].x = tempPosq.x;
localData[localAtomIndex].y = tempPosq.y;
localData[localAtomIndex].z = tempPosq.z;
localData[localAtomIndex].q = tempPosq.w;
localData[localAtomIndex].bornRadius = global_bornRadii[j];
}
if (x == y) {
// This tile is on the diagonal.
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
unsigned int atom1 = x*TILE_SIZE+tgx;
real4
force
=
0
.0f
;
real4 force = 0;
real4 posq1 = posq[atom1];
real bornRadius1 = global_bornRadii[atom1];
for (unsigned int j = 0; j < TILE_SIZE; j++) {
...
...
@@ -260,7 +450,7 @@ __kernel void computeGBSAForce1(__global real4* restrict forceBuffers, __global
#ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
#endif
real
r2
=
d
ot
(
delta.x
yz,
delta.
xyz
)
;
real r2 = d
elta.x*
delta.x
+
delta.
y*delta.y + delta.z*delta.z
;
#ifdef USE_CUTOFF
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else
...
...
@@ -277,35 +467,40 @@ __kernel void computeGBSAForce1(__global real4* restrict forceBuffers, __global
real tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator);
real Gpol = tempEnergy*RECIP(denominator2);
real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
force.w
+=
dGpol_dalpha2_ij*bornRadius2
;
real dEdR = Gpol*(1.0f - 0.25f*expTerm);
force.w += dGpol_dalpha2_ij*bornRadius2;
energy += 0.5f*tempEnergy;
force.xyz
-=
delta.xyz*dEdR
;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
}
}
// Write results.
unsigned
int
offset
=
x*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
#ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&forceBuffers[atom1], (long) (force.x*0x100000000));
atom_add(&forceBuffers[atom1+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
atom_add(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
atom_add(&global_bornForce[atom1], (long) (force.w*0x100000000));
#else
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz;
global_bornForce[offset] += force.w;
#endif
}
}
else {
// This is an off-diagonal tile.
for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
localData[tgx].fx
=
0
.0f
;
localData[tgx].fy
=
0
.0f
;
localData[tgx].fz
=
0
.0f
;
localData[tgx].fw
=
0
.0f
;
localData[tgx].fx = 0;
localData[tgx].fy = 0;
localData[tgx].fz = 0;
localData[tgx].fw = 0;
}
//
Compute
the
full
set
of
interactions
in
this
tile.
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
unsigned int atom1 = x*TILE_SIZE+tgx;
real4
force
=
0
.0f
;
real4 force = 0;
real4 posq1 = posq[atom1];
real bornRadius1 = global_bornRadii[atom1];
for (unsigned int j = 0; j < TILE_SIZE; j++) {
...
...
@@ -314,7 +509,7 @@ __kernel void computeGBSAForce1(__global real4* restrict forceBuffers, __global
#ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
#endif
real
r2
=
d
ot
(
delta.x
yz,
delta.
xyz
)
;
real r2 = d
elta.x*
delta.x
+
delta.
y*delta.y + delta.z*delta.z
;
#ifdef USE_CUTOFF
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else
...
...
@@ -331,8 +526,8 @@ __kernel void computeGBSAForce1(__global real4* restrict forceBuffers, __global
real tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator);
real Gpol = tempEnergy*RECIP(denominator2);
real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
force.w
+=
dGpol_dalpha2_ij*bornRadius2
;
real dEdR = Gpol*(1.0f - 0.25f*expTerm);
force.w += dGpol_dalpha2_ij*bornRadius2;
energy += tempEnergy;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
...
...
@@ -343,16 +538,30 @@ __kernel void computeGBSAForce1(__global real4* restrict forceBuffers, __global
}
}
//
Write
results
for
atom1.
// Write results for atom1.
#ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&forceBuffers[atom1], (long) (force.x*0x100000000));
atom_add(&forceBuffers[atom1+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
atom_add(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
atom_add(&global_bornForce[atom1], (long) (force.w*0x100000000));
#else
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz;
global_bornForce[offset] += force.w;
#endif
}
//
Write
results
// Write results
.
for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset = y*TILE_SIZE + tgx;
atom_add(&forceBuffers[offset], (long) (localData[tgx].fx*0x100000000));
atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (localData[tgx].fy*0x100000000));
atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (localData[tgx].fz*0x100000000));
atom_add(&global_bornForce[offset], (long) (localData[tgx].fw*0x100000000));
#else
unsigned int offset = y*TILE_SIZE+tgx + get_group_id(0)*PADDED_NUM_ATOMS;
real4 f = forceBuffers[offset];
f.x += localData[tgx].fx;
...
...
@@ -360,9 +569,231 @@ __kernel void computeGBSAForce1(__global real4* restrict forceBuffers, __global
f.z += localData[tgx].fz;
forceBuffers[offset] = f;
global_bornForce[offset] += localData[tgx].fw;
#endif
}
}
}
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
// of them (no cutoff).
#ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0];
int pos = get_group_id(0)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0);
int end = (get_group_id(0)+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0);
#else
int pos = get_group_id(0)*numTiles/get_num_groups(0);
int end = (get_group_id(0)+1)*numTiles/get_num_groups(0);
#endif
int nextToSkip = -1;
int currentSkipIndex = 0;
__local int atomIndices[TILE_SIZE];
while (pos < end) {
bool includeTile = true;
// Extract the coordinates of this tile.
unsigned int x, y;
bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
x = tileIndices.x;
singlePeriodicCopy = tileIndices.y;
}
else
#endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y |
|
x
>=
NUM_BLOCKS
)
{
//
Occasionally
happens
due
to
roundoff
error.
y
+=
(
x
<
y
?
-1
:
1
)
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
}
//
Skip
over
tiles
that
have
exclusions,
since
they
were
already
processed.
while
(
nextToSkip
<
pos
)
{
if
(
currentSkipIndex
<
NUM_TILES_WITH_EXCLUSIONS
)
{
ushort2
tile
=
exclusionTiles[currentSkipIndex++]
;
nextToSkip
=
tile.x
+
tile.y*NUM_BLOCKS
-
tile.y*
(
tile.y+1
)
/2
;
}
else
nextToSkip
=
end
;
}
includeTile
=
(
nextToSkip
!=
pos
)
;
}
if
(
includeTile
)
{
//
Load
the
data
for
this
tile.
for
(
int
localAtomIndex
=
0
; localAtomIndex < TILE_SIZE; localAtomIndex++) {
#
ifdef
USE_CUTOFF
unsigned
int
j
=
(
numTiles
<=
maxTiles
?
interactingAtoms[pos*TILE_SIZE+localAtomIndex]
:
y*TILE_SIZE+localAtomIndex
)
;
#
else
unsigned
int
j
=
y*TILE_SIZE+localAtomIndex
;
#
endif
atomIndices[localAtomIndex]
=
j
;
if
(
j
<
PADDED_NUM_ATOMS
)
{
real4
tempPosq
=
posq[j]
;
localData[localAtomIndex].x
=
tempPosq.x
;
localData[localAtomIndex].y
=
tempPosq.y
;
localData[localAtomIndex].z
=
tempPosq.z
;
localData[localAtomIndex].q
=
tempPosq.w
;
localData[localAtomIndex].bornRadius
=
global_bornRadii[j]
;
localData[localAtomIndex].fx
=
0.0f
;
localData[localAtomIndex].fy
=
0.0f
;
localData[localAtomIndex].fz
=
0.0f
;
localData[localAtomIndex].fw
=
0.0f
;
}
}
#
ifdef
USE_PERIODIC
if
(
singlePeriodicCopy
)
{
//
The
box
is
small
enough
that
we
can
just
translate
all
the
atoms
into
a
single
periodic
//
box,
then
skip
having
to
apply
periodic
boundary
conditions
later.
real4
blockCenterX
=
blockCenter[x]
;
for
(
unsigned
int
tgx
=
0
; tgx < TILE_SIZE; tgx++) {
localData[tgx].x
-=
floor
((
localData[tgx].x-blockCenterX.x
)
*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
localData[tgx].y
-=
floor
((
localData[tgx].y-blockCenterX.y
)
*invPeriodicBoxSize.y+0.5f
)
*periodicBoxSize.y
;
localData[tgx].z
-=
floor
((
localData[tgx].z-blockCenterX.z
)
*invPeriodicBoxSize.z+0.5f
)
*periodicBoxSize.z
;
}
for
(
unsigned
int
tgx
=
0
; tgx < TILE_SIZE; tgx++) {
unsigned
int
atom1
=
x*TILE_SIZE+tgx
;
real4
force
=
0
;
real4
posq1
=
posq[atom1]
;
posq1.xyz
-=
floor
((
posq1.xyz-blockCenterX.xyz
)
*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
float
bornRadius1
=
global_bornRadii[atom1]
;
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
real4
posq2
=
(
real4
)
(
localData[j].x,
localData[j].y,
localData[j].z,
localData[j].q
)
;
real4
delta
=
(
real4
)
(
posq2.xyz
-
posq1.xyz,
0
)
;
real
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
int
atom2
=
atomIndices[j]
;
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
&&
r2
<
CUTOFF_SQUARED
)
{
real
invR
=
RSQRT
(
r2
)
;
real
r
=
RECIP
(
invR
)
;
real
bornRadius2
=
localData[j].bornRadius
;
real
alpha2_ij
=
bornRadius1*bornRadius2
;
real
D_ij
=
r2*RECIP
(
4.0f*alpha2_ij
)
;
real
expTerm
=
EXP
(
-D_ij
)
;
real
denominator2
=
r2
+
alpha2_ij*expTerm
;
real
denominator
=
SQRT
(
denominator2
)
;
real
tempEnergy
=
(
PREFACTOR*posq1.w*posq2.w
)
*RECIP
(
denominator
)
;
real
Gpol
=
tempEnergy*RECIP
(
denominator2
)
;
real
dGpol_dalpha2_ij
=
-0.5f*Gpol*expTerm*
(
1.0f+D_ij
)
;
real
dEdR
=
Gpol*
(
1.0f
-
0.25f*expTerm
)
;
force.w
+=
dGpol_dalpha2_ij*bornRadius2
;
energy
+=
tempEnergy
;
delta.xyz
*=
dEdR
;
force.xyz
-=
delta.xyz
;
localData[j].fx
+=
delta.x
;
localData[j].fy
+=
delta.y
;
localData[j].fz
+=
delta.z
;
localData[j].fw
+=
dGpol_dalpha2_ij*bornRadius1
;
}
}
//
Write
results
for
atom1.
#
ifdef
SUPPORTS_64_BIT_ATOMICS
atom_add
(
&forceBuffers[atom1],
(
long
)
(
force.x*0x100000000
))
;
atom_add
(
&forceBuffers[atom1+PADDED_NUM_ATOMS],
(
long
)
(
force.y*0x100000000
))
;
atom_add
(
&forceBuffers[atom1+2*PADDED_NUM_ATOMS],
(
long
)
(
force.z*0x100000000
))
;
atom_add
(
&global_bornForce[atom1],
(
long
)
(
force.w*0x100000000
))
;
#
else
unsigned
int
offset
=
atom1
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
forceBuffers[offset].xyz
=
forceBuffers[offset].xyz+force.xyz
;
global_bornForce[offset]
+=
force.w
;
#
endif
}
}
else
#
endif
{
//
We
need
to
apply
periodic
boundary
conditions
separately
for
each
interaction.
for
(
unsigned
int
tgx
=
0
; tgx < TILE_SIZE; tgx++) {
unsigned
int
atom1
=
x*TILE_SIZE+tgx
;
real4
force
=
0
;
real4
posq1
=
posq[atom1]
;
float
bornRadius1
=
global_bornRadii[atom1]
;
for
(
unsigned
int
j
=
0
; j < TILE_SIZE; j++) {
real4
posq2
=
(
real4
)
(
localData[j].x,
localData[j].y,
localData[j].z,
localData[j].q
)
;
real4
delta
=
(
real4
)
(
posq2.xyz
-
posq1.xyz,
0
)
;
#
ifdef
USE_PERIODIC
delta.xyz
-=
floor
(
delta.xyz*invPeriodicBoxSize.xyz+0.5f
)
*periodicBoxSize.xyz
;
#
endif
real
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
int
atom2
=
atomIndices[j]
;
#
ifdef
USE_CUTOFF
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
&&
r2
<
CUTOFF_SQUARED
)
{
#
else
if
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
)
{
#
endif
real
invR
=
RSQRT
(
r2
)
;
real
r
=
RECIP
(
invR
)
;
real
bornRadius2
=
localData[j].bornRadius
;
real
alpha2_ij
=
bornRadius1*bornRadius2
;
real
D_ij
=
r2*RECIP
(
4.0f*alpha2_ij
)
;
real
expTerm
=
EXP
(
-D_ij
)
;
real
denominator2
=
r2
+
alpha2_ij*expTerm
;
real
denominator
=
SQRT
(
denominator2
)
;
real
tempEnergy
=
(
PREFACTOR*posq1.w*posq2.w
)
*RECIP
(
denominator
)
;
real
Gpol
=
tempEnergy*RECIP
(
denominator2
)
;
real
dGpol_dalpha2_ij
=
-0.5f*Gpol*expTerm*
(
1.0f+D_ij
)
;
real
dEdR
=
Gpol*
(
1.0f
-
0.25f*expTerm
)
;
force.w
+=
dGpol_dalpha2_ij*bornRadius2
;
energy
+=
tempEnergy
;
delta.xyz
*=
dEdR
;
force.xyz
-=
delta.xyz
;
localData[j].fx
+=
delta.x
;
localData[j].fy
+=
delta.y
;
localData[j].fz
+=
delta.z
;
localData[j].fw
+=
dGpol_dalpha2_ij*bornRadius1
;
}
}
//
Write
results
for
atom1.
#
ifdef
SUPPORTS_64_BIT_ATOMICS
atom_add
(
&forceBuffers[atom1],
(
long
)
(
force.x*0x100000000
))
;
atom_add
(
&forceBuffers[atom1+PADDED_NUM_ATOMS],
(
long
)
(
force.y*0x100000000
))
;
atom_add
(
&forceBuffers[atom1+2*PADDED_NUM_ATOMS],
(
long
)
(
force.z*0x100000000
))
;
atom_add
(
&global_bornForce[atom1],
(
long
)
(
force.w*0x100000000
))
;
#
else
unsigned
int
offset
=
atom1
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
forceBuffers[offset].xyz
=
forceBuffers[offset].xyz+force.xyz
;
global_bornForce[offset]
+=
force.w
;
#
endif
}
}
//
Write
results.
for
(
int
tgx
=
0
; tgx < TILE_SIZE; tgx++) {
#
ifdef
USE_CUTOFF
unsigned
int
atom2
=
atomIndices[tgx]
;
#
else
unsigned
int
atom2
=
y*TILE_SIZE
+
tgx
;
#
endif
if
(
atom2
<
PADDED_NUM_ATOMS
)
{
#
ifdef
SUPPORTS_64_BIT_ATOMICS
atom_add
(
&forceBuffers[atom2],
(
long
)
(
localData[tgx].fx*0x100000000
))
;
atom_add
(
&forceBuffers[atom2+PADDED_NUM_ATOMS],
(
long
)
(
localData[tgx].fy*0x100000000
))
;
atom_add
(
&forceBuffers[atom2+2*PADDED_NUM_ATOMS],
(
long
)
(
localData[tgx].fz*0x100000000
))
;
atom_add
(
&global_bornForce[atom2],
(
long
)
(
localData[tgx].fw*0x100000000
))
;
#
else
unsigned
int
offset
=
atom2
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
real4
f
=
forceBuffers[offset]
;
f.x
+=
localData[tgx].fx
;
f.y
+=
localData[tgx].fy
;
f.z
+=
localData[tgx].fz
;
forceBuffers[offset]
=
f
;
global_bornForce[offset]
+=
localData[tgx].fw
;
#
endif
}
}
}
lasty
=
y
;
pos++
;
}
energyBuffer[get_global_id
(
0
)
]
+=
energy
;
...
...
platforms/opencl/src/kernels/gbsaObc_default.cl
deleted
100644 → 0
View file @
f6d4557d
#
ifdef
SUPPORTS_64_BIT_ATOMICS
#
pragma
OPENCL
EXTENSION
cl_khr_global_int32_base_atomics
:
enable
#
pragma
OPENCL
EXTENSION
cl_khr_int64_base_atomics
:
enable
#
endif
#
define
TILE_SIZE
32
typedef
struct
{
real
x,
y,
z
;
float
radius,
scaledRadius
;
}
AtomData1
;
/**
*
Compute
the
Born
sum.
*/
__kernel
__attribute__
((
reqd_work_group_size
(
FORCE_WORK_GROUP_SIZE,
1
,
1
)))
void
computeBornSum
(
#
ifdef
SUPPORTS_64_BIT_ATOMICS
__global
long*
restrict
global_bornSum,
#
else
__global
real*
restrict
global_bornSum,
#
endif
__global
const
real4*
restrict
posq,
__global
const
float2*
restrict
global_params,
#
ifdef
USE_CUTOFF
__global
const
ushort2*
restrict
tiles,
__global
const
unsigned
int*
restrict
interactionCount,
real4
periodicBoxSize,
real4
invPeriodicBoxSize,
unsigned
int
maxTiles
)
{
#
else
unsigned
int
numTiles
)
{
#
endif
#
ifdef
USE_CUTOFF
unsigned
int
numTiles
=
interactionCount[0]
;
unsigned
int
pos
=
get_group_id
(
0
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS*
(
NUM_BLOCKS+1
)
/2
:
numTiles
)
/get_num_groups
(
0
)
;
unsigned
int
end
=
(
get_group_id
(
0
)
+1
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS*
(
NUM_BLOCKS+1
)
/2
:
numTiles
)
/get_num_groups
(
0
)
;
#
else
unsigned
int
pos
=
get_group_id
(
0
)
*numTiles/get_num_groups
(
0
)
;
unsigned
int
end
=
(
get_group_id
(
0
)
+1
)
*numTiles/get_num_groups
(
0
)
;
#
endif
unsigned
int
lasty
=
0xFFFFFFFF
;
__local
AtomData1
localData[TILE_SIZE]
;
__local
real
localBornSum[FORCE_WORK_GROUP_SIZE]
;
__local
real
localTemp[TILE_SIZE]
;
while
(
pos
<
end
)
{
//
Extract
the
coordinates
of
this
tile
unsigned
int
x,
y
;
#
ifdef
USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles[pos]
;
x
=
tileIndices.x
;
y
=
tileIndices.y
;
}
else
#
endif
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS+0.5f-sqrt
((
NUM_BLOCKS+0.5f
)
*
(
NUM_BLOCKS+0.5f
)
-2*pos
))
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
if
(
x
<
y
|
| x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
unsigned int baseLocalAtom = (get_local_id(0) < TILE_SIZE ? 0 : TILE_SIZE/2);
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
unsigned int localForceOffset = get_local_id(0) & ~(TILE_SIZE-1);
unsigned int atom1 = x*TILE_SIZE + tgx;
real bornSum = 0.0f;
real4 posq1 = posq[atom1];
float2 params1 = global_params[atom1];
if (x == y) {
// This tile is on the diagonal.
if (get_local_id(0) < TILE_SIZE) {
localData[get_local_id(0)].x = posq1.x;
localData[get_local_id(0)].y = posq1.y;
localData[get_local_id(0)].z = posq1.z;
localData[get_local_id(0)].radius = params1.x;
localData[get_local_id(0)].scaledRadius = params1.y;
}
barrier(CLK_LOCAL_MEM_FENCE);
for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
real4 delta = (real4) (localData[baseLocalAtom+j].x-posq1.x, localData[baseLocalAtom+j].y-posq1.y, localData[baseLocalAtom+j].z-posq1.z, 0);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real invR = RSQRT(r2);
real r = RECIP(invR);
float2 params2 = (float2) (localData[baseLocalAtom+j].radius, localData[baseLocalAtom+j].scaledRadius);
real rScaledRadiusJ = r+params2.y;
#ifdef USE_CUTOFF
unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+baseLocalAtom+j < NUM_ATOMS && r2 < CUTOFF_SQUARED && (j+baseLocalAtom != tgx) && (params1.x < rScaledRadiusJ));
#else
unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+baseLocalAtom+j < NUM_ATOMS && (j+baseLocalAtom != tgx) && (params1.x < rScaledRadiusJ));
#endif
real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
real u_ij = RECIP(rScaledRadiusJ);
real l_ij2 = l_ij*l_ij;
real u_ij2 = u_ij*u_ij;
real ratio = LOG(u_ij * RECIP(l_ij));
bornSum += (includeInteraction ? l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2) : (real) 0);
bornSum += (includeInteraction && params1.x < params2.y-r ? 2.0f*(RECIP(params1.x)-l_ij) : (real) 0);
}
// Sum the forces and write results.
if (get_local_id(0) >= TILE_SIZE)
localTemp[tgx] = bornSum;
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) {
#ifdef SUPPORTS_64_BIT_ATOMICS
const unsigned int offset = x*TILE_SIZE + tgx;
atom_add(&global_bornSum[offset], (long) ((bornSum + localTemp[tgx])*0x100000000));
#else
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
const unsigned int offset = x*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else
const unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif
global_bornSum[offset] += bornSum + localTemp[tgx];
#endif
}
// barrier not required here as localTemp is not accessed before encountering another barrier.
}
else {
// This is an off-diagonal tile.
if (lasty != y && get_local_id(0) < TILE_SIZE) {
unsigned int j = y*TILE_SIZE + tgx;
real4 tempPosq = posq[j];
localData[get_local_id(0)].x = tempPosq.x;
localData[get_local_id(0)].y = tempPosq.y;
localData[get_local_id(0)].z = tempPosq.z;
float2 tempParams = global_params[j];
localData[get_local_id(0)].radius = tempParams.x;
localData[get_local_id(0)].scaledRadius = tempParams.y;
}
localBornSum[get_local_id(0)] = 0.0f;
barrier(CLK_LOCAL_MEM_FENCE);
// Compute the full set of interactions in this tile.
unsigned int tj = (tgx+baseLocalAtom) & (TILE_SIZE-1);
for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
real4 delta = (real4) (localData[tj].x-posq1.x, localData[tj].y-posq1.y, localData[tj].z-posq1.z, 0);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS && r2 < CUTOFF_SQUARED);
#else
unsigned int includeInteraction = (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS);
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
float2 params2 = (float2) (localData[tj].radius, localData[tj].scaledRadius);
real rScaledRadiusJ = r+params2.y;
{
real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
real u_ij = RECIP(rScaledRadiusJ);
real l_ij2 = l_ij*l_ij;
real u_ij2 = u_ij*u_ij;
real ratio = LOG(u_ij * RECIP(l_ij));
unsigned int includeTerm = (includeInteraction && params1.x < rScaledRadiusJ);
bornSum += (includeTerm ? l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params2.y*params2.y*invR)*(l_ij2-u_ij2) : (real) 0);
bornSum += (includeTerm && params1.x < params2.y-r ? 2.0f*(RECIP(params1.x)-l_ij) : (real) 0);
}
real rScaledRadiusI = r+params1.y;
{
real l_ij = RECIP(max((real) params2.x, fabs(r-params1.y)));
real u_ij = RECIP(rScaledRadiusI);
real l_ij2 = l_ij*l_ij;
real u_ij2 = u_ij*u_ij;
real ratio = LOG(u_ij * RECIP(l_ij));
real term = l_ij - u_ij + 0.25f*r*(u_ij2-l_ij2) + (0.50f*invR*ratio) +
(0.25f*params1.y*params1.y*invR)*(l_ij2-u_ij2);
term += (params2.x < params1.y-r ? 2.0f*(RECIP(params2.x)-l_ij) : (real) 0);
localBornSum[tj+localForceOffset] += (includeInteraction && params2.x < rScaledRadiusI ? term : (real) 0);
}
barrier(CLK_LOCAL_MEM_FENCE);
tj = (tj+1) & (TILE_SIZE-1);
}
// Sum the forces and write results.
if (get_local_id(0) >= TILE_SIZE)
localTemp[tgx] = bornSum;
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) {
#ifdef SUPPORTS_64_BIT_ATOMICS
const unsigned int offset1 = x*TILE_SIZE + tgx;
const unsigned int offset2 = y*TILE_SIZE + tgx;
atom_add(&global_bornSum[offset1], (long) ((bornSum + localTemp[tgx])*0x100000000));
atom_add(&global_bornSum[offset2], (long) ((localBornSum[get_local_id(0)] + localBornSum[get_local_id(0)+TILE_SIZE])*0x100000000));
#else
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
const unsigned int offset1 = x*TILE_SIZE + tgx + y*PADDED_NUM_ATOMS;
const unsigned int offset2 = y*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else
const unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
const unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif
// Do both loads before both stores to minimize store-load waits.
real sum1 = global_bornSum[offset1];
real sum2 = global_bornSum[offset2];
sum1 += bornSum + localTemp[tgx];
sum2 += localBornSum[get_local_id(0)] + localBornSum[get_local_id(0)+TILE_SIZE];
global_bornSum[offset1] = sum1;
global_bornSum[offset2] = sum2;
#endif
}
barrier(CLK_LOCAL_MEM_FENCE);
}
lasty = y;
pos++;
}
}
typedef struct {
real x, y, z, w;
real padding;
} PaddedUnalignedFloat4;
typedef struct {
real x, y, z;
real q;
real bornRadius;
real temp_x, temp_y, temp_z, temp_w;
} AtomData2;
/**
* First part of computing the GBSA interaction.
*/
__kernel __attribute__((reqd_work_group_size(FORCE_WORK_GROUP_SIZE, 1, 1)))
void computeGBSAForce1(
#ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict forceBuffers, __global long* restrict global_bornForce,
#else
__global real4* restrict forceBuffers, __global real* restrict global_bornForce,
#endif
__global real* restrict energyBuffer, __global const real4* restrict posq, __global const real* restrict global_bornRadii,
#ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles) {
#else
unsigned int numTiles) {
#endif
#ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0];
unsigned int pos = get_group_id(0)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0);
unsigned int end = (get_group_id(0)+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0);
#else
unsigned int pos = get_group_id(0)*numTiles/get_num_groups(0);
unsigned int end = (get_group_id(0)+1)*numTiles/get_num_groups(0);
#endif
real energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF;
__local AtomData2 localData[TILE_SIZE];
__local PaddedUnalignedFloat4 localForce[FORCE_WORK_GROUP_SIZE];
while (pos < end) {
// Extract the coordinates of this tile
unsigned int x, y;
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
x = tileIndices.x;
y = tileIndices.y;
}
else
#endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y |
|
x
>=
NUM_BLOCKS
)
{
//
Occasionally
happens
due
to
roundoff
error.
y
+=
(
x
<
y
?
-1
:
1
)
;
x
=
(
pos-y*NUM_BLOCKS+y*
(
y+1
)
/2
)
;
}
}
unsigned
int
baseLocalAtom
=
(
get_local_id
(
0
)
<
TILE_SIZE
?
0
:
TILE_SIZE/2
)
;
unsigned
int
tgx
=
get_local_id
(
0
)
&
(
TILE_SIZE-1
)
;
unsigned
int
localForceOffset
=
get_local_id
(
0
)
&
~
(
TILE_SIZE-1
)
;
unsigned
int
atom1
=
x*TILE_SIZE
+
tgx
;
real4
force
=
0.0f
;
real4
posq1
=
posq[atom1]
;
real
bornRadius1
=
global_bornRadii[atom1]
;
if
(
x
==
y
)
{
//
This
tile
is
on
the
diagonal.
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
{
localData[get_local_id
(
0
)
].x
=
posq1.x
;
localData[get_local_id
(
0
)
].y
=
posq1.y
;
localData[get_local_id
(
0
)
].z
=
posq1.z
;
localData[get_local_id
(
0
)
].q
=
posq1.w
;
localData[get_local_id
(
0
)
].bornRadius
=
bornRadius1
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
for
(
unsigned
int
j
=
0
; j < TILE_SIZE/2; j++) {
unsigned
int
includeInteraction
=
(
atom1
<
NUM_ATOMS
&&
y*TILE_SIZE+baseLocalAtom+j
<
NUM_ATOMS
)
;
real4
posq2
=
(
real4
)
(
localData[baseLocalAtom+j].x,
localData[baseLocalAtom+j].y,
localData[baseLocalAtom+j].z,
localData[baseLocalAtom+j].q
)
;
real4
delta
=
(
real4
)
(
posq2.xyz
-
posq1.xyz,
0
)
;
#
ifdef
USE_PERIODIC
delta.x
-=
floor
(
delta.x*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
delta.y
-=
floor
(
delta.y*invPeriodicBoxSize.y+0.5f
)
*periodicBoxSize.y
;
delta.z
-=
floor
(
delta.z*invPeriodicBoxSize.z+0.5f
)
*periodicBoxSize.z
;
#
endif
real
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
real
invR
=
RSQRT
(
r2
)
;
real
r
=
RECIP
(
invR
)
;
real
bornRadius2
=
localData[baseLocalAtom+j].bornRadius
;
real
alpha2_ij
=
bornRadius1*bornRadius2
;
real
D_ij
=
r2*RECIP
(
4.0f*alpha2_ij
)
;
real
expTerm
=
EXP
(
-D_ij
)
;
real
denominator2
=
r2
+
alpha2_ij*expTerm
;
real
denominator
=
SQRT
(
denominator2
)
;
real
tempEnergy
=
(
PREFACTOR*posq1.w*posq2.w
)
*RECIP
(
denominator
)
;
real
Gpol
=
tempEnergy*RECIP
(
denominator2
)
;
real
dGpol_dalpha2_ij
=
-0.5f*Gpol*expTerm*
(
1.0f+D_ij
)
;
real
dEdR
=
Gpol*
(
1.0f
-
0.25f*expTerm
)
;
#
ifdef
USE_CUTOFF
dEdR
=
(
r2
>
CUTOFF_SQUARED
?
(
real
)
0
:
dEdR
)
;
tempEnergy
=
(
r2
>
CUTOFF_SQUARED
?
(
real
)
0
:
tempEnergy
)
;
dGpol_dalpha2_ij
=
(
r2
>
CUTOFF_SQUARED
?
(
real
)
0
:
dGpol_dalpha2_ij
)
;
#
endif
force.w
+=
(
includeInteraction
?
dGpol_dalpha2_ij*bornRadius2
:
(
real
)
0
)
;
energy
+=
(
includeInteraction
?
0.5f*tempEnergy
:
(
real
)
0
)
;
delta.xyz
*=
(
includeInteraction
?
dEdR
:
(
real
)
0
)
;
force.xyz
-=
delta.xyz
;
}
//
Sum
the
forces
and
write
results.
if
(
get_local_id
(
0
)
>=
TILE_SIZE
)
{
localData[tgx].temp_x
=
force.x
;
localData[tgx].temp_y
=
force.y
;
localData[tgx].temp_z
=
force.z
;
localData[tgx].temp_w
=
force.w
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
{
#
ifdef
SUPPORTS_64_BIT_ATOMICS
const
unsigned
int
offset
=
x*TILE_SIZE
+
tgx
;
atom_add
(
&forceBuffers[offset],
(
long
)
((
force.x
+
localData[tgx].temp_x
)
*0x100000000
))
;
atom_add
(
&forceBuffers[offset+PADDED_NUM_ATOMS],
(
long
)
((
force.y
+
localData[tgx].temp_y
)
*0x100000000
))
;
atom_add
(
&forceBuffers[offset+2*PADDED_NUM_ATOMS],
(
long
)
((
force.z
+
localData[tgx].temp_z
)
*0x100000000
))
;
atom_add
(
&global_bornForce[offset],
(
long
)
((
force.w
+
localData[tgx].temp_w
)
*0x100000000
))
;
#
else
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
const
unsigned
int
offset
=
x*TILE_SIZE
+
tgx
+
x*PADDED_NUM_ATOMS
;
#
else
const
unsigned
int
offset
=
x*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
#
endif
//
Cheaper
to
load/store
real4
than
real3.
Do
all
loads
before
all
stores
to
minimize
store-load
waits.
real4
sum
=
forceBuffers[offset]
;
real
global_sum
=
global_bornForce[offset]
;
sum.x
+=
force.x
+
localData[tgx].temp_x
;
sum.y
+=
force.y
+
localData[tgx].temp_y
;
sum.z
+=
force.z
+
localData[tgx].temp_z
;
global_sum
+=
force.w
+
localData[tgx].temp_w
;
forceBuffers[offset]
=
sum
;
global_bornForce[offset]
=
global_sum
;
#
endif
}
//
barrier
not
required
here
as
localData[*]/temp_*
is
not
accessed
before
encountering
another
barrier.
}
else
{
//
This
is
an
off-diagonal
tile.
if
(
lasty
!=
y
&&
get_local_id
(
0
)
<
TILE_SIZE
)
{
unsigned
int
j
=
y*TILE_SIZE
+
tgx
;
real4
tempPosq
=
posq[j]
;
localData[get_local_id
(
0
)
].x
=
tempPosq.x
;
localData[get_local_id
(
0
)
].y
=
tempPosq.y
;
localData[get_local_id
(
0
)
].z
=
tempPosq.z
;
localData[get_local_id
(
0
)
].q
=
tempPosq.w
;
localData[get_local_id
(
0
)
].bornRadius
=
global_bornRadii[j]
;
}
localForce[get_local_id
(
0
)
].x
=
0.0f
;
localForce[get_local_id
(
0
)
].y
=
0.0f
;
localForce[get_local_id
(
0
)
].z
=
0.0f
;
localForce[get_local_id
(
0
)
].w
=
0.0f
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
//
Compute
the
full
set
of
interactions
in
this
tile.
unsigned
int
tj
=
(
tgx+baseLocalAtom
)
&
(
TILE_SIZE-1
)
;
for
(
unsigned
int
j
=
0
; j < TILE_SIZE/2; j++) {
unsigned
int
includeInteraction
=
(
atom1
<
NUM_ATOMS
&&
y*TILE_SIZE+tj
<
NUM_ATOMS
)
;
real4
posq2
=
(
real4
)
(
localData[tj].x,
localData[tj].y,
localData[tj].z,
localData[tj].q
)
;
real4
delta
=
(
real4
)
(
posq2.xyz
-
posq1.xyz,
0
)
;
#
ifdef
USE_PERIODIC
delta.x
-=
floor
(
delta.x*invPeriodicBoxSize.x+0.5f
)
*periodicBoxSize.x
;
delta.y
-=
floor
(
delta.y*invPeriodicBoxSize.y+0.5f
)
*periodicBoxSize.y
;
delta.z
-=
floor
(
delta.z*invPeriodicBoxSize.z+0.5f
)
*periodicBoxSize.z
;
#
endif
real
r2
=
delta.x*delta.x
+
delta.y*delta.y
+
delta.z*delta.z
;
real
invR
=
RSQRT
(
r2
)
;
real
r
=
RECIP
(
invR
)
;
real
bornRadius2
=
localData[tj].bornRadius
;
real
alpha2_ij
=
bornRadius1*bornRadius2
;
real
D_ij
=
r2*RECIP
(
4.0f*alpha2_ij
)
;
real
expTerm
=
EXP
(
-D_ij
)
;
real
denominator2
=
r2
+
alpha2_ij*expTerm
;
real
denominator
=
SQRT
(
denominator2
)
;
real
tempEnergy
=
(
PREFACTOR*posq1.w*posq2.w
)
*RECIP
(
denominator
)
;
real
Gpol
=
tempEnergy*RECIP
(
denominator2
)
;
real
dGpol_dalpha2_ij
=
-0.5f*Gpol*expTerm*
(
1.0f+D_ij
)
;
real
dEdR
=
Gpol*
(
1.0f
-
0.25f*expTerm
)
;
#
ifdef
USE_CUTOFF
dEdR
=
(
r2
>
CUTOFF_SQUARED
?
(
real
)
0
:
dEdR
)
;
tempEnergy
=
(
r2
>
CUTOFF_SQUARED
?
(
real
)
0
:
tempEnergy
)
;
dGpol_dalpha2_ij
=
(
r2
>
CUTOFF_SQUARED
?
(
real
)
0
:
dGpol_dalpha2_ij
)
;
#
endif
force.w
+=
(
includeInteraction
?
dGpol_dalpha2_ij*bornRadius2
:
(
real
)
0
)
;
energy
+=
(
includeInteraction
?
tempEnergy
:
(
real
)
0
)
;
delta.xyz
*=
(
includeInteraction
?
dEdR
:
(
real
)
0
)
;
force.xyz
-=
delta.xyz
;
localForce[tj+localForceOffset].x
+=
delta.x
;
localForce[tj+localForceOffset].y
+=
delta.y
;
localForce[tj+localForceOffset].z
+=
delta.z
;
localForce[tj+localForceOffset].w
+=
(
includeInteraction
?
dGpol_dalpha2_ij*bornRadius1
:
(
real
)
0
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
tj
=
(
tj+1
)
&
(
TILE_SIZE-1
)
;
}
//
Sum
the
forces
and
write
results.
if
(
get_local_id
(
0
)
>=
TILE_SIZE
)
{
localData[tgx].temp_x
=
force.x
;
localData[tgx].temp_y
=
force.y
;
localData[tgx].temp_z
=
force.z
;
localData[tgx].temp_w
=
force.w
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
get_local_id
(
0
)
<
TILE_SIZE
)
{
#
ifdef
SUPPORTS_64_BIT_ATOMICS
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
const
unsigned
int
offset1
=
x*TILE_SIZE
+
tgx
;
const
unsigned
int
offset2
=
y*TILE_SIZE
+
tgx
;
atom_add
(
&forceBuffers[offset1],
(
long
)
((
force.x
+
localData[tgx].temp_x
)
*0x100000000
))
;
atom_add
(
&forceBuffers[offset1+PADDED_NUM_ATOMS],
(
long
)
((
force.y
+
localData[tgx].temp_y
)
*0x100000000
))
;
atom_add
(
&forceBuffers[offset1+2*PADDED_NUM_ATOMS],
(
long
)
((
force.z
+
localData[tgx].temp_z
)
*0x100000000
))
;
atom_add
(
&global_bornForce[offset1],
(
long
)
((
force.w
+
localData[tgx].temp_w
)
*0x100000000
))
;
atom_add
(
&forceBuffers[offset2],
(
long
)
((
localForce[get_local_id
(
0
)
].x
+
localForce[get_local_id
(
0
)
+TILE_SIZE].x
)
*0x100000000
))
;
atom_add
(
&forceBuffers[offset2+PADDED_NUM_ATOMS],
(
long
)
((
localForce[get_local_id
(
0
)
].y
+
localForce[get_local_id
(
0
)
+TILE_SIZE].y
)
*0x100000000
))
;
atom_add
(
&forceBuffers[offset2+2*PADDED_NUM_ATOMS],
(
long
)
((
localForce[get_local_id
(
0
)
].z
+
localForce[get_local_id
(
0
)
+TILE_SIZE].z
)
*0x100000000
))
;
atom_add
(
&global_bornForce[offset2],
(
long
)
((
localForce[get_local_id
(
0
)
].w
+
localForce[get_local_id
(
0
)
+TILE_SIZE].w
)
*0x100000000
))
;
#
else
#
ifdef
USE_OUTPUT_BUFFER_PER_BLOCK
const
unsigned
int
offset1
=
x*TILE_SIZE
+
tgx
+
y*PADDED_NUM_ATOMS
;
const
unsigned
int
offset2
=
y*TILE_SIZE
+
tgx
+
x*PADDED_NUM_ATOMS
;
#
else
const
unsigned
int
offset1
=
x*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
const
unsigned
int
offset2
=
y*TILE_SIZE
+
tgx
+
get_group_id
(
0
)
*PADDED_NUM_ATOMS
;
#
endif
//
Cheaper
to
load/store
real4
than
real3.
Do
all
loads
before
all
stores
to
minimize
store-load
waits.
real4
sum1
=
forceBuffers[offset1]
;
real4
sum2
=
forceBuffers[offset2]
;
real
global_sum1
=
global_bornForce[offset1]
;
real
global_sum2
=
global_bornForce[offset2]
;
sum1.x
+=
force.x
+
localData[tgx].temp_x
;
sum1.y
+=
force.y
+
localData[tgx].temp_y
;
sum1.z
+=
force.z
+
localData[tgx].temp_z
;
global_sum1
+=
force.w
+
localData[tgx].temp_w
;
sum2.x
+=
localForce[get_local_id
(
0
)
].x
+
localForce[get_local_id
(
0
)
+TILE_SIZE].x
;
sum2.y
+=
localForce[get_local_id
(
0
)
].y
+
localForce[get_local_id
(
0
)
+TILE_SIZE].y
;
sum2.z
+=
localForce[get_local_id
(
0
)
].z
+
localForce[get_local_id
(
0
)
+TILE_SIZE].z
;
global_sum2
+=
localForce[get_local_id
(
0
)
].w
+
localForce[get_local_id
(
0
)
+TILE_SIZE].w
;
forceBuffers[offset1]
=
sum1
;
forceBuffers[offset2]
=
sum2
;
global_bornForce[offset1]
=
global_sum1
;
global_bornForce[offset2]
=
global_sum2
;
#
endif
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
lasty
=
y
;
pos++
;
}
energyBuffer[get_global_id
(
0
)
]
+=
energy
;
}
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment