Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
b21e3182
Commit
b21e3182
authored
Apr 20, 2015
by
Jason Swails
Browse files
Merge branch 'master' into psfinscode
parents
7b30da6e
3946c025
Changes
29
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
614 additions
and
223 deletions
+614
-223
CMakeLists.txt
CMakeLists.txt
+6
-4
openmmapi/include/openmm/internal/vectorize8.h
openmmapi/include/openmm/internal/vectorize8.h
+12
-0
openmmapi/include/openmm/internal/vectorize_neon.h
openmmapi/include/openmm/internal/vectorize_neon.h
+6
-2
openmmapi/include/openmm/internal/vectorize_pnacl.h
openmmapi/include/openmm/internal/vectorize_pnacl.h
+6
-2
openmmapi/include/openmm/internal/vectorize_sse.h
openmmapi/include/openmm/internal/vectorize_sse.h
+12
-0
platforms/cpu/include/CpuNeighborList.h
platforms/cpu/include/CpuNeighborList.h
+2
-1
platforms/cpu/include/CpuNonbondedForceVec4.h
platforms/cpu/include/CpuNonbondedForceVec4.h
+6
-6
platforms/cpu/include/CpuNonbondedForceVec8.h
platforms/cpu/include/CpuNonbondedForceVec8.h
+6
-6
platforms/cpu/src/CpuCustomManyParticleForce.cpp
platforms/cpu/src/CpuCustomManyParticleForce.cpp
+1
-1
platforms/cpu/src/CpuNeighborList.cpp
platforms/cpu/src/CpuNeighborList.cpp
+71
-41
platforms/cpu/src/CpuNonbondedForceVec4.cpp
platforms/cpu/src/CpuNonbondedForceVec4.cpp
+137
-43
platforms/cpu/src/CpuNonbondedForceVec8.cpp
platforms/cpu/src/CpuNonbondedForceVec8.cpp
+139
-45
platforms/opencl/include/OpenCLFFT3D.h
platforms/opencl/include/OpenCLFFT3D.h
+14
-6
platforms/opencl/include/OpenCLKernels.h
platforms/opencl/include/OpenCLKernels.h
+1
-0
platforms/opencl/src/OpenCLContext.cpp
platforms/opencl/src/OpenCLContext.cpp
+4
-5
platforms/opencl/src/OpenCLFFT3D.cpp
platforms/opencl/src/OpenCLFFT3D.cpp
+136
-30
platforms/opencl/src/OpenCLKernels.cpp
platforms/opencl/src/OpenCLKernels.cpp
+41
-20
platforms/opencl/src/OpenCLPlatform.cpp
platforms/opencl/src/OpenCLPlatform.cpp
+5
-7
platforms/opencl/src/OpenCLSort.cpp
platforms/opencl/src/OpenCLSort.cpp
+6
-3
platforms/opencl/src/kernels/customManyParticle.cl
platforms/opencl/src/kernels/customManyParticle.cl
+3
-1
No files found.
CMakeLists.txt
View file @
b21e3182
...
@@ -296,7 +296,9 @@ SET(OPENMM_BUILD_SHARED_LIB ON CACHE BOOL "Whether to build shared OpenMM librar
...
@@ -296,7 +296,9 @@ SET(OPENMM_BUILD_SHARED_LIB ON CACHE BOOL "Whether to build shared OpenMM librar
SET
(
EXTRA_LINK_FLAGS
${
EXTRA_COMPILE_FLAGS
}
)
SET
(
EXTRA_LINK_FLAGS
${
EXTRA_COMPILE_FLAGS
}
)
IF
(
CMAKE_SYSTEM_NAME MATCHES
"Linux"
)
IF
(
CMAKE_SYSTEM_NAME MATCHES
"Linux"
)
IF
(
NOT ANDROID
)
SET
(
EXTRA_LINK_FLAGS
"
${
EXTRA_LINK_FLAGS
}
-Wl,--no-as-needed -lrt"
)
SET
(
EXTRA_LINK_FLAGS
"
${
EXTRA_LINK_FLAGS
}
-Wl,--no-as-needed -lrt"
)
ENDIF
(
NOT ANDROID
)
ENDIF
(
CMAKE_SYSTEM_NAME MATCHES
"Linux"
)
ENDIF
(
CMAKE_SYSTEM_NAME MATCHES
"Linux"
)
IF
(
OPENMM_BUILD_SHARED_LIB
)
IF
(
OPENMM_BUILD_SHARED_LIB
)
...
@@ -403,11 +405,11 @@ MARK_AS_ADVANCED(CUDA_BUILD_CUBIN)
...
@@ -403,11 +405,11 @@ MARK_AS_ADVANCED(CUDA_BUILD_CUBIN)
MARK_AS_ADVANCED
(
CUDA_BUILD_EMULATION
)
MARK_AS_ADVANCED
(
CUDA_BUILD_EMULATION
)
FIND_PACKAGE
(
OpenCL QUIET
)
FIND_PACKAGE
(
OpenCL QUIET
)
IF
(
OPENCL_FOUND
AND NOT APPLE
)
IF
(
OPENCL_FOUND
)
SET
(
OPENMM_BUILD_OPENCL_LIB ON CACHE BOOL
"Build OpenMMOpenCL library"
)
SET
(
OPENMM_BUILD_OPENCL_LIB ON CACHE BOOL
"Build OpenMMOpenCL library"
)
ELSE
(
OPENCL_FOUND
AND NOT APPLE
)
ELSE
(
OPENCL_FOUND
)
SET
(
OPENMM_BUILD_OPENCL_LIB OFF CACHE BOOL
"Build OpenMMOpenCL library"
)
SET
(
OPENMM_BUILD_OPENCL_LIB OFF CACHE BOOL
"Build OpenMMOpenCL library"
)
ENDIF
(
OPENCL_FOUND
AND NOT APPLE
)
ENDIF
(
OPENCL_FOUND
)
IF
(
OPENMM_BUILD_OPENCL_LIB
)
IF
(
OPENMM_BUILD_OPENCL_LIB
)
ADD_SUBDIRECTORY
(
platforms/opencl
)
ADD_SUBDIRECTORY
(
platforms/opencl
)
ENDIF
(
OPENMM_BUILD_OPENCL_LIB
)
ENDIF
(
OPENMM_BUILD_OPENCL_LIB
)
...
...
openmmapi/include/openmm/internal/vectorize8.h
View file @
b21e3182
...
@@ -191,6 +191,18 @@ static inline fvec8 sqrt(const fvec8& v) {
...
@@ -191,6 +191,18 @@ static inline fvec8 sqrt(const fvec8& v) {
return
fvec8
(
_mm256_sqrt_ps
(
v
.
val
));
return
fvec8
(
_mm256_sqrt_ps
(
v
.
val
));
}
}
static
inline
fvec8
rsqrt
(
const
fvec8
&
v
)
{
// Initial estimate of rsqrt().
fvec8
y
(
_mm256_rsqrt_ps
(
v
.
val
));
// Perform an iteration of Newton refinement.
fvec8
x2
=
v
*
0.5
f
;
y
*=
fvec8
(
1.5
f
)
-
x2
*
y
*
y
;
return
y
;
}
static
inline
float
dot8
(
const
fvec8
&
v1
,
const
fvec8
&
v2
)
{
static
inline
float
dot8
(
const
fvec8
&
v1
,
const
fvec8
&
v2
)
{
fvec8
result
=
_mm256_dp_ps
(
v1
,
v2
,
0xF1
);
fvec8
result
=
_mm256_dp_ps
(
v1
,
v2
,
0xF1
);
return
_mm_cvtss_f32
(
result
.
lowerVec
())
+
_mm_cvtss_f32
(
result
.
upperVec
());
return
_mm_cvtss_f32
(
result
.
lowerVec
())
+
_mm_cvtss_f32
(
result
.
upperVec
());
...
...
openmmapi/include/openmm/internal/vectorize_neon.h
View file @
b21e3182
...
@@ -251,11 +251,15 @@ static inline fvec4 abs(const fvec4& v) {
...
@@ -251,11 +251,15 @@ static inline fvec4 abs(const fvec4& v) {
return
vabsq_f32
(
v
);
return
vabsq_f32
(
v
);
}
}
static
inline
fvec4
sqrt
(
const
fvec4
&
v
)
{
static
inline
fvec4
r
sqrt
(
const
fvec4
&
v
)
{
float32x4_t
recipSqrt
=
vrsqrteq_f32
(
v
);
float32x4_t
recipSqrt
=
vrsqrteq_f32
(
v
);
recipSqrt
=
vmulq_f32
(
recipSqrt
,
vrsqrtsq_f32
(
vmulq_f32
(
recipSqrt
,
v
),
recipSqrt
));
recipSqrt
=
vmulq_f32
(
recipSqrt
,
vrsqrtsq_f32
(
vmulq_f32
(
recipSqrt
,
v
),
recipSqrt
));
recipSqrt
=
vmulq_f32
(
recipSqrt
,
vrsqrtsq_f32
(
vmulq_f32
(
recipSqrt
,
v
),
recipSqrt
));
recipSqrt
=
vmulq_f32
(
recipSqrt
,
vrsqrtsq_f32
(
vmulq_f32
(
recipSqrt
,
v
),
recipSqrt
));
return
vmulq_f32
(
v
,
recipSqrt
);
return
recipSqrt
;
}
static
inline
fvec4
sqrt
(
const
fvec4
&
v
)
{
return
rsqrt
(
v
)
*
v
;
}
}
static
inline
float
dot3
(
const
fvec4
&
v1
,
const
fvec4
&
v2
)
{
static
inline
float
dot3
(
const
fvec4
&
v1
,
const
fvec4
&
v2
)
{
...
...
openmmapi/include/openmm/internal/vectorize_pnacl.h
View file @
b21e3182
...
@@ -330,7 +330,7 @@ static inline fvec4 ceil(const fvec4& v) {
...
@@ -330,7 +330,7 @@ static inline fvec4 ceil(const fvec4& v) {
return
truncated
+
blend
(
0.0
f
,
1.0
f
,
truncated
<
v
);
return
truncated
+
blend
(
0.0
f
,
1.0
f
,
truncated
<
v
);
}
}
static
inline
fvec4
sqrt
(
const
fvec4
&
v
)
{
static
inline
fvec4
r
sqrt
(
const
fvec4
&
v
)
{
// Initial estimate of rsqrt().
// Initial estimate of rsqrt().
ivec4
i
=
(
__m128i
)
v
;
ivec4
i
=
(
__m128i
)
v
;
...
@@ -343,7 +343,11 @@ static inline fvec4 sqrt(const fvec4& v) {
...
@@ -343,7 +343,11 @@ static inline fvec4 sqrt(const fvec4& v) {
y
*=
1.5
f
-
x2
*
y
*
y
;
y
*=
1.5
f
-
x2
*
y
*
y
;
y
*=
1.5
f
-
x2
*
y
*
y
;
y
*=
1.5
f
-
x2
*
y
*
y
;
y
*=
1.5
f
-
x2
*
y
*
y
;
y
*=
1.5
f
-
x2
*
y
*
y
;
return
y
*
v
;
return
y
;
}
static
inline
fvec4
sqrt
(
const
fvec4
&
v
)
{
return
rsqrt
(
v
)
*
v
;
}
}
#endif
/*OPENMM_VECTORIZE_PNACL_H_*/
#endif
/*OPENMM_VECTORIZE_PNACL_H_*/
...
...
openmmapi/include/openmm/internal/vectorize_sse.h
View file @
b21e3182
...
@@ -241,6 +241,18 @@ static inline fvec4 sqrt(const fvec4& v) {
...
@@ -241,6 +241,18 @@ static inline fvec4 sqrt(const fvec4& v) {
return
fvec4
(
_mm_sqrt_ps
(
v
.
val
));
return
fvec4
(
_mm_sqrt_ps
(
v
.
val
));
}
}
static
inline
fvec4
rsqrt
(
const
fvec4
&
v
)
{
// Initial estimate of rsqrt().
fvec4
y
(
_mm_rsqrt_ps
(
v
.
val
));
// Perform an iteration of Newton refinement.
fvec4
x2
=
v
*
0.5
f
;
y
*=
fvec4
(
1.5
f
)
-
x2
*
y
*
y
;
return
y
;
}
static
inline
float
dot3
(
const
fvec4
&
v1
,
const
fvec4
&
v2
)
{
static
inline
float
dot3
(
const
fvec4
&
v1
,
const
fvec4
&
v2
)
{
return
_mm_cvtss_f32
(
_mm_dp_ps
(
v1
,
v2
,
0x71
));
return
_mm_cvtss_f32
(
_mm_dp_ps
(
v1
,
v2
,
0x71
));
}
}
...
...
platforms/cpu/include/CpuNeighborList.h
View file @
b21e3182
...
@@ -9,7 +9,7 @@
...
@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* *
* Portions copyright (c) 2013 Stanford University and the Authors.
*
* Portions copyright (c) 2013
-2015
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Authors: Peter Eastman *
* Contributors: *
* Contributors: *
* *
* *
...
@@ -61,6 +61,7 @@ public:
...
@@ -61,6 +61,7 @@ public:
private:
private:
int
blockSize
;
int
blockSize
;
std
::
vector
<
int
>
sortedAtoms
;
std
::
vector
<
int
>
sortedAtoms
;
std
::
vector
<
float
>
sortedPositions
;
std
::
vector
<
std
::
vector
<
int
>
>
blockNeighbors
;
std
::
vector
<
std
::
vector
<
int
>
>
blockNeighbors
;
std
::
vector
<
std
::
vector
<
char
>
>
blockExclusions
;
std
::
vector
<
std
::
vector
<
char
>
>
blockExclusions
;
// The following variables are used to make information accessible to the individual threads.
// The following variables are used to make information accessible to the individual threads.
...
...
platforms/cpu/include/CpuNonbondedForceVec4.h
View file @
b21e3182
...
@@ -56,8 +56,8 @@ protected:
...
@@ -56,8 +56,8 @@ protected:
/**
/**
* Templatized implementation of calculateBlockIxn.
* Templatized implementation of calculateBlockIxn.
*/
*/
template
<
bool
TRICLINIC
>
template
<
int
PERIODIC_TYPE
>
void
calculateBlockIxnImpl
(
int
blockIndex
,
float
*
forces
,
double
*
totalEnergy
,
const
fvec4
&
boxSize
,
const
fvec4
&
invBoxSize
);
void
calculateBlockIxnImpl
(
int
blockIndex
,
float
*
forces
,
double
*
totalEnergy
,
const
fvec4
&
boxSize
,
const
fvec4
&
invBoxSize
,
const
fvec4
&
blockCenter
);
/**---------------------------------------------------------------------------------------
/**---------------------------------------------------------------------------------------
...
@@ -74,15 +74,15 @@ protected:
...
@@ -74,15 +74,15 @@ protected:
/**
/**
* Templatized implementation of calculateBlockEwaldIxn.
* Templatized implementation of calculateBlockEwaldIxn.
*/
*/
template
<
bool
TRICLINIC
>
template
<
int
PERIODIC_TYPE
>
void
calculateBlockEwaldIxnImpl
(
int
blockIndex
,
float
*
forces
,
double
*
totalEnergy
,
const
fvec4
&
boxSize
,
const
fvec4
&
invBoxSize
);
void
calculateBlockEwaldIxnImpl
(
int
blockIndex
,
float
*
forces
,
double
*
totalEnergy
,
const
fvec4
&
boxSize
,
const
fvec4
&
invBoxSize
,
const
fvec4
&
blockCenter
);
/**
/**
* Compute the displacement and squared distance between a collection of points, optionally using
* Compute the displacement and squared distance between a collection of points, optionally using
* periodic boundary conditions.
* periodic boundary conditions.
*/
*/
template
<
bool
TRICLINIC
>
template
<
int
PERIODIC_TYPE
>
void
getDeltaR
(
const
f
loat
*
posI
,
const
fvec4
&
x
,
const
fvec4
&
y
,
const
fvec4
&
z
,
fvec4
&
dx
,
fvec4
&
dy
,
fvec4
&
dz
,
fvec4
&
r2
,
bool
periodic
,
const
fvec4
&
boxSize
,
const
fvec4
&
invBoxSize
)
const
;
void
getDeltaR
(
const
f
vec4
&
posI
,
const
fvec4
&
x
,
const
fvec4
&
y
,
const
fvec4
&
z
,
fvec4
&
dx
,
fvec4
&
dy
,
fvec4
&
dz
,
fvec4
&
r2
,
bool
periodic
,
const
fvec4
&
boxSize
,
const
fvec4
&
invBoxSize
)
const
;
/**
/**
* Compute a fast approximation to erfc(x).
* Compute a fast approximation to erfc(x).
...
...
platforms/cpu/include/CpuNonbondedForceVec8.h
View file @
b21e3182
...
@@ -55,8 +55,8 @@ protected:
...
@@ -55,8 +55,8 @@ protected:
/**
/**
* Templatized implementation of calculateBlockIxn.
* Templatized implementation of calculateBlockIxn.
*/
*/
template
<
bool
TRICLINIC
>
template
<
int
PERIODIC_TYPE
>
void
calculateBlockIxnImpl
(
int
blockIndex
,
float
*
forces
,
double
*
totalEnergy
,
const
fvec4
&
boxSize
,
const
fvec4
&
invBoxSize
);
void
calculateBlockIxnImpl
(
int
blockIndex
,
float
*
forces
,
double
*
totalEnergy
,
const
fvec4
&
boxSize
,
const
fvec4
&
invBoxSize
,
const
fvec4
&
blockCenter
);
/**---------------------------------------------------------------------------------------
/**---------------------------------------------------------------------------------------
...
@@ -73,15 +73,15 @@ protected:
...
@@ -73,15 +73,15 @@ protected:
/**
/**
* Templatized implementation of calculateBlockEwaldIxn.
* Templatized implementation of calculateBlockEwaldIxn.
*/
*/
template
<
bool
TRICLINIC
>
template
<
int
PERIODIC_TYPE
>
void
calculateBlockEwaldIxnImpl
(
int
blockIndex
,
float
*
forces
,
double
*
totalEnergy
,
const
fvec4
&
boxSize
,
const
fvec4
&
invBoxSize
);
void
calculateBlockEwaldIxnImpl
(
int
blockIndex
,
float
*
forces
,
double
*
totalEnergy
,
const
fvec4
&
boxSize
,
const
fvec4
&
invBoxSize
,
const
fvec4
&
blockCenter
);
/**
/**
* Compute the displacement and squared distance between a collection of points, optionally using
* Compute the displacement and squared distance between a collection of points, optionally using
* periodic boundary conditions.
* periodic boundary conditions.
*/
*/
template
<
bool
TRICLINIC
>
template
<
int
PERIODIC_TYPE
>
void
getDeltaR
(
const
f
loat
*
posI
,
const
fvec8
&
x
,
const
fvec8
&
y
,
const
fvec8
&
z
,
fvec8
&
dx
,
fvec8
&
dy
,
fvec8
&
dz
,
fvec8
&
r2
,
bool
periodic
,
const
fvec4
&
boxSize
,
const
fvec4
&
invBoxSize
)
const
;
void
getDeltaR
(
const
f
vec4
&
posI
,
const
fvec8
&
x
,
const
fvec8
&
y
,
const
fvec8
&
z
,
fvec8
&
dx
,
fvec8
&
dy
,
fvec8
&
dz
,
fvec8
&
r2
,
bool
periodic
,
const
fvec4
&
boxSize
,
const
fvec4
&
invBoxSize
)
const
;
/**
/**
* Compute a fast approximation to erfc(x).
* Compute a fast approximation to erfc(x).
...
...
platforms/cpu/src/CpuCustomManyParticleForce.cpp
View file @
b21e3182
...
@@ -199,7 +199,7 @@ void CpuCustomManyParticleForce::setUseCutoff(RealOpenMM distance) {
...
@@ -199,7 +199,7 @@ void CpuCustomManyParticleForce::setUseCutoff(RealOpenMM distance) {
}
}
void
CpuCustomManyParticleForce
::
setPeriodic
(
RealVec
*
periodicBoxVectors
)
{
void
CpuCustomManyParticleForce
::
setPeriodic
(
RealVec
*
periodicBoxVectors
)
{
assert
(
c
utoff
);
assert
(
useC
utoff
);
assert
(
periodicBoxVectors
[
0
][
0
]
>=
2.0
*
cutoffDistance
);
assert
(
periodicBoxVectors
[
0
][
0
]
>=
2.0
*
cutoffDistance
);
assert
(
periodicBoxVectors
[
1
][
1
]
>=
2.0
*
cutoffDistance
);
assert
(
periodicBoxVectors
[
1
][
1
]
>=
2.0
*
cutoffDistance
);
assert
(
periodicBoxVectors
[
2
][
2
]
>=
2.0
*
cutoffDistance
);
assert
(
periodicBoxVectors
[
2
][
2
]
>=
2.0
*
cutoffDistance
);
...
...
platforms/cpu/src/CpuNeighborList.cpp
View file @
b21e3182
...
@@ -6,7 +6,7 @@
...
@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* *
* Portions copyright (c) 2013 Stanford University and the Authors.
*
* Portions copyright (c) 2013
-2015
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Authors: Peter Eastman *
* Contributors: *
* Contributors: *
* *
* *
...
@@ -165,7 +165,7 @@ public:
...
@@ -165,7 +165,7 @@ public:
return
VoxelIndex
(
y
,
z
);
return
VoxelIndex
(
y
,
z
);
}
}
void
getNeighbors
(
vector
<
int
>&
neighbors
,
int
blockIndex
,
const
fvec4
&
blockCenter
,
const
fvec4
&
blockWidth
,
const
vector
<
int
>&
sortedAtoms
,
vector
<
char
>&
exclusions
,
float
maxDistance
,
const
vector
<
int
>&
blockAtoms
,
const
float
*
atomLoca
tions
,
const
vector
<
VoxelIndex
>&
atomVoxelIndex
)
const
{
void
getNeighbors
(
vector
<
int
>&
neighbors
,
int
blockIndex
,
const
fvec4
&
blockCenter
,
const
fvec4
&
blockWidth
,
const
vector
<
int
>&
sortedAtoms
,
vector
<
char
>&
exclusions
,
float
maxDistance
,
const
vector
<
int
>&
blockAtoms
,
const
vector
<
float
>&
blockAtomX
,
const
vector
<
float
>&
blockAtomY
,
const
vector
<
float
>&
blockAtomZ
,
const
vector
<
float
>&
sortedPosi
tions
,
const
vector
<
VoxelIndex
>&
atomVoxelIndex
)
const
{
neighbors
.
resize
(
0
);
neighbors
.
resize
(
0
);
exclusions
.
resize
(
0
);
exclusions
.
resize
(
0
);
fvec4
boxSize
(
periodicBoxSize
[
0
],
periodicBoxSize
[
1
],
periodicBoxSize
[
2
],
0
);
fvec4
boxSize
(
periodicBoxSize
[
0
],
periodicBoxSize
[
1
],
periodicBoxSize
[
2
],
0
);
...
@@ -233,24 +233,34 @@ public:
...
@@ -233,24 +233,34 @@ public:
float
minx
=
centerPos
[
0
];
float
minx
=
centerPos
[
0
];
float
maxx
=
centerPos
[
0
];
float
maxx
=
centerPos
[
0
];
f
vec4
offset
(
-
xoffset
,
-
yoffset
+
voxelSizeY
*
y
+
(
usePeriodic
?
0.0
f
:
miny
),
voxelSizeZ
*
z
+
(
usePeriodic
?
0.0
f
:
minz
)
,
0
)
;
f
loat
offset
[
3
]
=
{
-
xoffset
,
-
yoffset
+
voxelSizeY
*
y
+
(
usePeriodic
?
0.0
f
:
miny
),
voxelSizeZ
*
z
+
(
usePeriodic
?
0.0
f
:
minz
)
}
;
for
(
int
k
=
0
;
k
<
(
int
)
blockAtoms
.
size
();
k
++
)
{
for
(
int
k
=
0
;
k
<
(
int
)
blockAtoms
.
size
();
k
+=
4
)
{
const
float
*
atomPos
=
&
atomLocations
[
4
*
blockAtoms
[
k
]]
;
fvec4
dist2
=
maxDistanceSquared
;
fvec4
posVec
(
atomPos
);
if
(
y
!=
atomVoxelIndex
[
k
].
y
)
{
fvec4
d
elta
1
=
offset
-
posVec
;
fvec4
d
y
1
=
offset
[
1
]
-
fvec4
(
&
blockAtomY
[
k
])
;
fvec4
d
elta
2
=
d
elta1
+
fvec4
(
0
,
voxelSizeY
,
voxelSizeZ
,
0
)
;
fvec4
d
y
2
=
d
y1
+
voxelSizeY
;
if
(
usePeriodic
)
{
if
(
usePeriodic
)
{
delta
1
-=
round
(
d
elta
1
*
invBoxSize
)
*
boxSize
;
dy
1
-=
round
(
d
y
1
*
invBoxSize
[
1
]
)
*
boxSize
[
1
]
;
delta
2
-=
round
(
d
elta
2
*
invBoxSize
)
*
boxSize
;
dy
2
-=
round
(
d
y
2
*
invBoxSize
[
1
]
)
*
boxSize
[
1
]
;
}
}
fvec4
delta
=
min
(
abs
(
delta1
),
abs
(
delta2
));
fvec4
dy
=
min
(
abs
(
dy1
),
abs
(
dy2
));
float
dy
=
(
y
==
atomVoxelIndex
[
k
].
y
?
0.0
f
:
delta
[
1
]);
dist2
-=
dy
*
dy
;
float
dz
=
(
z
==
atomVoxelIndex
[
k
].
z
?
0.0
f
:
delta
[
2
]);
}
float
dist2
=
maxDistanceSquared
-
dy
*
dy
-
dz
*
dz
;
if
(
z
!=
atomVoxelIndex
[
k
].
z
)
{
if
(
dist2
>
0
)
{
fvec4
dz1
=
offset
[
2
]
-
fvec4
(
&
blockAtomZ
[
k
]);
float
dist
=
sqrtf
(
dist2
);
fvec4
dz2
=
dz1
+
voxelSizeZ
;
minx
=
min
(
minx
,
atomPos
[
0
]
-
dist
-
xoffset
);
if
(
usePeriodic
)
{
maxx
=
max
(
maxx
,
atomPos
[
0
]
+
dist
-
xoffset
);
dz1
-=
round
(
dz1
*
invBoxSize
[
2
])
*
boxSize
[
2
];
dz2
-=
round
(
dz2
*
invBoxSize
[
2
])
*
boxSize
[
2
];
}
fvec4
dz
=
min
(
abs
(
dz1
),
abs
(
dz2
));
dist2
-=
dz
*
dz
;
}
fvec4
dist
=
sqrt
(
dist2
);
int
numToCheck
=
min
(
4
,
(
int
)
(
blockAtoms
.
size
()
-
k
));
for
(
int
m
=
0
;
m
<
numToCheck
;
m
++
)
{
minx
=
min
(
minx
,
blockAtomX
[
k
+
m
]
-
dist
[
m
]
-
xoffset
);
maxx
=
max
(
maxx
,
blockAtomX
[
k
+
m
]
+
dist
[
m
]
-
xoffset
);
}
}
}
}
if
(
minx
==
maxx
)
if
(
minx
==
maxx
)
...
@@ -290,12 +300,10 @@ public:
...
@@ -290,12 +300,10 @@ public:
if
(
sortedIndex
>=
lastSortedIndex
)
if
(
sortedIndex
>=
lastSortedIndex
)
continue
;
continue
;
fvec4
atomPos
(
atomLocations
+
4
*
sortedAtoms
[
sortedIndex
]);
fvec4
atomPos
(
&
sortedPositions
[
4
*
sortedIndex
]);
fvec4
delta
=
atomPos
-
centerPos
;
fvec4
delta
=
atomPos
-
centerPos
;
if
(
periodicRectangular
)
{
if
(
periodicRectangular
)
fvec4
base
=
round
(
delta
*
invBoxSize
)
*
boxSize
;
delta
-=
round
(
delta
*
invBoxSize
)
*
boxSize
;
delta
=
delta
-
base
;
}
else
if
(
needPeriodic
)
{
else
if
(
needPeriodic
)
{
delta
-=
periodicBoxVec4
[
2
]
*
floorf
(
delta
[
2
]
*
recipBoxSize
[
2
]
+
0.5
f
);
delta
-=
periodicBoxVec4
[
2
]
*
floorf
(
delta
[
2
]
*
recipBoxSize
[
2
]
+
0.5
f
);
delta
-=
periodicBoxVec4
[
1
]
*
floorf
(
delta
[
1
]
*
recipBoxSize
[
1
]
+
0.5
f
);
delta
-=
periodicBoxVec4
[
1
]
*
floorf
(
delta
[
1
]
*
recipBoxSize
[
1
]
+
0.5
f
);
...
@@ -310,26 +318,34 @@ public:
...
@@ -310,26 +318,34 @@ public:
// The distance is large enough that there might not be any actual interactions.
// The distance is large enough that there might not be any actual interactions.
// Check individual atom pairs to be sure.
// Check individual atom pairs to be sure.
bool
any
=
false
;
bool
anyInteraction
=
false
;
for
(
int
k
=
0
;
k
<
(
int
)
blockAtoms
.
size
();
k
++
)
{
for
(
int
k
=
0
;
k
<
(
int
)
blockAtoms
.
size
();
k
+=
4
)
{
fvec4
pos1
(
&
atomLocations
[
4
*
blockAtoms
[
k
]]);
fvec4
dx
=
fvec4
(
&
blockAtomX
[
k
])
-
atomPos
[
0
];
delta
=
atomPos
-
pos1
;
fvec4
dy
=
fvec4
(
&
blockAtomY
[
k
])
-
atomPos
[
1
];
fvec4
dz
=
fvec4
(
&
blockAtomZ
[
k
])
-
atomPos
[
2
];
if
(
periodicRectangular
)
{
if
(
periodicRectangular
)
{
fvec4
base
=
round
(
delta
*
invBoxSize
)
*
boxSize
;
dx
-=
round
(
dx
*
invBoxSize
[
0
])
*
boxSize
[
0
];
delta
=
delta
-
base
;
dy
-=
round
(
dy
*
invBoxSize
[
1
])
*
boxSize
[
1
];
dz
-=
round
(
dz
*
invBoxSize
[
2
])
*
boxSize
[
2
];
}
}
else
if
(
needPeriodic
)
{
else
if
(
needPeriodic
)
{
delta
-=
periodicBoxVec4
[
2
]
*
floorf
(
delta
[
2
]
*
recipBoxSize
[
2
]
+
0.5
f
);
fvec4
scale3
=
floor
(
dz
*
recipBoxSize
[
2
]
+
0.5
f
);
delta
-=
periodicBoxVec4
[
1
]
*
floorf
(
delta
[
1
]
*
recipBoxSize
[
1
]
+
0.5
f
);
dx
-=
scale3
*
periodicBoxVectors
[
2
][
0
];
delta
-=
periodicBoxVec4
[
0
]
*
floorf
(
delta
[
0
]
*
recipBoxSize
[
0
]
+
0.5
f
);
dy
-=
scale3
*
periodicBoxVectors
[
2
][
1
];
}
dz
-=
scale3
*
periodicBoxVectors
[
2
][
2
];
float
r2
=
dot3
(
delta
,
delta
);
fvec4
scale2
=
floor
(
dy
*
recipBoxSize
[
1
]
+
0.5
f
);
if
(
r2
<
maxDistanceSquared
)
{
dx
-=
scale2
*
periodicBoxVectors
[
1
][
0
];
any
=
true
;
dy
-=
scale2
*
periodicBoxVectors
[
1
][
1
];
fvec4
scale1
=
floor
(
dx
*
recipBoxSize
[
0
]
+
0.5
f
);
dx
-=
scale1
*
periodicBoxVectors
[
0
][
0
];
}
fvec4
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
if
(
any
(
r2
<
maxDistanceSquared
))
{
anyInteraction
=
true
;
break
;
break
;
}
}
}
}
if
(
!
any
)
if
(
!
any
Interaction
)
continue
;
continue
;
}
}
...
@@ -379,6 +395,7 @@ void CpuNeighborList::computeNeighborList(int numAtoms, const AlignedArray<float
...
@@ -379,6 +395,7 @@ void CpuNeighborList::computeNeighborList(int numAtoms, const AlignedArray<float
blockNeighbors
.
resize
(
numBlocks
);
blockNeighbors
.
resize
(
numBlocks
);
blockExclusions
.
resize
(
numBlocks
);
blockExclusions
.
resize
(
numBlocks
);
sortedAtoms
.
resize
(
numAtoms
);
sortedAtoms
.
resize
(
numAtoms
);
sortedPositions
.
resize
(
4
*
numAtoms
);
// Record the parameters for the threads.
// Record the parameters for the threads.
...
@@ -428,6 +445,8 @@ void CpuNeighborList::computeNeighborList(int numAtoms, const AlignedArray<float
...
@@ -428,6 +445,8 @@ void CpuNeighborList::computeNeighborList(int numAtoms, const AlignedArray<float
for
(
int
i
=
0
;
i
<
numAtoms
;
i
++
)
{
for
(
int
i
=
0
;
i
<
numAtoms
;
i
++
)
{
int
atomIndex
=
atomBins
[
i
].
second
;
int
atomIndex
=
atomBins
[
i
].
second
;
sortedAtoms
[
i
]
=
atomIndex
;
sortedAtoms
[
i
]
=
atomIndex
;
fvec4
atomPos
(
&
atomLocations
[
4
*
atomIndex
]);
atomPos
.
store
(
&
sortedPositions
[
4
*
i
]);
voxels
.
insert
(
i
,
&
atomLocations
[
4
*
atomIndex
]);
voxels
.
insert
(
i
,
&
atomLocations
[
4
*
atomIndex
]);
}
}
voxels
.
sortItems
();
voxels
.
sortItems
();
...
@@ -489,6 +508,7 @@ void CpuNeighborList::threadComputeNeighborList(ThreadPool& threads, int threadI
...
@@ -489,6 +508,7 @@ void CpuNeighborList::threadComputeNeighborList(ThreadPool& threads, int threadI
int
numBlocks
=
blockNeighbors
.
size
();
int
numBlocks
=
blockNeighbors
.
size
();
vector
<
int
>
blockAtoms
;
vector
<
int
>
blockAtoms
;
vector
<
float
>
blockAtomX
(
blockSize
),
blockAtomY
(
blockSize
),
blockAtomZ
(
blockSize
);
vector
<
VoxelIndex
>
atomVoxelIndex
;
vector
<
VoxelIndex
>
atomVoxelIndex
;
for
(
int
i
=
threadIndex
;
i
<
numBlocks
;
i
+=
numThreads
)
{
for
(
int
i
=
threadIndex
;
i
<
numBlocks
;
i
+=
numThreads
)
{
// Find the atoms in this block and compute their bounding box.
// Find the atoms in this block and compute their bounding box.
...
@@ -501,14 +521,24 @@ void CpuNeighborList::threadComputeNeighborList(ThreadPool& threads, int threadI
...
@@ -501,14 +521,24 @@ void CpuNeighborList::threadComputeNeighborList(ThreadPool& threads, int threadI
blockAtoms
[
j
]
=
sortedAtoms
[
firstIndex
+
j
];
blockAtoms
[
j
]
=
sortedAtoms
[
firstIndex
+
j
];
atomVoxelIndex
[
j
]
=
voxels
->
getVoxelIndex
(
&
atomLocations
[
4
*
blockAtoms
[
j
]]);
atomVoxelIndex
[
j
]
=
voxels
->
getVoxelIndex
(
&
atomLocations
[
4
*
blockAtoms
[
j
]]);
}
}
fvec4
minPos
(
&
atomLocations
[
4
*
sortedAtoms
[
firstIndex
]
]
);
fvec4
minPos
(
&
sortedPositions
[
4
*
firstIndex
]);
fvec4
maxPos
=
minPos
;
fvec4
maxPos
=
minPos
;
for
(
int
j
=
1
;
j
<
atomsInBlock
;
j
++
)
{
for
(
int
j
=
1
;
j
<
atomsInBlock
;
j
++
)
{
fvec4
pos
(
&
atomLocations
[
4
*
sortedAtoms
[
firstIndex
+
j
]
]);
fvec4
pos
(
&
sortedPositions
[
4
*
(
firstIndex
+
j
)
]);
minPos
=
min
(
minPos
,
pos
);
minPos
=
min
(
minPos
,
pos
);
maxPos
=
max
(
maxPos
,
pos
);
maxPos
=
max
(
maxPos
,
pos
);
}
}
voxels
->
getNeighbors
(
blockNeighbors
[
i
],
i
,
(
maxPos
+
minPos
)
*
0.5
f
,
(
maxPos
-
minPos
)
*
0.5
f
,
sortedAtoms
,
blockExclusions
[
i
],
maxDistance
,
blockAtoms
,
atomLocations
,
atomVoxelIndex
);
for
(
int
j
=
0
;
j
<
atomsInBlock
;
j
++
)
{
blockAtomX
[
j
]
=
sortedPositions
[
4
*
(
firstIndex
+
j
)];
blockAtomY
[
j
]
=
sortedPositions
[
4
*
(
firstIndex
+
j
)
+
1
];
blockAtomZ
[
j
]
=
sortedPositions
[
4
*
(
firstIndex
+
j
)
+
2
];
}
for
(
int
j
=
atomsInBlock
;
j
<
blockSize
;
j
++
)
{
blockAtomX
[
j
]
=
1e10
;
blockAtomY
[
j
]
=
1e10
;
blockAtomZ
[
j
]
=
1e10
;
}
voxels
->
getNeighbors
(
blockNeighbors
[
i
],
i
,
(
maxPos
+
minPos
)
*
0.5
f
,
(
maxPos
-
minPos
)
*
0.5
f
,
sortedAtoms
,
blockExclusions
[
i
],
maxDistance
,
blockAtoms
,
blockAtomX
,
blockAtomY
,
blockAtomZ
,
sortedPositions
,
atomVoxelIndex
);
// Record the exclusions for this block.
// Record the exclusions for this block.
...
...
platforms/cpu/src/CpuNonbondedForceVec4.cpp
View file @
b21e3182
...
@@ -44,30 +44,76 @@ CpuNonbondedForce* createCpuNonbondedForceVec4() {
...
@@ -44,30 +44,76 @@ CpuNonbondedForce* createCpuNonbondedForceVec4() {
CpuNonbondedForceVec4
::
CpuNonbondedForceVec4
()
{
CpuNonbondedForceVec4
::
CpuNonbondedForceVec4
()
{
}
}
enum
PeriodicType
{
NoPeriodic
,
PeriodicPerAtom
,
PeriodicPerInteraction
,
PeriodicTriclinic
};
void
CpuNonbondedForceVec4
::
calculateBlockIxn
(
int
blockIndex
,
float
*
forces
,
double
*
totalEnergy
,
const
fvec4
&
boxSize
,
const
fvec4
&
invBoxSize
)
{
void
CpuNonbondedForceVec4
::
calculateBlockIxn
(
int
blockIndex
,
float
*
forces
,
double
*
totalEnergy
,
const
fvec4
&
boxSize
,
const
fvec4
&
invBoxSize
)
{
if
(
triclinic
)
// Determine whether we need to apply periodic boundary conditions.
calculateBlockIxnImpl
<
true
>
(
blockIndex
,
forces
,
totalEnergy
,
boxSize
,
invBoxSize
);
PeriodicType
periodicType
;
fvec4
blockCenter
;
if
(
!
periodic
)
{
periodicType
=
NoPeriodic
;
blockCenter
=
0.0
f
;
}
else
{
const
int
*
blockAtom
=
&
neighborList
->
getSortedAtoms
()[
4
*
blockIndex
];
float
minx
,
maxx
,
miny
,
maxy
,
minz
,
maxz
;
minx
=
maxx
=
posq
[
4
*
blockAtom
[
0
]];
miny
=
maxy
=
posq
[
4
*
blockAtom
[
0
]
+
1
];
minz
=
maxz
=
posq
[
4
*
blockAtom
[
0
]
+
2
];
for
(
int
i
=
1
;
i
<
4
;
i
++
)
{
minx
=
min
(
minx
,
posq
[
4
*
blockAtom
[
i
]]);
maxx
=
max
(
maxx
,
posq
[
4
*
blockAtom
[
i
]]);
miny
=
min
(
miny
,
posq
[
4
*
blockAtom
[
i
]
+
1
]);
maxy
=
max
(
maxy
,
posq
[
4
*
blockAtom
[
i
]
+
1
]);
minz
=
min
(
minz
,
posq
[
4
*
blockAtom
[
i
]
+
2
]);
maxz
=
max
(
maxz
,
posq
[
4
*
blockAtom
[
i
]
+
2
]);
}
blockCenter
=
fvec4
(
0.5
f
*
(
minx
+
maxx
),
0.5
f
*
(
miny
+
maxy
),
0.5
f
*
(
minz
+
maxz
),
0.0
f
);
if
(
!
(
minx
<
cutoffDistance
||
miny
<
cutoffDistance
||
minz
<
cutoffDistance
||
maxx
>
boxSize
[
0
]
-
cutoffDistance
||
maxy
>
boxSize
[
1
]
-
cutoffDistance
||
maxz
>
boxSize
[
2
]
-
cutoffDistance
))
periodicType
=
NoPeriodic
;
else
if
(
triclinic
)
periodicType
=
PeriodicTriclinic
;
else
if
(
0.5
f
*
(
boxSize
[
0
]
-
(
maxx
-
minx
))
>=
cutoffDistance
&&
0.5
f
*
(
boxSize
[
1
]
-
(
maxy
-
miny
))
>=
cutoffDistance
&&
0.5
f
*
(
boxSize
[
2
]
-
(
maxz
-
minz
))
>=
cutoffDistance
)
periodicType
=
PeriodicPerAtom
;
else
else
calculateBlockIxnImpl
<
false
>
(
blockIndex
,
forces
,
totalEnergy
,
boxSize
,
invBoxSize
);
periodicType
=
PeriodicPerInteraction
;
}
// Call the appropriate version depending on what calculation is required for periodic boundary conditions.
if
(
periodicType
==
NoPeriodic
)
calculateBlockIxnImpl
<
NoPeriodic
>
(
blockIndex
,
forces
,
totalEnergy
,
boxSize
,
invBoxSize
,
blockCenter
);
else
if
(
periodicType
==
PeriodicPerAtom
)
calculateBlockIxnImpl
<
PeriodicPerAtom
>
(
blockIndex
,
forces
,
totalEnergy
,
boxSize
,
invBoxSize
,
blockCenter
);
else
if
(
periodicType
==
PeriodicPerInteraction
)
calculateBlockIxnImpl
<
PeriodicPerInteraction
>
(
blockIndex
,
forces
,
totalEnergy
,
boxSize
,
invBoxSize
,
blockCenter
);
else
if
(
periodicType
==
PeriodicTriclinic
)
calculateBlockIxnImpl
<
PeriodicTriclinic
>
(
blockIndex
,
forces
,
totalEnergy
,
boxSize
,
invBoxSize
,
blockCenter
);
}
}
template
<
bool
TRICLINIC
>
template
<
int
PERIODIC_TYPE
>
void
CpuNonbondedForceVec4
::
calculateBlockIxnImpl
(
int
blockIndex
,
float
*
forces
,
double
*
totalEnergy
,
const
fvec4
&
boxSize
,
const
fvec4
&
invBoxSize
)
{
void
CpuNonbondedForceVec4
::
calculateBlockIxnImpl
(
int
blockIndex
,
float
*
forces
,
double
*
totalEnergy
,
const
fvec4
&
boxSize
,
const
fvec4
&
invBoxSize
,
const
fvec4
&
blockCenter
)
{
// Load the positions and parameters of the atoms in the block.
// Load the positions and parameters of the atoms in the block.
const
int
*
blockAtom
=
&
neighborList
->
getSortedAtoms
()[
4
*
blockIndex
];
const
int
*
blockAtom
=
&
neighborList
->
getSortedAtoms
()[
4
*
blockIndex
];
fvec4
blockAtomPosq
[
4
];
fvec4
blockAtomPosq
[
4
];
fvec4
blockAtomForceX
(
0.0
f
),
blockAtomForceY
(
0.0
f
),
blockAtomForceZ
(
0.0
f
);
fvec4
blockAtomForceX
(
0.0
f
),
blockAtomForceY
(
0.0
f
),
blockAtomForceZ
(
0.0
f
);
for
(
int
i
=
0
;
i
<
4
;
i
++
)
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
blockAtomPosq
[
i
]
=
fvec4
(
posq
+
4
*
blockAtom
[
i
]);
blockAtomPosq
[
i
]
=
fvec4
(
posq
+
4
*
blockAtom
[
i
]);
if
(
PERIODIC_TYPE
==
PeriodicPerAtom
)
blockAtomPosq
[
i
]
-=
floor
((
blockAtomPosq
[
i
]
-
blockCenter
)
*
invBoxSize
+
0.5
f
)
*
boxSize
;
}
fvec4
blockAtomX
=
fvec4
(
blockAtomPosq
[
0
][
0
],
blockAtomPosq
[
1
][
0
],
blockAtomPosq
[
2
][
0
],
blockAtomPosq
[
3
][
0
]);
fvec4
blockAtomX
=
fvec4
(
blockAtomPosq
[
0
][
0
],
blockAtomPosq
[
1
][
0
],
blockAtomPosq
[
2
][
0
],
blockAtomPosq
[
3
][
0
]);
fvec4
blockAtomY
=
fvec4
(
blockAtomPosq
[
0
][
1
],
blockAtomPosq
[
1
][
1
],
blockAtomPosq
[
2
][
1
],
blockAtomPosq
[
3
][
1
]);
fvec4
blockAtomY
=
fvec4
(
blockAtomPosq
[
0
][
1
],
blockAtomPosq
[
1
][
1
],
blockAtomPosq
[
2
][
1
],
blockAtomPosq
[
3
][
1
]);
fvec4
blockAtomZ
=
fvec4
(
blockAtomPosq
[
0
][
2
],
blockAtomPosq
[
1
][
2
],
blockAtomPosq
[
2
][
2
],
blockAtomPosq
[
3
][
2
]);
fvec4
blockAtomZ
=
fvec4
(
blockAtomPosq
[
0
][
2
],
blockAtomPosq
[
1
][
2
],
blockAtomPosq
[
2
][
2
],
blockAtomPosq
[
3
][
2
]);
fvec4
blockAtomCharge
=
fvec4
(
ONE_4PI_EPS0
)
*
fvec4
(
blockAtomPosq
[
0
][
3
],
blockAtomPosq
[
1
][
3
],
blockAtomPosq
[
2
][
3
],
blockAtomPosq
[
3
][
3
]);
fvec4
blockAtomCharge
=
fvec4
(
ONE_4PI_EPS0
)
*
fvec4
(
blockAtomPosq
[
0
][
3
],
blockAtomPosq
[
1
][
3
],
blockAtomPosq
[
2
][
3
],
blockAtomPosq
[
3
][
3
]);
fvec4
blockAtomSigma
(
atomParameters
[
blockAtom
[
0
]].
first
,
atomParameters
[
blockAtom
[
1
]].
first
,
atomParameters
[
blockAtom
[
2
]].
first
,
atomParameters
[
blockAtom
[
3
]].
first
);
fvec4
blockAtomSigma
(
atomParameters
[
blockAtom
[
0
]].
first
,
atomParameters
[
blockAtom
[
1
]].
first
,
atomParameters
[
blockAtom
[
2
]].
first
,
atomParameters
[
blockAtom
[
3
]].
first
);
fvec4
blockAtomEpsilon
(
atomParameters
[
blockAtom
[
0
]].
second
,
atomParameters
[
blockAtom
[
1
]].
second
,
atomParameters
[
blockAtom
[
2
]].
second
,
atomParameters
[
blockAtom
[
3
]].
second
);
fvec4
blockAtomEpsilon
(
atomParameters
[
blockAtom
[
0
]].
second
,
atomParameters
[
blockAtom
[
1
]].
second
,
atomParameters
[
blockAtom
[
2
]].
second
,
atomParameters
[
blockAtom
[
3
]].
second
);
bool
needPeriodic
=
(
periodic
&&
(
any
(
blockAtomX
<
cutoffDistance
)
||
any
(
blockAtomY
<
cutoffDistance
)
||
any
(
blockAtomZ
<
cutoffDistance
)
||
const
bool
needPeriodic
=
(
PERIODIC_TYPE
==
PeriodicPerInteraction
||
PERIODIC_TYPE
==
PeriodicTriclinic
);
any
(
blockAtomX
>
boxSize
[
0
]
-
cutoffDistance
)
||
any
(
blockAtomY
>
boxSize
[
1
]
-
cutoffDistance
)
||
any
(
blockAtomZ
>
boxSize
[
2
]
-
cutoffDistance
)));
const
float
invSwitchingInterval
=
1
/
(
cutoffDistance
-
switchingDistance
);
const
float
invSwitchingInterval
=
1
/
(
cutoffDistance
-
switchingDistance
);
// Loop over neighbors for this block.
// Loop over neighbors for this block.
...
@@ -82,7 +128,10 @@ void CpuNonbondedForceVec4::calculateBlockIxnImpl(int blockIndex, float* forces,
...
@@ -82,7 +128,10 @@ void CpuNonbondedForceVec4::calculateBlockIxnImpl(int blockIndex, float* forces,
// Compute the distances to the block atoms.
// Compute the distances to the block atoms.
fvec4
dx
,
dy
,
dz
,
r2
;
fvec4
dx
,
dy
,
dz
,
r2
;
getDeltaR
<
TRICLINIC
>
(
posq
+
4
*
atom
,
blockAtomX
,
blockAtomY
,
blockAtomZ
,
dx
,
dy
,
dz
,
r2
,
needPeriodic
,
boxSize
,
invBoxSize
);
fvec4
atomPos
(
posq
+
4
*
atom
);
if
(
PERIODIC_TYPE
==
PeriodicPerAtom
)
atomPos
-=
floor
((
atomPos
-
blockCenter
)
*
invBoxSize
+
0.5
f
)
*
boxSize
;
getDeltaR
<
PERIODIC_TYPE
>
(
atomPos
,
blockAtomX
,
blockAtomY
,
blockAtomZ
,
dx
,
dy
,
dz
,
r2
,
needPeriodic
,
boxSize
,
invBoxSize
);
ivec4
include
;
ivec4
include
;
char
excl
=
exclusions
[
i
];
char
excl
=
exclusions
[
i
];
if
(
excl
==
0
)
if
(
excl
==
0
)
...
@@ -95,8 +144,7 @@ void CpuNonbondedForceVec4::calculateBlockIxnImpl(int blockIndex, float* forces,
...
@@ -95,8 +144,7 @@ void CpuNonbondedForceVec4::calculateBlockIxnImpl(int blockIndex, float* forces,
// Compute the interactions.
// Compute the interactions.
fvec4
r
=
sqrt
(
r2
);
fvec4
inverseR
=
rsqrt
(
r2
);
fvec4
inverseR
=
fvec4
(
1.0
f
)
/
r
;
fvec4
energy
,
dEdR
;
fvec4
energy
,
dEdR
;
float
atomEpsilon
=
atomParameters
[
atom
].
second
;
float
atomEpsilon
=
atomParameters
[
atom
].
second
;
if
(
atomEpsilon
!=
0.0
f
)
{
if
(
atomEpsilon
!=
0.0
f
)
{
...
@@ -108,6 +156,7 @@ void CpuNonbondedForceVec4::calculateBlockIxnImpl(int blockIndex, float* forces,
...
@@ -108,6 +156,7 @@ void CpuNonbondedForceVec4::calculateBlockIxnImpl(int blockIndex, float* forces,
dEdR
=
epsSig6
*
(
12.0
f
*
sig6
-
6.0
f
);
dEdR
=
epsSig6
*
(
12.0
f
*
sig6
-
6.0
f
);
energy
=
epsSig6
*
(
sig6
-
1.0
f
);
energy
=
epsSig6
*
(
sig6
-
1.0
f
);
if
(
useSwitch
)
{
if
(
useSwitch
)
{
fvec4
r
=
r2
*
inverseR
;
fvec4
t
=
blend
(
0.0
f
,
(
r
-
switchingDistance
)
*
invSwitchingInterval
,
r
>
switchingDistance
);
fvec4
t
=
blend
(
0.0
f
,
(
r
-
switchingDistance
)
*
invSwitchingInterval
,
r
>
switchingDistance
);
fvec4
switchValue
=
1
+
t
*
t
*
t
*
(
-
10.0
f
+
t
*
(
15.0
f
-
t
*
6.0
f
));
fvec4
switchValue
=
1
+
t
*
t
*
t
*
(
-
10.0
f
+
t
*
(
15.0
f
-
t
*
6.0
f
));
fvec4
switchDeriv
=
t
*
t
*
(
-
30.0
f
+
t
*
(
60.0
f
-
t
*
30.0
f
))
*
invSwitchingInterval
;
fvec4
switchDeriv
=
t
*
t
*
(
-
30.0
f
+
t
*
(
60.0
f
-
t
*
30.0
f
))
*
invSwitchingInterval
;
...
@@ -162,29 +211,73 @@ void CpuNonbondedForceVec4::calculateBlockIxnImpl(int blockIndex, float* forces,
...
@@ -162,29 +211,73 @@ void CpuNonbondedForceVec4::calculateBlockIxnImpl(int blockIndex, float* forces,
}
}
void
CpuNonbondedForceVec4
::
calculateBlockEwaldIxn
(
int
blockIndex
,
float
*
forces
,
double
*
totalEnergy
,
const
fvec4
&
boxSize
,
const
fvec4
&
invBoxSize
)
{
void
CpuNonbondedForceVec4
::
calculateBlockEwaldIxn
(
int
blockIndex
,
float
*
forces
,
double
*
totalEnergy
,
const
fvec4
&
boxSize
,
const
fvec4
&
invBoxSize
)
{
if
(
triclinic
)
// Determine whether we need to apply periodic boundary conditions.
calculateBlockEwaldIxnImpl
<
true
>
(
blockIndex
,
forces
,
totalEnergy
,
boxSize
,
invBoxSize
);
PeriodicType
periodicType
;
fvec4
blockCenter
;
if
(
!
periodic
)
{
periodicType
=
NoPeriodic
;
blockCenter
=
0.0
f
;
}
else
{
const
int
*
blockAtom
=
&
neighborList
->
getSortedAtoms
()[
4
*
blockIndex
];
float
minx
,
maxx
,
miny
,
maxy
,
minz
,
maxz
;
minx
=
maxx
=
posq
[
4
*
blockAtom
[
0
]];
miny
=
maxy
=
posq
[
4
*
blockAtom
[
0
]
+
1
];
minz
=
maxz
=
posq
[
4
*
blockAtom
[
0
]
+
2
];
for
(
int
i
=
1
;
i
<
4
;
i
++
)
{
minx
=
min
(
minx
,
posq
[
4
*
blockAtom
[
i
]]);
maxx
=
max
(
maxx
,
posq
[
4
*
blockAtom
[
i
]]);
miny
=
min
(
miny
,
posq
[
4
*
blockAtom
[
i
]
+
1
]);
maxy
=
max
(
maxy
,
posq
[
4
*
blockAtom
[
i
]
+
1
]);
minz
=
min
(
minz
,
posq
[
4
*
blockAtom
[
i
]
+
2
]);
maxz
=
max
(
maxz
,
posq
[
4
*
blockAtom
[
i
]
+
2
]);
}
blockCenter
=
fvec4
(
0.5
f
*
(
minx
+
maxx
),
0.5
f
*
(
miny
+
maxy
),
0.5
f
*
(
minz
+
maxz
),
0.0
f
);
if
(
!
(
minx
<
cutoffDistance
||
miny
<
cutoffDistance
||
minz
<
cutoffDistance
||
maxx
>
boxSize
[
0
]
-
cutoffDistance
||
maxy
>
boxSize
[
1
]
-
cutoffDistance
||
maxz
>
boxSize
[
2
]
-
cutoffDistance
))
periodicType
=
NoPeriodic
;
else
if
(
triclinic
)
periodicType
=
PeriodicTriclinic
;
else
if
(
0.5
f
*
(
boxSize
[
0
]
-
(
maxx
-
minx
))
>=
cutoffDistance
&&
0.5
f
*
(
boxSize
[
1
]
-
(
maxy
-
miny
))
>=
cutoffDistance
&&
0.5
f
*
(
boxSize
[
2
]
-
(
maxz
-
minz
))
>=
cutoffDistance
)
periodicType
=
PeriodicPerAtom
;
else
else
calculateBlockEwaldIxnImpl
<
false
>
(
blockIndex
,
forces
,
totalEnergy
,
boxSize
,
invBoxSize
);
periodicType
=
PeriodicPerInteraction
;
}
// Call the appropriate version depending on what calculation is required for periodic boundary conditions.
if
(
periodicType
==
NoPeriodic
)
calculateBlockEwaldIxnImpl
<
NoPeriodic
>
(
blockIndex
,
forces
,
totalEnergy
,
boxSize
,
invBoxSize
,
blockCenter
);
else
if
(
periodicType
==
PeriodicPerAtom
)
calculateBlockEwaldIxnImpl
<
PeriodicPerAtom
>
(
blockIndex
,
forces
,
totalEnergy
,
boxSize
,
invBoxSize
,
blockCenter
);
else
if
(
periodicType
==
PeriodicPerInteraction
)
calculateBlockEwaldIxnImpl
<
PeriodicPerInteraction
>
(
blockIndex
,
forces
,
totalEnergy
,
boxSize
,
invBoxSize
,
blockCenter
);
else
if
(
periodicType
==
PeriodicTriclinic
)
calculateBlockEwaldIxnImpl
<
PeriodicTriclinic
>
(
blockIndex
,
forces
,
totalEnergy
,
boxSize
,
invBoxSize
,
blockCenter
);
}
}
template
<
bool
TRICLINIC
>
template
<
int
PERIODIC_TYPE
>
void
CpuNonbondedForceVec4
::
calculateBlockEwaldIxnImpl
(
int
blockIndex
,
float
*
forces
,
double
*
totalEnergy
,
const
fvec4
&
boxSize
,
const
fvec4
&
invBoxSize
)
{
void
CpuNonbondedForceVec4
::
calculateBlockEwaldIxnImpl
(
int
blockIndex
,
float
*
forces
,
double
*
totalEnergy
,
const
fvec4
&
boxSize
,
const
fvec4
&
invBoxSize
,
const
fvec4
&
blockCenter
)
{
// Load the positions and parameters of the atoms in the block.
// Load the positions and parameters of the atoms in the block.
const
int
*
blockAtom
=
&
neighborList
->
getSortedAtoms
()[
4
*
blockIndex
];
const
int
*
blockAtom
=
&
neighborList
->
getSortedAtoms
()[
4
*
blockIndex
];
fvec4
blockAtomPosq
[
4
];
fvec4
blockAtomPosq
[
4
];
fvec4
blockAtomForceX
(
0.0
f
),
blockAtomForceY
(
0.0
f
),
blockAtomForceZ
(
0.0
f
);
fvec4
blockAtomForceX
(
0.0
f
),
blockAtomForceY
(
0.0
f
),
blockAtomForceZ
(
0.0
f
);
for
(
int
i
=
0
;
i
<
4
;
i
++
)
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
blockAtomPosq
[
i
]
=
fvec4
(
posq
+
4
*
blockAtom
[
i
]);
blockAtomPosq
[
i
]
=
fvec4
(
posq
+
4
*
blockAtom
[
i
]);
if
(
PERIODIC_TYPE
==
PeriodicPerAtom
)
blockAtomPosq
[
i
]
-=
floor
((
blockAtomPosq
[
i
]
-
blockCenter
)
*
invBoxSize
+
0.5
f
)
*
boxSize
;
}
fvec4
blockAtomX
=
fvec4
(
blockAtomPosq
[
0
][
0
],
blockAtomPosq
[
1
][
0
],
blockAtomPosq
[
2
][
0
],
blockAtomPosq
[
3
][
0
]);
fvec4
blockAtomX
=
fvec4
(
blockAtomPosq
[
0
][
0
],
blockAtomPosq
[
1
][
0
],
blockAtomPosq
[
2
][
0
],
blockAtomPosq
[
3
][
0
]);
fvec4
blockAtomY
=
fvec4
(
blockAtomPosq
[
0
][
1
],
blockAtomPosq
[
1
][
1
],
blockAtomPosq
[
2
][
1
],
blockAtomPosq
[
3
][
1
]);
fvec4
blockAtomY
=
fvec4
(
blockAtomPosq
[
0
][
1
],
blockAtomPosq
[
1
][
1
],
blockAtomPosq
[
2
][
1
],
blockAtomPosq
[
3
][
1
]);
fvec4
blockAtomZ
=
fvec4
(
blockAtomPosq
[
0
][
2
],
blockAtomPosq
[
1
][
2
],
blockAtomPosq
[
2
][
2
],
blockAtomPosq
[
3
][
2
]);
fvec4
blockAtomZ
=
fvec4
(
blockAtomPosq
[
0
][
2
],
blockAtomPosq
[
1
][
2
],
blockAtomPosq
[
2
][
2
],
blockAtomPosq
[
3
][
2
]);
fvec4
blockAtomCharge
=
fvec4
(
ONE_4PI_EPS0
)
*
fvec4
(
blockAtomPosq
[
0
][
3
],
blockAtomPosq
[
1
][
3
],
blockAtomPosq
[
2
][
3
],
blockAtomPosq
[
3
][
3
]);
fvec4
blockAtomCharge
=
fvec4
(
ONE_4PI_EPS0
)
*
fvec4
(
blockAtomPosq
[
0
][
3
],
blockAtomPosq
[
1
][
3
],
blockAtomPosq
[
2
][
3
],
blockAtomPosq
[
3
][
3
]);
fvec4
blockAtomSigma
(
atomParameters
[
blockAtom
[
0
]].
first
,
atomParameters
[
blockAtom
[
1
]].
first
,
atomParameters
[
blockAtom
[
2
]].
first
,
atomParameters
[
blockAtom
[
3
]].
first
);
fvec4
blockAtomSigma
(
atomParameters
[
blockAtom
[
0
]].
first
,
atomParameters
[
blockAtom
[
1
]].
first
,
atomParameters
[
blockAtom
[
2
]].
first
,
atomParameters
[
blockAtom
[
3
]].
first
);
fvec4
blockAtomEpsilon
(
atomParameters
[
blockAtom
[
0
]].
second
,
atomParameters
[
blockAtom
[
1
]].
second
,
atomParameters
[
blockAtom
[
2
]].
second
,
atomParameters
[
blockAtom
[
3
]].
second
);
fvec4
blockAtomEpsilon
(
atomParameters
[
blockAtom
[
0
]].
second
,
atomParameters
[
blockAtom
[
1
]].
second
,
atomParameters
[
blockAtom
[
2
]].
second
,
atomParameters
[
blockAtom
[
3
]].
second
);
bool
needPeriodic
=
(
periodic
&&
(
any
(
blockAtomX
<
cutoffDistance
)
||
any
(
blockAtomY
<
cutoffDistance
)
||
any
(
blockAtomZ
<
cutoffDistance
)
||
const
bool
needPeriodic
=
(
PERIODIC_TYPE
==
PeriodicPerInteraction
||
PERIODIC_TYPE
==
PeriodicTriclinic
);
any
(
blockAtomX
>
boxSize
[
0
]
-
cutoffDistance
)
||
any
(
blockAtomY
>
boxSize
[
1
]
-
cutoffDistance
)
||
any
(
blockAtomZ
>
boxSize
[
2
]
-
cutoffDistance
)));
const
float
invSwitchingInterval
=
1
/
(
cutoffDistance
-
switchingDistance
);
const
float
invSwitchingInterval
=
1
/
(
cutoffDistance
-
switchingDistance
);
// Loop over neighbors for this block.
// Loop over neighbors for this block.
...
@@ -199,7 +292,10 @@ void CpuNonbondedForceVec4::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
...
@@ -199,7 +292,10 @@ void CpuNonbondedForceVec4::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
// Compute the distances to the block atoms.
// Compute the distances to the block atoms.
fvec4
dx
,
dy
,
dz
,
r2
;
fvec4
dx
,
dy
,
dz
,
r2
;
getDeltaR
<
TRICLINIC
>
(
posq
+
4
*
atom
,
blockAtomX
,
blockAtomY
,
blockAtomZ
,
dx
,
dy
,
dz
,
r2
,
needPeriodic
,
boxSize
,
invBoxSize
);
fvec4
atomPos
(
posq
+
4
*
atom
);
if
(
PERIODIC_TYPE
==
PeriodicPerAtom
)
atomPos
-=
floor
((
atomPos
-
blockCenter
)
*
invBoxSize
+
0.5
f
)
*
boxSize
;
getDeltaR
<
PERIODIC_TYPE
>
(
atomPos
,
blockAtomX
,
blockAtomY
,
blockAtomZ
,
dx
,
dy
,
dz
,
r2
,
needPeriodic
,
boxSize
,
invBoxSize
);
ivec4
include
;
ivec4
include
;
char
excl
=
exclusions
[
i
];
char
excl
=
exclusions
[
i
];
if
(
excl
==
0
)
if
(
excl
==
0
)
...
@@ -212,8 +308,8 @@ void CpuNonbondedForceVec4::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
...
@@ -212,8 +308,8 @@ void CpuNonbondedForceVec4::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
// Compute the interactions.
// Compute the interactions.
fvec4
r
=
sqrt
(
r2
);
fvec4
inverseR
=
r
sqrt
(
r2
);
fvec4
inverseR
=
fvec4
(
1.0
f
)
/
r
;
fvec4
r
=
r2
*
inverseR
;
fvec4
energy
,
dEdR
;
fvec4
energy
,
dEdR
;
float
atomEpsilon
=
atomParameters
[
atom
].
second
;
float
atomEpsilon
=
atomParameters
[
atom
].
second
;
if
(
atomEpsilon
!=
0.0
f
)
{
if
(
atomEpsilon
!=
0.0
f
)
{
...
@@ -272,13 +368,12 @@ void CpuNonbondedForceVec4::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
...
@@ -272,13 +368,12 @@ void CpuNonbondedForceVec4::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
(
fvec4
(
forces
+
4
*
blockAtom
[
j
])
+
f
[
j
]).
store
(
forces
+
4
*
blockAtom
[
j
]);
(
fvec4
(
forces
+
4
*
blockAtom
[
j
])
+
f
[
j
]).
store
(
forces
+
4
*
blockAtom
[
j
]);
}
}
template
<
bool
TRICLINIC
>
template
<
int
PERIODIC_TYPE
>
void
CpuNonbondedForceVec4
::
getDeltaR
(
const
f
loat
*
posI
,
const
fvec4
&
x
,
const
fvec4
&
y
,
const
fvec4
&
z
,
fvec4
&
dx
,
fvec4
&
dy
,
fvec4
&
dz
,
fvec4
&
r2
,
bool
periodic
,
const
fvec4
&
boxSize
,
const
fvec4
&
invBoxSize
)
const
{
void
CpuNonbondedForceVec4
::
getDeltaR
(
const
f
vec4
&
posI
,
const
fvec4
&
x
,
const
fvec4
&
y
,
const
fvec4
&
z
,
fvec4
&
dx
,
fvec4
&
dy
,
fvec4
&
dz
,
fvec4
&
r2
,
bool
periodic
,
const
fvec4
&
boxSize
,
const
fvec4
&
invBoxSize
)
const
{
dx
=
x
-
posI
[
0
];
dx
=
x
-
posI
[
0
];
dy
=
y
-
posI
[
1
];
dy
=
y
-
posI
[
1
];
dz
=
z
-
posI
[
2
];
dz
=
z
-
posI
[
2
];
if
(
periodic
)
{
if
(
PERIODIC_TYPE
==
PeriodicTriclinic
)
{
if
(
TRICLINIC
)
{
fvec4
scale3
=
floor
(
dz
*
recipBoxSize
[
2
]
+
0.5
f
);
fvec4
scale3
=
floor
(
dz
*
recipBoxSize
[
2
]
+
0.5
f
);
dx
-=
scale3
*
periodicBoxVectors
[
2
][
0
];
dx
-=
scale3
*
periodicBoxVectors
[
2
][
0
];
dy
-=
scale3
*
periodicBoxVectors
[
2
][
1
];
dy
-=
scale3
*
periodicBoxVectors
[
2
][
1
];
...
@@ -289,12 +384,11 @@ void CpuNonbondedForceVec4::getDeltaR(const float* posI, const fvec4& x, const f
...
@@ -289,12 +384,11 @@ void CpuNonbondedForceVec4::getDeltaR(const float* posI, const fvec4& x, const f
fvec4
scale1
=
floor
(
dx
*
recipBoxSize
[
0
]
+
0.5
f
);
fvec4
scale1
=
floor
(
dx
*
recipBoxSize
[
0
]
+
0.5
f
);
dx
-=
scale1
*
periodicBoxVectors
[
0
][
0
];
dx
-=
scale1
*
periodicBoxVectors
[
0
][
0
];
}
}
else
{
else
if
(
PERIODIC_TYPE
==
PeriodicPerInteraction
)
{
dx
-=
round
(
dx
*
invBoxSize
[
0
])
*
boxSize
[
0
];
dx
-=
round
(
dx
*
invBoxSize
[
0
])
*
boxSize
[
0
];
dy
-=
round
(
dy
*
invBoxSize
[
1
])
*
boxSize
[
1
];
dy
-=
round
(
dy
*
invBoxSize
[
1
])
*
boxSize
[
1
];
dz
-=
round
(
dz
*
invBoxSize
[
2
])
*
boxSize
[
2
];
dz
-=
round
(
dz
*
invBoxSize
[
2
])
*
boxSize
[
2
];
}
}
}
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
}
}
...
...
platforms/cpu/src/CpuNonbondedForceVec8.cpp
View file @
b21e3182
...
@@ -76,29 +76,75 @@ CpuNonbondedForce* createCpuNonbondedForceVec8() {
...
@@ -76,29 +76,75 @@ CpuNonbondedForce* createCpuNonbondedForceVec8() {
CpuNonbondedForceVec8
::
CpuNonbondedForceVec8
()
{
CpuNonbondedForceVec8
::
CpuNonbondedForceVec8
()
{
}
}
enum
PeriodicType
{
NoPeriodic
,
PeriodicPerAtom
,
PeriodicPerInteraction
,
PeriodicTriclinic
};
void
CpuNonbondedForceVec8
::
calculateBlockIxn
(
int
blockIndex
,
float
*
forces
,
double
*
totalEnergy
,
const
fvec4
&
boxSize
,
const
fvec4
&
invBoxSize
)
{
void
CpuNonbondedForceVec8
::
calculateBlockIxn
(
int
blockIndex
,
float
*
forces
,
double
*
totalEnergy
,
const
fvec4
&
boxSize
,
const
fvec4
&
invBoxSize
)
{
if
(
triclinic
)
// Determine whether we need to apply periodic boundary conditions.
calculateBlockIxnImpl
<
true
>
(
blockIndex
,
forces
,
totalEnergy
,
boxSize
,
invBoxSize
);
PeriodicType
periodicType
;
fvec4
blockCenter
;
if
(
!
periodic
)
{
periodicType
=
NoPeriodic
;
blockCenter
=
0.0
f
;
}
else
{
const
int
*
blockAtom
=
&
neighborList
->
getSortedAtoms
()[
8
*
blockIndex
];
float
minx
,
maxx
,
miny
,
maxy
,
minz
,
maxz
;
minx
=
maxx
=
posq
[
4
*
blockAtom
[
0
]];
miny
=
maxy
=
posq
[
4
*
blockAtom
[
0
]
+
1
];
minz
=
maxz
=
posq
[
4
*
blockAtom
[
0
]
+
2
];
for
(
int
i
=
1
;
i
<
8
;
i
++
)
{
minx
=
min
(
minx
,
posq
[
4
*
blockAtom
[
i
]]);
maxx
=
max
(
maxx
,
posq
[
4
*
blockAtom
[
i
]]);
miny
=
min
(
miny
,
posq
[
4
*
blockAtom
[
i
]
+
1
]);
maxy
=
max
(
maxy
,
posq
[
4
*
blockAtom
[
i
]
+
1
]);
minz
=
min
(
minz
,
posq
[
4
*
blockAtom
[
i
]
+
2
]);
maxz
=
max
(
maxz
,
posq
[
4
*
blockAtom
[
i
]
+
2
]);
}
blockCenter
=
fvec4
(
0.5
f
*
(
minx
+
maxx
),
0.5
f
*
(
miny
+
maxy
),
0.5
f
*
(
minz
+
maxz
),
0.0
f
);
if
(
!
(
minx
<
cutoffDistance
||
miny
<
cutoffDistance
||
minz
<
cutoffDistance
||
maxx
>
boxSize
[
0
]
-
cutoffDistance
||
maxy
>
boxSize
[
1
]
-
cutoffDistance
||
maxz
>
boxSize
[
2
]
-
cutoffDistance
))
periodicType
=
NoPeriodic
;
else
if
(
triclinic
)
periodicType
=
PeriodicTriclinic
;
else
if
(
0.5
f
*
(
boxSize
[
0
]
-
(
maxx
-
minx
))
>=
cutoffDistance
&&
0.5
f
*
(
boxSize
[
1
]
-
(
maxy
-
miny
))
>=
cutoffDistance
&&
0.5
f
*
(
boxSize
[
2
]
-
(
maxz
-
minz
))
>=
cutoffDistance
)
periodicType
=
PeriodicPerAtom
;
else
else
calculateBlockIxnImpl
<
false
>
(
blockIndex
,
forces
,
totalEnergy
,
boxSize
,
invBoxSize
);
periodicType
=
PeriodicPerInteraction
;
}
// Call the appropriate version depending on what calculation is required for periodic boundary conditions.
if
(
periodicType
==
NoPeriodic
)
calculateBlockIxnImpl
<
NoPeriodic
>
(
blockIndex
,
forces
,
totalEnergy
,
boxSize
,
invBoxSize
,
blockCenter
);
else
if
(
periodicType
==
PeriodicPerAtom
)
calculateBlockIxnImpl
<
PeriodicPerAtom
>
(
blockIndex
,
forces
,
totalEnergy
,
boxSize
,
invBoxSize
,
blockCenter
);
else
if
(
periodicType
==
PeriodicPerInteraction
)
calculateBlockIxnImpl
<
PeriodicPerInteraction
>
(
blockIndex
,
forces
,
totalEnergy
,
boxSize
,
invBoxSize
,
blockCenter
);
else
if
(
periodicType
==
PeriodicTriclinic
)
calculateBlockIxnImpl
<
PeriodicTriclinic
>
(
blockIndex
,
forces
,
totalEnergy
,
boxSize
,
invBoxSize
,
blockCenter
);
}
}
template
<
bool
TRICLINIC
>
template
<
int
PERIODIC_TYPE
>
void
CpuNonbondedForceVec8
::
calculateBlockIxnImpl
(
int
blockIndex
,
float
*
forces
,
double
*
totalEnergy
,
const
fvec4
&
boxSize
,
const
fvec4
&
invBoxSize
)
{
void
CpuNonbondedForceVec8
::
calculateBlockIxnImpl
(
int
blockIndex
,
float
*
forces
,
double
*
totalEnergy
,
const
fvec4
&
boxSize
,
const
fvec4
&
invBoxSize
,
const
fvec4
&
blockCenter
)
{
// Load the positions and parameters of the atoms in the block.
// Load the positions and parameters of the atoms in the block.
const
int
*
blockAtom
=
&
neighborList
->
getSortedAtoms
()[
8
*
blockIndex
];
const
int
*
blockAtom
=
&
neighborList
->
getSortedAtoms
()[
8
*
blockIndex
];
fvec4
blockAtomPosq
[
8
];
fvec4
blockAtomPosq
[
8
];
fvec8
blockAtomForceX
(
0.0
f
),
blockAtomForceY
(
0.0
f
),
blockAtomForceZ
(
0.0
f
);
fvec8
blockAtomForceX
(
0.0
f
),
blockAtomForceY
(
0.0
f
),
blockAtomForceZ
(
0.0
f
);
fvec8
blockAtomX
,
blockAtomY
,
blockAtomZ
,
blockAtomCharge
;
fvec8
blockAtomX
,
blockAtomY
,
blockAtomZ
,
blockAtomCharge
;
for
(
int
i
=
0
;
i
<
8
;
i
++
)
for
(
int
i
=
0
;
i
<
8
;
i
++
)
{
blockAtomPosq
[
i
]
=
fvec4
(
posq
+
4
*
blockAtom
[
i
]);
blockAtomPosq
[
i
]
=
fvec4
(
posq
+
4
*
blockAtom
[
i
]);
if
(
PERIODIC_TYPE
==
PeriodicPerAtom
)
blockAtomPosq
[
i
]
-=
floor
((
blockAtomPosq
[
i
]
-
blockCenter
)
*
invBoxSize
+
0.5
f
)
*
boxSize
;
}
transpose
(
blockAtomPosq
[
0
],
blockAtomPosq
[
1
],
blockAtomPosq
[
2
],
blockAtomPosq
[
3
],
blockAtomPosq
[
4
],
blockAtomPosq
[
5
],
blockAtomPosq
[
6
],
blockAtomPosq
[
7
],
blockAtomX
,
blockAtomY
,
blockAtomZ
,
blockAtomCharge
);
transpose
(
blockAtomPosq
[
0
],
blockAtomPosq
[
1
],
blockAtomPosq
[
2
],
blockAtomPosq
[
3
],
blockAtomPosq
[
4
],
blockAtomPosq
[
5
],
blockAtomPosq
[
6
],
blockAtomPosq
[
7
],
blockAtomX
,
blockAtomY
,
blockAtomZ
,
blockAtomCharge
);
blockAtomCharge
*=
ONE_4PI_EPS0
;
blockAtomCharge
*=
ONE_4PI_EPS0
;
fvec8
blockAtomSigma
(
atomParameters
[
blockAtom
[
0
]].
first
,
atomParameters
[
blockAtom
[
1
]].
first
,
atomParameters
[
blockAtom
[
2
]].
first
,
atomParameters
[
blockAtom
[
3
]].
first
,
atomParameters
[
blockAtom
[
4
]].
first
,
atomParameters
[
blockAtom
[
5
]].
first
,
atomParameters
[
blockAtom
[
6
]].
first
,
atomParameters
[
blockAtom
[
7
]].
first
);
fvec8
blockAtomSigma
(
atomParameters
[
blockAtom
[
0
]].
first
,
atomParameters
[
blockAtom
[
1
]].
first
,
atomParameters
[
blockAtom
[
2
]].
first
,
atomParameters
[
blockAtom
[
3
]].
first
,
atomParameters
[
blockAtom
[
4
]].
first
,
atomParameters
[
blockAtom
[
5
]].
first
,
atomParameters
[
blockAtom
[
6
]].
first
,
atomParameters
[
blockAtom
[
7
]].
first
);
fvec8
blockAtomEpsilon
(
atomParameters
[
blockAtom
[
0
]].
second
,
atomParameters
[
blockAtom
[
1
]].
second
,
atomParameters
[
blockAtom
[
2
]].
second
,
atomParameters
[
blockAtom
[
3
]].
second
,
atomParameters
[
blockAtom
[
4
]].
second
,
atomParameters
[
blockAtom
[
5
]].
second
,
atomParameters
[
blockAtom
[
6
]].
second
,
atomParameters
[
blockAtom
[
7
]].
second
);
fvec8
blockAtomEpsilon
(
atomParameters
[
blockAtom
[
0
]].
second
,
atomParameters
[
blockAtom
[
1
]].
second
,
atomParameters
[
blockAtom
[
2
]].
second
,
atomParameters
[
blockAtom
[
3
]].
second
,
atomParameters
[
blockAtom
[
4
]].
second
,
atomParameters
[
blockAtom
[
5
]].
second
,
atomParameters
[
blockAtom
[
6
]].
second
,
atomParameters
[
blockAtom
[
7
]].
second
);
bool
needPeriodic
=
(
periodic
&&
(
any
(
blockAtomX
<
cutoffDistance
)
||
any
(
blockAtomY
<
cutoffDistance
)
||
any
(
blockAtomZ
<
cutoffDistance
)
||
const
bool
needPeriodic
=
(
PERIODIC_TYPE
==
PeriodicPerInteraction
||
PERIODIC_TYPE
==
PeriodicTriclinic
);
any
(
blockAtomX
>
boxSize
[
0
]
-
cutoffDistance
)
||
any
(
blockAtomY
>
boxSize
[
1
]
-
cutoffDistance
)
||
any
(
blockAtomZ
>
boxSize
[
2
]
-
cutoffDistance
)));
const
float
invSwitchingInterval
=
1
/
(
cutoffDistance
-
switchingDistance
);
const
float
invSwitchingInterval
=
1
/
(
cutoffDistance
-
switchingDistance
);
// Loop over neighbors for this block.
// Loop over neighbors for this block.
...
@@ -113,7 +159,10 @@ void CpuNonbondedForceVec8::calculateBlockIxnImpl(int blockIndex, float* forces,
...
@@ -113,7 +159,10 @@ void CpuNonbondedForceVec8::calculateBlockIxnImpl(int blockIndex, float* forces,
// Compute the distances to the block atoms.
// Compute the distances to the block atoms.
fvec8
dx
,
dy
,
dz
,
r2
;
fvec8
dx
,
dy
,
dz
,
r2
;
getDeltaR
<
TRICLINIC
>
(
&
posq
[
4
*
atom
],
blockAtomX
,
blockAtomY
,
blockAtomZ
,
dx
,
dy
,
dz
,
r2
,
needPeriodic
,
boxSize
,
invBoxSize
);
fvec4
atomPos
(
posq
+
4
*
atom
);
if
(
PERIODIC_TYPE
==
PeriodicPerAtom
)
atomPos
-=
floor
((
atomPos
-
blockCenter
)
*
invBoxSize
+
0.5
f
)
*
boxSize
;
getDeltaR
<
PERIODIC_TYPE
>
(
atomPos
,
blockAtomX
,
blockAtomY
,
blockAtomZ
,
dx
,
dy
,
dz
,
r2
,
needPeriodic
,
boxSize
,
invBoxSize
);
ivec8
include
;
ivec8
include
;
char
excl
=
exclusions
[
i
];
char
excl
=
exclusions
[
i
];
if
(
excl
==
0
)
if
(
excl
==
0
)
...
@@ -126,8 +175,7 @@ void CpuNonbondedForceVec8::calculateBlockIxnImpl(int blockIndex, float* forces,
...
@@ -126,8 +175,7 @@ void CpuNonbondedForceVec8::calculateBlockIxnImpl(int blockIndex, float* forces,
// Compute the interactions.
// Compute the interactions.
fvec8
r
=
sqrt
(
r2
);
fvec8
inverseR
=
rsqrt
(
r2
);
fvec8
inverseR
=
fvec8
(
1.0
f
)
/
r
;
fvec8
energy
,
dEdR
;
fvec8
energy
,
dEdR
;
float
atomEpsilon
=
atomParameters
[
atom
].
second
;
float
atomEpsilon
=
atomParameters
[
atom
].
second
;
if
(
atomEpsilon
!=
0.0
f
)
{
if
(
atomEpsilon
!=
0.0
f
)
{
...
@@ -139,6 +187,7 @@ void CpuNonbondedForceVec8::calculateBlockIxnImpl(int blockIndex, float* forces,
...
@@ -139,6 +187,7 @@ void CpuNonbondedForceVec8::calculateBlockIxnImpl(int blockIndex, float* forces,
dEdR
=
epsSig6
*
(
12.0
f
*
sig6
-
6.0
f
);
dEdR
=
epsSig6
*
(
12.0
f
*
sig6
-
6.0
f
);
energy
=
epsSig6
*
(
sig6
-
1.0
f
);
energy
=
epsSig6
*
(
sig6
-
1.0
f
);
if
(
useSwitch
)
{
if
(
useSwitch
)
{
fvec8
r
=
r2
*
inverseR
;
fvec8
t
=
(
r
>
switchingDistance
)
&
((
r
-
switchingDistance
)
*
invSwitchingInterval
);
fvec8
t
=
(
r
>
switchingDistance
)
&
((
r
-
switchingDistance
)
*
invSwitchingInterval
);
fvec8
switchValue
=
1
+
t
*
t
*
t
*
(
-
10.0
f
+
t
*
(
15.0
f
-
t
*
6.0
f
));
fvec8
switchValue
=
1
+
t
*
t
*
t
*
(
-
10.0
f
+
t
*
(
15.0
f
-
t
*
6.0
f
));
fvec8
switchDeriv
=
t
*
t
*
(
-
30.0
f
+
t
*
(
60.0
f
-
t
*
30.0
f
))
*
invSwitchingInterval
;
fvec8
switchDeriv
=
t
*
t
*
(
-
30.0
f
+
t
*
(
60.0
f
-
t
*
30.0
f
))
*
invSwitchingInterval
;
...
@@ -193,28 +242,72 @@ void CpuNonbondedForceVec8::calculateBlockIxnImpl(int blockIndex, float* forces,
...
@@ -193,28 +242,72 @@ void CpuNonbondedForceVec8::calculateBlockIxnImpl(int blockIndex, float* forces,
}
}
void
CpuNonbondedForceVec8
::
calculateBlockEwaldIxn
(
int
blockIndex
,
float
*
forces
,
double
*
totalEnergy
,
const
fvec4
&
boxSize
,
const
fvec4
&
invBoxSize
)
{
void
CpuNonbondedForceVec8
::
calculateBlockEwaldIxn
(
int
blockIndex
,
float
*
forces
,
double
*
totalEnergy
,
const
fvec4
&
boxSize
,
const
fvec4
&
invBoxSize
)
{
if
(
triclinic
)
// Determine whether we need to apply periodic boundary conditions.
calculateBlockEwaldIxnImpl
<
true
>
(
blockIndex
,
forces
,
totalEnergy
,
boxSize
,
invBoxSize
);
PeriodicType
periodicType
;
fvec4
blockCenter
;
if
(
!
periodic
)
{
periodicType
=
NoPeriodic
;
blockCenter
=
0.0
f
;
}
else
{
const
int
*
blockAtom
=
&
neighborList
->
getSortedAtoms
()[
8
*
blockIndex
];
float
minx
,
maxx
,
miny
,
maxy
,
minz
,
maxz
;
minx
=
maxx
=
posq
[
4
*
blockAtom
[
0
]];
miny
=
maxy
=
posq
[
4
*
blockAtom
[
0
]
+
1
];
minz
=
maxz
=
posq
[
4
*
blockAtom
[
0
]
+
2
];
for
(
int
i
=
1
;
i
<
8
;
i
++
)
{
minx
=
min
(
minx
,
posq
[
4
*
blockAtom
[
i
]]);
maxx
=
max
(
maxx
,
posq
[
4
*
blockAtom
[
i
]]);
miny
=
min
(
miny
,
posq
[
4
*
blockAtom
[
i
]
+
1
]);
maxy
=
max
(
maxy
,
posq
[
4
*
blockAtom
[
i
]
+
1
]);
minz
=
min
(
minz
,
posq
[
4
*
blockAtom
[
i
]
+
2
]);
maxz
=
max
(
maxz
,
posq
[
4
*
blockAtom
[
i
]
+
2
]);
}
blockCenter
=
fvec4
(
0.5
f
*
(
minx
+
maxx
),
0.5
f
*
(
miny
+
maxy
),
0.5
f
*
(
minz
+
maxz
),
0.0
f
);
if
(
!
(
minx
<
cutoffDistance
||
miny
<
cutoffDistance
||
minz
<
cutoffDistance
||
maxx
>
boxSize
[
0
]
-
cutoffDistance
||
maxy
>
boxSize
[
1
]
-
cutoffDistance
||
maxz
>
boxSize
[
2
]
-
cutoffDistance
))
periodicType
=
NoPeriodic
;
else
if
(
triclinic
)
periodicType
=
PeriodicTriclinic
;
else
if
(
0.5
f
*
(
boxSize
[
0
]
-
(
maxx
-
minx
))
>=
cutoffDistance
&&
0.5
f
*
(
boxSize
[
1
]
-
(
maxy
-
miny
))
>=
cutoffDistance
&&
0.5
f
*
(
boxSize
[
2
]
-
(
maxz
-
minz
))
>=
cutoffDistance
)
periodicType
=
PeriodicPerAtom
;
else
else
calculateBlockEwaldIxnImpl
<
false
>
(
blockIndex
,
forces
,
totalEnergy
,
boxSize
,
invBoxSize
);
periodicType
=
PeriodicPerInteraction
;
}
// Call the appropriate version depending on what calculation is required for periodic boundary conditions.
if
(
periodicType
==
NoPeriodic
)
calculateBlockEwaldIxnImpl
<
NoPeriodic
>
(
blockIndex
,
forces
,
totalEnergy
,
boxSize
,
invBoxSize
,
blockCenter
);
else
if
(
periodicType
==
PeriodicPerAtom
)
calculateBlockEwaldIxnImpl
<
PeriodicPerAtom
>
(
blockIndex
,
forces
,
totalEnergy
,
boxSize
,
invBoxSize
,
blockCenter
);
else
if
(
periodicType
==
PeriodicPerInteraction
)
calculateBlockEwaldIxnImpl
<
PeriodicPerInteraction
>
(
blockIndex
,
forces
,
totalEnergy
,
boxSize
,
invBoxSize
,
blockCenter
);
else
if
(
periodicType
==
PeriodicTriclinic
)
calculateBlockEwaldIxnImpl
<
PeriodicTriclinic
>
(
blockIndex
,
forces
,
totalEnergy
,
boxSize
,
invBoxSize
,
blockCenter
);
}
}
template
<
bool
TRICLINIC
>
template
<
int
PERIODIC_TYPE
>
void
CpuNonbondedForceVec8
::
calculateBlockEwaldIxnImpl
(
int
blockIndex
,
float
*
forces
,
double
*
totalEnergy
,
const
fvec4
&
boxSize
,
const
fvec4
&
invBoxSize
)
{
void
CpuNonbondedForceVec8
::
calculateBlockEwaldIxnImpl
(
int
blockIndex
,
float
*
forces
,
double
*
totalEnergy
,
const
fvec4
&
boxSize
,
const
fvec4
&
invBoxSize
,
const
fvec4
&
blockCenter
)
{
// Load the positions and parameters of the atoms in the block.
// Load the positions and parameters of the atoms in the block.
const
int
*
blockAtom
=
&
neighborList
->
getSortedAtoms
()[
8
*
blockIndex
];
const
int
*
blockAtom
=
&
neighborList
->
getSortedAtoms
()[
8
*
blockIndex
];
fvec4
blockAtomPosq
[
8
];
fvec4
blockAtomPosq
[
8
];
fvec8
blockAtomForceX
(
0.0
f
),
blockAtomForceY
(
0.0
f
),
blockAtomForceZ
(
0.0
f
);
fvec8
blockAtomForceX
(
0.0
f
),
blockAtomForceY
(
0.0
f
),
blockAtomForceZ
(
0.0
f
);
fvec8
blockAtomX
,
blockAtomY
,
blockAtomZ
,
blockAtomCharge
;
fvec8
blockAtomX
,
blockAtomY
,
blockAtomZ
,
blockAtomCharge
;
for
(
int
i
=
0
;
i
<
8
;
i
++
)
for
(
int
i
=
0
;
i
<
8
;
i
++
)
{
blockAtomPosq
[
i
]
=
fvec4
(
posq
+
4
*
blockAtom
[
i
]);
blockAtomPosq
[
i
]
=
fvec4
(
posq
+
4
*
blockAtom
[
i
]);
if
(
PERIODIC_TYPE
==
PeriodicPerAtom
)
blockAtomPosq
[
i
]
-=
floor
((
blockAtomPosq
[
i
]
-
blockCenter
)
*
invBoxSize
+
0.5
f
)
*
boxSize
;
}
transpose
(
blockAtomPosq
[
0
],
blockAtomPosq
[
1
],
blockAtomPosq
[
2
],
blockAtomPosq
[
3
],
blockAtomPosq
[
4
],
blockAtomPosq
[
5
],
blockAtomPosq
[
6
],
blockAtomPosq
[
7
],
blockAtomX
,
blockAtomY
,
blockAtomZ
,
blockAtomCharge
);
transpose
(
blockAtomPosq
[
0
],
blockAtomPosq
[
1
],
blockAtomPosq
[
2
],
blockAtomPosq
[
3
],
blockAtomPosq
[
4
],
blockAtomPosq
[
5
],
blockAtomPosq
[
6
],
blockAtomPosq
[
7
],
blockAtomX
,
blockAtomY
,
blockAtomZ
,
blockAtomCharge
);
blockAtomCharge
*=
ONE_4PI_EPS0
;
blockAtomCharge
*=
ONE_4PI_EPS0
;
fvec8
blockAtomSigma
(
atomParameters
[
blockAtom
[
0
]].
first
,
atomParameters
[
blockAtom
[
1
]].
first
,
atomParameters
[
blockAtom
[
2
]].
first
,
atomParameters
[
blockAtom
[
3
]].
first
,
atomParameters
[
blockAtom
[
4
]].
first
,
atomParameters
[
blockAtom
[
5
]].
first
,
atomParameters
[
blockAtom
[
6
]].
first
,
atomParameters
[
blockAtom
[
7
]].
first
);
fvec8
blockAtomSigma
(
atomParameters
[
blockAtom
[
0
]].
first
,
atomParameters
[
blockAtom
[
1
]].
first
,
atomParameters
[
blockAtom
[
2
]].
first
,
atomParameters
[
blockAtom
[
3
]].
first
,
atomParameters
[
blockAtom
[
4
]].
first
,
atomParameters
[
blockAtom
[
5
]].
first
,
atomParameters
[
blockAtom
[
6
]].
first
,
atomParameters
[
blockAtom
[
7
]].
first
);
fvec8
blockAtomEpsilon
(
atomParameters
[
blockAtom
[
0
]].
second
,
atomParameters
[
blockAtom
[
1
]].
second
,
atomParameters
[
blockAtom
[
2
]].
second
,
atomParameters
[
blockAtom
[
3
]].
second
,
atomParameters
[
blockAtom
[
4
]].
second
,
atomParameters
[
blockAtom
[
5
]].
second
,
atomParameters
[
blockAtom
[
6
]].
second
,
atomParameters
[
blockAtom
[
7
]].
second
);
fvec8
blockAtomEpsilon
(
atomParameters
[
blockAtom
[
0
]].
second
,
atomParameters
[
blockAtom
[
1
]].
second
,
atomParameters
[
blockAtom
[
2
]].
second
,
atomParameters
[
blockAtom
[
3
]].
second
,
atomParameters
[
blockAtom
[
4
]].
second
,
atomParameters
[
blockAtom
[
5
]].
second
,
atomParameters
[
blockAtom
[
6
]].
second
,
atomParameters
[
blockAtom
[
7
]].
second
);
bool
needPeriodic
=
(
periodic
&&
(
any
(
blockAtomX
<
cutoffDistance
)
||
any
(
blockAtomY
<
cutoffDistance
)
||
any
(
blockAtomZ
<
cutoffDistance
)
||
const
bool
needPeriodic
=
(
PERIODIC_TYPE
==
PeriodicPerInteraction
||
PERIODIC_TYPE
==
PeriodicTriclinic
);
any
(
blockAtomX
>
boxSize
[
0
]
-
cutoffDistance
)
||
any
(
blockAtomY
>
boxSize
[
1
]
-
cutoffDistance
)
||
any
(
blockAtomZ
>
boxSize
[
2
]
-
cutoffDistance
)));
const
float
invSwitchingInterval
=
1
/
(
cutoffDistance
-
switchingDistance
);
const
float
invSwitchingInterval
=
1
/
(
cutoffDistance
-
switchingDistance
);
// Loop over neighbors for this block.
// Loop over neighbors for this block.
...
@@ -229,7 +322,10 @@ void CpuNonbondedForceVec8::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
...
@@ -229,7 +322,10 @@ void CpuNonbondedForceVec8::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
// Compute the distances to the block atoms.
// Compute the distances to the block atoms.
fvec8
dx
,
dy
,
dz
,
r2
;
fvec8
dx
,
dy
,
dz
,
r2
;
getDeltaR
<
TRICLINIC
>
(
&
posq
[
4
*
atom
],
blockAtomX
,
blockAtomY
,
blockAtomZ
,
dx
,
dy
,
dz
,
r2
,
needPeriodic
,
boxSize
,
invBoxSize
);
fvec4
atomPos
(
posq
+
4
*
atom
);
if
(
PERIODIC_TYPE
==
PeriodicPerAtom
)
atomPos
-=
floor
((
atomPos
-
blockCenter
)
*
invBoxSize
+
0.5
f
)
*
boxSize
;
getDeltaR
<
PERIODIC_TYPE
>
(
atomPos
,
blockAtomX
,
blockAtomY
,
blockAtomZ
,
dx
,
dy
,
dz
,
r2
,
needPeriodic
,
boxSize
,
invBoxSize
);
ivec8
include
;
ivec8
include
;
char
excl
=
exclusions
[
i
];
char
excl
=
exclusions
[
i
];
if
(
excl
==
0
)
if
(
excl
==
0
)
...
@@ -242,8 +338,8 @@ void CpuNonbondedForceVec8::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
...
@@ -242,8 +338,8 @@ void CpuNonbondedForceVec8::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
// Compute the interactions.
// Compute the interactions.
fvec8
r
=
sqrt
(
r2
);
fvec8
inverseR
=
r
sqrt
(
r2
);
fvec8
inverseR
=
fvec8
(
1.0
f
)
/
r
;
fvec8
r
=
r2
*
inverseR
;
fvec8
energy
,
dEdR
;
fvec8
energy
,
dEdR
;
float
atomEpsilon
=
atomParameters
[
atom
].
second
;
float
atomEpsilon
=
atomParameters
[
atom
].
second
;
if
(
atomEpsilon
!=
0.0
f
)
{
if
(
atomEpsilon
!=
0.0
f
)
{
...
@@ -302,13 +398,12 @@ void CpuNonbondedForceVec8::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
...
@@ -302,13 +398,12 @@ void CpuNonbondedForceVec8::calculateBlockEwaldIxnImpl(int blockIndex, float* fo
(
fvec4
(
forces
+
4
*
blockAtom
[
j
])
+
f
[
j
]).
store
(
forces
+
4
*
blockAtom
[
j
]);
(
fvec4
(
forces
+
4
*
blockAtom
[
j
])
+
f
[
j
]).
store
(
forces
+
4
*
blockAtom
[
j
]);
}
}
template
<
bool
TRICLINIC
>
template
<
int
PERIODIC_TYPE
>
void
CpuNonbondedForceVec8
::
getDeltaR
(
const
f
loat
*
posI
,
const
fvec8
&
x
,
const
fvec8
&
y
,
const
fvec8
&
z
,
fvec8
&
dx
,
fvec8
&
dy
,
fvec8
&
dz
,
fvec8
&
r2
,
bool
periodic
,
const
fvec4
&
boxSize
,
const
fvec4
&
invBoxSize
)
const
{
void
CpuNonbondedForceVec8
::
getDeltaR
(
const
f
vec4
&
posI
,
const
fvec8
&
x
,
const
fvec8
&
y
,
const
fvec8
&
z
,
fvec8
&
dx
,
fvec8
&
dy
,
fvec8
&
dz
,
fvec8
&
r2
,
bool
periodic
,
const
fvec4
&
boxSize
,
const
fvec4
&
invBoxSize
)
const
{
dx
=
x
-
posI
[
0
];
dx
=
x
-
posI
[
0
];
dy
=
y
-
posI
[
1
];
dy
=
y
-
posI
[
1
];
dz
=
z
-
posI
[
2
];
dz
=
z
-
posI
[
2
];
if
(
periodic
)
{
if
(
PERIODIC_TYPE
==
PeriodicTriclinic
)
{
if
(
TRICLINIC
)
{
fvec8
scale3
=
floor
(
dz
*
recipBoxSize
[
2
]
+
0.5
f
);
fvec8
scale3
=
floor
(
dz
*
recipBoxSize
[
2
]
+
0.5
f
);
dx
-=
scale3
*
periodicBoxVectors
[
2
][
0
];
dx
-=
scale3
*
periodicBoxVectors
[
2
][
0
];
dy
-=
scale3
*
periodicBoxVectors
[
2
][
1
];
dy
-=
scale3
*
periodicBoxVectors
[
2
][
1
];
...
@@ -319,12 +414,11 @@ void CpuNonbondedForceVec8::getDeltaR(const float* posI, const fvec8& x, const f
...
@@ -319,12 +414,11 @@ void CpuNonbondedForceVec8::getDeltaR(const float* posI, const fvec8& x, const f
fvec8
scale1
=
floor
(
dx
*
recipBoxSize
[
0
]
+
0.5
f
);
fvec8
scale1
=
floor
(
dx
*
recipBoxSize
[
0
]
+
0.5
f
);
dx
-=
scale1
*
periodicBoxVectors
[
0
][
0
];
dx
-=
scale1
*
periodicBoxVectors
[
0
][
0
];
}
}
else
{
else
if
(
PERIODIC_TYPE
==
PeriodicPerInteraction
)
{
dx
-=
round
(
dx
*
invBoxSize
[
0
])
*
boxSize
[
0
];
dx
-=
round
(
dx
*
invBoxSize
[
0
])
*
boxSize
[
0
];
dy
-=
round
(
dy
*
invBoxSize
[
1
])
*
boxSize
[
1
];
dy
-=
round
(
dy
*
invBoxSize
[
1
])
*
boxSize
[
1
];
dz
-=
round
(
dz
*
invBoxSize
[
2
])
*
boxSize
[
2
];
dz
-=
round
(
dz
*
invBoxSize
[
2
])
*
boxSize
[
2
];
}
}
}
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
r2
=
dx
*
dx
+
dy
*
dy
+
dz
*
dz
;
}
}
...
...
platforms/opencl/include/OpenCLFFT3D.h
View file @
b21e3182
...
@@ -9,7 +9,7 @@
...
@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* *
* Portions copyright (c) 2009 Stanford University and the Authors.
*
* Portions copyright (c) 2009
-2015
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Authors: Peter Eastman *
* Contributors: *
* Contributors: *
* *
* *
...
@@ -40,7 +40,7 @@ namespace OpenMM {
...
@@ -40,7 +40,7 @@ namespace OpenMM {
* 15, 207–228 (2000).
* 15, 207–228 (2000).
* <p>
* <p>
* This class places certain restrictions on the allowed dimensions of the grid. First,
* This class places certain restrictions on the allowed dimensions of the grid. First,
* the size of each dimension may have no prime factors other than 2, 3, and
5
. You
* the size of each dimension may have no prime factors other than 2, 3,
5,
and
7
. You
* can call findLegalDimension() to determine the smallest size that satisfies this
* can call findLegalDimension() to determine the smallest size that satisfies this
* requirement and is greater than or equal to a specified minimum size. Second, the size
* requirement and is greater than or equal to a specified minimum size. Second, the size
* of each dimension must be small enough to compute each 1D transform entirely in local
* of each dimension must be small enough to compute each 1D transform entirely in local
...
@@ -61,12 +61,17 @@ public:
...
@@ -61,12 +61,17 @@ public:
* @param xsize the first dimension of the data sets on which FFTs will be performed
* @param xsize the first dimension of the data sets on which FFTs will be performed
* @param ysize the second dimension of the data sets on which FFTs will be performed
* @param ysize the second dimension of the data sets on which FFTs will be performed
* @param zsize the third dimension of the data sets on which FFTs will be performed
* @param zsize the third dimension of the data sets on which FFTs will be performed
* @param realToComplex if true, a real-to-complex transform will be done. Otherwise, it is complex-to-complex.
*/
*/
OpenCLFFT3D
(
OpenCLContext
&
context
,
int
xsize
,
int
ysize
,
int
zsize
);
OpenCLFFT3D
(
OpenCLContext
&
context
,
int
xsize
,
int
ysize
,
int
zsize
,
bool
realToComplex
=
false
);
/**
/**
* Perform a Fourier transform. The transform cannot be done in-place: the input and output
* Perform a Fourier transform. The transform cannot be done in-place: the input and output
* arrays must be different. Also, the input array is used as workspace, so its contents
* arrays must be different. Also, the input array is used as workspace, so its contents
* are destroyed.
* are destroyed. This also means that both arrays must be large enough to hold complex values,
* even when performing a real-to-complex transform.
* <p>
* When performing a real-to-complex transform, the output data is of size xsize*ysize*(zsize/2+1)
* and contains only the non-redundant elements.
*
*
* @param in the data to transform, ordered such that in[x*ysize*zsize + y*zsize + z] contains element (x, y, z)
* @param in the data to transform, ordered such that in[x*ysize*zsize + y*zsize + z] contains element (x, y, z)
* @param out on exit, this contains the transformed data
* @param out on exit, this contains the transformed data
...
@@ -75,17 +80,20 @@ public:
...
@@ -75,17 +80,20 @@ public:
void
execFFT
(
OpenCLArray
&
in
,
OpenCLArray
&
out
,
bool
forward
=
true
);
void
execFFT
(
OpenCLArray
&
in
,
OpenCLArray
&
out
,
bool
forward
=
true
);
/**
/**
* Get the smallest legal size for a dimension of the grid (that is, a size with no prime
* Get the smallest legal size for a dimension of the grid (that is, a size with no prime
* factors other than 2, 3, and
5
).
* factors other than 2, 3,
5,
and
7
).
*
*
* @param minimum the minimum size the return value must be greater than or equal to
* @param minimum the minimum size the return value must be greater than or equal to
*/
*/
static
int
findLegalDimension
(
int
minimum
);
static
int
findLegalDimension
(
int
minimum
);
private:
private:
cl
::
Kernel
createKernel
(
int
xsize
,
int
ysize
,
int
zsize
,
int
&
threads
);
cl
::
Kernel
createKernel
(
int
xsize
,
int
ysize
,
int
zsize
,
int
&
threads
,
int
axis
,
bool
forward
,
bool
inputIsReal
);
int
xsize
,
ysize
,
zsize
;
int
xsize
,
ysize
,
zsize
;
int
xthreads
,
ythreads
,
zthreads
;
int
xthreads
,
ythreads
,
zthreads
;
bool
packRealAsComplex
;
OpenCLContext
&
context
;
OpenCLContext
&
context
;
cl
::
Kernel
xkernel
,
ykernel
,
zkernel
;
cl
::
Kernel
xkernel
,
ykernel
,
zkernel
;
cl
::
Kernel
invxkernel
,
invykernel
,
invzkernel
;
cl
::
Kernel
packForwardKernel
,
unpackForwardKernel
,
packBackwardKernel
,
unpackBackwardKernel
;
};
};
}
// namespace OpenMM
}
// namespace OpenMM
...
...
platforms/opencl/include/OpenCLKernels.h
View file @
b21e3182
...
@@ -639,6 +639,7 @@ private:
...
@@ -639,6 +639,7 @@ private:
cl
::
Kernel
pmeSpreadChargeKernel
;
cl
::
Kernel
pmeSpreadChargeKernel
;
cl
::
Kernel
pmeFinishSpreadChargeKernel
;
cl
::
Kernel
pmeFinishSpreadChargeKernel
;
cl
::
Kernel
pmeConvolutionKernel
;
cl
::
Kernel
pmeConvolutionKernel
;
cl
::
Kernel
pmeEvalEnergyKernel
;
cl
::
Kernel
pmeInterpolateForceKernel
;
cl
::
Kernel
pmeInterpolateForceKernel
;
std
::
map
<
std
::
string
,
std
::
string
>
pmeDefines
;
std
::
map
<
std
::
string
,
std
::
string
>
pmeDefines
;
std
::
vector
<
std
::
pair
<
int
,
int
>
>
exceptionAtoms
;
std
::
vector
<
std
::
pair
<
int
,
int
>
>
exceptionAtoms
;
...
...
platforms/opencl/src/OpenCLContext.cpp
View file @
b21e3182
...
@@ -106,9 +106,8 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
...
@@ -106,9 +106,8 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
// if they supplied a valid deviceIndex, we only look through that one
// if they supplied a valid deviceIndex, we only look through that one
if
(
i
!=
deviceIndex
&&
deviceIndex
>=
0
&&
deviceIndex
<
(
int
)
devices
.
size
())
if
(
i
!=
deviceIndex
&&
deviceIndex
>=
0
&&
deviceIndex
<
(
int
)
devices
.
size
())
continue
;
continue
;
if
(
platformVendor
==
"Apple"
&&
(
devices
[
i
].
getInfo
<
CL_DEVICE_TYPE
>
()
==
CL_DEVICE_TYPE_CPU
))
if
(
platformVendor
==
"Apple"
&&
(
devices
[
i
].
getInfo
<
CL_DEVICE_TYPE
>
()
==
CL_DEVICE_TYPE_CPU
||
devices
[
i
].
getInfo
<
CL_DEVICE_VENDOR
>
()
==
"AMD"
))
continue
;
// The CPU device on OS X won't work correctly.
continue
;
// The CPU device on OS X won't work correctly, and there are serious bugs using AMD GPUs.
int
maxSize
=
devices
[
i
].
getInfo
<
CL_DEVICE_MAX_WORK_ITEM_SIZES
>
()[
0
];
int
maxSize
=
devices
[
i
].
getInfo
<
CL_DEVICE_MAX_WORK_ITEM_SIZES
>
()[
0
];
int
processingElementsPerComputeUnit
=
8
;
int
processingElementsPerComputeUnit
=
8
;
if
(
devices
[
i
].
getInfo
<
CL_DEVICE_TYPE
>
()
!=
CL_DEVICE_TYPE_GPU
)
{
if
(
devices
[
i
].
getInfo
<
CL_DEVICE_TYPE
>
()
!=
CL_DEVICE_TYPE_GPU
)
{
...
@@ -170,6 +169,8 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
...
@@ -170,6 +169,8 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
compilationDefines
[
"WORK_GROUP_SIZE"
]
=
intToString
(
ThreadBlockSize
);
compilationDefines
[
"WORK_GROUP_SIZE"
]
=
intToString
(
ThreadBlockSize
);
if
(
platformVendor
.
size
()
>=
5
&&
platformVendor
.
substr
(
0
,
5
)
==
"Intel"
)
if
(
platformVendor
.
size
()
>=
5
&&
platformVendor
.
substr
(
0
,
5
)
==
"Intel"
)
defaultOptimizationOptions
=
""
;
defaultOptimizationOptions
=
""
;
else
if
(
platformVendor
==
"Apple"
)
defaultOptimizationOptions
=
"-cl-mad-enable -cl-no-signed-zeros"
;
else
else
defaultOptimizationOptions
=
"-cl-fast-relaxed-math"
;
defaultOptimizationOptions
=
"-cl-fast-relaxed-math"
;
supports64BitGlobalAtomics
=
(
device
.
getInfo
<
CL_DEVICE_EXTENSIONS
>
().
find
(
"cl_khr_int64_base_atomics"
)
!=
string
::
npos
);
supports64BitGlobalAtomics
=
(
device
.
getInfo
<
CL_DEVICE_EXTENSIONS
>
().
find
(
"cl_khr_int64_base_atomics"
)
!=
string
::
npos
);
...
@@ -241,8 +242,6 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
...
@@ -241,8 +242,6 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
}
}
else
else
simdWidth
=
1
;
simdWidth
=
1
;
if
(
platformVendor
==
"Apple"
&&
vendor
==
"AMD"
)
compilationDefines
[
"MAC_AMD_WORKAROUND"
]
=
""
;
if
(
supports64BitGlobalAtomics
)
if
(
supports64BitGlobalAtomics
)
compilationDefines
[
"SUPPORTS_64_BIT_ATOMICS"
]
=
""
;
compilationDefines
[
"SUPPORTS_64_BIT_ATOMICS"
]
=
""
;
if
(
supportsDoublePrecision
)
if
(
supportsDoublePrecision
)
...
...
platforms/opencl/src/OpenCLFFT3D.cpp
View file @
b21e3182
...
@@ -6,7 +6,7 @@
...
@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* *
* Portions copyright (c) 2009-201
2
Stanford University and the Authors. *
* Portions copyright (c) 2009-201
5
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Authors: Peter Eastman *
* Contributors: *
* Contributors: *
* *
* *
...
@@ -35,25 +35,109 @@
...
@@ -35,25 +35,109 @@
using
namespace
OpenMM
;
using
namespace
OpenMM
;
using
namespace
std
;
using
namespace
std
;
OpenCLFFT3D
::
OpenCLFFT3D
(
OpenCLContext
&
context
,
int
xsize
,
int
ysize
,
int
zsize
)
:
context
(
context
),
xsize
(
xsize
),
ysize
(
ysize
),
zsize
(
zsize
)
{
OpenCLFFT3D
::
OpenCLFFT3D
(
OpenCLContext
&
context
,
int
xsize
,
int
ysize
,
int
zsize
,
bool
realToComplex
)
:
zkernel
=
createKernel
(
xsize
,
ysize
,
zsize
,
zthreads
);
context
(
context
),
xsize
(
xsize
),
ysize
(
ysize
),
zsize
(
zsize
)
{
xkernel
=
createKernel
(
ysize
,
zsize
,
xsize
,
xthreads
);
packRealAsComplex
=
false
;
ykernel
=
createKernel
(
zsize
,
xsize
,
ysize
,
ythreads
);
int
packedXSize
=
xsize
;
int
packedYSize
=
ysize
;
int
packedZSize
=
zsize
;
if
(
realToComplex
)
{
// If any axis size is even, we can pack the real values into a complex grid that is only half as large.
// Look for an appropriate axis.
packRealAsComplex
=
true
;
int
packedAxis
,
bufferSize
;
if
(
xsize
%
2
==
0
)
{
packedAxis
=
0
;
packedXSize
/=
2
;
bufferSize
=
packedXSize
;
}
else
if
(
ysize
%
2
==
0
)
{
packedAxis
=
1
;
packedYSize
/=
2
;
bufferSize
=
packedYSize
;
}
else
if
(
zsize
%
2
==
0
)
{
packedAxis
=
2
;
packedZSize
/=
2
;
bufferSize
=
packedZSize
;
}
else
packRealAsComplex
=
false
;
if
(
packRealAsComplex
)
{
// Build the kernels for packing and unpacking the data.
map
<
string
,
string
>
defines
;
defines
[
"XSIZE"
]
=
context
.
intToString
(
xsize
);
defines
[
"YSIZE"
]
=
context
.
intToString
(
ysize
);
defines
[
"ZSIZE"
]
=
context
.
intToString
(
zsize
);
defines
[
"PACKED_AXIS"
]
=
context
.
intToString
(
packedAxis
);
defines
[
"PACKED_XSIZE"
]
=
context
.
intToString
(
packedXSize
);
defines
[
"PACKED_YSIZE"
]
=
context
.
intToString
(
packedYSize
);
defines
[
"PACKED_ZSIZE"
]
=
context
.
intToString
(
packedZSize
);
defines
[
"M_PI"
]
=
context
.
doubleToString
(
M_PI
);
cl
::
Program
program
=
context
.
createProgram
(
OpenCLKernelSources
::
fftR2C
,
defines
);
packForwardKernel
=
cl
::
Kernel
(
program
,
"packForwardData"
);
unpackForwardKernel
=
cl
::
Kernel
(
program
,
"unpackForwardData"
);
unpackForwardKernel
.
setArg
(
2
,
bufferSize
*
(
context
.
getUseDoublePrecision
()
?
sizeof
(
mm_double2
)
:
sizeof
(
mm_float2
)),
NULL
);
packBackwardKernel
=
cl
::
Kernel
(
program
,
"packBackwardData"
);
packBackwardKernel
.
setArg
(
2
,
bufferSize
*
(
context
.
getUseDoublePrecision
()
?
sizeof
(
mm_double2
)
:
sizeof
(
mm_float2
)),
NULL
);
unpackBackwardKernel
=
cl
::
Kernel
(
program
,
"unpackBackwardData"
);
}
}
bool
inputIsReal
=
(
realToComplex
&&
!
packRealAsComplex
);
zkernel
=
createKernel
(
packedXSize
,
packedYSize
,
packedZSize
,
zthreads
,
0
,
true
,
inputIsReal
);
xkernel
=
createKernel
(
packedYSize
,
packedZSize
,
packedXSize
,
xthreads
,
1
,
true
,
inputIsReal
);
ykernel
=
createKernel
(
packedZSize
,
packedXSize
,
packedYSize
,
ythreads
,
2
,
true
,
inputIsReal
);
invzkernel
=
createKernel
(
packedXSize
,
packedYSize
,
packedZSize
,
zthreads
,
0
,
false
,
inputIsReal
);
invxkernel
=
createKernel
(
packedYSize
,
packedZSize
,
packedXSize
,
xthreads
,
1
,
false
,
inputIsReal
);
invykernel
=
createKernel
(
packedZSize
,
packedXSize
,
packedYSize
,
ythreads
,
2
,
false
,
inputIsReal
);
}
}
void
OpenCLFFT3D
::
execFFT
(
OpenCLArray
&
in
,
OpenCLArray
&
out
,
bool
forward
)
{
void
OpenCLFFT3D
::
execFFT
(
OpenCLArray
&
in
,
OpenCLArray
&
out
,
bool
forward
)
{
zkernel
.
setArg
<
cl
::
Buffer
>
(
0
,
in
.
getDeviceBuffer
());
cl
::
Kernel
kernel1
=
(
forward
?
zkernel
:
invzkernel
);
zkernel
.
setArg
<
cl
::
Buffer
>
(
1
,
out
.
getDeviceBuffer
());
cl
::
Kernel
kernel2
=
(
forward
?
xkernel
:
invxkernel
);
zkernel
.
setArg
<
cl_int
>
(
2
,
forward
?
1
:
-
1
);
cl
::
Kernel
kernel3
=
(
forward
?
ykernel
:
invykernel
);
context
.
executeKernel
(
zkernel
,
xsize
*
ysize
*
zsize
,
zthreads
);
if
(
packRealAsComplex
)
{
xkernel
.
setArg
<
cl
::
Buffer
>
(
0
,
out
.
getDeviceBuffer
());
cl
::
Kernel
packKernel
=
(
forward
?
packForwardKernel
:
packBackwardKernel
);
xkernel
.
setArg
<
cl
::
Buffer
>
(
1
,
in
.
getDeviceBuffer
());
cl
::
Kernel
unpackKernel
=
(
forward
?
unpackForwardKernel
:
unpackBackwardKernel
);
xkernel
.
setArg
<
cl_int
>
(
2
,
forward
?
1
:
-
1
);
int
gridSize
=
xsize
*
ysize
*
zsize
/
2
;
context
.
executeKernel
(
xkernel
,
xsize
*
ysize
*
zsize
,
xthreads
);
ykernel
.
setArg
<
cl
::
Buffer
>
(
0
,
in
.
getDeviceBuffer
());
// Pack the data into a half sized grid.
ykernel
.
setArg
<
cl
::
Buffer
>
(
1
,
out
.
getDeviceBuffer
());
ykernel
.
setArg
<
cl_int
>
(
2
,
forward
?
1
:
-
1
);
packKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
in
.
getDeviceBuffer
());
context
.
executeKernel
(
ykernel
,
xsize
*
ysize
*
zsize
,
ythreads
);
packKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
out
.
getDeviceBuffer
());
context
.
executeKernel
(
packKernel
,
gridSize
);
// Perform the FFT.
kernel1
.
setArg
<
cl
::
Buffer
>
(
0
,
out
.
getDeviceBuffer
());
kernel1
.
setArg
<
cl
::
Buffer
>
(
1
,
in
.
getDeviceBuffer
());
context
.
executeKernel
(
kernel1
,
gridSize
,
zthreads
);
kernel2
.
setArg
<
cl
::
Buffer
>
(
0
,
in
.
getDeviceBuffer
());
kernel2
.
setArg
<
cl
::
Buffer
>
(
1
,
out
.
getDeviceBuffer
());
context
.
executeKernel
(
kernel2
,
gridSize
,
xthreads
);
kernel3
.
setArg
<
cl
::
Buffer
>
(
0
,
out
.
getDeviceBuffer
());
kernel3
.
setArg
<
cl
::
Buffer
>
(
1
,
in
.
getDeviceBuffer
());
context
.
executeKernel
(
kernel3
,
gridSize
,
ythreads
);
// Unpack the data.
unpackKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
in
.
getDeviceBuffer
());
unpackKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
out
.
getDeviceBuffer
());
context
.
executeKernel
(
unpackKernel
,
gridSize
);
}
else
{
kernel1
.
setArg
<
cl
::
Buffer
>
(
0
,
in
.
getDeviceBuffer
());
kernel1
.
setArg
<
cl
::
Buffer
>
(
1
,
out
.
getDeviceBuffer
());
context
.
executeKernel
(
kernel1
,
xsize
*
ysize
*
zsize
,
zthreads
);
kernel2
.
setArg
<
cl
::
Buffer
>
(
0
,
out
.
getDeviceBuffer
());
kernel2
.
setArg
<
cl
::
Buffer
>
(
1
,
in
.
getDeviceBuffer
());
context
.
executeKernel
(
kernel2
,
xsize
*
ysize
*
zsize
,
xthreads
);
kernel3
.
setArg
<
cl
::
Buffer
>
(
0
,
in
.
getDeviceBuffer
());
kernel3
.
setArg
<
cl
::
Buffer
>
(
1
,
out
.
getDeviceBuffer
());
context
.
executeKernel
(
kernel3
,
xsize
*
ysize
*
zsize
,
ythreads
);
}
}
}
int
OpenCLFFT3D
::
findLegalDimension
(
int
minimum
)
{
int
OpenCLFFT3D
::
findLegalDimension
(
int
minimum
)
{
...
@@ -73,8 +157,10 @@ int OpenCLFFT3D::findLegalDimension(int minimum) {
...
@@ -73,8 +157,10 @@ int OpenCLFFT3D::findLegalDimension(int minimum) {
}
}
}
}
cl
::
Kernel
OpenCLFFT3D
::
createKernel
(
int
xsize
,
int
ysize
,
int
zsize
,
int
&
threads
)
{
cl
::
Kernel
OpenCLFFT3D
::
createKernel
(
int
xsize
,
int
ysize
,
int
zsize
,
int
&
threads
,
int
axis
,
bool
forward
,
bool
inputIsReal
)
{
int
maxThreads
=
std
::
min
(
256
,
(
int
)
context
.
getDevice
().
getInfo
<
CL_DEVICE_MAX_WORK_GROUP_SIZE
>
());
int
maxThreads
=
std
::
min
(
256
,
(
int
)
context
.
getDevice
().
getInfo
<
CL_DEVICE_MAX_WORK_GROUP_SIZE
>
());
while
(
maxThreads
>
128
&&
maxThreads
-
64
>=
zsize
)
maxThreads
-=
64
;
bool
isCPU
=
context
.
getDevice
().
getInfo
<
CL_DEVICE_TYPE
>
()
==
CL_DEVICE_TYPE_CPU
;
bool
isCPU
=
context
.
getDevice
().
getInfo
<
CL_DEVICE_TYPE
>
()
==
CL_DEVICE_TYPE_CPU
;
while
(
true
)
{
while
(
true
)
{
bool
loopRequired
=
(
zsize
>
maxThreads
||
isCPU
);
bool
loopRequired
=
(
zsize
>
maxThreads
||
isCPU
);
...
@@ -137,10 +223,10 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
...
@@ -137,10 +223,10 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
source
<<
"real2 b2 = "
<<
context
.
doubleToString
((
2
*
cos
(
2
*
M_PI
/
7
)
-
cos
(
4
*
M_PI
/
7
)
-
cos
(
6
*
M_PI
/
7
))
/
3
)
<<
"*(d0-d4);
\n
"
;
source
<<
"real2 b2 = "
<<
context
.
doubleToString
((
2
*
cos
(
2
*
M_PI
/
7
)
-
cos
(
4
*
M_PI
/
7
)
-
cos
(
6
*
M_PI
/
7
))
/
3
)
<<
"*(d0-d4);
\n
"
;
source
<<
"real2 b3 = "
<<
context
.
doubleToString
((
cos
(
2
*
M_PI
/
7
)
-
2
*
cos
(
4
*
M_PI
/
7
)
+
cos
(
6
*
M_PI
/
7
))
/
3
)
<<
"*(d4-d2);
\n
"
;
source
<<
"real2 b3 = "
<<
context
.
doubleToString
((
cos
(
2
*
M_PI
/
7
)
-
2
*
cos
(
4
*
M_PI
/
7
)
+
cos
(
6
*
M_PI
/
7
))
/
3
)
<<
"*(d4-d2);
\n
"
;
source
<<
"real2 b4 = "
<<
context
.
doubleToString
((
cos
(
2
*
M_PI
/
7
)
+
cos
(
4
*
M_PI
/
7
)
-
2
*
cos
(
6
*
M_PI
/
7
))
/
3
)
<<
"*(d2-d0);
\n
"
;
source
<<
"real2 b4 = "
<<
context
.
doubleToString
((
cos
(
2
*
M_PI
/
7
)
+
cos
(
4
*
M_PI
/
7
)
-
2
*
cos
(
6
*
M_PI
/
7
))
/
3
)
<<
"*(d2-d0);
\n
"
;
source
<<
"real2 b5 = -
sign
*"
<<
context
.
doubleToString
((
sin
(
2
*
M_PI
/
7
)
+
sin
(
4
*
M_PI
/
7
)
-
sin
(
6
*
M_PI
/
7
))
/
3
)
<<
"*(d7+d1);
\n
"
;
source
<<
"real2 b5 = -
(SIGN)
*"
<<
context
.
doubleToString
((
sin
(
2
*
M_PI
/
7
)
+
sin
(
4
*
M_PI
/
7
)
-
sin
(
6
*
M_PI
/
7
))
/
3
)
<<
"*(d7+d1);
\n
"
;
source
<<
"real2 b6 = -
sign
*"
<<
context
.
doubleToString
((
2
*
sin
(
2
*
M_PI
/
7
)
-
sin
(
4
*
M_PI
/
7
)
+
sin
(
6
*
M_PI
/
7
))
/
3
)
<<
"*(d1-d5);
\n
"
;
source
<<
"real2 b6 = -
(SIGN)
*"
<<
context
.
doubleToString
((
2
*
sin
(
2
*
M_PI
/
7
)
-
sin
(
4
*
M_PI
/
7
)
+
sin
(
6
*
M_PI
/
7
))
/
3
)
<<
"*(d1-d5);
\n
"
;
source
<<
"real2 b7 = -
sign
*"
<<
context
.
doubleToString
((
sin
(
2
*
M_PI
/
7
)
-
2
*
sin
(
4
*
M_PI
/
7
)
-
sin
(
6
*
M_PI
/
7
))
/
3
)
<<
"*(d5-d3);
\n
"
;
source
<<
"real2 b7 = -
(SIGN)
*"
<<
context
.
doubleToString
((
sin
(
2
*
M_PI
/
7
)
-
2
*
sin
(
4
*
M_PI
/
7
)
-
sin
(
6
*
M_PI
/
7
))
/
3
)
<<
"*(d5-d3);
\n
"
;
source
<<
"real2 b8 = -
sign
*"
<<
context
.
doubleToString
((
sin
(
2
*
M_PI
/
7
)
+
sin
(
4
*
M_PI
/
7
)
+
2
*
sin
(
6
*
M_PI
/
7
))
/
3
)
<<
"*(d3-d1);
\n
"
;
source
<<
"real2 b8 = -
(SIGN)
*"
<<
context
.
doubleToString
((
sin
(
2
*
M_PI
/
7
)
+
sin
(
4
*
M_PI
/
7
)
+
2
*
sin
(
6
*
M_PI
/
7
))
/
3
)
<<
"*(d3-d1);
\n
"
;
source
<<
"real2 t0 = b0+b1;
\n
"
;
source
<<
"real2 t0 = b0+b1;
\n
"
;
source
<<
"real2 t1 = b2+b3;
\n
"
;
source
<<
"real2 t1 = b2+b3;
\n
"
;
source
<<
"real2 t2 = b4-b3;
\n
"
;
source
<<
"real2 t2 = b4-b3;
\n
"
;
...
@@ -178,8 +264,8 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
...
@@ -178,8 +264,8 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
source
<<
"real2 d7 = d6+d5;
\n
"
;
source
<<
"real2 d7 = d6+d5;
\n
"
;
source
<<
"real2 d8 = d6-d5;
\n
"
;
source
<<
"real2 d8 = d6-d5;
\n
"
;
string
coeff
=
context
.
doubleToString
(
sin
(
0.2
*
M_PI
)
/
sin
(
0.4
*
M_PI
));
string
coeff
=
context
.
doubleToString
(
sin
(
0.2
*
M_PI
)
/
sin
(
0.4
*
M_PI
));
source
<<
"real2 d9 =
sign
*(real2) (d2.y+"
<<
coeff
<<
"*d3.y, -d2.x-"
<<
coeff
<<
"*d3.x);
\n
"
;
source
<<
"real2 d9 =
(SIGN)
*(real2) (d2.y+"
<<
coeff
<<
"*d3.y, -d2.x-"
<<
coeff
<<
"*d3.x);
\n
"
;
source
<<
"real2 d10 =
sign
*(real2) ("
<<
coeff
<<
"*d2.y-d3.y, d3.x-"
<<
coeff
<<
"*d2.x);
\n
"
;
source
<<
"real2 d10 =
(SIGN)
*(real2) ("
<<
coeff
<<
"*d2.y-d3.y, d3.x-"
<<
coeff
<<
"*d2.x);
\n
"
;
source
<<
"data"
<<
output
<<
"[base+4*j*"
<<
m
<<
"] = c0+d4;
\n
"
;
source
<<
"data"
<<
output
<<
"[base+4*j*"
<<
m
<<
"] = c0+d4;
\n
"
;
source
<<
"data"
<<
output
<<
"[base+(4*j+1)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
zsize
<<
"/"
<<
(
5
*
L
)
<<
"], d7+d9);
\n
"
;
source
<<
"data"
<<
output
<<
"[base+(4*j+1)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
zsize
<<
"/"
<<
(
5
*
L
)
<<
"], d7+d9);
\n
"
;
source
<<
"data"
<<
output
<<
"[base+(4*j+2)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
(
2
*
zsize
)
<<
"/"
<<
(
5
*
L
)
<<
"], d8+d10);
\n
"
;
source
<<
"data"
<<
output
<<
"[base+(4*j+2)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
(
2
*
zsize
)
<<
"/"
<<
(
5
*
L
)
<<
"], d8+d10);
\n
"
;
...
@@ -194,7 +280,7 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
...
@@ -194,7 +280,7 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
source
<<
"real2 d0 = c0+c2;
\n
"
;
source
<<
"real2 d0 = c0+c2;
\n
"
;
source
<<
"real2 d1 = c0-c2;
\n
"
;
source
<<
"real2 d1 = c0-c2;
\n
"
;
source
<<
"real2 d2 = c1+c3;
\n
"
;
source
<<
"real2 d2 = c1+c3;
\n
"
;
source
<<
"real2 d3 =
sign
*(real2) (c1.y-c3.y, c3.x-c1.x);
\n
"
;
source
<<
"real2 d3 =
(SIGN)
*(real2) (c1.y-c3.y, c3.x-c1.x);
\n
"
;
source
<<
"data"
<<
output
<<
"[base+3*j*"
<<
m
<<
"] = d0+d2;
\n
"
;
source
<<
"data"
<<
output
<<
"[base+3*j*"
<<
m
<<
"] = d0+d2;
\n
"
;
source
<<
"data"
<<
output
<<
"[base+(3*j+1)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
zsize
<<
"/"
<<
(
4
*
L
)
<<
"], d1+d3);
\n
"
;
source
<<
"data"
<<
output
<<
"[base+(3*j+1)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
zsize
<<
"/"
<<
(
4
*
L
)
<<
"], d1+d3);
\n
"
;
source
<<
"data"
<<
output
<<
"[base+(3*j+2)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
(
2
*
zsize
)
<<
"/"
<<
(
4
*
L
)
<<
"], d0-d2);
\n
"
;
source
<<
"data"
<<
output
<<
"[base+(3*j+2)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
(
2
*
zsize
)
<<
"/"
<<
(
4
*
L
)
<<
"], d0-d2);
\n
"
;
...
@@ -206,7 +292,7 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
...
@@ -206,7 +292,7 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
source
<<
"real2 c2 = data"
<<
input
<<
"[base+"
<<
(
2
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"real2 c2 = data"
<<
input
<<
"[base+"
<<
(
2
*
L
*
m
)
<<
"];
\n
"
;
source
<<
"real2 d0 = c1+c2;
\n
"
;
source
<<
"real2 d0 = c1+c2;
\n
"
;
source
<<
"real2 d1 = c0-0.5f*d0;
\n
"
;
source
<<
"real2 d1 = c0-0.5f*d0;
\n
"
;
source
<<
"real2 d2 =
sign
*"
<<
context
.
doubleToString
(
sin
(
M_PI
/
3.0
))
<<
"*(real2) (c1.y-c2.y, c2.x-c1.x);
\n
"
;
source
<<
"real2 d2 =
(SIGN)
*"
<<
context
.
doubleToString
(
sin
(
M_PI
/
3.0
))
<<
"*(real2) (c1.y-c2.y, c2.x-c1.x);
\n
"
;
source
<<
"data"
<<
output
<<
"[base+2*j*"
<<
m
<<
"] = c0+d0;
\n
"
;
source
<<
"data"
<<
output
<<
"[base+2*j*"
<<
m
<<
"] = c0+d0;
\n
"
;
source
<<
"data"
<<
output
<<
"[base+(2*j+1)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
zsize
<<
"/"
<<
(
3
*
L
)
<<
"], d1+d2);
\n
"
;
source
<<
"data"
<<
output
<<
"[base+(2*j+1)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
zsize
<<
"/"
<<
(
3
*
L
)
<<
"], d1+d2);
\n
"
;
source
<<
"data"
<<
output
<<
"[base+(2*j+2)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
(
2
*
zsize
)
<<
"/"
<<
(
3
*
L
)
<<
"], d1-d2);
\n
"
;
source
<<
"data"
<<
output
<<
"[base+(2*j+2)*"
<<
m
<<
"] = multiplyComplex(w[j*"
<<
(
2
*
zsize
)
<<
"/"
<<
(
3
*
L
)
<<
"], d1-d2);
\n
"
;
...
@@ -226,13 +312,27 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
...
@@ -226,13 +312,27 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
// Create the kernel.
// Create the kernel.
bool
outputIsReal
=
(
inputIsReal
&&
axis
==
2
&&
!
forward
);
bool
outputIsPacked
=
(
inputIsReal
&&
axis
==
2
&&
forward
);
string
outputSuffix
=
(
outputIsReal
?
".x"
:
""
);
if
(
loopRequired
)
{
if
(
loopRequired
)
{
if
(
outputIsPacked
)
source
<<
"if (x < XSIZE/2+1)
\n
"
;
source
<<
"for (int z = get_local_id(0); z < ZSIZE; z += get_local_size(0))
\n
"
;
source
<<
"for (int z = get_local_id(0); z < ZSIZE; z += get_local_size(0))
\n
"
;
source
<<
"out[y*(ZSIZE*XSIZE)+z*XSIZE+x] = data"
<<
(
stage
%
2
)
<<
"[z];
\n
"
;
if
(
outputIsPacked
)
source
<<
"out[y*(ZSIZE*(XSIZE/2+1))+z*(XSIZE/2+1)+x] = data"
<<
(
stage
%
2
)
<<
"[z]"
<<
outputSuffix
<<
";
\n
"
;
else
source
<<
"out[y*(ZSIZE*XSIZE)+z*XSIZE+x] = data"
<<
(
stage
%
2
)
<<
"[z]"
<<
outputSuffix
<<
";
\n
"
;
}
else
{
if
(
outputIsPacked
)
{
source
<<
"if (index < XSIZE*YSIZE && x < XSIZE/2+1)
\n
"
;
source
<<
"out[y*(ZSIZE*(XSIZE/2+1))+(get_local_id(0)%ZSIZE)*(XSIZE/2+1)+x] = data"
<<
(
stage
%
2
)
<<
"[get_local_id(0)]"
<<
outputSuffix
<<
";
\n
"
;
}
}
else
{
else
{
source
<<
"if (index < XSIZE*YSIZE)
\n
"
;
source
<<
"if (index < XSIZE*YSIZE)
\n
"
;
source
<<
"out[y*(ZSIZE*XSIZE)+(get_local_id(0)%ZSIZE)*XSIZE+x] = data"
<<
(
stage
%
2
)
<<
"[get_local_id(0)];
\n
"
;
source
<<
"out[y*(ZSIZE*XSIZE)+(get_local_id(0)%ZSIZE)*XSIZE+x] = data"
<<
(
stage
%
2
)
<<
"[get_local_id(0)]"
<<
outputSuffix
<<
";
\n
"
;
}
}
}
map
<
string
,
string
>
replacements
;
map
<
string
,
string
>
replacements
;
replacements
[
"XSIZE"
]
=
context
.
intToString
(
xsize
);
replacements
[
"XSIZE"
]
=
context
.
intToString
(
xsize
);
...
@@ -242,6 +342,12 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
...
@@ -242,6 +342,12 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
replacements
[
"M_PI"
]
=
context
.
doubleToString
(
M_PI
);
replacements
[
"M_PI"
]
=
context
.
doubleToString
(
M_PI
);
replacements
[
"COMPUTE_FFT"
]
=
source
.
str
();
replacements
[
"COMPUTE_FFT"
]
=
source
.
str
();
replacements
[
"LOOP_REQUIRED"
]
=
(
loopRequired
?
"1"
:
"0"
);
replacements
[
"LOOP_REQUIRED"
]
=
(
loopRequired
?
"1"
:
"0"
);
replacements
[
"SIGN"
]
=
(
forward
?
"1"
:
"-1"
);
replacements
[
"INPUT_TYPE"
]
=
(
inputIsReal
&&
axis
==
0
&&
forward
?
"real"
:
"real2"
);
replacements
[
"OUTPUT_TYPE"
]
=
(
outputIsReal
?
"real"
:
"real2"
);
replacements
[
"INPUT_IS_REAL"
]
=
(
inputIsReal
&&
axis
==
0
&&
forward
?
"1"
:
"0"
);
replacements
[
"INPUT_IS_PACKED"
]
=
(
inputIsReal
&&
axis
==
0
&&
!
forward
?
"1"
:
"0"
);
replacements
[
"OUTPUT_IS_PACKED"
]
=
(
outputIsPacked
?
"1"
:
"0"
);
cl
::
Program
program
=
context
.
createProgram
(
context
.
replaceStrings
(
OpenCLKernelSources
::
fft
,
replacements
));
cl
::
Program
program
=
context
.
createProgram
(
context
.
replaceStrings
(
OpenCLKernelSources
::
fft
,
replacements
));
cl
::
Kernel
kernel
(
program
,
"execFFT"
);
cl
::
Kernel
kernel
(
program
,
"execFFT"
);
threads
=
(
isCPU
?
1
:
blocksPerGroup
*
zsize
);
threads
=
(
isCPU
?
1
:
blocksPerGroup
*
zsize
);
...
@@ -253,9 +359,9 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
...
@@ -253,9 +359,9 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threa
continue
;
continue
;
}
}
int
bufferSize
=
blocksPerGroup
*
zsize
*
(
context
.
getUseDoublePrecision
()
?
sizeof
(
mm_double2
)
:
sizeof
(
mm_float2
));
int
bufferSize
=
blocksPerGroup
*
zsize
*
(
context
.
getUseDoublePrecision
()
?
sizeof
(
mm_double2
)
:
sizeof
(
mm_float2
));
kernel
.
setArg
(
2
,
bufferSize
,
NULL
);
kernel
.
setArg
(
3
,
bufferSize
,
NULL
);
kernel
.
setArg
(
3
,
bufferSize
,
NULL
);
kernel
.
setArg
(
4
,
bufferSize
,
NULL
);
kernel
.
setArg
(
4
,
bufferSize
,
NULL
);
kernel
.
setArg
(
5
,
bufferSize
,
NULL
);
return
kernel
;
return
kernel
;
}
}
}
}
platforms/opencl/src/OpenCLKernels.cpp
View file @
b21e3182
...
@@ -1628,6 +1628,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
...
@@ -1628,6 +1628,7 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
pmeDefines
[
"GRID_SIZE_Y"
]
=
cl
.
intToString
(
gridSizeY
);
pmeDefines
[
"GRID_SIZE_Y"
]
=
cl
.
intToString
(
gridSizeY
);
pmeDefines
[
"GRID_SIZE_Z"
]
=
cl
.
intToString
(
gridSizeZ
);
pmeDefines
[
"GRID_SIZE_Z"
]
=
cl
.
intToString
(
gridSizeZ
);
pmeDefines
[
"EPSILON_FACTOR"
]
=
cl
.
doubleToString
(
sqrt
(
ONE_4PI_EPS0
));
pmeDefines
[
"EPSILON_FACTOR"
]
=
cl
.
doubleToString
(
sqrt
(
ONE_4PI_EPS0
));
pmeDefines
[
"M_PI"
]
=
cl
.
doubleToString
(
M_PI
);
bool
deviceIsCpu
=
(
cl
.
getDevice
().
getInfo
<
CL_DEVICE_TYPE
>
()
==
CL_DEVICE_TYPE_CPU
);
bool
deviceIsCpu
=
(
cl
.
getDevice
().
getInfo
<
CL_DEVICE_TYPE
>
()
==
CL_DEVICE_TYPE_CPU
);
if
(
deviceIsCpu
)
if
(
deviceIsCpu
)
pmeDefines
[
"DEVICE_IS_CPU"
]
=
"1"
;
pmeDefines
[
"DEVICE_IS_CPU"
]
=
"1"
;
...
@@ -1652,8 +1653,11 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
...
@@ -1652,8 +1653,11 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
int
elementSize
=
(
cl
.
getUseDoublePrecision
()
?
sizeof
(
double
)
:
sizeof
(
float
));
int
elementSize
=
(
cl
.
getUseDoublePrecision
()
?
sizeof
(
double
)
:
sizeof
(
float
));
pmeGrid
=
new
OpenCLArray
(
cl
,
gridSizeX
*
gridSizeY
*
gridSizeZ
,
2
*
elementSize
,
"pmeGrid"
);
pmeGrid
=
new
OpenCLArray
(
cl
,
gridSizeX
*
gridSizeY
*
gridSizeZ
,
2
*
elementSize
,
"pmeGrid"
);
cl
.
addAutoclearBuffer
(
*
pmeGrid
);
pmeGrid2
=
new
OpenCLArray
(
cl
,
gridSizeX
*
gridSizeY
*
gridSizeZ
,
2
*
elementSize
,
"pmeGrid2"
);
pmeGrid2
=
new
OpenCLArray
(
cl
,
gridSizeX
*
gridSizeY
*
gridSizeZ
,
2
*
elementSize
,
"pmeGrid2"
);
if
(
cl
.
getSupports64BitGlobalAtomics
())
cl
.
addAutoclearBuffer
(
*
pmeGrid2
);
else
cl
.
addAutoclearBuffer
(
*
pmeGrid
);
pmeBsplineModuliX
=
new
OpenCLArray
(
cl
,
gridSizeX
,
elementSize
,
"pmeBsplineModuliX"
);
pmeBsplineModuliX
=
new
OpenCLArray
(
cl
,
gridSizeX
,
elementSize
,
"pmeBsplineModuliX"
);
pmeBsplineModuliY
=
new
OpenCLArray
(
cl
,
gridSizeY
,
elementSize
,
"pmeBsplineModuliY"
);
pmeBsplineModuliY
=
new
OpenCLArray
(
cl
,
gridSizeY
,
elementSize
,
"pmeBsplineModuliY"
);
pmeBsplineModuliZ
=
new
OpenCLArray
(
cl
,
gridSizeZ
,
elementSize
,
"pmeBsplineModuliZ"
);
pmeBsplineModuliZ
=
new
OpenCLArray
(
cl
,
gridSizeZ
,
elementSize
,
"pmeBsplineModuliZ"
);
...
@@ -1661,9 +1665,12 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
...
@@ -1661,9 +1665,12 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
pmeAtomRange
=
OpenCLArray
::
create
<
cl_int
>
(
cl
,
gridSizeX
*
gridSizeY
*
gridSizeZ
+
1
,
"pmeAtomRange"
);
pmeAtomRange
=
OpenCLArray
::
create
<
cl_int
>
(
cl
,
gridSizeX
*
gridSizeY
*
gridSizeZ
+
1
,
"pmeAtomRange"
);
pmeAtomGridIndex
=
OpenCLArray
::
create
<
mm_int2
>
(
cl
,
numParticles
,
"pmeAtomGridIndex"
);
pmeAtomGridIndex
=
OpenCLArray
::
create
<
mm_int2
>
(
cl
,
numParticles
,
"pmeAtomGridIndex"
);
sort
=
new
OpenCLSort
(
cl
,
new
SortTrait
(),
cl
.
getNumAtoms
());
sort
=
new
OpenCLSort
(
cl
,
new
SortTrait
(),
cl
.
getNumAtoms
());
fft
=
new
OpenCLFFT3D
(
cl
,
gridSizeX
,
gridSizeY
,
gridSizeZ
);
fft
=
new
OpenCLFFT3D
(
cl
,
gridSizeX
,
gridSizeY
,
gridSizeZ
,
true
);
string
vendor
=
cl
.
getDevice
().
getInfo
<
CL_DEVICE_VENDOR
>
();
string
vendor
=
cl
.
getDevice
().
getInfo
<
CL_DEVICE_VENDOR
>
();
usePmeQueue
=
(
vendor
.
size
()
>=
6
&&
vendor
.
substr
(
0
,
6
)
==
"NVIDIA"
);
bool
isNvidia
=
(
vendor
.
size
()
>=
6
&&
vendor
.
substr
(
0
,
6
)
==
"NVIDIA"
);
if
(
isNvidia
)
pmeDefines
[
"USE_ALTERNATE_MEMORY_ACCESS_PATTERN"
]
=
"1"
;
usePmeQueue
=
isNvidia
;
if
(
usePmeQueue
)
{
if
(
usePmeQueue
)
{
pmeQueue
=
cl
::
CommandQueue
(
cl
.
getContext
(),
cl
.
getDevice
());
pmeQueue
=
cl
::
CommandQueue
(
cl
.
getContext
(),
cl
.
getDevice
());
int
recipForceGroup
=
force
.
getReciprocalSpaceForceGroup
();
int
recipForceGroup
=
force
.
getReciprocalSpaceForceGroup
();
...
@@ -1800,6 +1807,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
...
@@ -1800,6 +1807,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
pmeZIndexKernel
=
cl
::
Kernel
(
program
,
"recordZIndex"
);
pmeZIndexKernel
=
cl
::
Kernel
(
program
,
"recordZIndex"
);
pmeSpreadChargeKernel
=
cl
::
Kernel
(
program
,
"gridSpreadCharge"
);
pmeSpreadChargeKernel
=
cl
::
Kernel
(
program
,
"gridSpreadCharge"
);
pmeConvolutionKernel
=
cl
::
Kernel
(
program
,
"reciprocalConvolution"
);
pmeConvolutionKernel
=
cl
::
Kernel
(
program
,
"reciprocalConvolution"
);
pmeEvalEnergyKernel
=
cl
::
Kernel
(
program
,
"gridEvaluateEnergy"
);
pmeInterpolateForceKernel
=
cl
::
Kernel
(
program
,
"gridInterpolateForce"
);
pmeInterpolateForceKernel
=
cl
::
Kernel
(
program
,
"gridInterpolateForce"
);
int
elementSize
=
(
cl
.
getUseDoublePrecision
()
?
sizeof
(
mm_double4
)
:
sizeof
(
mm_float4
));
int
elementSize
=
(
cl
.
getUseDoublePrecision
()
?
sizeof
(
mm_double4
)
:
sizeof
(
mm_float4
));
pmeUpdateBsplinesKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
cl
.
getPosq
().
getDeviceBuffer
());
pmeUpdateBsplinesKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
cl
.
getPosq
().
getDeviceBuffer
());
...
@@ -1814,20 +1822,28 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
...
@@ -1814,20 +1822,28 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
pmeSpreadChargeKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
cl
.
getPosq
().
getDeviceBuffer
());
pmeSpreadChargeKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
cl
.
getPosq
().
getDeviceBuffer
());
pmeSpreadChargeKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
pmeAtomGridIndex
->
getDeviceBuffer
());
pmeSpreadChargeKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
pmeAtomGridIndex
->
getDeviceBuffer
());
pmeSpreadChargeKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
pmeAtomRange
->
getDeviceBuffer
());
pmeSpreadChargeKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
pmeAtomRange
->
getDeviceBuffer
());
if
(
cl
.
getSupports64BitGlobalAtomics
())
pmeSpreadChargeKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
pmeGrid2
->
getDeviceBuffer
());
else
pmeSpreadChargeKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
pmeGrid
->
getDeviceBuffer
());
pmeSpreadChargeKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
pmeGrid
->
getDeviceBuffer
());
pmeSpreadChargeKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
pmeBsplineTheta
->
getDeviceBuffer
());
pmeSpreadChargeKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
pmeBsplineTheta
->
getDeviceBuffer
());
pmeConvolutionKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
pmeGrid2
->
getDeviceBuffer
());
pmeConvolutionKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
pmeGrid2
->
getDeviceBuffer
());
pmeConvolutionKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
cl
.
getEnergyBuffer
().
getDeviceBuffer
());
pmeConvolutionKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
pmeBsplineModuliX
->
getDeviceBuffer
());
pmeConvolutionKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
pmeBsplineModuliX
->
getDeviceBuffer
());
pmeConvolutionKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
pmeBsplineModuliY
->
getDeviceBuffer
());
pmeConvolutionKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
pmeBsplineModuliY
->
getDeviceBuffer
());
pmeConvolutionKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
pmeBsplineModuliZ
->
getDeviceBuffer
());
pmeConvolutionKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
pmeBsplineModuliZ
->
getDeviceBuffer
());
pmeEvalEnergyKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
pmeGrid2
->
getDeviceBuffer
());
pmeEvalEnergyKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
cl
.
getEnergyBuffer
().
getDeviceBuffer
());
pmeEvalEnergyKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
pmeBsplineModuliX
->
getDeviceBuffer
());
pmeEvalEnergyKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
pmeBsplineModuliY
->
getDeviceBuffer
());
pmeEvalEnergyKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
pmeBsplineModuliZ
->
getDeviceBuffer
());
pmeInterpolateForceKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
cl
.
getPosq
().
getDeviceBuffer
());
pmeInterpolateForceKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
cl
.
getPosq
().
getDeviceBuffer
());
pmeInterpolateForceKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
cl
.
getForceBuffers
().
getDeviceBuffer
());
pmeInterpolateForceKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
cl
.
getForceBuffers
().
getDeviceBuffer
());
pmeInterpolateForceKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
pmeGrid
->
getDeviceBuffer
());
pmeInterpolateForceKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
pmeGrid
->
getDeviceBuffer
());
pmeInterpolateForceKernel
.
setArg
<
cl
::
Buffer
>
(
7
,
pmeAtomGridIndex
->
getDeviceBuffer
());
pmeInterpolateForceKernel
.
setArg
<
cl
::
Buffer
>
(
7
,
pmeAtomGridIndex
->
getDeviceBuffer
());
if
(
cl
.
getSupports64BitGlobalAtomics
())
{
if
(
cl
.
getSupports64BitGlobalAtomics
())
{
pmeFinishSpreadChargeKernel
=
cl
::
Kernel
(
program
,
"finishSpreadCharge"
);
pmeFinishSpreadChargeKernel
=
cl
::
Kernel
(
program
,
"finishSpreadCharge"
);
pmeFinishSpreadChargeKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
pmeGrid
->
getDeviceBuffer
());
pmeFinishSpreadChargeKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
pmeGrid2
->
getDeviceBuffer
());
pmeFinishSpreadChargeKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
pmeGrid
->
getDeviceBuffer
());
}
}
}
}
}
}
...
@@ -1851,7 +1867,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
...
@@ -1851,7 +1867,7 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
cl
.
executeKernel
(
ewaldForcesKernel
,
cl
.
getNumAtoms
());
cl
.
executeKernel
(
ewaldForcesKernel
,
cl
.
getNumAtoms
());
}
}
if
(
pmeGrid
!=
NULL
&&
includeReciprocal
)
{
if
(
pmeGrid
!=
NULL
&&
includeReciprocal
)
{
if
(
usePmeQueue
)
if
(
usePmeQueue
&&
!
includeEnergy
)
cl
.
setQueue
(
pmeQueue
);
cl
.
setQueue
(
pmeQueue
);
// Invert the periodic box vectors.
// Invert the periodic box vectors.
...
@@ -1926,19 +1942,24 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
...
@@ -1926,19 +1942,24 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
}
}
fft
->
execFFT
(
*
pmeGrid
,
*
pmeGrid2
,
true
);
fft
->
execFFT
(
*
pmeGrid
,
*
pmeGrid2
,
true
);
mm_double4
boxSize
=
cl
.
getPeriodicBoxSizeDouble
();
mm_double4
boxSize
=
cl
.
getPeriodicBoxSizeDouble
();
double
scaleFactor
=
1.0
/
(
M_PI
*
boxSize
.
x
*
boxSize
.
y
*
boxSize
.
z
);
if
(
cl
.
getUseDoublePrecision
())
{
if
(
cl
.
getUseDoublePrecision
())
{
pmeConvolutionKernel
.
setArg
<
mm_double4
>
(
5
,
recipBoxVectors
[
0
]);
pmeConvolutionKernel
.
setArg
<
mm_double4
>
(
4
,
recipBoxVectors
[
0
]);
pmeConvolutionKernel
.
setArg
<
mm_double4
>
(
6
,
recipBoxVectors
[
1
]);
pmeConvolutionKernel
.
setArg
<
mm_double4
>
(
5
,
recipBoxVectors
[
1
]);
pmeConvolutionKernel
.
setArg
<
mm_double4
>
(
7
,
recipBoxVectors
[
2
]);
pmeConvolutionKernel
.
setArg
<
mm_double4
>
(
6
,
recipBoxVectors
[
2
]);
pmeConvolutionKernel
.
setArg
<
cl_double
>
(
8
,
scaleFactor
);
pmeEvalEnergyKernel
.
setArg
<
mm_double4
>
(
5
,
recipBoxVectors
[
0
]);
pmeEvalEnergyKernel
.
setArg
<
mm_double4
>
(
6
,
recipBoxVectors
[
1
]);
pmeEvalEnergyKernel
.
setArg
<
mm_double4
>
(
7
,
recipBoxVectors
[
2
]);
}
}
else
{
else
{
pmeConvolutionKernel
.
setArg
<
mm_float4
>
(
5
,
recipBoxVectorsFloat
[
0
]);
pmeConvolutionKernel
.
setArg
<
mm_float4
>
(
4
,
recipBoxVectorsFloat
[
0
]);
pmeConvolutionKernel
.
setArg
<
mm_float4
>
(
6
,
recipBoxVectorsFloat
[
1
]);
pmeConvolutionKernel
.
setArg
<
mm_float4
>
(
5
,
recipBoxVectorsFloat
[
1
]);
pmeConvolutionKernel
.
setArg
<
mm_float4
>
(
7
,
recipBoxVectorsFloat
[
2
]);
pmeConvolutionKernel
.
setArg
<
mm_float4
>
(
6
,
recipBoxVectorsFloat
[
2
]);
pmeConvolutionKernel
.
setArg
<
cl_float
>
(
8
,
(
float
)
scaleFactor
);
pmeEvalEnergyKernel
.
setArg
<
mm_float4
>
(
5
,
recipBoxVectorsFloat
[
0
]);
}
pmeEvalEnergyKernel
.
setArg
<
mm_float4
>
(
6
,
recipBoxVectorsFloat
[
1
]);
pmeEvalEnergyKernel
.
setArg
<
mm_float4
>
(
7
,
recipBoxVectorsFloat
[
2
]);
}
if
(
includeEnergy
)
cl
.
executeKernel
(
pmeEvalEnergyKernel
,
cl
.
getNumAtoms
());
cl
.
executeKernel
(
pmeConvolutionKernel
,
cl
.
getNumAtoms
());
cl
.
executeKernel
(
pmeConvolutionKernel
,
cl
.
getNumAtoms
());
fft
->
execFFT
(
*
pmeGrid2
,
*
pmeGrid
,
false
);
fft
->
execFFT
(
*
pmeGrid2
,
*
pmeGrid
,
false
);
setPeriodicBoxSizeArg
(
cl
,
pmeInterpolateForceKernel
,
3
);
setPeriodicBoxSizeArg
(
cl
,
pmeInterpolateForceKernel
,
3
);
...
...
platforms/opencl/src/OpenCLPlatform.cpp
View file @
b21e3182
...
@@ -109,7 +109,7 @@ bool OpenCLPlatform::supportsDoublePrecision() const {
...
@@ -109,7 +109,7 @@ bool OpenCLPlatform::supportsDoublePrecision() const {
bool
OpenCLPlatform
::
isPlatformSupported
()
{
bool
OpenCLPlatform
::
isPlatformSupported
()
{
// Return false for OpenCL implementations that are known
// Return false for OpenCL implementations that are known
// to be buggy (Apple OSX
since 10.7.5)
// to be buggy (Apple OS
X
prior to 10.10).
#ifdef __APPLE__
#ifdef __APPLE__
char
str
[
256
];
char
str
[
256
];
...
@@ -122,12 +122,10 @@ bool OpenCLPlatform::isPlatformSupported() {
...
@@ -122,12 +122,10 @@ bool OpenCLPlatform::isPlatformSupported() {
if
(
sscanf
(
str
,
"%d.%d.%d"
,
&
major
,
&
minor
,
&
micro
)
!=
3
)
if
(
sscanf
(
str
,
"%d.%d.%d"
,
&
major
,
&
minor
,
&
micro
)
!=
3
)
return
false
;
return
false
;
if
((
major
>
11
)
||
(
major
==
11
&&
minor
>
4
)
||
(
major
==
11
&&
minor
==
4
&&
micro
>=
2
))
if
(
major
<
14
||
(
major
==
14
&&
minor
<
3
))
// 11.4.2 is the darwin release corresponding to OSX 10.7.5, which is the
// 14.3.0 is the darwin release corresponding to OS X 10.10.3. Versions prior to that
// point at which a number of serious bugs were introduced into the
// contained a number of serious bugs in the Apple OpenCL libraries.
// Apple OpenCL libraries, resulting in catistrophically incorrect MD simulations
// (See https://github.com/SimTk/openmm/issues/395 for example.)
// (see https://github.com/SimTk/openmm/issues/395 for example). Once a fix is released,
// this version check should be updated.
return
false
;
return
false
;
#endif
#endif
...
...
platforms/opencl/src/OpenCLSort.cpp
View file @
b21e3182
...
@@ -6,7 +6,7 @@
...
@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* *
* Portions copyright (c) 2010-201
3
Stanford University and the Authors. *
* Portions copyright (c) 2010-201
5
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Authors: Peter Eastman *
* Contributors: *
* Contributors: *
* *
* *
...
@@ -42,7 +42,6 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le
...
@@ -42,7 +42,6 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le
replacements
[
"MIN_KEY"
]
=
trait
->
getMinKey
();
replacements
[
"MIN_KEY"
]
=
trait
->
getMinKey
();
replacements
[
"MAX_KEY"
]
=
trait
->
getMaxKey
();
replacements
[
"MAX_KEY"
]
=
trait
->
getMaxKey
();
replacements
[
"MAX_VALUE"
]
=
trait
->
getMaxValue
();
replacements
[
"MAX_VALUE"
]
=
trait
->
getMaxValue
();
replacements
[
"VALUE_IS_INT2"
]
=
(
trait
->
getDataType
()
==
std
::
string
(
"int2"
)
?
"1"
:
"0"
);
cl
::
Program
program
=
context
.
createProgram
(
context
.
replaceStrings
(
OpenCLKernelSources
::
sort
,
replacements
));
cl
::
Program
program
=
context
.
createProgram
(
context
.
replaceStrings
(
OpenCLKernelSources
::
sort
,
replacements
));
shortListKernel
=
cl
::
Kernel
(
program
,
"sortShortList"
);
shortListKernel
=
cl
::
Kernel
(
program
,
"sortShortList"
);
computeRangeKernel
=
cl
::
Kernel
(
program
,
"computeRange"
);
computeRangeKernel
=
cl
::
Kernel
(
program
,
"computeRange"
);
...
@@ -59,7 +58,11 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le
...
@@ -59,7 +58,11 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le
unsigned
int
maxRangeSize
=
std
::
min
(
maxGroupSize
,
(
unsigned
int
)
computeRangeKernel
.
getWorkGroupInfo
<
CL_KERNEL_WORK_GROUP_SIZE
>
(
context
.
getDevice
()));
unsigned
int
maxRangeSize
=
std
::
min
(
maxGroupSize
,
(
unsigned
int
)
computeRangeKernel
.
getWorkGroupInfo
<
CL_KERNEL_WORK_GROUP_SIZE
>
(
context
.
getDevice
()));
unsigned
int
maxPositionsSize
=
std
::
min
(
maxGroupSize
,
(
unsigned
int
)
computeBucketPositionsKernel
.
getWorkGroupInfo
<
CL_KERNEL_WORK_GROUP_SIZE
>
(
context
.
getDevice
()));
unsigned
int
maxPositionsSize
=
std
::
min
(
maxGroupSize
,
(
unsigned
int
)
computeBucketPositionsKernel
.
getWorkGroupInfo
<
CL_KERNEL_WORK_GROUP_SIZE
>
(
context
.
getDevice
()));
unsigned
int
maxShortListSize
=
shortListKernel
.
getWorkGroupInfo
<
CL_KERNEL_WORK_GROUP_SIZE
>
(
context
.
getDevice
());
unsigned
int
maxShortListSize
=
shortListKernel
.
getWorkGroupInfo
<
CL_KERNEL_WORK_GROUP_SIZE
>
(
context
.
getDevice
());
isShortList
=
(
length
<=
maxLocalBuffer
&&
length
<
maxShortListSize
);
// On Qualcomm's OpenCL, it's essential to check against maxShortListSize. Otherwise you get a crash.
// But AMD's OpenCL returns an inappropriately small value for it that is much shorter than the actual
// maximum, so including the check hurts performance. For the moment I'm going to just comment it out.
// If we officially support Qualcomm in the future, we'll need to do something better.
isShortList
=
(
length
<=
maxLocalBuffer
/* && length < maxShortListSize*/
);
for
(
rangeKernelSize
=
1
;
rangeKernelSize
*
2
<=
maxRangeSize
;
rangeKernelSize
*=
2
)
for
(
rangeKernelSize
=
1
;
rangeKernelSize
*
2
<=
maxRangeSize
;
rangeKernelSize
*=
2
)
;
;
positionsKernelSize
=
std
::
min
(
rangeKernelSize
,
maxPositionsSize
);
positionsKernelSize
=
std
::
min
(
rangeKernelSize
,
maxPositionsSize
);
...
...
platforms/opencl/src/kernels/customManyParticle.cl
View file @
b21e3182
...
@@ -227,7 +227,9 @@ __kernel void findNeighbors(real4 periodicBoxSize, real4 invPeriodicBoxSize, rea
...
@@ -227,7 +227,9 @@ __kernel void findNeighbors(real4 periodicBoxSize, real4 invPeriodicBoxSize, rea
int
start
=
block2*TILE_SIZE
;
int
start
=
block2*TILE_SIZE
;
int
included[TILE_SIZE]
;
int
included[TILE_SIZE]
;
int
numIncluded
=
0
;
int
numIncluded
=
0
;
SYNC_WARPS
;
positionCache[get_local_id
(
0
)
]
=
posq[start+indexInWarp]
;
positionCache[get_local_id
(
0
)
]
=
posq[start+indexInWarp]
;
SYNC_WARPS
;
if
(
atom1
<
NUM_ATOMS
)
{
if
(
atom1
<
NUM_ATOMS
)
{
for
(
int
j
=
0
; j < 32; j++) {
for
(
int
j
=
0
; j < 32; j++) {
int
atom2
=
start+j
;
int
atom2
=
start+j
;
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment