Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
a72e364f
Commit
a72e364f
authored
Jul 10, 2014
by
peastman
Browse files
Merge pull request #516 from peastman/qc
Lots of changes to support Android
parents
0c00acd2
d7569df2
Changes
18
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
18 changed files
with
994 additions
and
541 deletions
+994
-541
CMakeLists.txt
CMakeLists.txt
+35
-15
openmmapi/include/openmm/internal/hardware.h
openmmapi/include/openmm/internal/hardware.h
+41
-31
openmmapi/include/openmm/internal/vectorize.h
openmmapi/include/openmm/internal/vectorize.h
+7
-252
openmmapi/include/openmm/internal/vectorize_neon.h
openmmapi/include/openmm/internal/vectorize_neon.h
+334
-0
openmmapi/include/openmm/internal/vectorize_sse.h
openmmapi/include/openmm/internal/vectorize_sse.h
+286
-0
openmmapi/src/OSRngSeed.cpp
openmmapi/src/OSRngSeed.cpp
+8
-6
platforms/cpu/sharedTarget/CMakeLists.txt
platforms/cpu/sharedTarget/CMakeLists.txt
+11
-7
platforms/cpu/src/CpuNeighborList.cpp
platforms/cpu/src/CpuNeighborList.cpp
+0
-1
platforms/cpu/src/CpuNonbondedForceVec4.cpp
platforms/cpu/src/CpuNonbondedForceVec4.cpp
+2
-2
platforms/cpu/src/CpuPlatform.cpp
platforms/cpu/src/CpuPlatform.cpp
+15
-9
platforms/opencl/include/OpenCLNonbondedUtilities.h
platforms/opencl/include/OpenCLNonbondedUtilities.h
+2
-1
platforms/opencl/src/OpenCLFFT3D.cpp
platforms/opencl/src/OpenCLFFT3D.cpp
+176
-165
platforms/opencl/src/OpenCLKernels.cpp
platforms/opencl/src/OpenCLKernels.cpp
+2
-2
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
+48
-35
platforms/opencl/src/OpenCLPlatform.cpp
platforms/opencl/src/OpenCLPlatform.cpp
+2
-4
platforms/opencl/src/OpenCLSort.cpp
platforms/opencl/src/OpenCLSort.cpp
+6
-3
plugins/cpupme/CMakeLists.txt
plugins/cpupme/CMakeLists.txt
+5
-1
plugins/cpupme/src/CpuPmeKernels.cpp
plugins/cpupme/src/CpuPmeKernels.cpp
+14
-7
No files found.
CMakeLists.txt
View file @
a72e364f
...
...
@@ -61,6 +61,11 @@ ELSE(WIN32)
ENDIF
(
NOT OPENMM_INSTALL_PREFIX
)
ENDIF
(
WIN32
)
# Include CPU-Features for Android
IF
(
ANDROID
)
INCLUDE_DIRECTORIES
(
${
ANDROID_NDK
}
/sources/cpufeatures
)
ENDIF
(
ANDROID
)
# It seems that on linux and mac, everything is trying to be installed in /usr/local/openmm
# But if every install target is prefixed with /openmm/, on Windows the install files
# end up in C:/Program Files/OpenMM/openmm/ which is ugly.
...
...
@@ -87,9 +92,11 @@ IF(WIN32)
SET
(
PTHREADS_LIB pthreadVC2
)
SET
(
PTHREADS_LIB_STATIC pthreadVC2_static_mt
)
ELSE
(
WIN32
)
SET
(
PTHREADS_LIB pthread
)
# in linux, even in static builds we link against the dynamic object (since its tied to libc versions)
SET
(
PTHREADS_LIB_STATIC pthread
)
IF
(
NOT ANDROID
)
SET
(
PTHREADS_LIB pthread
)
# in linux, even in static builds we link against the dynamic object (since its tied to libc versions)
SET
(
PTHREADS_LIB_STATIC pthread
)
ENDIF
(
NOT ANDROID
)
ENDIF
(
WIN32
)
# The build system will set ARCH64 for 64 bit builds, which require
...
...
@@ -121,11 +128,11 @@ IF (APPLE)
SET
(
CMAKE_INSTALL_NAME_DIR
"@rpath"
)
SET
(
EXTRA_COMPILE_FLAGS
"-msse2 -stdlib=libc++"
)
ELSE
(
APPLE
)
IF
(
MSVC
)
IF
(
MSVC
OR ANDROID
)
SET
(
EXTRA_COMPILE_FLAGS
)
ELSE
(
MSVC
)
ELSE
(
MSVC
OR ANDROID
)
SET
(
EXTRA_COMPILE_FLAGS
"-msse2"
)
ENDIF
(
MSVC
)
ENDIF
(
MSVC
OR ANDROID
)
ENDIF
(
APPLE
)
IF
(
UNIX AND NOT CMAKE_BUILD_TYPE
)
...
...
@@ -137,8 +144,13 @@ IF (NOT CMAKE_CXX_FLAGS_DEBUG)
ENDIF
(
NOT CMAKE_CXX_FLAGS_DEBUG
)
IF
(
NOT CMAKE_CXX_FLAGS_RELEASE
)
SET
(
CMAKE_CXX_FLAGS_RELEASE
"-O3 -DNDEBUG"
CACHE STRING
"To use when CMAKE_BUILD_TYPE=Release"
FORCE
)
IF
(
ANDROID
)
SET
(
CMAKE_CXX_FLAGS_RELEASE
"-mfloat-abi=softfp -march=armv7-a -mfpu=neon -funsafe-math-optimizations -O3 -DNDEBUG"
CACHE STRING
"To use when CMAKE_BUILD_TYPE=Release"
FORCE
)
ELSE
(
ANDROID
)
SET
(
CMAKE_CXX_FLAGS_RELEASE
"-O3 -DNDEBUG"
CACHE STRING
"To use when CMAKE_BUILD_TYPE=Release"
FORCE
)
ENDIF
(
ANDROID
)
ENDIF
(
NOT CMAKE_CXX_FLAGS_RELEASE
)
...
...
@@ -252,7 +264,11 @@ FOREACH(subdir ${OPENMM_SOURCE_SUBDIRS})
## OpenMM was previously installed there.
INCLUDE_DIRECTORIES
(
BEFORE
${
CMAKE_CURRENT_SOURCE_DIR
}
/
${
subdir
}
/include
)
ENDFOREACH
(
subdir
)
SET_SOURCE_FILES_PROPERTIES
(
${
CMAKE_SOURCE_DIR
}
/libraries/sfmt/src/SFMT.cpp PROPERTIES COMPILE_FLAGS
"-DHAVE_SSE2=1"
)
IF
(
ANDROID
)
SET_SOURCE_FILES_PROPERTIES
(
${
CMAKE_SOURCE_DIR
}
/libraries/sfmt/src/SFMT.cpp PROPERTIES COMPILE_FLAGS
"-UHAVE_SSE2"
)
ELSE
(
ANDROID
)
SET_SOURCE_FILES_PROPERTIES
(
${
CMAKE_SOURCE_DIR
}
/libraries/sfmt/src/SFMT.cpp PROPERTIES COMPILE_FLAGS
"-DHAVE_SSE2=1"
)
ENDIF
(
ANDROID
)
# If API wrappers are being generated, and add them to the build.
SET
(
OPENMM_BUILD_C_AND_FORTRAN_WRAPPERS ON CACHE BOOL
"Build wrappers for C and Fortran"
)
...
...
@@ -287,13 +303,17 @@ ENDIF(OPENMM_BUILD_C_AND_FORTRAN_WRAPPERS)
# On Linux need to link to libdl
FIND_LIBRARY
(
DL_LIBRARY dl
)
IF
(
DL_LIBRARY
)
TARGET_LINK_LIBRARIES
(
${
SHARED_TARGET
}
${
DL_LIBRARY
}
${
PTHREADS_LIB
}
)
IF
(
OPENMM_BUILD_STATIC_LIB
)
TARGET_LINK_LIBRARIES
(
${
STATIC_TARGET
}
${
DL_LIBRARY
}
${
PTHREADS_LIB
}
)
ENDIF
(
OPENMM_BUILD_STATIC_LIB
)
MARK_AS_ADVANCED
(
DL_LIBRARY
)
TARGET_LINK_LIBRARIES
(
${
SHARED_TARGET
}
${
DL_LIBRARY
}
${
PTHREADS_LIB
}
)
IF
(
OPENMM_BUILD_STATIC_LIB
)
TARGET_LINK_LIBRARIES
(
${
STATIC_TARGET
}
${
DL_LIBRARY
}
${
PTHREADS_LIB
}
)
ENDIF
(
OPENMM_BUILD_STATIC_LIB
)
MARK_AS_ADVANCED
(
DL_LIBRARY
)
ELSE
(
DL_LIBRARY
)
TARGET_LINK_LIBRARIES
(
${
SHARED_TARGET
}
${
PTHREADS_LIB
}
)
IF
(
ANDROID
)
TARGET_LINK_LIBRARIES
(
${
SHARED_TARGET
}
${
PTHREADS_LIB
}
cpufeatures
)
ELSE
(
ANDROID
)
TARGET_LINK_LIBRARIES
(
${
SHARED_TARGET
}
${
PTHREADS_LIB
}
)
ENDIF
(
ANDROID
)
ENDIF
(
DL_LIBRARY
)
ADD_SUBDIRECTORY
(
platforms/reference/tests
)
...
...
openmmapi/include/openmm/internal/hardware.h
View file @
a72e364f
...
...
@@ -47,8 +47,12 @@
#define NOMINMAX
#include <windows.h>
#else
#include <dlfcn.h>
#include <unistd.h>
#ifdef __ANDROID__
#include <cpu-features.h>
#else
#include <dlfcn.h>
#include <unistd.h>
#endif
#endif
#endif
...
...
@@ -70,11 +74,15 @@ static int getNumProcessors() {
ncpu
=
1
;
return
ncpu
;
#else
long
nProcessorsOnline
=
sysconf
(
_SC_NPROCESSORS_ONLN
);
if
(
nProcessorsOnline
==
-
1
)
return
1
;
else
return
(
int
)
nProcessorsOnline
;
#ifdef __ANDROID__
return
android_getCpuCount
();
#else
long
nProcessorsOnline
=
sysconf
(
_SC_NPROCESSORS_ONLN
);
if
(
nProcessorsOnline
==
-
1
)
return
1
;
else
return
(
int
)
nProcessorsOnline
;
#endif
#endif
#endif
}
...
...
@@ -85,30 +93,32 @@ static int getNumProcessors() {
#ifdef _WIN32
#define cpuid __cpuid
#else
static
void
cpuid
(
int
cpuInfo
[
4
],
int
infoType
){
#ifdef __LP64__
__asm__
__volatile__
(
"cpuid"
:
"=a"
(
cpuInfo
[
0
]),
"=b"
(
cpuInfo
[
1
]),
"=c"
(
cpuInfo
[
2
]),
"=d"
(
cpuInfo
[
3
])
:
"a"
(
infoType
)
);
#else
__asm__
__volatile__
(
"pushl %%ebx
\n
"
"cpuid
\n
"
"movl %%ebx, %1
\n
"
"popl %%ebx
\n
"
:
"=a"
(
cpuInfo
[
0
]),
"=r"
(
cpuInfo
[
1
]),
"=c"
(
cpuInfo
[
2
]),
"=d"
(
cpuInfo
[
3
])
:
"a"
(
infoType
)
);
#endif
}
#ifndef __ANDROID__
static
void
cpuid
(
int
cpuInfo
[
4
],
int
infoType
){
#ifdef __LP64__
__asm__
__volatile__
(
"cpuid"
:
"=a"
(
cpuInfo
[
0
]),
"=b"
(
cpuInfo
[
1
]),
"=c"
(
cpuInfo
[
2
]),
"=d"
(
cpuInfo
[
3
])
:
"a"
(
infoType
)
);
#else
__asm__
__volatile__
(
"pushl %%ebx
\n
"
"cpuid
\n
"
"movl %%ebx, %1
\n
"
"popl %%ebx
\n
"
:
"=a"
(
cpuInfo
[
0
]),
"=r"
(
cpuInfo
[
1
]),
"=c"
(
cpuInfo
[
2
]),
"=d"
(
cpuInfo
[
3
])
:
"a"
(
infoType
)
);
#endif
}
#endif
#endif
#endif // OPENMM_HARDWARE_H_
openmmapi/include/openmm/internal/vectorize.h
View file @
a72e364f
...
...
@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 201
3
Stanford University and the Authors. *
* Portions copyright (c) 201
4
Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
...
...
@@ -31,256 +31,11 @@
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <smmintrin.h>
// This file defines classes and functions to simplify vectorizing code with SSE.
class
ivec4
;
/**
* A four element vector of floats.
*/
class
fvec4
{
public:
__m128
val
;
fvec4
()
{}
fvec4
(
float
v
)
:
val
(
_mm_set1_ps
(
v
))
{}
fvec4
(
float
v1
,
float
v2
,
float
v3
,
float
v4
)
:
val
(
_mm_set_ps
(
v4
,
v3
,
v2
,
v1
))
{}
fvec4
(
__m128
v
)
:
val
(
v
)
{}
fvec4
(
const
float
*
v
)
:
val
(
_mm_loadu_ps
(
v
))
{}
operator
__m128
()
const
{
return
val
;
}
float
operator
[](
int
i
)
const
{
float
result
[
4
];
store
(
result
);
return
result
[
i
];
}
void
store
(
float
*
v
)
const
{
_mm_storeu_ps
(
v
,
val
);
}
fvec4
operator
+
(
const
fvec4
&
other
)
const
{
return
_mm_add_ps
(
val
,
other
);
}
fvec4
operator
-
(
const
fvec4
&
other
)
const
{
return
_mm_sub_ps
(
val
,
other
);
}
fvec4
operator
*
(
const
fvec4
&
other
)
const
{
return
_mm_mul_ps
(
val
,
other
);
}
fvec4
operator
/
(
const
fvec4
&
other
)
const
{
return
_mm_div_ps
(
val
,
other
);
}
void
operator
+=
(
const
fvec4
&
other
)
{
val
=
_mm_add_ps
(
val
,
other
);
}
void
operator
-=
(
const
fvec4
&
other
)
{
val
=
_mm_sub_ps
(
val
,
other
);
}
void
operator
*=
(
const
fvec4
&
other
)
{
val
=
_mm_mul_ps
(
val
,
other
);
}
void
operator
/=
(
const
fvec4
&
other
)
{
val
=
_mm_div_ps
(
val
,
other
);
}
fvec4
operator
-
()
const
{
return
_mm_sub_ps
(
_mm_set1_ps
(
0.0
f
),
val
);
}
fvec4
operator
&
(
const
fvec4
&
other
)
const
{
return
_mm_and_ps
(
val
,
other
);
}
fvec4
operator
|
(
const
fvec4
&
other
)
const
{
return
_mm_or_ps
(
val
,
other
);
}
fvec4
operator
==
(
const
fvec4
&
other
)
const
{
return
_mm_cmpeq_ps
(
val
,
other
);
}
fvec4
operator
!=
(
const
fvec4
&
other
)
const
{
return
_mm_cmpneq_ps
(
val
,
other
);
}
fvec4
operator
>
(
const
fvec4
&
other
)
const
{
return
_mm_cmpgt_ps
(
val
,
other
);
}
fvec4
operator
<
(
const
fvec4
&
other
)
const
{
return
_mm_cmplt_ps
(
val
,
other
);
}
fvec4
operator
>=
(
const
fvec4
&
other
)
const
{
return
_mm_cmpge_ps
(
val
,
other
);
}
fvec4
operator
<=
(
const
fvec4
&
other
)
const
{
return
_mm_cmple_ps
(
val
,
other
);
}
operator
ivec4
()
const
;
};
/**
* A four element vector of ints.
*/
class
ivec4
{
public:
__m128i
val
;
ivec4
()
{}
ivec4
(
int
v
)
:
val
(
_mm_set1_epi32
(
v
))
{}
ivec4
(
int
v1
,
int
v2
,
int
v3
,
int
v4
)
:
val
(
_mm_set_epi32
(
v4
,
v3
,
v2
,
v1
))
{}
ivec4
(
__m128i
v
)
:
val
(
v
)
{}
ivec4
(
const
int
*
v
)
:
val
(
_mm_loadu_si128
((
const
__m128i
*
)
v
))
{}
operator
__m128i
()
const
{
return
val
;
}
int
operator
[](
int
i
)
const
{
int
result
[
4
];
store
(
result
);
return
result
[
i
];
}
void
store
(
int
*
v
)
const
{
_mm_storeu_si128
((
__m128i
*
)
v
,
val
);
}
ivec4
operator
+
(
const
ivec4
&
other
)
const
{
return
_mm_add_epi32
(
val
,
other
);
}
ivec4
operator
-
(
const
ivec4
&
other
)
const
{
return
_mm_sub_epi32
(
val
,
other
);
}
ivec4
operator
*
(
const
ivec4
&
other
)
const
{
return
_mm_mul_epi32
(
val
,
other
);
}
void
operator
+=
(
const
ivec4
&
other
)
{
val
=
_mm_add_epi32
(
val
,
other
);
}
void
operator
-=
(
const
ivec4
&
other
)
{
val
=
_mm_sub_epi32
(
val
,
other
);
}
void
operator
*=
(
const
ivec4
&
other
)
{
val
=
_mm_mul_epi32
(
val
,
other
);
}
ivec4
operator
-
()
const
{
return
_mm_sub_epi32
(
_mm_set1_epi32
(
0
),
val
);
}
ivec4
operator
&
(
const
ivec4
&
other
)
const
{
return
_mm_and_si128
(
val
,
other
);
}
ivec4
operator
|
(
const
ivec4
&
other
)
const
{
return
_mm_or_si128
(
val
,
other
);
}
ivec4
operator
==
(
const
ivec4
&
other
)
const
{
return
_mm_cmpeq_epi32
(
val
,
other
);
}
ivec4
operator
!=
(
const
ivec4
&
other
)
const
{
return
_mm_xor_si128
(
*
this
==
other
,
_mm_set1_epi32
(
0xFFFFFFFF
));
}
ivec4
operator
>
(
const
ivec4
&
other
)
const
{
return
_mm_cmpgt_epi32
(
val
,
other
);
}
ivec4
operator
<
(
const
ivec4
&
other
)
const
{
return
_mm_cmplt_epi32
(
val
,
other
);
}
ivec4
operator
>=
(
const
ivec4
&
other
)
const
{
return
_mm_xor_si128
(
_mm_cmplt_epi32
(
val
,
other
),
_mm_set1_epi32
(
0xFFFFFFFF
));
}
ivec4
operator
<=
(
const
ivec4
&
other
)
const
{
return
_mm_xor_si128
(
_mm_cmpgt_epi32
(
val
,
other
),
_mm_set1_epi32
(
0xFFFFFFFF
));
}
operator
fvec4
()
const
;
};
// Conversion operators.
inline
fvec4
::
operator
ivec4
()
const
{
return
_mm_cvttps_epi32
(
val
);
}
inline
ivec4
::
operator
fvec4
()
const
{
return
_mm_cvtepi32_ps
(
val
);
}
// Functions that operate on fvec4s.
static
inline
fvec4
floor
(
const
fvec4
&
v
)
{
return
fvec4
(
_mm_floor_ps
(
v
.
val
));
}
static
inline
fvec4
ceil
(
const
fvec4
&
v
)
{
return
fvec4
(
_mm_ceil_ps
(
v
.
val
));
}
static
inline
fvec4
round
(
const
fvec4
&
v
)
{
return
fvec4
(
_mm_round_ps
(
v
.
val
,
_MM_FROUND_TO_NEAREST_INT
));
}
static
inline
fvec4
min
(
const
fvec4
&
v1
,
const
fvec4
&
v2
)
{
return
fvec4
(
_mm_min_ps
(
v1
.
val
,
v2
.
val
));
}
static
inline
fvec4
max
(
const
fvec4
&
v1
,
const
fvec4
&
v2
)
{
return
fvec4
(
_mm_max_ps
(
v1
.
val
,
v2
.
val
));
}
static
inline
fvec4
abs
(
const
fvec4
&
v
)
{
static
const
__m128
mask
=
_mm_castsi128_ps
(
_mm_set1_epi32
(
0x7FFFFFFF
));
return
fvec4
(
_mm_and_ps
(
v
.
val
,
mask
));
}
static
inline
fvec4
sqrt
(
const
fvec4
&
v
)
{
return
fvec4
(
_mm_sqrt_ps
(
v
.
val
));
}
static
inline
float
dot3
(
const
fvec4
&
v1
,
const
fvec4
&
v2
)
{
return
_mm_cvtss_f32
(
_mm_dp_ps
(
v1
,
v2
,
0x71
));
}
static
inline
float
dot4
(
const
fvec4
&
v1
,
const
fvec4
&
v2
)
{
return
_mm_cvtss_f32
(
_mm_dp_ps
(
v1
,
v2
,
0xF1
));
}
static
inline
void
transpose
(
fvec4
&
v1
,
fvec4
&
v2
,
fvec4
&
v3
,
fvec4
&
v4
)
{
_MM_TRANSPOSE4_PS
(
v1
,
v2
,
v3
,
v4
);
}
// Functions that operate on ivec4s.
static
inline
ivec4
min
(
const
ivec4
&
v1
,
const
ivec4
&
v2
)
{
return
ivec4
(
_mm_min_epi32
(
v1
.
val
,
v2
.
val
));
}
static
inline
ivec4
max
(
const
ivec4
&
v1
,
const
ivec4
&
v2
)
{
return
ivec4
(
_mm_max_epi32
(
v1
.
val
,
v2
.
val
));
}
static
inline
ivec4
abs
(
const
ivec4
&
v
)
{
return
ivec4
(
_mm_abs_epi32
(
v
.
val
));
}
static
inline
bool
any
(
const
ivec4
&
v
)
{
return
!
_mm_test_all_zeros
(
v
,
_mm_set1_epi32
(
0xFFFFFFFF
));
}
// Mathematical operators involving a scalar and a vector.
static
inline
fvec4
operator
+
(
float
v1
,
const
fvec4
&
v2
)
{
return
fvec4
(
v1
)
+
v2
;
}
static
inline
fvec4
operator
-
(
float
v1
,
const
fvec4
&
v2
)
{
return
fvec4
(
v1
)
-
v2
;
}
static
inline
fvec4
operator
*
(
float
v1
,
const
fvec4
&
v2
)
{
return
fvec4
(
v1
)
*
v2
;
}
static
inline
fvec4
operator
/
(
float
v1
,
const
fvec4
&
v2
)
{
return
fvec4
(
v1
)
/
v2
;
}
// Operations for blending fvec4s based on an ivec4.
static
inline
fvec4
blend
(
const
fvec4
&
v1
,
const
fvec4
&
v2
,
const
ivec4
&
mask
)
{
return
fvec4
(
_mm_blendv_ps
(
v1
.
val
,
v2
.
val
,
_mm_castsi128_ps
(
mask
.
val
)));
}
#if defined(__ANDROID__)
#include "vectorize_neon.h"
#else
#include "vectorize_sse.h"
#endif
#endif
/*OPENMM_VECTORIZE_H_*/
openmmapi/include/openmm/internal/vectorize_neon.h
0 → 100644
View file @
a72e364f
#ifndef OPENMM_VECTORIZE_NEON_H_
#define OPENMM_VECTORIZE_NEON_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2013-2014 Stanford University and the Authors. *
* Authors: Mateus Lima, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <cpu-features.h>
#include <arm_neon.h>
#include <cmath>
typedef
int
int32_t
;
// This file defines classes and functions to simplify vectorizing code with NEON.
class
ivec4
;
/**
* A four element vector of floats.
*/
class
fvec4
{
public:
float32x4_t
val
;
fvec4
()
{}
fvec4
(
float
v
)
:
val
(
vdupq_n_f32
(
v
))
{}
fvec4
(
float
v1
,
float
v2
,
float
v3
,
float
v4
)
{
float
v
[]
=
{
v1
,
v2
,
v3
,
v4
};
val
=
vld1q_f32
(
v
);
}
fvec4
(
float32x4_t
v
)
:
val
(
v
)
{}
fvec4
(
const
float
*
v
)
:
val
(
vld1q_f32
(
v
))
{}
operator
float32x4_t
()
const
{
return
val
;
}
float
operator
[](
int
i
)
const
{
switch
(
i
)
{
case
0
:
return
vgetq_lane_f32
(
val
,
0
);
case
1
:
return
vgetq_lane_f32
(
val
,
1
);
case
2
:
return
vgetq_lane_f32
(
val
,
2
);
case
3
:
return
vgetq_lane_f32
(
val
,
3
);
}
return
0.0
f
;
}
void
store
(
float
*
v
)
const
{
vst1q_f32
(
v
,
val
);
}
fvec4
operator
+
(
const
fvec4
&
other
)
const
{
return
vaddq_f32
(
val
,
other
);
}
fvec4
operator
-
(
const
fvec4
&
other
)
const
{
return
vsubq_f32
(
val
,
other
);
}
fvec4
operator
*
(
const
fvec4
&
other
)
const
{
return
vmulq_f32
(
val
,
other
);
}
fvec4
operator
/
(
const
fvec4
&
other
)
const
{
// NEON does not have a divide float-point operator, so we get the reciprocal and multiply.
float32x4_t
reciprocal
=
vrecpeq_f32
(
other
);
reciprocal
=
vmulq_f32
(
vrecpsq_f32
(
other
,
reciprocal
),
reciprocal
);
reciprocal
=
vmulq_f32
(
vrecpsq_f32
(
other
,
reciprocal
),
reciprocal
);
fvec4
result
=
vmulq_f32
(
val
,
reciprocal
);
return
result
;
}
void
operator
+=
(
const
fvec4
&
other
)
{
val
=
vaddq_f32
(
val
,
other
);
}
void
operator
-=
(
const
fvec4
&
other
)
{
val
=
vsubq_f32
(
val
,
other
);
}
void
operator
*=
(
const
fvec4
&
other
)
{
val
=
vmulq_f32
(
val
,
other
);
}
void
operator
/=
(
const
fvec4
&
other
)
{
val
=
*
this
/
other
;
}
fvec4
operator
-
()
const
{
return
vnegq_f32
(
val
);
}
fvec4
operator
&
(
const
fvec4
&
other
)
const
{
return
vreinterpretq_f32_u32
(
vandq_u32
(
vreinterpretq_u32_f32
(
val
),
vreinterpretq_u32_f32
(
other
)));
}
fvec4
operator
|
(
const
fvec4
&
other
)
const
{
return
vreinterpretq_f32_u32
(
vorrq_u32
(
vreinterpretq_u32_f32
(
val
),
vreinterpretq_u32_f32
(
other
)));
}
fvec4
operator
==
(
const
fvec4
&
other
)
const
{
return
vcvtq_f32_s32
(
vreinterpretq_s32_u32
(
vceqq_f32
(
val
,
other
)));
}
fvec4
operator
!=
(
const
fvec4
&
other
)
const
{
return
vcvtq_f32_s32
(
vreinterpretq_s32_u32
(
vmvnq_u32
(
vceqq_f32
(
val
,
other
))));
// not(equals(val, other))
}
fvec4
operator
>
(
const
fvec4
&
other
)
const
{
return
vcvtq_f32_s32
(
vreinterpretq_s32_u32
(
vcgtq_f32
(
val
,
other
)));
}
fvec4
operator
<
(
const
fvec4
&
other
)
const
{
return
vcvtq_f32_s32
(
vreinterpretq_s32_u32
(
vcltq_f32
(
val
,
other
)));
}
fvec4
operator
>=
(
const
fvec4
&
other
)
const
{
return
vcvtq_f32_s32
(
vreinterpretq_s32_u32
(
vcgeq_f32
(
val
,
other
)));
}
fvec4
operator
<=
(
const
fvec4
&
other
)
const
{
return
vcvtq_f32_s32
(
vreinterpretq_s32_u32
(
vcleq_f32
(
val
,
other
)));
}
operator
ivec4
()
const
;
};
/**
* A four element vector of ints.
*/
class
ivec4
{
public:
int32x4_t
val
;
ivec4
()
{}
ivec4
(
int
v
)
:
val
(
vdupq_n_s32
(
v
))
{}
ivec4
(
int
v1
,
int
v2
,
int
v3
,
int
v4
)
{
int
v
[]
=
{
v1
,
v2
,
v3
,
v4
};
val
=
vld1q_s32
(
v
);
}
ivec4
(
int32x4_t
v
)
:
val
(
v
)
{}
ivec4
(
const
int
*
v
)
:
val
(
vld1q_s32
(
v
))
{}
operator
int32x4_t
()
const
{
return
val
;
}
int
operator
[](
int
i
)
const
{
switch
(
i
)
{
case
0
:
return
vgetq_lane_s32
(
val
,
0
);
case
1
:
return
vgetq_lane_s32
(
val
,
1
);
case
2
:
return
vgetq_lane_s32
(
val
,
2
);
case
3
:
return
vgetq_lane_s32
(
val
,
3
);
}
return
0
;
}
void
store
(
int
*
v
)
const
{
vst1q_s32
(
v
,
val
);
}
ivec4
operator
+
(
const
ivec4
&
other
)
const
{
return
vaddq_s32
(
val
,
other
);
}
ivec4
operator
-
(
const
ivec4
&
other
)
const
{
return
vsubq_s32
(
val
,
other
);
}
ivec4
operator
*
(
const
ivec4
&
other
)
const
{
return
vmulq_s32
(
val
,
other
);
}
void
operator
+=
(
const
ivec4
&
other
)
{
val
=
vaddq_s32
(
val
,
other
);
}
void
operator
-=
(
const
ivec4
&
other
)
{
val
=
vsubq_s32
(
val
,
other
);
}
void
operator
*=
(
const
ivec4
&
other
)
{
val
=
vmulq_s32
(
val
,
other
);
}
ivec4
operator
-
()
const
{
return
vnegq_s32
(
val
);
}
ivec4
operator
&
(
const
ivec4
&
other
)
const
{
return
vandq_s32
(
val
,
other
);
}
ivec4
operator
|
(
const
ivec4
&
other
)
const
{
return
vorrq_s32
(
val
,
other
);
}
ivec4
operator
==
(
const
ivec4
&
other
)
const
{
return
vreinterpretq_s32_u32
(
vceqq_s32
(
val
,
other
));
}
ivec4
operator
!=
(
const
ivec4
&
other
)
const
{
return
vreinterpretq_s32_u32
(
vmvnq_u32
(
vceqq_s32
(
val
,
other
)));
// not(equal(val, other))
}
ivec4
operator
>
(
const
ivec4
&
other
)
const
{
return
vreinterpretq_s32_u32
(
vcgtq_s32
(
val
,
other
));
}
ivec4
operator
<
(
const
ivec4
&
other
)
const
{
return
vreinterpretq_s32_u32
(
vcltq_s32
(
val
,
other
));
}
ivec4
operator
>=
(
const
ivec4
&
other
)
const
{
return
vreinterpretq_s32_u32
(
vcgeq_s32
(
val
,
other
));
}
ivec4
operator
<=
(
const
ivec4
&
other
)
const
{
return
vreinterpretq_s32_u32
(
vcleq_s32
(
val
,
other
));
}
operator
fvec4
()
const
;
};
// Conversion operators.
inline
fvec4
::
operator
ivec4
()
const
{
return
ivec4
(
vcvtq_s32_f32
(
val
));
}
inline
ivec4
::
operator
fvec4
()
const
{
return
fvec4
(
vcvtq_f32_s32
(
val
));
}
// Functions that operate on fvec4s.
static
inline
fvec4
min
(
const
fvec4
&
v1
,
const
fvec4
&
v2
)
{
return
vminq_f32
(
v1
,
v2
);
}
static
inline
fvec4
max
(
const
fvec4
&
v1
,
const
fvec4
&
v2
)
{
return
vmaxq_f32
(
v1
,
v2
);
}
static
inline
fvec4
abs
(
const
fvec4
&
v
)
{
return
vabsq_f32
(
v
);
}
static
inline
fvec4
sqrt
(
const
fvec4
&
v
)
{
float32x4_t
recipSqrt
=
vrsqrteq_f32
(
v
);
recipSqrt
=
vmulq_f32
(
recipSqrt
,
vrsqrtsq_f32
(
vmulq_f32
(
recipSqrt
,
v
),
recipSqrt
));
recipSqrt
=
vmulq_f32
(
recipSqrt
,
vrsqrtsq_f32
(
vmulq_f32
(
recipSqrt
,
v
),
recipSqrt
));
return
vmulq_f32
(
v
,
recipSqrt
);
}
static
inline
float
dot3
(
const
fvec4
&
v1
,
const
fvec4
&
v2
)
{
fvec4
result
=
v1
*
v2
;
return
vgetq_lane_f32
(
result
,
0
)
+
vgetq_lane_f32
(
result
,
1
)
+
vgetq_lane_f32
(
result
,
2
);
}
static
inline
float
dot4
(
const
fvec4
&
v1
,
const
fvec4
&
v2
)
{
fvec4
result
=
v1
*
v2
;
return
vgetq_lane_f32
(
result
,
0
)
+
vgetq_lane_f32
(
result
,
1
)
+
vgetq_lane_f32
(
result
,
2
)
+
vgetq_lane_f32
(
result
,
3
);
}
static
inline
void
transpose
(
fvec4
&
v1
,
fvec4
&
v2
,
fvec4
&
v3
,
fvec4
&
v4
)
{
float32x4x2_t
t1
=
vuzpq_f32
(
v1
,
v3
);
float32x4x2_t
t2
=
vuzpq_f32
(
v2
,
v4
);
float32x4x2_t
t3
=
vtrnq_f32
(
t1
.
val
[
0
],
t2
.
val
[
0
]);
float32x4x2_t
t4
=
vtrnq_f32
(
t1
.
val
[
1
],
t2
.
val
[
1
]);
v1
=
t3
.
val
[
0
];
v2
=
t4
.
val
[
0
];
v3
=
t3
.
val
[
1
];
v4
=
t4
.
val
[
1
];
}
// Functions that operate on ivec4s.
static
inline
ivec4
min
(
const
ivec4
&
v1
,
const
ivec4
&
v2
)
{
return
vminq_s32
(
v1
,
v2
);
}
static
inline
ivec4
max
(
const
ivec4
&
v1
,
const
ivec4
&
v2
)
{
return
vmaxq_s32
(
v1
,
v2
);
}
static
inline
ivec4
abs
(
const
ivec4
&
v
)
{
return
vabdq_s32
(
v
,
ivec4
(
0
));
}
static
inline
bool
any
(
const
ivec4
&
v
)
{
return
(
vgetq_lane_s32
(
v
,
0
)
!=
0
||
vgetq_lane_s32
(
v
,
1
)
!=
0
||
vgetq_lane_s32
(
v
,
2
)
!=
0
||
vgetq_lane_s32
(
v
,
3
)
!=
0
);
}
// Mathematical operators involving a scalar and a vector.
static
inline
fvec4
operator
+
(
float
v1
,
const
fvec4
&
v2
)
{
return
fvec4
(
v1
)
+
v2
;
}
static
inline
fvec4
operator
-
(
float
v1
,
const
fvec4
&
v2
)
{
return
fvec4
(
v1
)
-
v2
;
}
static
inline
fvec4
operator
*
(
float
v1
,
const
fvec4
&
v2
)
{
return
fvec4
(
v1
)
*
v2
;
}
static
inline
fvec4
operator
/
(
float
v1
,
const
fvec4
&
v2
)
{
return
fvec4
(
v1
)
/
v2
;
}
// Operations for blending fvec4s based on an ivec4.
static
inline
fvec4
blend
(
const
fvec4
&
v1
,
const
fvec4
&
v2
,
const
ivec4
&
mask
)
{
return
vbslq_f32
(
vreinterpretq_u32_s32
(
mask
),
v2
,
v1
);
}
// These are at the end since they involve other functions defined above.
static
inline
fvec4
round
(
const
fvec4
&
v
)
{
fvec4
shift
(
0x1
.0
p23f
);
fvec4
absResult
=
(
abs
(
v
)
+
shift
)
-
shift
;
return
blend
(
v
,
absResult
,
ivec4
(
0x7FFFFFFF
));
}
static
inline
fvec4
floor
(
const
fvec4
&
v
)
{
fvec4
rounded
=
round
(
v
);
return
rounded
+
blend
(
0.0
f
,
-
1.0
f
,
rounded
>
v
);
}
static
inline
fvec4
ceil
(
const
fvec4
&
v
)
{
fvec4
rounded
=
round
(
v
);
return
rounded
+
blend
(
0.0
f
,
1.0
f
,
rounded
<
v
);
}
#endif
/*OPENMM_VECTORIZE_NEON_H_*/
openmmapi/include/openmm/internal/vectorize_sse.h
0 → 100644
View file @
a72e364f
#ifndef OPENMM_VECTORIZE_SSE_H_
#define OPENMM_VECTORIZE_SSE_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2013 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <smmintrin.h>
// This file defines classes and functions to simplify vectorizing code with SSE.
class
ivec4
;
/**
* A four element vector of floats.
*/
class
fvec4
{
public:
__m128
val
;
fvec4
()
{}
fvec4
(
float
v
)
:
val
(
_mm_set1_ps
(
v
))
{}
fvec4
(
float
v1
,
float
v2
,
float
v3
,
float
v4
)
:
val
(
_mm_set_ps
(
v4
,
v3
,
v2
,
v1
))
{}
fvec4
(
__m128
v
)
:
val
(
v
)
{}
fvec4
(
const
float
*
v
)
:
val
(
_mm_loadu_ps
(
v
))
{}
operator
__m128
()
const
{
return
val
;
}
float
operator
[](
int
i
)
const
{
float
result
[
4
];
store
(
result
);
return
result
[
i
];
}
void
store
(
float
*
v
)
const
{
_mm_storeu_ps
(
v
,
val
);
}
fvec4
operator
+
(
const
fvec4
&
other
)
const
{
return
_mm_add_ps
(
val
,
other
);
}
fvec4
operator
-
(
const
fvec4
&
other
)
const
{
return
_mm_sub_ps
(
val
,
other
);
}
fvec4
operator
*
(
const
fvec4
&
other
)
const
{
return
_mm_mul_ps
(
val
,
other
);
}
fvec4
operator
/
(
const
fvec4
&
other
)
const
{
return
_mm_div_ps
(
val
,
other
);
}
void
operator
+=
(
const
fvec4
&
other
)
{
val
=
_mm_add_ps
(
val
,
other
);
}
void
operator
-=
(
const
fvec4
&
other
)
{
val
=
_mm_sub_ps
(
val
,
other
);
}
void
operator
*=
(
const
fvec4
&
other
)
{
val
=
_mm_mul_ps
(
val
,
other
);
}
void
operator
/=
(
const
fvec4
&
other
)
{
val
=
_mm_div_ps
(
val
,
other
);
}
fvec4
operator
-
()
const
{
return
_mm_sub_ps
(
_mm_set1_ps
(
0.0
f
),
val
);
}
fvec4
operator
&
(
const
fvec4
&
other
)
const
{
return
_mm_and_ps
(
val
,
other
);
}
fvec4
operator
|
(
const
fvec4
&
other
)
const
{
return
_mm_or_ps
(
val
,
other
);
}
fvec4
operator
==
(
const
fvec4
&
other
)
const
{
return
_mm_cmpeq_ps
(
val
,
other
);
}
fvec4
operator
!=
(
const
fvec4
&
other
)
const
{
return
_mm_cmpneq_ps
(
val
,
other
);
}
fvec4
operator
>
(
const
fvec4
&
other
)
const
{
return
_mm_cmpgt_ps
(
val
,
other
);
}
fvec4
operator
<
(
const
fvec4
&
other
)
const
{
return
_mm_cmplt_ps
(
val
,
other
);
}
fvec4
operator
>=
(
const
fvec4
&
other
)
const
{
return
_mm_cmpge_ps
(
val
,
other
);
}
fvec4
operator
<=
(
const
fvec4
&
other
)
const
{
return
_mm_cmple_ps
(
val
,
other
);
}
operator
ivec4
()
const
;
};
/**
* A four element vector of ints.
*/
class
ivec4
{
public:
__m128i
val
;
ivec4
()
{}
ivec4
(
int
v
)
:
val
(
_mm_set1_epi32
(
v
))
{}
ivec4
(
int
v1
,
int
v2
,
int
v3
,
int
v4
)
:
val
(
_mm_set_epi32
(
v4
,
v3
,
v2
,
v1
))
{}
ivec4
(
__m128i
v
)
:
val
(
v
)
{}
ivec4
(
const
int
*
v
)
:
val
(
_mm_loadu_si128
((
const
__m128i
*
)
v
))
{}
operator
__m128i
()
const
{
return
val
;
}
int
operator
[](
int
i
)
const
{
int
result
[
4
];
store
(
result
);
return
result
[
i
];
}
void
store
(
int
*
v
)
const
{
_mm_storeu_si128
((
__m128i
*
)
v
,
val
);
}
ivec4
operator
+
(
const
ivec4
&
other
)
const
{
return
_mm_add_epi32
(
val
,
other
);
}
ivec4
operator
-
(
const
ivec4
&
other
)
const
{
return
_mm_sub_epi32
(
val
,
other
);
}
ivec4
operator
*
(
const
ivec4
&
other
)
const
{
return
_mm_mul_epi32
(
val
,
other
);
}
void
operator
+=
(
const
ivec4
&
other
)
{
val
=
_mm_add_epi32
(
val
,
other
);
}
void
operator
-=
(
const
ivec4
&
other
)
{
val
=
_mm_sub_epi32
(
val
,
other
);
}
void
operator
*=
(
const
ivec4
&
other
)
{
val
=
_mm_mul_epi32
(
val
,
other
);
}
ivec4
operator
-
()
const
{
return
_mm_sub_epi32
(
_mm_set1_epi32
(
0
),
val
);
}
ivec4
operator
&
(
const
ivec4
&
other
)
const
{
return
_mm_and_si128
(
val
,
other
);
}
ivec4
operator
|
(
const
ivec4
&
other
)
const
{
return
_mm_or_si128
(
val
,
other
);
}
ivec4
operator
==
(
const
ivec4
&
other
)
const
{
return
_mm_cmpeq_epi32
(
val
,
other
);
}
ivec4
operator
!=
(
const
ivec4
&
other
)
const
{
return
_mm_xor_si128
(
*
this
==
other
,
_mm_set1_epi32
(
0xFFFFFFFF
));
}
ivec4
operator
>
(
const
ivec4
&
other
)
const
{
return
_mm_cmpgt_epi32
(
val
,
other
);
}
ivec4
operator
<
(
const
ivec4
&
other
)
const
{
return
_mm_cmplt_epi32
(
val
,
other
);
}
ivec4
operator
>=
(
const
ivec4
&
other
)
const
{
return
_mm_xor_si128
(
_mm_cmplt_epi32
(
val
,
other
),
_mm_set1_epi32
(
0xFFFFFFFF
));
}
ivec4
operator
<=
(
const
ivec4
&
other
)
const
{
return
_mm_xor_si128
(
_mm_cmpgt_epi32
(
val
,
other
),
_mm_set1_epi32
(
0xFFFFFFFF
));
}
operator
fvec4
()
const
;
};
// Conversion operators.
inline
fvec4
::
operator
ivec4
()
const
{
return
_mm_cvttps_epi32
(
val
);
}
inline
ivec4
::
operator
fvec4
()
const
{
return
_mm_cvtepi32_ps
(
val
);
}
// Functions that operate on fvec4s.
static
inline
fvec4
floor
(
const
fvec4
&
v
)
{
return
fvec4
(
_mm_floor_ps
(
v
.
val
));
}
static
inline
fvec4
ceil
(
const
fvec4
&
v
)
{
return
fvec4
(
_mm_ceil_ps
(
v
.
val
));
}
static
inline
fvec4
round
(
const
fvec4
&
v
)
{
return
fvec4
(
_mm_round_ps
(
v
.
val
,
_MM_FROUND_TO_NEAREST_INT
));
}
static
inline
fvec4
min
(
const
fvec4
&
v1
,
const
fvec4
&
v2
)
{
return
fvec4
(
_mm_min_ps
(
v1
.
val
,
v2
.
val
));
}
static
inline
fvec4
max
(
const
fvec4
&
v1
,
const
fvec4
&
v2
)
{
return
fvec4
(
_mm_max_ps
(
v1
.
val
,
v2
.
val
));
}
static
inline
fvec4
abs
(
const
fvec4
&
v
)
{
static
const
__m128
mask
=
_mm_castsi128_ps
(
_mm_set1_epi32
(
0x7FFFFFFF
));
return
fvec4
(
_mm_and_ps
(
v
.
val
,
mask
));
}
static
inline
fvec4
sqrt
(
const
fvec4
&
v
)
{
return
fvec4
(
_mm_sqrt_ps
(
v
.
val
));
}
static
inline
float
dot3
(
const
fvec4
&
v1
,
const
fvec4
&
v2
)
{
return
_mm_cvtss_f32
(
_mm_dp_ps
(
v1
,
v2
,
0x71
));
}
static
inline
float
dot4
(
const
fvec4
&
v1
,
const
fvec4
&
v2
)
{
return
_mm_cvtss_f32
(
_mm_dp_ps
(
v1
,
v2
,
0xF1
));
}
static
inline
void
transpose
(
fvec4
&
v1
,
fvec4
&
v2
,
fvec4
&
v3
,
fvec4
&
v4
)
{
_MM_TRANSPOSE4_PS
(
v1
,
v2
,
v3
,
v4
);
}
// Functions that operate on ivec4s.
static
inline
ivec4
min
(
const
ivec4
&
v1
,
const
ivec4
&
v2
)
{
return
ivec4
(
_mm_min_epi32
(
v1
.
val
,
v2
.
val
));
}
static
inline
ivec4
max
(
const
ivec4
&
v1
,
const
ivec4
&
v2
)
{
return
ivec4
(
_mm_max_epi32
(
v1
.
val
,
v2
.
val
));
}
static
inline
ivec4
abs
(
const
ivec4
&
v
)
{
return
ivec4
(
_mm_abs_epi32
(
v
.
val
));
}
static
inline
bool
any
(
const
ivec4
&
v
)
{
return
!
_mm_test_all_zeros
(
v
,
_mm_set1_epi32
(
0xFFFFFFFF
));
}
// Mathematical operators involving a scalar and a vector.
static
inline
fvec4
operator
+
(
float
v1
,
const
fvec4
&
v2
)
{
return
fvec4
(
v1
)
+
v2
;
}
static
inline
fvec4
operator
-
(
float
v1
,
const
fvec4
&
v2
)
{
return
fvec4
(
v1
)
-
v2
;
}
static
inline
fvec4
operator
*
(
float
v1
,
const
fvec4
&
v2
)
{
return
fvec4
(
v1
)
*
v2
;
}
static
inline
fvec4
operator
/
(
float
v1
,
const
fvec4
&
v2
)
{
return
fvec4
(
v1
)
/
v2
;
}
// Operations for blending fvec4s based on an ivec4.
static
inline
fvec4
blend
(
const
fvec4
&
v1
,
const
fvec4
&
v2
,
const
ivec4
&
mask
)
{
return
fvec4
(
_mm_blendv_ps
(
v1
.
val
,
v2
.
val
,
_mm_castsi128_ps
(
mask
.
val
)));
}
#endif
/*OPENMM_VECTORIZE_SSE_H_*/
openmmapi/src/OSRngSeed.cpp
View file @
a72e364f
...
...
@@ -29,7 +29,6 @@
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <stdexcept>
#if defined(_WIN32) || defined(__CYGWIN__)
#include <windows.h>
static
HCRYPTPROV
hCryptProv
=
0
;
...
...
@@ -38,28 +37,31 @@ static HCRYPTPROV hCryptProv = 0;
#include <fcntl.h>
#include <unistd.h>
#endif
#include "openmm/OpenMMException.h"
#include "openmm/internal/OSRngSeed.h"
using
OpenMM
::
OpenMMException
;
int
osrngseed
(
void
)
{
int
value
;
#if defined(_WIN32) || defined(__CYGWIN__)
if
(
!::
CryptAcquireContextW
(
&
hCryptProv
,
0
,
0
,
PROV_RSA_FULL
,
CRYPT_VERIFYCONTEXT
|
CRYPT_SILENT
))
{
throw
std
::
runtime_error
(
"Failed to initialize Windows random API (CryptoGen)"
);
throw
OpenMMException
(
"Failed to initialize Windows random API (CryptoGen)"
);
}
if
(
!
CryptGenRandom
(
hCryptProv
,
sizeof
(
int
),
(
BYTE
*
)
&
value
))
{
::
CryptReleaseContext
(
hCryptProv
,
0
);
throw
std
::
runtime_error
(
"Failed to get random numbers"
);
throw
OpenMMException
(
"Failed to get random numbers"
);
}
if
(
!::
CryptReleaseContext
(
hCryptProv
,
0
))
{
throw
std
::
runtime_error
(
"Failed to release Windows random API context"
);
throw
OpenMMException
(
"Failed to release Windows random API context"
);
}
#else
int
m_fd
=
open
(
"/dev/urandom"
,
O_RDONLY
);
if
(
m_fd
==
-
1
)
{
throw
std
::
runtime_error
(
"Failed to open /dev/urandom"
);
throw
OpenMMException
(
"Failed to open /dev/urandom"
);
}
if
(
read
(
m_fd
,
&
value
,
sizeof
(
int
))
!=
sizeof
(
int
))
{
throw
std
::
runtime_error
(
"Failed to read bytes from /dev/urandom"
);
throw
OpenMMException
(
"Failed to read bytes from /dev/urandom"
);
}
close
(
m_fd
);
#endif
...
...
platforms/cpu/sharedTarget/CMakeLists.txt
View file @
a72e364f
FOREACH
(
file
${
SOURCE_FILES
}
)
IF
(
file MATCHES
".*Vec8.*"
)
IF
(
MSVC
)
IF
(
MSVC
)
SET_SOURCE_FILES_PROPERTIES
(
${
file
}
PROPERTIES COMPILE_FLAGS
"
${
EXTRA_COMPILE_FLAGS
}
/arch:AVX /D__AVX__"
)
ELSE
(
MSVC
)
SET_SOURCE_FILES_PROPERTIES
(
${
file
}
PROPERTIES COMPILE_FLAGS
"
${
EXTRA_COMPILE_FLAGS
}
-msse4.1 -mavx"
)
ENDIF
(
MSVC
)
ELSE
(
MSVC
)
IF
(
NOT ANDROID
)
SET_SOURCE_FILES_PROPERTIES
(
${
file
}
PROPERTIES COMPILE_FLAGS
"
${
EXTRA_COMPILE_FLAGS
}
-msse4.1 -mavx"
)
ENDIF
(
NOT ANDROID
)
ENDIF
(
MSVC
)
ELSE
(
file MATCHES
".*Vec8.*"
)
IF
(
NOT MSVC
)
SET_SOURCE_FILES_PROPERTIES
(
${
file
}
PROPERTIES COMPILE_FLAGS
"
${
EXTRA_COMPILE_FLAGS
}
-msse4.1"
)
ENDIF
(
NOT MSVC
)
IF
(
NOT MSVC
)
IF
(
NOT ANDROID
)
SET_SOURCE_FILES_PROPERTIES
(
${
file
}
PROPERTIES COMPILE_FLAGS
"
${
EXTRA_COMPILE_FLAGS
}
-msse4.1"
)
ENDIF
(
NOT ANDROID
)
ENDIF
(
NOT MSVC
)
ENDIF
(
file MATCHES
".*Vec8.*"
)
ENDFOREACH
(
file
)
ADD_LIBRARY
(
${
SHARED_TARGET
}
SHARED
${
SOURCE_FILES
}
${
SOURCE_INCLUDE_FILES
}
${
API_ABS_INCLUDE_FILES
}
)
...
...
platforms/cpu/src/CpuNeighborList.cpp
View file @
a72e364f
...
...
@@ -37,7 +37,6 @@
#include <set>
#include <map>
#include <cmath>
#include <smmintrin.h>
using
namespace
std
;
...
...
platforms/cpu/src/CpuNonbondedForceVec4.cpp
View file @
a72e364f
...
...
@@ -103,7 +103,7 @@ void CpuNonbondedForceVec4::calculateBlockIxn(int blockIndex, float* forces, dou
dEdR
=
epsSig6
*
(
12.0
f
*
sig6
-
6.0
f
);
energy
=
epsSig6
*
(
sig6
-
1.0
f
);
if
(
useSwitch
)
{
fvec4
t
=
(
r
>
switchingDistance
)
&
(
(
r
-
switchingDistance
)
*
invSwitchingInterval
);
fvec4
t
=
blend
(
0.0
f
,
(
r
-
switchingDistance
)
*
invSwitchingInterval
,
r
>
switchingDistance
);
fvec4
switchValue
=
1
+
t
*
t
*
t
*
(
-
10.0
f
+
t
*
(
15.0
f
-
t
*
6.0
f
));
fvec4
switchDeriv
=
t
*
t
*
(
-
30.0
f
+
t
*
(
60.0
f
-
t
*
30.0
f
))
*
invSwitchingInterval
;
dEdR
=
switchValue
*
dEdR
-
energy
*
switchDeriv
*
r
;
...
...
@@ -214,7 +214,7 @@ void CpuNonbondedForceVec4::calculateBlockEwaldIxn(int blockIndex, float* forces
dEdR
=
epsSig6
*
(
12.0
f
*
sig6
-
6.0
f
);
energy
=
epsSig6
*
(
sig6
-
1.0
f
);
if
(
useSwitch
)
{
fvec4
t
=
(
r
>
switchingDistance
)
&
(
(
r
-
switchingDistance
)
*
invSwitchingInterval
);
fvec4
t
=
blend
(
0.0
f
,
(
r
-
switchingDistance
)
*
invSwitchingInterval
,
r
>
switchingDistance
);
fvec4
switchValue
=
1
+
t
*
t
*
t
*
(
-
10.0
f
+
t
*
(
15.0
f
-
t
*
6.0
f
));
fvec4
switchDeriv
=
t
*
t
*
(
-
30.0
f
+
t
*
(
60.0
f
-
t
*
30.0
f
))
*
invSwitchingInterval
;
dEdR
=
switchValue
*
dEdR
-
energy
*
switchDeriv
*
r
;
...
...
platforms/cpu/src/CpuPlatform.cpp
View file @
a72e364f
...
...
@@ -36,6 +36,7 @@
#include "ReferenceConstraints.h"
#include "openmm/internal/hardware.h"
#include <sstream>
#include <stdlib.h>
using
namespace
OpenMM
;
using
namespace
std
;
...
...
@@ -93,15 +94,20 @@ bool CpuPlatform::supportsDoublePrecision() const {
}
bool
CpuPlatform
::
isProcessorSupported
()
{
// Make sure the CPU supports SSE 4.1.
int
cpuInfo
[
4
];
cpuid
(
cpuInfo
,
0
);
if
(
cpuInfo
[
0
]
>=
1
)
{
cpuid
(
cpuInfo
,
1
);
return
((
cpuInfo
[
2
]
&
((
int
)
1
<<
19
))
!=
0
);
}
return
false
;
// Make sure the CPU supports SSE 4.1 or NEON.
#ifdef __ANDROID__
uint64_t
features
=
android_getCpuFeatures
();
return
(
features
&
ANDROID_CPU_ARM_FEATURE_NEON
)
!=
0
;
#else
int
cpuInfo
[
4
];
cpuid
(
cpuInfo
,
0
);
if
(
cpuInfo
[
0
]
>=
1
)
{
cpuid
(
cpuInfo
,
1
);
return
((
cpuInfo
[
2
]
&
((
int
)
1
<<
19
))
!=
0
);
}
return
false
;
#endif
}
void
CpuPlatform
::
contextCreated
(
ContextImpl
&
context
,
const
map
<
string
,
string
>&
properties
)
const
{
...
...
platforms/opencl/include/OpenCLNonbondedUtilities.h
View file @
a72e364f
...
...
@@ -284,7 +284,8 @@ private:
std
::
map
<
std
::
string
,
std
::
string
>
kernelDefines
;
double
cutoff
;
bool
useCutoff
,
usePeriodic
,
deviceIsCpu
,
anyExclusions
,
usePadding
;
int
numForceBuffers
,
startTileIndex
,
numTiles
,
startBlockIndex
,
numBlocks
,
numForceThreadBlocks
,
forceThreadBlockSize
,
nonbondedForceGroup
;
int
numForceBuffers
,
startTileIndex
,
numTiles
,
startBlockIndex
,
numBlocks
,
numForceThreadBlocks
;
int
forceThreadBlockSize
,
interactingBlocksThreadBlockSize
,
nonbondedForceGroup
;
};
/**
...
...
platforms/opencl/src/OpenCLFFT3D.cpp
View file @
a72e364f
This diff is collapsed.
Click to expand it.
platforms/opencl/src/OpenCLKernels.cpp
View file @
a72e364f
...
...
@@ -4820,7 +4820,7 @@ void OpenCLIntegrateVariableVerletStepKernel::initialize(const System& system, c
kernel1
=
cl
::
Kernel
(
program
,
"integrateVerletPart1"
);
kernel2
=
cl
::
Kernel
(
program
,
"integrateVerletPart2"
);
selectSizeKernel
=
cl
::
Kernel
(
program
,
"selectVerletStepSize"
);
blockSize
=
min
(
min
(
256
,
system
.
getNumParticles
()),
(
int
)
cl
.
getDevice
().
getInfo
<
CL_DEVICE_MAX
_WORK_GROUP_SIZE
>
());
blockSize
=
min
(
min
(
256
,
system
.
getNumParticles
()),
(
int
)
selectSizeKernel
.
getWorkGroupInfo
<
CL_KERNEL
_WORK_GROUP_SIZE
>
(
cl
.
getDevice
()
));
}
double
OpenCLIntegrateVariableVerletStepKernel
::
execute
(
ContextImpl
&
context
,
const
VariableVerletIntegrator
&
integrator
,
double
maxTime
)
{
...
...
@@ -4930,7 +4930,7 @@ void OpenCLIntegrateVariableLangevinStepKernel::initialize(const System& system,
params
=
new
OpenCLArray
(
cl
,
3
,
cl
.
getUseDoublePrecision
()
||
cl
.
getUseMixedPrecision
()
?
sizeof
(
cl_double
)
:
sizeof
(
cl_float
),
"langevinParams"
);
blockSize
=
min
(
256
,
system
.
getNumParticles
());
blockSize
=
max
(
blockSize
,
params
->
getSize
());
blockSize
=
min
(
blockSize
,
(
int
)
cl
.
getDevice
().
getInfo
<
CL_DEVICE_MAX
_WORK_GROUP_SIZE
>
());
blockSize
=
min
(
blockSize
,
(
int
)
selectSizeKernel
.
getWorkGroupInfo
<
CL_KERNEL
_WORK_GROUP_SIZE
>
(
cl
.
getDevice
()
));
}
double
OpenCLIntegrateVariableLangevinStepKernel
::
execute
(
ContextImpl
&
context
,
const
VariableLangevinIntegrator
&
integrator
,
double
maxTime
)
{
...
...
platforms/opencl/src/OpenCLNonbondedUtilities.cpp
View file @
a72e364f
...
...
@@ -317,42 +317,55 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
for
(
int
i
=
0
;
i
<
(
int
)
exclusionBlocksForBlock
.
size
();
i
++
)
maxExclusions
=
(
maxExclusions
>
exclusionBlocksForBlock
[
i
].
size
()
?
maxExclusions
:
exclusionBlocksForBlock
[
i
].
size
());
defines
[
"MAX_EXCLUSIONS"
]
=
context
.
intToString
(
maxExclusions
);
defines
[
"GROUP_SIZE"
]
=
(
deviceIsCpu
?
"32"
:
"128"
);
defines
[
"BUFFER_GROUPS"
]
=
(
deviceIsCpu
?
"4"
:
"2"
);
string
file
=
(
deviceIsCpu
?
OpenCLKernelSources
::
findInteractingBlocks_cpu
:
OpenCLKernelSources
::
findInteractingBlocks
);
cl
::
Program
interactingBlocksProgram
=
context
.
createProgram
(
file
,
defines
);
findBlockBoundsKernel
=
cl
::
Kernel
(
interactingBlocksProgram
,
"findBlockBounds"
);
findBlockBoundsKernel
.
setArg
<
cl_int
>
(
0
,
context
.
getNumAtoms
());
findBlockBoundsKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
context
.
getPosq
().
getDeviceBuffer
());
findBlockBoundsKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
blockCenter
->
getDeviceBuffer
());
findBlockBoundsKernel
.
setArg
<
cl
::
Buffer
>
(
5
,
blockBoundingBox
->
getDeviceBuffer
());
findBlockBoundsKernel
.
setArg
<
cl
::
Buffer
>
(
6
,
rebuildNeighborList
->
getDeviceBuffer
());
findBlockBoundsKernel
.
setArg
<
cl
::
Buffer
>
(
7
,
sortedBlocks
->
getDeviceBuffer
());
sortBoxDataKernel
=
cl
::
Kernel
(
interactingBlocksProgram
,
"sortBoxData"
);
sortBoxDataKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
sortedBlocks
->
getDeviceBuffer
());
sortBoxDataKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
blockCenter
->
getDeviceBuffer
());
sortBoxDataKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
blockBoundingBox
->
getDeviceBuffer
());
sortBoxDataKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
sortedBlockCenter
->
getDeviceBuffer
());
sortBoxDataKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
sortedBlockBoundingBox
->
getDeviceBuffer
());
sortBoxDataKernel
.
setArg
<
cl
::
Buffer
>
(
5
,
context
.
getPosq
().
getDeviceBuffer
());
sortBoxDataKernel
.
setArg
<
cl
::
Buffer
>
(
6
,
oldPositions
->
getDeviceBuffer
());
sortBoxDataKernel
.
setArg
<
cl
::
Buffer
>
(
7
,
interactionCount
->
getDeviceBuffer
());
sortBoxDataKernel
.
setArg
<
cl
::
Buffer
>
(
8
,
rebuildNeighborList
->
getDeviceBuffer
());
findInteractingBlocksKernel
=
cl
::
Kernel
(
interactingBlocksProgram
,
"findBlocksWithInteractions"
);
findInteractingBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
interactionCount
->
getDeviceBuffer
());
findInteractingBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
interactingTiles
->
getDeviceBuffer
());
findInteractingBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
interactingAtoms
->
getDeviceBuffer
());
findInteractingBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
5
,
context
.
getPosq
().
getDeviceBuffer
());
findInteractingBlocksKernel
.
setArg
<
cl_uint
>
(
6
,
interactingTiles
->
getSize
());
findInteractingBlocksKernel
.
setArg
<
cl_uint
>
(
7
,
startBlockIndex
);
findInteractingBlocksKernel
.
setArg
<
cl_uint
>
(
8
,
numBlocks
);
findInteractingBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
9
,
sortedBlocks
->
getDeviceBuffer
());
findInteractingBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
10
,
sortedBlockCenter
->
getDeviceBuffer
());
findInteractingBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
11
,
sortedBlockBoundingBox
->
getDeviceBuffer
());
findInteractingBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
12
,
exclusionIndices
->
getDeviceBuffer
());
findInteractingBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
13
,
exclusionRowIndices
->
getDeviceBuffer
());
findInteractingBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
14
,
oldPositions
->
getDeviceBuffer
());
findInteractingBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
15
,
rebuildNeighborList
->
getDeviceBuffer
());
int
groupSize
=
(
deviceIsCpu
?
32
:
128
);
while
(
true
)
{
defines
[
"GROUP_SIZE"
]
=
context
.
intToString
(
groupSize
);
cl
::
Program
interactingBlocksProgram
=
context
.
createProgram
(
file
,
defines
);
findBlockBoundsKernel
=
cl
::
Kernel
(
interactingBlocksProgram
,
"findBlockBounds"
);
findBlockBoundsKernel
.
setArg
<
cl_int
>
(
0
,
context
.
getNumAtoms
());
findBlockBoundsKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
context
.
getPosq
().
getDeviceBuffer
());
findBlockBoundsKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
blockCenter
->
getDeviceBuffer
());
findBlockBoundsKernel
.
setArg
<
cl
::
Buffer
>
(
5
,
blockBoundingBox
->
getDeviceBuffer
());
findBlockBoundsKernel
.
setArg
<
cl
::
Buffer
>
(
6
,
rebuildNeighborList
->
getDeviceBuffer
());
findBlockBoundsKernel
.
setArg
<
cl
::
Buffer
>
(
7
,
sortedBlocks
->
getDeviceBuffer
());
sortBoxDataKernel
=
cl
::
Kernel
(
interactingBlocksProgram
,
"sortBoxData"
);
sortBoxDataKernel
.
setArg
<
cl
::
Buffer
>
(
0
,
sortedBlocks
->
getDeviceBuffer
());
sortBoxDataKernel
.
setArg
<
cl
::
Buffer
>
(
1
,
blockCenter
->
getDeviceBuffer
());
sortBoxDataKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
blockBoundingBox
->
getDeviceBuffer
());
sortBoxDataKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
sortedBlockCenter
->
getDeviceBuffer
());
sortBoxDataKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
sortedBlockBoundingBox
->
getDeviceBuffer
());
sortBoxDataKernel
.
setArg
<
cl
::
Buffer
>
(
5
,
context
.
getPosq
().
getDeviceBuffer
());
sortBoxDataKernel
.
setArg
<
cl
::
Buffer
>
(
6
,
oldPositions
->
getDeviceBuffer
());
sortBoxDataKernel
.
setArg
<
cl
::
Buffer
>
(
7
,
interactionCount
->
getDeviceBuffer
());
sortBoxDataKernel
.
setArg
<
cl
::
Buffer
>
(
8
,
rebuildNeighborList
->
getDeviceBuffer
());
findInteractingBlocksKernel
=
cl
::
Kernel
(
interactingBlocksProgram
,
"findBlocksWithInteractions"
);
findInteractingBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
2
,
interactionCount
->
getDeviceBuffer
());
findInteractingBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
3
,
interactingTiles
->
getDeviceBuffer
());
findInteractingBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
4
,
interactingAtoms
->
getDeviceBuffer
());
findInteractingBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
5
,
context
.
getPosq
().
getDeviceBuffer
());
findInteractingBlocksKernel
.
setArg
<
cl_uint
>
(
6
,
interactingTiles
->
getSize
());
findInteractingBlocksKernel
.
setArg
<
cl_uint
>
(
7
,
startBlockIndex
);
findInteractingBlocksKernel
.
setArg
<
cl_uint
>
(
8
,
numBlocks
);
findInteractingBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
9
,
sortedBlocks
->
getDeviceBuffer
());
findInteractingBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
10
,
sortedBlockCenter
->
getDeviceBuffer
());
findInteractingBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
11
,
sortedBlockBoundingBox
->
getDeviceBuffer
());
findInteractingBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
12
,
exclusionIndices
->
getDeviceBuffer
());
findInteractingBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
13
,
exclusionRowIndices
->
getDeviceBuffer
());
findInteractingBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
14
,
oldPositions
->
getDeviceBuffer
());
findInteractingBlocksKernel
.
setArg
<
cl
::
Buffer
>
(
15
,
rebuildNeighborList
->
getDeviceBuffer
());
if
(
findInteractingBlocksKernel
.
getWorkGroupInfo
<
CL_KERNEL_WORK_GROUP_SIZE
>
(
context
.
getDevice
())
<
groupSize
)
{
// The device can't handle this block size, so reduce it.
groupSize
-=
32
;
if
(
groupSize
<
32
)
throw
OpenMMException
(
"Failed to create findInteractingBlocks kernel"
);
continue
;
}
break
;
}
interactingBlocksThreadBlockSize
=
(
deviceIsCpu
?
1
:
groupSize
);
}
}
...
...
@@ -389,7 +402,7 @@ void OpenCLNonbondedUtilities::prepareInteractions() {
context
.
executeKernel
(
sortBoxDataKernel
,
context
.
getNumAtoms
());
setPeriodicBoxSizeArg
(
context
,
findInteractingBlocksKernel
,
0
);
setInvPeriodicBoxSizeArg
(
context
,
findInteractingBlocksKernel
,
1
);
context
.
executeKernel
(
findInteractingBlocksKernel
,
context
.
getNumAtoms
(),
deviceIsCpu
?
1
:
128
);
context
.
executeKernel
(
findInteractingBlocksKernel
,
context
.
getNumAtoms
(),
interactingBlocksThreadBlockSize
);
}
void
OpenCLNonbondedUtilities
::
computeInteractions
()
{
...
...
platforms/opencl/src/OpenCLPlatform.cpp
View file @
a72e364f
...
...
@@ -32,6 +32,7 @@
#include "openmm/Context.h"
#include "openmm/System.h"
#include <algorithm>
#include <cctype>
#include <sstream>
#ifdef __APPLE__
#include "sys/sysctl.h"
...
...
@@ -39,10 +40,7 @@
using
namespace
OpenMM
;
using
std
::
map
;
using
std
::
string
;
using
std
::
stringstream
;
using
std
::
vector
;
using
namespace
std
;
#ifdef OPENMM_OPENCL_BUILDING_STATIC_LIBRARY
extern
"C"
void
registerOpenCLPlatform
()
{
...
...
platforms/opencl/src/OpenCLSort.cpp
View file @
a72e364f
...
...
@@ -56,10 +56,13 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le
unsigned
int
maxGroupSize
=
std
::
min
(
256
,
(
int
)
context
.
getDevice
().
getInfo
<
CL_DEVICE_MAX_WORK_GROUP_SIZE
>
());
int
maxSharedMem
=
context
.
getDevice
().
getInfo
<
CL_DEVICE_LOCAL_MEM_SIZE
>
();
unsigned
int
maxLocalBuffer
=
(
unsigned
int
)
((
maxSharedMem
/
trait
->
getDataSize
())
/
2
);
isShortList
=
(
length
<=
maxLocalBuffer
);
for
(
rangeKernelSize
=
1
;
rangeKernelSize
*
2
<=
maxGroupSize
;
rangeKernelSize
*=
2
)
unsigned
int
maxRangeSize
=
std
::
min
(
maxGroupSize
,
(
unsigned
int
)
computeRangeKernel
.
getWorkGroupInfo
<
CL_KERNEL_WORK_GROUP_SIZE
>
(
context
.
getDevice
()));
unsigned
int
maxPositionsSize
=
std
::
min
(
maxGroupSize
,
(
unsigned
int
)
computeBucketPositionsKernel
.
getWorkGroupInfo
<
CL_KERNEL_WORK_GROUP_SIZE
>
(
context
.
getDevice
()));
unsigned
int
maxShortListSize
=
shortListKernel
.
getWorkGroupInfo
<
CL_KERNEL_WORK_GROUP_SIZE
>
(
context
.
getDevice
());
isShortList
=
(
length
<=
maxLocalBuffer
&&
length
<
maxShortListSize
);
for
(
rangeKernelSize
=
1
;
rangeKernelSize
*
2
<=
maxRangeSize
;
rangeKernelSize
*=
2
)
;
positionsKernelSize
=
rangeKernelSize
;
positionsKernelSize
=
std
::
min
(
rangeKernelSize
,
maxPositionsSize
)
;
sortKernelSize
=
(
isShortList
?
rangeKernelSize
:
rangeKernelSize
/
2
);
if
(
rangeKernelSize
>
length
)
rangeKernelSize
=
length
;
...
...
plugins/cpupme/CMakeLists.txt
View file @
a72e364f
...
...
@@ -57,7 +57,11 @@ ENDFOREACH(subdir)
INCLUDE_DIRECTORIES
(
BEFORE
${
CMAKE_CURRENT_SOURCE_DIR
}
/src
)
IF
(
NOT MSVC
)
SET_SOURCE_FILES_PROPERTIES
(
${
SOURCE_FILES
}
PROPERTIES COMPILE_FLAGS
"-msse4.1"
)
IF
(
ANDROID
)
SET_SOURCE_FILES_PROPERTIES
(
${
SOURCE_FILES
}
PROPERTIES COMPILE_FLAGS
""
)
ELSE
(
ANDROID
)
SET_SOURCE_FILES_PROPERTIES
(
${
SOURCE_FILES
}
PROPERTIES COMPILE_FLAGS
"-msse4.1"
)
ENDIF
(
ANDROID
)
ENDIF
(
NOT MSVC
)
# Include FFTW related files.
...
...
plugins/cpupme/src/CpuPmeKernels.cpp
View file @
a72e364f
...
...
@@ -569,13 +569,20 @@ double CpuCalcPmeReciprocalForceKernel::finishComputation(IO& io) {
}
bool
CpuCalcPmeReciprocalForceKernel
::
isProcessorSupported
()
{
int
cpuInfo
[
4
];
cpuid
(
cpuInfo
,
0
);
if
(
cpuInfo
[
0
]
>=
1
)
{
cpuid
(
cpuInfo
,
1
);
return
((
cpuInfo
[
2
]
&
((
int
)
1
<<
19
))
!=
0
);
// Require SSE 4.1
}
return
false
;
// Make sure the CPU supports SSE 4.1 or NEON.
#ifdef __ANDROID__
uint64_t
features
=
android_getCpuFeatures
();
return
(
features
&
ANDROID_CPU_ARM_FEATURE_NEON
)
!=
0
;
#else
int
cpuInfo
[
4
];
cpuid
(
cpuInfo
,
0
);
if
(
cpuInfo
[
0
]
>=
1
)
{
cpuid
(
cpuInfo
,
1
);
return
((
cpuInfo
[
2
]
&
((
int
)
1
<<
19
))
!=
0
);
}
return
false
;
#endif
}
int
CpuCalcPmeReciprocalForceKernel
::
findFFTDimension
(
int
minimum
)
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment