Commit a72e364f authored by peastman's avatar peastman
Browse files

Merge pull request #516 from peastman/qc

Lots of changes to support Android
parents 0c00acd2 d7569df2
......@@ -61,6 +61,11 @@ ELSE(WIN32)
ENDIF(NOT OPENMM_INSTALL_PREFIX)
ENDIF(WIN32)
# Include CPU-Features for Android
IF (ANDROID)
INCLUDE_DIRECTORIES(${ANDROID_NDK}/sources/cpufeatures)
ENDIF (ANDROID)
# It seems that on linux and mac, everything is trying to be installed in /usr/local/openmm
# But if every install target is prefixed with /openmm/, on Windows the install files
# end up in C:/Program Files/OpenMM/openmm/ which is ugly.
......@@ -87,9 +92,11 @@ IF(WIN32)
SET(PTHREADS_LIB pthreadVC2)
SET(PTHREADS_LIB_STATIC pthreadVC2_static_mt)
ELSE(WIN32)
SET(PTHREADS_LIB pthread)
# in linux, even in static builds we link against the dynamic object (since its tied to libc versions)
SET(PTHREADS_LIB_STATIC pthread)
IF (NOT ANDROID)
SET(PTHREADS_LIB pthread)
# in linux, even in static builds we link against the dynamic object (since its tied to libc versions)
SET(PTHREADS_LIB_STATIC pthread)
ENDIF (NOT ANDROID)
ENDIF(WIN32)
# The build system will set ARCH64 for 64 bit builds, which require
......@@ -121,11 +128,11 @@ IF (APPLE)
SET (CMAKE_INSTALL_NAME_DIR "@rpath")
SET(EXTRA_COMPILE_FLAGS "-msse2 -stdlib=libc++")
ELSE (APPLE)
IF (MSVC)
IF (MSVC OR ANDROID)
SET(EXTRA_COMPILE_FLAGS)
ELSE (MSVC)
ELSE (MSVC OR ANDROID)
SET(EXTRA_COMPILE_FLAGS "-msse2")
ENDIF (MSVC)
ENDIF (MSVC OR ANDROID)
ENDIF (APPLE)
IF(UNIX AND NOT CMAKE_BUILD_TYPE)
......@@ -137,8 +144,13 @@ IF (NOT CMAKE_CXX_FLAGS_DEBUG)
ENDIF (NOT CMAKE_CXX_FLAGS_DEBUG)
IF (NOT CMAKE_CXX_FLAGS_RELEASE)
SET(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG" CACHE STRING
"To use when CMAKE_BUILD_TYPE=Release" FORCE)
IF (ANDROID)
SET(CMAKE_CXX_FLAGS_RELEASE "-mfloat-abi=softfp -march=armv7-a -mfpu=neon -funsafe-math-optimizations -O3 -DNDEBUG" CACHE STRING
"To use when CMAKE_BUILD_TYPE=Release" FORCE)
ELSE (ANDROID)
SET(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG" CACHE STRING
"To use when CMAKE_BUILD_TYPE=Release" FORCE)
ENDIF (ANDROID)
ENDIF (NOT CMAKE_CXX_FLAGS_RELEASE)
......@@ -252,7 +264,11 @@ FOREACH(subdir ${OPENMM_SOURCE_SUBDIRS})
## OpenMM was previously installed there.
INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include)
ENDFOREACH(subdir)
SET_SOURCE_FILES_PROPERTIES(${CMAKE_SOURCE_DIR}/libraries/sfmt/src/SFMT.cpp PROPERTIES COMPILE_FLAGS "-DHAVE_SSE2=1")
IF (ANDROID)
SET_SOURCE_FILES_PROPERTIES(${CMAKE_SOURCE_DIR}/libraries/sfmt/src/SFMT.cpp PROPERTIES COMPILE_FLAGS "-UHAVE_SSE2")
ELSE (ANDROID)
SET_SOURCE_FILES_PROPERTIES(${CMAKE_SOURCE_DIR}/libraries/sfmt/src/SFMT.cpp PROPERTIES COMPILE_FLAGS "-DHAVE_SSE2=1")
ENDIF(ANDROID)
# If API wrappers are being generated, and add them to the build.
SET(OPENMM_BUILD_C_AND_FORTRAN_WRAPPERS ON CACHE BOOL "Build wrappers for C and Fortran")
......@@ -287,13 +303,17 @@ ENDIF(OPENMM_BUILD_C_AND_FORTRAN_WRAPPERS)
# On Linux need to link to libdl
FIND_LIBRARY(DL_LIBRARY dl)
IF(DL_LIBRARY)
TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${DL_LIBRARY} ${PTHREADS_LIB})
IF(OPENMM_BUILD_STATIC_LIB)
TARGET_LINK_LIBRARIES(${STATIC_TARGET} ${DL_LIBRARY} ${PTHREADS_LIB})
ENDIF(OPENMM_BUILD_STATIC_LIB)
MARK_AS_ADVANCED(DL_LIBRARY)
TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${DL_LIBRARY} ${PTHREADS_LIB})
IF(OPENMM_BUILD_STATIC_LIB)
TARGET_LINK_LIBRARIES(${STATIC_TARGET} ${DL_LIBRARY} ${PTHREADS_LIB})
ENDIF(OPENMM_BUILD_STATIC_LIB)
MARK_AS_ADVANCED(DL_LIBRARY)
ELSE(DL_LIBRARY)
TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${PTHREADS_LIB})
IF (ANDROID)
TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${PTHREADS_LIB} cpufeatures)
ELSE (ANDROID)
TARGET_LINK_LIBRARIES(${SHARED_TARGET} ${PTHREADS_LIB})
ENDIF (ANDROID)
ENDIF(DL_LIBRARY)
ADD_SUBDIRECTORY(platforms/reference/tests)
......
......@@ -47,8 +47,12 @@
#define NOMINMAX
#include <windows.h>
#else
#include <dlfcn.h>
#include <unistd.h>
#ifdef __ANDROID__
#include <cpu-features.h>
#else
#include <dlfcn.h>
#include <unistd.h>
#endif
#endif
#endif
......@@ -70,11 +74,15 @@ static int getNumProcessors() {
ncpu = 1;
return ncpu;
#else
long nProcessorsOnline = sysconf(_SC_NPROCESSORS_ONLN);
if (nProcessorsOnline == -1)
return 1;
else
return (int) nProcessorsOnline;
#ifdef __ANDROID__
return android_getCpuCount();
#else
long nProcessorsOnline = sysconf(_SC_NPROCESSORS_ONLN);
if (nProcessorsOnline == -1)
return 1;
else
return (int) nProcessorsOnline;
#endif
#endif
#endif
}
......@@ -85,30 +93,32 @@ static int getNumProcessors() {
#ifdef _WIN32
#define cpuid __cpuid
#else
static void cpuid(int cpuInfo[4], int infoType){
#ifdef __LP64__
__asm__ __volatile__ (
"cpuid":
"=a" (cpuInfo[0]),
"=b" (cpuInfo[1]),
"=c" (cpuInfo[2]),
"=d" (cpuInfo[3]) :
"a" (infoType)
);
#else
__asm__ __volatile__ (
"pushl %%ebx\n"
"cpuid\n"
"movl %%ebx, %1\n"
"popl %%ebx\n" :
"=a" (cpuInfo[0]),
"=r" (cpuInfo[1]),
"=c" (cpuInfo[2]),
"=d" (cpuInfo[3]) :
"a" (infoType)
);
#endif
}
#ifndef __ANDROID__
static void cpuid(int cpuInfo[4], int infoType){
#ifdef __LP64__
__asm__ __volatile__ (
"cpuid":
"=a" (cpuInfo[0]),
"=b" (cpuInfo[1]),
"=c" (cpuInfo[2]),
"=d" (cpuInfo[3]) :
"a" (infoType)
);
#else
__asm__ __volatile__ (
"pushl %%ebx\n"
"cpuid\n"
"movl %%ebx, %1\n"
"popl %%ebx\n" :
"=a" (cpuInfo[0]),
"=r" (cpuInfo[1]),
"=c" (cpuInfo[2]),
"=d" (cpuInfo[3]) :
"a" (infoType)
);
#endif
}
#endif
#endif
#endif // OPENMM_HARDWARE_H_
......@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2013 Stanford University and the Authors. *
* Portions copyright (c) 2014 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -31,256 +31,11 @@
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <smmintrin.h>
// This file defines classes and functions to simplify vectorizing code with SSE.
class ivec4;
/**
* A four element vector of floats.
*/
class fvec4 {
public:
__m128 val;
fvec4() {}
fvec4(float v) : val(_mm_set1_ps(v)) {}
fvec4(float v1, float v2, float v3, float v4) : val(_mm_set_ps(v4, v3, v2, v1)) {}
fvec4(__m128 v) : val(v) {}
fvec4(const float* v) : val(_mm_loadu_ps(v)) {}
operator __m128() const {
return val;
}
float operator[](int i) const {
float result[4];
store(result);
return result[i];
}
void store(float* v) const {
_mm_storeu_ps(v, val);
}
fvec4 operator+(const fvec4& other) const {
return _mm_add_ps(val, other);
}
fvec4 operator-(const fvec4& other) const {
return _mm_sub_ps(val, other);
}
fvec4 operator*(const fvec4& other) const {
return _mm_mul_ps(val, other);
}
fvec4 operator/(const fvec4& other) const {
return _mm_div_ps(val, other);
}
void operator+=(const fvec4& other) {
val = _mm_add_ps(val, other);
}
void operator-=(const fvec4& other) {
val = _mm_sub_ps(val, other);
}
void operator*=(const fvec4& other) {
val = _mm_mul_ps(val, other);
}
void operator/=(const fvec4& other) {
val = _mm_div_ps(val, other);
}
fvec4 operator-() const {
return _mm_sub_ps(_mm_set1_ps(0.0f), val);
}
fvec4 operator&(const fvec4& other) const {
return _mm_and_ps(val, other);
}
fvec4 operator|(const fvec4& other) const {
return _mm_or_ps(val, other);
}
fvec4 operator==(const fvec4& other) const {
return _mm_cmpeq_ps(val, other);
}
fvec4 operator!=(const fvec4& other) const {
return _mm_cmpneq_ps(val, other);
}
fvec4 operator>(const fvec4& other) const {
return _mm_cmpgt_ps(val, other);
}
fvec4 operator<(const fvec4& other) const {
return _mm_cmplt_ps(val, other);
}
fvec4 operator>=(const fvec4& other) const {
return _mm_cmpge_ps(val, other);
}
fvec4 operator<=(const fvec4& other) const {
return _mm_cmple_ps(val, other);
}
operator ivec4() const;
};
/**
* A four element vector of ints.
*/
class ivec4 {
public:
__m128i val;
ivec4() {}
ivec4(int v) : val(_mm_set1_epi32(v)) {}
ivec4(int v1, int v2, int v3, int v4) : val(_mm_set_epi32(v4, v3, v2, v1)) {}
ivec4(__m128i v) : val(v) {}
ivec4(const int* v) : val(_mm_loadu_si128((const __m128i*) v)) {}
operator __m128i() const {
return val;
}
int operator[](int i) const {
int result[4];
store(result);
return result[i];
}
void store(int* v) const {
_mm_storeu_si128((__m128i*) v, val);
}
ivec4 operator+(const ivec4& other) const {
return _mm_add_epi32(val, other);
}
ivec4 operator-(const ivec4& other) const {
return _mm_sub_epi32(val, other);
}
ivec4 operator*(const ivec4& other) const {
return _mm_mul_epi32(val, other);
}
void operator+=(const ivec4& other) {
val = _mm_add_epi32(val, other);
}
void operator-=(const ivec4& other) {
val = _mm_sub_epi32(val, other);
}
void operator*=(const ivec4& other) {
val = _mm_mul_epi32(val, other);
}
ivec4 operator-() const {
return _mm_sub_epi32(_mm_set1_epi32(0), val);
}
ivec4 operator&(const ivec4& other) const {
return _mm_and_si128(val, other);
}
ivec4 operator|(const ivec4& other) const {
return _mm_or_si128(val, other);
}
ivec4 operator==(const ivec4& other) const {
return _mm_cmpeq_epi32(val, other);
}
ivec4 operator!=(const ivec4& other) const {
return _mm_xor_si128(*this==other, _mm_set1_epi32(0xFFFFFFFF));
}
ivec4 operator>(const ivec4& other) const {
return _mm_cmpgt_epi32(val, other);
}
ivec4 operator<(const ivec4& other) const {
return _mm_cmplt_epi32(val, other);
}
ivec4 operator>=(const ivec4& other) const {
return _mm_xor_si128(_mm_cmplt_epi32(val, other), _mm_set1_epi32(0xFFFFFFFF));
}
ivec4 operator<=(const ivec4& other) const {
return _mm_xor_si128(_mm_cmpgt_epi32(val, other), _mm_set1_epi32(0xFFFFFFFF));
}
operator fvec4() const;
};
// Conversion operators.
inline fvec4::operator ivec4() const {
return _mm_cvttps_epi32(val);
}
inline ivec4::operator fvec4() const {
return _mm_cvtepi32_ps(val);
}
// Functions that operate on fvec4s.
static inline fvec4 floor(const fvec4& v) {
return fvec4(_mm_floor_ps(v.val));
}
static inline fvec4 ceil(const fvec4& v) {
return fvec4(_mm_ceil_ps(v.val));
}
static inline fvec4 round(const fvec4& v) {
return fvec4(_mm_round_ps(v.val, _MM_FROUND_TO_NEAREST_INT));
}
static inline fvec4 min(const fvec4& v1, const fvec4& v2) {
return fvec4(_mm_min_ps(v1.val, v2.val));
}
static inline fvec4 max(const fvec4& v1, const fvec4& v2) {
return fvec4(_mm_max_ps(v1.val, v2.val));
}
static inline fvec4 abs(const fvec4& v) {
static const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
return fvec4(_mm_and_ps(v.val, mask));
}
static inline fvec4 sqrt(const fvec4& v) {
return fvec4(_mm_sqrt_ps(v.val));
}
static inline float dot3(const fvec4& v1, const fvec4& v2) {
return _mm_cvtss_f32(_mm_dp_ps(v1, v2, 0x71));
}
static inline float dot4(const fvec4& v1, const fvec4& v2) {
return _mm_cvtss_f32(_mm_dp_ps(v1, v2, 0xF1));
}
static inline void transpose(fvec4& v1, fvec4& v2, fvec4& v3, fvec4& v4) {
_MM_TRANSPOSE4_PS(v1, v2, v3, v4);
}
// Functions that operate on ivec4s.
static inline ivec4 min(const ivec4& v1, const ivec4& v2) {
return ivec4(_mm_min_epi32(v1.val, v2.val));
}
static inline ivec4 max(const ivec4& v1, const ivec4& v2) {
return ivec4(_mm_max_epi32(v1.val, v2.val));
}
static inline ivec4 abs(const ivec4& v) {
return ivec4(_mm_abs_epi32(v.val));
}
static inline bool any(const ivec4& v) {
return !_mm_test_all_zeros(v, _mm_set1_epi32(0xFFFFFFFF));
}
// Mathematical operators involving a scalar and a vector.
static inline fvec4 operator+(float v1, const fvec4& v2) {
return fvec4(v1)+v2;
}
static inline fvec4 operator-(float v1, const fvec4& v2) {
return fvec4(v1)-v2;
}
static inline fvec4 operator*(float v1, const fvec4& v2) {
return fvec4(v1)*v2;
}
static inline fvec4 operator/(float v1, const fvec4& v2) {
return fvec4(v1)/v2;
}
// Operations for blending fvec4s based on an ivec4.
static inline fvec4 blend(const fvec4& v1, const fvec4& v2, const ivec4& mask) {
return fvec4(_mm_blendv_ps(v1.val, v2.val, _mm_castsi128_ps(mask.val)));
}
#if defined(__ANDROID__)
#include "vectorize_neon.h"
#else
#include "vectorize_sse.h"
#endif
#endif /*OPENMM_VECTORIZE_H_*/
#ifndef OPENMM_VECTORIZE_NEON_H_
#define OPENMM_VECTORIZE_NEON_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2013-2014 Stanford University and the Authors. *
* Authors: Mateus Lima, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <cpu-features.h>
#include <arm_neon.h>
#include <cmath>
typedef int int32_t;
// This file defines classes and functions to simplify vectorizing code with NEON.
class ivec4;
/**
* A four element vector of floats.
*/
class fvec4 {
public:
float32x4_t val;
fvec4() {}
fvec4(float v) : val(vdupq_n_f32(v)) {}
fvec4(float v1, float v2, float v3, float v4) {
float v[] = {v1, v2, v3, v4};
val = vld1q_f32(v);
}
fvec4(float32x4_t v) : val(v) {}
fvec4(const float* v) : val(vld1q_f32(v)) {}
operator float32x4_t() const {
return val;
}
float operator[](int i) const {
switch (i) {
case 0:
return vgetq_lane_f32(val, 0);
case 1:
return vgetq_lane_f32(val, 1);
case 2:
return vgetq_lane_f32(val, 2);
case 3:
return vgetq_lane_f32(val, 3);
}
return 0.0f;
}
void store(float* v) const {
vst1q_f32(v, val);
}
fvec4 operator+(const fvec4& other) const {
return vaddq_f32(val, other);
}
fvec4 operator-(const fvec4& other) const {
return vsubq_f32(val, other);
}
fvec4 operator*(const fvec4& other) const {
return vmulq_f32(val, other);
}
fvec4 operator/(const fvec4& other) const {
// NEON does not have a divide float-point operator, so we get the reciprocal and multiply.
float32x4_t reciprocal = vrecpeq_f32(other);
reciprocal = vmulq_f32(vrecpsq_f32(other, reciprocal), reciprocal);
reciprocal = vmulq_f32(vrecpsq_f32(other, reciprocal), reciprocal);
fvec4 result = vmulq_f32(val,reciprocal);
return result;
}
void operator+=(const fvec4& other) {
val = vaddq_f32(val, other);
}
void operator-=(const fvec4& other) {
val = vsubq_f32(val, other);
}
void operator*=(const fvec4& other) {
val = vmulq_f32(val, other);
}
void operator/=(const fvec4& other) {
val = *this/other;
}
fvec4 operator-() const {
return vnegq_f32(val);
}
fvec4 operator&(const fvec4& other) const {
return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(val), vreinterpretq_u32_f32(other)));
}
fvec4 operator|(const fvec4& other) const {
return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(val), vreinterpretq_u32_f32(other)));
}
fvec4 operator==(const fvec4& other) const {
return vcvtq_f32_s32(vreinterpretq_s32_u32(vceqq_f32(val, other)));
}
fvec4 operator!=(const fvec4& other) const {
return vcvtq_f32_s32(vreinterpretq_s32_u32(vmvnq_u32(vceqq_f32(val, other)))); // not(equals(val, other))
}
fvec4 operator>(const fvec4& other) const {
return vcvtq_f32_s32(vreinterpretq_s32_u32(vcgtq_f32(val, other)));
}
fvec4 operator<(const fvec4& other) const {
return vcvtq_f32_s32(vreinterpretq_s32_u32(vcltq_f32(val, other)));
}
fvec4 operator>=(const fvec4& other) const {
return vcvtq_f32_s32(vreinterpretq_s32_u32(vcgeq_f32(val, other)));
}
fvec4 operator<=(const fvec4& other) const {
return vcvtq_f32_s32(vreinterpretq_s32_u32(vcleq_f32(val, other)));
}
operator ivec4() const;
};
/**
* A four element vector of ints.
*/
class ivec4 {
public:
int32x4_t val;
ivec4() {}
ivec4(int v) : val(vdupq_n_s32(v)) {}
ivec4(int v1, int v2, int v3, int v4) {
int v[] = {v1, v2, v3, v4};
val = vld1q_s32(v);
}
ivec4(int32x4_t v) : val(v) {}
ivec4(const int* v) : val(vld1q_s32(v)) {}
operator int32x4_t() const {
return val;
}
int operator[](int i) const {
switch (i) {
case 0:
return vgetq_lane_s32(val, 0);
case 1:
return vgetq_lane_s32(val, 1);
case 2:
return vgetq_lane_s32(val, 2);
case 3:
return vgetq_lane_s32(val, 3);
}
return 0;
}
void store(int* v) const {
vst1q_s32(v, val);
}
ivec4 operator+(const ivec4& other) const {
return vaddq_s32(val, other);
}
ivec4 operator-(const ivec4& other) const {
return vsubq_s32(val, other);
}
ivec4 operator*(const ivec4& other) const {
return vmulq_s32(val, other);
}
void operator+=(const ivec4& other) {
val = vaddq_s32(val, other);
}
void operator-=(const ivec4& other) {
val = vsubq_s32(val, other);
}
void operator*=(const ivec4& other) {
val = vmulq_s32(val, other);
}
ivec4 operator-() const {
return vnegq_s32(val);
}
ivec4 operator&(const ivec4& other) const {
return vandq_s32(val, other);
}
ivec4 operator|(const ivec4& other) const {
return vorrq_s32(val, other);
}
ivec4 operator==(const ivec4& other) const {
return vreinterpretq_s32_u32(vceqq_s32(val, other));
}
ivec4 operator!=(const ivec4& other) const {
return vreinterpretq_s32_u32(vmvnq_u32(vceqq_s32(val, other))); // not(equal(val, other))
}
ivec4 operator>(const ivec4& other) const {
return vreinterpretq_s32_u32(vcgtq_s32(val, other));
}
ivec4 operator<(const ivec4& other) const {
return vreinterpretq_s32_u32(vcltq_s32(val, other));
}
ivec4 operator>=(const ivec4& other) const {
return vreinterpretq_s32_u32(vcgeq_s32(val, other));
}
ivec4 operator<=(const ivec4& other) const {
return vreinterpretq_s32_u32(vcleq_s32(val, other));
}
operator fvec4() const;
};
// Conversion operators.
inline fvec4::operator ivec4() const {
return ivec4(vcvtq_s32_f32(val));
}
inline ivec4::operator fvec4() const {
return fvec4(vcvtq_f32_s32(val));
}
// Functions that operate on fvec4s.
static inline fvec4 min(const fvec4& v1, const fvec4& v2) {
return vminq_f32(v1, v2);
}
static inline fvec4 max(const fvec4& v1, const fvec4& v2) {
return vmaxq_f32(v1, v2);
}
static inline fvec4 abs(const fvec4& v) {
return vabsq_f32(v);
}
static inline fvec4 sqrt(const fvec4& v) {
float32x4_t recipSqrt = vrsqrteq_f32(v);
recipSqrt = vmulq_f32(recipSqrt, vrsqrtsq_f32(vmulq_f32(recipSqrt, v), recipSqrt));
recipSqrt = vmulq_f32(recipSqrt, vrsqrtsq_f32(vmulq_f32(recipSqrt, v), recipSqrt));
return vmulq_f32(v, recipSqrt);
}
static inline float dot3(const fvec4& v1, const fvec4& v2) {
fvec4 result = v1*v2;
return vgetq_lane_f32(result, 0) + vgetq_lane_f32(result, 1) + vgetq_lane_f32(result, 2);
}
static inline float dot4(const fvec4& v1, const fvec4& v2) {
fvec4 result = v1*v2;
return vgetq_lane_f32(result, 0) + vgetq_lane_f32(result, 1) + vgetq_lane_f32(result, 2) + vgetq_lane_f32(result,3);
}
static inline void transpose(fvec4& v1, fvec4& v2, fvec4& v3, fvec4& v4) {
float32x4x2_t t1 = vuzpq_f32(v1, v3);
float32x4x2_t t2 = vuzpq_f32(v2, v4);
float32x4x2_t t3 = vtrnq_f32(t1.val[0], t2.val[0]);
float32x4x2_t t4 = vtrnq_f32(t1.val[1], t2.val[1]);
v1 = t3.val[0];
v2 = t4.val[0];
v3 = t3.val[1];
v4 = t4.val[1];
}
// Functions that operate on ivec4s.
static inline ivec4 min(const ivec4& v1, const ivec4& v2) {
return vminq_s32(v1, v2);
}
static inline ivec4 max(const ivec4& v1, const ivec4& v2) {
return vmaxq_s32(v1, v2);
}
static inline ivec4 abs(const ivec4& v) {
return vabdq_s32(v, ivec4(0));
}
static inline bool any(const ivec4& v) {
return (vgetq_lane_s32(v, 0) != 0 || vgetq_lane_s32(v, 1) != 0 || vgetq_lane_s32(v, 2) != 0 || vgetq_lane_s32(v, 3) != 0);
}
// Mathematical operators involving a scalar and a vector.
static inline fvec4 operator+(float v1, const fvec4& v2) {
return fvec4(v1)+v2;
}
static inline fvec4 operator-(float v1, const fvec4& v2) {
return fvec4(v1)-v2;
}
static inline fvec4 operator*(float v1, const fvec4& v2) {
return fvec4(v1)*v2;
}
static inline fvec4 operator/(float v1, const fvec4& v2) {
return fvec4(v1)/v2;
}
// Operations for blending fvec4s based on an ivec4.
static inline fvec4 blend(const fvec4& v1, const fvec4& v2, const ivec4& mask) {
return vbslq_f32(vreinterpretq_u32_s32(mask), v2, v1);
}
// These are at the end since they involve other functions defined above.
static inline fvec4 round(const fvec4& v) {
fvec4 shift(0x1.0p23f);
fvec4 absResult = (abs(v)+shift)-shift;
return blend(v, absResult, ivec4(0x7FFFFFFF));
}
static inline fvec4 floor(const fvec4& v) {
fvec4 rounded = round(v);
return rounded + blend(0.0f, -1.0f, rounded>v);
}
static inline fvec4 ceil(const fvec4& v) {
fvec4 rounded = round(v);
return rounded + blend(0.0f, 1.0f, rounded<v);
}
#endif /*OPENMM_VECTORIZE_NEON_H_*/
#ifndef OPENMM_VECTORIZE_SSE_H_
#define OPENMM_VECTORIZE_SSE_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2013 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <smmintrin.h>
// This file defines classes and functions to simplify vectorizing code with SSE.
class ivec4;
/**
* A four element vector of floats.
*/
class fvec4 {
public:
__m128 val;
fvec4() {}
fvec4(float v) : val(_mm_set1_ps(v)) {}
fvec4(float v1, float v2, float v3, float v4) : val(_mm_set_ps(v4, v3, v2, v1)) {}
fvec4(__m128 v) : val(v) {}
fvec4(const float* v) : val(_mm_loadu_ps(v)) {}
operator __m128() const {
return val;
}
float operator[](int i) const {
float result[4];
store(result);
return result[i];
}
void store(float* v) const {
_mm_storeu_ps(v, val);
}
fvec4 operator+(const fvec4& other) const {
return _mm_add_ps(val, other);
}
fvec4 operator-(const fvec4& other) const {
return _mm_sub_ps(val, other);
}
fvec4 operator*(const fvec4& other) const {
return _mm_mul_ps(val, other);
}
fvec4 operator/(const fvec4& other) const {
return _mm_div_ps(val, other);
}
void operator+=(const fvec4& other) {
val = _mm_add_ps(val, other);
}
void operator-=(const fvec4& other) {
val = _mm_sub_ps(val, other);
}
void operator*=(const fvec4& other) {
val = _mm_mul_ps(val, other);
}
void operator/=(const fvec4& other) {
val = _mm_div_ps(val, other);
}
fvec4 operator-() const {
return _mm_sub_ps(_mm_set1_ps(0.0f), val);
}
fvec4 operator&(const fvec4& other) const {
return _mm_and_ps(val, other);
}
fvec4 operator|(const fvec4& other) const {
return _mm_or_ps(val, other);
}
fvec4 operator==(const fvec4& other) const {
return _mm_cmpeq_ps(val, other);
}
fvec4 operator!=(const fvec4& other) const {
return _mm_cmpneq_ps(val, other);
}
fvec4 operator>(const fvec4& other) const {
return _mm_cmpgt_ps(val, other);
}
fvec4 operator<(const fvec4& other) const {
return _mm_cmplt_ps(val, other);
}
fvec4 operator>=(const fvec4& other) const {
return _mm_cmpge_ps(val, other);
}
fvec4 operator<=(const fvec4& other) const {
return _mm_cmple_ps(val, other);
}
operator ivec4() const;
};
/**
* A four element vector of ints.
*/
class ivec4 {
public:
__m128i val;
ivec4() {}
ivec4(int v) : val(_mm_set1_epi32(v)) {}
ivec4(int v1, int v2, int v3, int v4) : val(_mm_set_epi32(v4, v3, v2, v1)) {}
ivec4(__m128i v) : val(v) {}
ivec4(const int* v) : val(_mm_loadu_si128((const __m128i*) v)) {}
operator __m128i() const {
return val;
}
int operator[](int i) const {
int result[4];
store(result);
return result[i];
}
void store(int* v) const {
_mm_storeu_si128((__m128i*) v, val);
}
ivec4 operator+(const ivec4& other) const {
return _mm_add_epi32(val, other);
}
ivec4 operator-(const ivec4& other) const {
return _mm_sub_epi32(val, other);
}
ivec4 operator*(const ivec4& other) const {
return _mm_mul_epi32(val, other);
}
void operator+=(const ivec4& other) {
val = _mm_add_epi32(val, other);
}
void operator-=(const ivec4& other) {
val = _mm_sub_epi32(val, other);
}
void operator*=(const ivec4& other) {
val = _mm_mul_epi32(val, other);
}
ivec4 operator-() const {
return _mm_sub_epi32(_mm_set1_epi32(0), val);
}
ivec4 operator&(const ivec4& other) const {
return _mm_and_si128(val, other);
}
ivec4 operator|(const ivec4& other) const {
return _mm_or_si128(val, other);
}
ivec4 operator==(const ivec4& other) const {
return _mm_cmpeq_epi32(val, other);
}
ivec4 operator!=(const ivec4& other) const {
return _mm_xor_si128(*this==other, _mm_set1_epi32(0xFFFFFFFF));
}
ivec4 operator>(const ivec4& other) const {
return _mm_cmpgt_epi32(val, other);
}
ivec4 operator<(const ivec4& other) const {
return _mm_cmplt_epi32(val, other);
}
ivec4 operator>=(const ivec4& other) const {
return _mm_xor_si128(_mm_cmplt_epi32(val, other), _mm_set1_epi32(0xFFFFFFFF));
}
ivec4 operator<=(const ivec4& other) const {
return _mm_xor_si128(_mm_cmpgt_epi32(val, other), _mm_set1_epi32(0xFFFFFFFF));
}
operator fvec4() const;
};
// Conversion operators.
inline fvec4::operator ivec4() const {
return _mm_cvttps_epi32(val);
}
inline ivec4::operator fvec4() const {
return _mm_cvtepi32_ps(val);
}
// Functions that operate on fvec4s.
static inline fvec4 floor(const fvec4& v) {
return fvec4(_mm_floor_ps(v.val));
}
static inline fvec4 ceil(const fvec4& v) {
return fvec4(_mm_ceil_ps(v.val));
}
static inline fvec4 round(const fvec4& v) {
return fvec4(_mm_round_ps(v.val, _MM_FROUND_TO_NEAREST_INT));
}
static inline fvec4 min(const fvec4& v1, const fvec4& v2) {
return fvec4(_mm_min_ps(v1.val, v2.val));
}
static inline fvec4 max(const fvec4& v1, const fvec4& v2) {
return fvec4(_mm_max_ps(v1.val, v2.val));
}
static inline fvec4 abs(const fvec4& v) {
static const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
return fvec4(_mm_and_ps(v.val, mask));
}
static inline fvec4 sqrt(const fvec4& v) {
return fvec4(_mm_sqrt_ps(v.val));
}
static inline float dot3(const fvec4& v1, const fvec4& v2) {
return _mm_cvtss_f32(_mm_dp_ps(v1, v2, 0x71));
}
static inline float dot4(const fvec4& v1, const fvec4& v2) {
return _mm_cvtss_f32(_mm_dp_ps(v1, v2, 0xF1));
}
static inline void transpose(fvec4& v1, fvec4& v2, fvec4& v3, fvec4& v4) {
_MM_TRANSPOSE4_PS(v1, v2, v3, v4);
}
// Functions that operate on ivec4s.
static inline ivec4 min(const ivec4& v1, const ivec4& v2) {
return ivec4(_mm_min_epi32(v1.val, v2.val));
}
static inline ivec4 max(const ivec4& v1, const ivec4& v2) {
return ivec4(_mm_max_epi32(v1.val, v2.val));
}
static inline ivec4 abs(const ivec4& v) {
return ivec4(_mm_abs_epi32(v.val));
}
static inline bool any(const ivec4& v) {
return !_mm_test_all_zeros(v, _mm_set1_epi32(0xFFFFFFFF));
}
// Mathematical operators involving a scalar and a vector.
static inline fvec4 operator+(float v1, const fvec4& v2) {
return fvec4(v1)+v2;
}
static inline fvec4 operator-(float v1, const fvec4& v2) {
return fvec4(v1)-v2;
}
static inline fvec4 operator*(float v1, const fvec4& v2) {
return fvec4(v1)*v2;
}
static inline fvec4 operator/(float v1, const fvec4& v2) {
return fvec4(v1)/v2;
}
// Operations for blending fvec4s based on an ivec4.
static inline fvec4 blend(const fvec4& v1, const fvec4& v2, const ivec4& mask) {
return fvec4(_mm_blendv_ps(v1.val, v2.val, _mm_castsi128_ps(mask.val)));
}
#endif /*OPENMM_VECTORIZE_SSE_H_*/
......@@ -29,7 +29,6 @@
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <stdexcept>
#if defined(_WIN32) || defined(__CYGWIN__)
#include <windows.h>
static HCRYPTPROV hCryptProv = 0;
......@@ -38,28 +37,31 @@ static HCRYPTPROV hCryptProv = 0;
#include <fcntl.h>
#include <unistd.h>
#endif
#include "openmm/OpenMMException.h"
#include "openmm/internal/OSRngSeed.h"
using OpenMM::OpenMMException;
int osrngseed(void) {
int value;
#if defined(_WIN32) || defined(__CYGWIN__)
if (!::CryptAcquireContextW(&hCryptProv, 0, 0, PROV_RSA_FULL, CRYPT_VERIFYCONTEXT | CRYPT_SILENT)) {
throw std::runtime_error("Failed to initialize Windows random API (CryptoGen)");
throw OpenMMException("Failed to initialize Windows random API (CryptoGen)");
}
if (!CryptGenRandom(hCryptProv, sizeof(int), (BYTE*) &value)) {
::CryptReleaseContext(hCryptProv, 0);
throw std::runtime_error("Failed to get random numbers");
throw OpenMMException("Failed to get random numbers");
}
if (!::CryptReleaseContext(hCryptProv, 0)) {
throw std::runtime_error("Failed to release Windows random API context");
throw OpenMMException("Failed to release Windows random API context");
}
#else
int m_fd = open("/dev/urandom", O_RDONLY);
if (m_fd == -1) {
throw std::runtime_error("Failed to open /dev/urandom");
throw OpenMMException("Failed to open /dev/urandom");
}
if (read(m_fd, &value, sizeof(int)) != sizeof(int)) {
throw std::runtime_error("Failed to read bytes from /dev/urandom");
throw OpenMMException("Failed to read bytes from /dev/urandom");
}
close(m_fd);
#endif
......
FOREACH(file ${SOURCE_FILES})
IF (file MATCHES ".*Vec8.*")
IF (MSVC)
IF (MSVC)
SET_SOURCE_FILES_PROPERTIES(${file} PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} /arch:AVX /D__AVX__")
ELSE (MSVC)
SET_SOURCE_FILES_PROPERTIES(${file} PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -msse4.1 -mavx")
ENDIF (MSVC)
ELSE (MSVC)
IF (NOT ANDROID)
SET_SOURCE_FILES_PROPERTIES(${file} PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -msse4.1 -mavx")
ENDIF (NOT ANDROID)
ENDIF (MSVC)
ELSE (file MATCHES ".*Vec8.*")
IF (NOT MSVC)
SET_SOURCE_FILES_PROPERTIES(${file} PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -msse4.1")
ENDIF (NOT MSVC)
IF (NOT MSVC)
IF (NOT ANDROID)
SET_SOURCE_FILES_PROPERTIES(${file} PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -msse4.1")
ENDIF (NOT ANDROID)
ENDIF (NOT MSVC)
ENDIF (file MATCHES ".*Vec8.*")
ENDFOREACH(file)
ADD_LIBRARY(${SHARED_TARGET} SHARED ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})
......
......@@ -37,7 +37,6 @@
#include <set>
#include <map>
#include <cmath>
#include <smmintrin.h>
using namespace std;
......
......@@ -103,7 +103,7 @@ void CpuNonbondedForceVec4::calculateBlockIxn(int blockIndex, float* forces, dou
dEdR = epsSig6*(12.0f*sig6 - 6.0f);
energy = epsSig6*(sig6-1.0f);
if (useSwitch) {
fvec4 t = (r>switchingDistance) & ((r-switchingDistance)*invSwitchingInterval);
fvec4 t = blend(0.0f, (r-switchingDistance)*invSwitchingInterval, r>switchingDistance);
fvec4 switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
fvec4 switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))*invSwitchingInterval;
dEdR = switchValue*dEdR - energy*switchDeriv*r;
......@@ -214,7 +214,7 @@ void CpuNonbondedForceVec4::calculateBlockEwaldIxn(int blockIndex, float* forces
dEdR = epsSig6*(12.0f*sig6 - 6.0f);
energy = epsSig6*(sig6-1.0f);
if (useSwitch) {
fvec4 t = (r>switchingDistance) & ((r-switchingDistance)*invSwitchingInterval);
fvec4 t = blend(0.0f, (r-switchingDistance)*invSwitchingInterval, r>switchingDistance);
fvec4 switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
fvec4 switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))*invSwitchingInterval;
dEdR = switchValue*dEdR - energy*switchDeriv*r;
......
......@@ -36,6 +36,7 @@
#include "ReferenceConstraints.h"
#include "openmm/internal/hardware.h"
#include <sstream>
#include <stdlib.h>
using namespace OpenMM;
using namespace std;
......@@ -93,15 +94,20 @@ bool CpuPlatform::supportsDoublePrecision() const {
}
bool CpuPlatform::isProcessorSupported() {
// Make sure the CPU supports SSE 4.1.
int cpuInfo[4];
cpuid(cpuInfo, 0);
if (cpuInfo[0] >= 1) {
cpuid(cpuInfo, 1);
return ((cpuInfo[2] & ((int) 1 << 19)) != 0);
}
return false;
// Make sure the CPU supports SSE 4.1 or NEON.
#ifdef __ANDROID__
uint64_t features = android_getCpuFeatures();
return (features & ANDROID_CPU_ARM_FEATURE_NEON) != 0;
#else
int cpuInfo[4];
cpuid(cpuInfo, 0);
if (cpuInfo[0] >= 1) {
cpuid(cpuInfo, 1);
return ((cpuInfo[2] & ((int) 1 << 19)) != 0);
}
return false;
#endif
}
void CpuPlatform::contextCreated(ContextImpl& context, const map<string, string>& properties) const {
......
......@@ -284,7 +284,8 @@ private:
std::map<std::string, std::string> kernelDefines;
double cutoff;
bool useCutoff, usePeriodic, deviceIsCpu, anyExclusions, usePadding;
int numForceBuffers, startTileIndex, numTiles, startBlockIndex, numBlocks, numForceThreadBlocks, forceThreadBlockSize, nonbondedForceGroup;
int numForceBuffers, startTileIndex, numTiles, startBlockIndex, numBlocks, numForceThreadBlocks;
int forceThreadBlockSize, interactingBlocksThreadBlockSize, nonbondedForceGroup;
};
/**
......
This diff is collapsed.
......@@ -4820,7 +4820,7 @@ void OpenCLIntegrateVariableVerletStepKernel::initialize(const System& system, c
kernel1 = cl::Kernel(program, "integrateVerletPart1");
kernel2 = cl::Kernel(program, "integrateVerletPart2");
selectSizeKernel = cl::Kernel(program, "selectVerletStepSize");
blockSize = min(min(256, system.getNumParticles()), (int) cl.getDevice().getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>());
blockSize = min(min(256, system.getNumParticles()), (int) selectSizeKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(cl.getDevice()));
}
double OpenCLIntegrateVariableVerletStepKernel::execute(ContextImpl& context, const VariableVerletIntegrator& integrator, double maxTime) {
......@@ -4930,7 +4930,7 @@ void OpenCLIntegrateVariableLangevinStepKernel::initialize(const System& system,
params = new OpenCLArray(cl, 3, cl.getUseDoublePrecision() || cl.getUseMixedPrecision() ? sizeof(cl_double) : sizeof(cl_float), "langevinParams");
blockSize = min(256, system.getNumParticles());
blockSize = max(blockSize, params->getSize());
blockSize = min(blockSize, (int) cl.getDevice().getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>());
blockSize = min(blockSize, (int) selectSizeKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(cl.getDevice()));
}
double OpenCLIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, const VariableLangevinIntegrator& integrator, double maxTime) {
......
......@@ -317,42 +317,55 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
for (int i = 0; i < (int) exclusionBlocksForBlock.size(); i++)
maxExclusions = (maxExclusions > exclusionBlocksForBlock[i].size() ? maxExclusions : exclusionBlocksForBlock[i].size());
defines["MAX_EXCLUSIONS"] = context.intToString(maxExclusions);
defines["GROUP_SIZE"] = (deviceIsCpu ? "32" : "128");
defines["BUFFER_GROUPS"] = (deviceIsCpu ? "4" : "2");
string file = (deviceIsCpu ? OpenCLKernelSources::findInteractingBlocks_cpu : OpenCLKernelSources::findInteractingBlocks);
cl::Program interactingBlocksProgram = context.createProgram(file, defines);
findBlockBoundsKernel = cl::Kernel(interactingBlocksProgram, "findBlockBounds");
findBlockBoundsKernel.setArg<cl_int>(0, context.getNumAtoms());
findBlockBoundsKernel.setArg<cl::Buffer>(3, context.getPosq().getDeviceBuffer());
findBlockBoundsKernel.setArg<cl::Buffer>(4, blockCenter->getDeviceBuffer());
findBlockBoundsKernel.setArg<cl::Buffer>(5, blockBoundingBox->getDeviceBuffer());
findBlockBoundsKernel.setArg<cl::Buffer>(6, rebuildNeighborList->getDeviceBuffer());
findBlockBoundsKernel.setArg<cl::Buffer>(7, sortedBlocks->getDeviceBuffer());
sortBoxDataKernel = cl::Kernel(interactingBlocksProgram, "sortBoxData");
sortBoxDataKernel.setArg<cl::Buffer>(0, sortedBlocks->getDeviceBuffer());
sortBoxDataKernel.setArg<cl::Buffer>(1, blockCenter->getDeviceBuffer());
sortBoxDataKernel.setArg<cl::Buffer>(2, blockBoundingBox->getDeviceBuffer());
sortBoxDataKernel.setArg<cl::Buffer>(3, sortedBlockCenter->getDeviceBuffer());
sortBoxDataKernel.setArg<cl::Buffer>(4, sortedBlockBoundingBox->getDeviceBuffer());
sortBoxDataKernel.setArg<cl::Buffer>(5, context.getPosq().getDeviceBuffer());
sortBoxDataKernel.setArg<cl::Buffer>(6, oldPositions->getDeviceBuffer());
sortBoxDataKernel.setArg<cl::Buffer>(7, interactionCount->getDeviceBuffer());
sortBoxDataKernel.setArg<cl::Buffer>(8, rebuildNeighborList->getDeviceBuffer());
findInteractingBlocksKernel = cl::Kernel(interactingBlocksProgram, "findBlocksWithInteractions");
findInteractingBlocksKernel.setArg<cl::Buffer>(2, interactionCount->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(3, interactingTiles->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(4, interactingAtoms->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(5, context.getPosq().getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl_uint>(6, interactingTiles->getSize());
findInteractingBlocksKernel.setArg<cl_uint>(7, startBlockIndex);
findInteractingBlocksKernel.setArg<cl_uint>(8, numBlocks);
findInteractingBlocksKernel.setArg<cl::Buffer>(9, sortedBlocks->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(10, sortedBlockCenter->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(11, sortedBlockBoundingBox->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(12, exclusionIndices->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(13, exclusionRowIndices->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(14, oldPositions->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(15, rebuildNeighborList->getDeviceBuffer());
int groupSize = (deviceIsCpu ? 32 : 128);
while (true) {
defines["GROUP_SIZE"] = context.intToString(groupSize);
cl::Program interactingBlocksProgram = context.createProgram(file, defines);
findBlockBoundsKernel = cl::Kernel(interactingBlocksProgram, "findBlockBounds");
findBlockBoundsKernel.setArg<cl_int>(0, context.getNumAtoms());
findBlockBoundsKernel.setArg<cl::Buffer>(3, context.getPosq().getDeviceBuffer());
findBlockBoundsKernel.setArg<cl::Buffer>(4, blockCenter->getDeviceBuffer());
findBlockBoundsKernel.setArg<cl::Buffer>(5, blockBoundingBox->getDeviceBuffer());
findBlockBoundsKernel.setArg<cl::Buffer>(6, rebuildNeighborList->getDeviceBuffer());
findBlockBoundsKernel.setArg<cl::Buffer>(7, sortedBlocks->getDeviceBuffer());
sortBoxDataKernel = cl::Kernel(interactingBlocksProgram, "sortBoxData");
sortBoxDataKernel.setArg<cl::Buffer>(0, sortedBlocks->getDeviceBuffer());
sortBoxDataKernel.setArg<cl::Buffer>(1, blockCenter->getDeviceBuffer());
sortBoxDataKernel.setArg<cl::Buffer>(2, blockBoundingBox->getDeviceBuffer());
sortBoxDataKernel.setArg<cl::Buffer>(3, sortedBlockCenter->getDeviceBuffer());
sortBoxDataKernel.setArg<cl::Buffer>(4, sortedBlockBoundingBox->getDeviceBuffer());
sortBoxDataKernel.setArg<cl::Buffer>(5, context.getPosq().getDeviceBuffer());
sortBoxDataKernel.setArg<cl::Buffer>(6, oldPositions->getDeviceBuffer());
sortBoxDataKernel.setArg<cl::Buffer>(7, interactionCount->getDeviceBuffer());
sortBoxDataKernel.setArg<cl::Buffer>(8, rebuildNeighborList->getDeviceBuffer());
findInteractingBlocksKernel = cl::Kernel(interactingBlocksProgram, "findBlocksWithInteractions");
findInteractingBlocksKernel.setArg<cl::Buffer>(2, interactionCount->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(3, interactingTiles->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(4, interactingAtoms->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(5, context.getPosq().getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl_uint>(6, interactingTiles->getSize());
findInteractingBlocksKernel.setArg<cl_uint>(7, startBlockIndex);
findInteractingBlocksKernel.setArg<cl_uint>(8, numBlocks);
findInteractingBlocksKernel.setArg<cl::Buffer>(9, sortedBlocks->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(10, sortedBlockCenter->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(11, sortedBlockBoundingBox->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(12, exclusionIndices->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(13, exclusionRowIndices->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(14, oldPositions->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(15, rebuildNeighborList->getDeviceBuffer());
if (findInteractingBlocksKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice()) < groupSize) {
// The device can't handle this block size, so reduce it.
groupSize -= 32;
if (groupSize < 32)
throw OpenMMException("Failed to create findInteractingBlocks kernel");
continue;
}
break;
}
interactingBlocksThreadBlockSize = (deviceIsCpu ? 1 : groupSize);
}
}
......@@ -389,7 +402,7 @@ void OpenCLNonbondedUtilities::prepareInteractions() {
context.executeKernel(sortBoxDataKernel, context.getNumAtoms());
setPeriodicBoxSizeArg(context, findInteractingBlocksKernel, 0);
setInvPeriodicBoxSizeArg(context, findInteractingBlocksKernel, 1);
context.executeKernel(findInteractingBlocksKernel, context.getNumAtoms(), deviceIsCpu ? 1 : 128);
context.executeKernel(findInteractingBlocksKernel, context.getNumAtoms(), interactingBlocksThreadBlockSize);
}
void OpenCLNonbondedUtilities::computeInteractions() {
......
......@@ -32,6 +32,7 @@
#include "openmm/Context.h"
#include "openmm/System.h"
#include <algorithm>
#include <cctype>
#include <sstream>
#ifdef __APPLE__
#include "sys/sysctl.h"
......@@ -39,10 +40,7 @@
using namespace OpenMM;
using std::map;
using std::string;
using std::stringstream;
using std::vector;
using namespace std;
#ifdef OPENMM_OPENCL_BUILDING_STATIC_LIBRARY
extern "C" void registerOpenCLPlatform() {
......
......@@ -56,10 +56,13 @@ OpenCLSort::OpenCLSort(OpenCLContext& context, SortTrait* trait, unsigned int le
unsigned int maxGroupSize = std::min(256, (int) context.getDevice().getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>());
int maxSharedMem = context.getDevice().getInfo<CL_DEVICE_LOCAL_MEM_SIZE>();
unsigned int maxLocalBuffer = (unsigned int) ((maxSharedMem/trait->getDataSize())/2);
isShortList = (length <= maxLocalBuffer);
for (rangeKernelSize = 1; rangeKernelSize*2 <= maxGroupSize; rangeKernelSize *= 2)
unsigned int maxRangeSize = std::min(maxGroupSize, (unsigned int) computeRangeKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice()));
unsigned int maxPositionsSize = std::min(maxGroupSize, (unsigned int) computeBucketPositionsKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice()));
unsigned int maxShortListSize = shortListKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice());
isShortList = (length <= maxLocalBuffer && length < maxShortListSize);
for (rangeKernelSize = 1; rangeKernelSize*2 <= maxRangeSize; rangeKernelSize *= 2)
;
positionsKernelSize = rangeKernelSize;
positionsKernelSize = std::min(rangeKernelSize, maxPositionsSize);
sortKernelSize = (isShortList ? rangeKernelSize : rangeKernelSize/2);
if (rangeKernelSize > length)
rangeKernelSize = length;
......
......@@ -57,7 +57,11 @@ ENDFOREACH(subdir)
INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/src)
IF (NOT MSVC)
SET_SOURCE_FILES_PROPERTIES(${SOURCE_FILES} PROPERTIES COMPILE_FLAGS "-msse4.1")
IF (ANDROID)
SET_SOURCE_FILES_PROPERTIES(${SOURCE_FILES} PROPERTIES COMPILE_FLAGS "")
ELSE (ANDROID)
SET_SOURCE_FILES_PROPERTIES(${SOURCE_FILES} PROPERTIES COMPILE_FLAGS "-msse4.1")
ENDIF (ANDROID)
ENDIF (NOT MSVC)
# Include FFTW related files.
......
......@@ -569,13 +569,20 @@ double CpuCalcPmeReciprocalForceKernel::finishComputation(IO& io) {
}
bool CpuCalcPmeReciprocalForceKernel::isProcessorSupported() {
int cpuInfo[4];
cpuid(cpuInfo, 0);
if (cpuInfo[0] >= 1) {
cpuid(cpuInfo, 1);
return ((cpuInfo[2] & ((int) 1 << 19)) != 0); // Require SSE 4.1
}
return false;
// Make sure the CPU supports SSE 4.1 or NEON.
#ifdef __ANDROID__
uint64_t features = android_getCpuFeatures();
return (features & ANDROID_CPU_ARM_FEATURE_NEON) != 0;
#else
int cpuInfo[4];
cpuid(cpuInfo, 0);
if (cpuInfo[0] >= 1) {
cpuid(cpuInfo, 1);
return ((cpuInfo[2] & ((int) 1 << 19)) != 0);
}
return false;
#endif
}
int CpuCalcPmeReciprocalForceKernel::findFFTDimension(int minimum) {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment