Preliminary support for ARM/Android.

454caac9 · peastman · 51828eaa · 454caac9 · 454caac9 · 454caac9
Commit 454caac9 authored May 14, 2014 by peastman
12 changed files
--- a/openmmapi/include/openmm/internal/hardware.h
+++ b/openmmapi/include/openmm/internal/hardware.h
@@ -46,10 +46,14 @@
   #ifdef WIN32
      #define NOMINMAX
      #include <windows.h>
+   #else
+      #ifdef __ANDROID__
+        #include <cpu-features.h>
      #else
        #include <dlfcn.h>
        #include <unistd.h>
      #endif
+   #endif
 #endif
 static int getNumProcessors() {
@@ -70,11 +74,15 @@ static int getNumProcessors() {
        ncpu = 1;
    return ncpu;
 #else
+    #ifdef __ANDROID__
+        return android_getCpuCount();
+    #else
      long nProcessorsOnline = sysconf(_SC_NPROCESSORS_ONLN);
      if (nProcessorsOnline == -1)
          return 1;
      else
          return (int) nProcessorsOnline;
+    #endif
 #endif
 #endif
 }
@@ -85,8 +93,9 @@ static int getNumProcessors() {
 #ifdef _WIN32
 #define cpuid __cpuid
 #else
-static void cpuid(int cpuInfo[4], int infoType){
+#ifndef __ANDROID__
-#ifdef __LP64__
+    static void cpuid(int cpuInfo[4], int infoType){
+    #ifdef __LP64__
        __asm__ __volatile__ (
            "cpuid":
            "=a" (cpuInfo[0]),
@@ -95,7 +104,7 @@ static void cpuid(int cpuInfo[4], int infoType){
            "=d" (cpuInfo[3]) :
            "a" (infoType)
        );
-#else
+    #else
        __asm__ __volatile__ (
            "pushl %%ebx\n"
            "cpuid\n"
@@ -107,8 +116,9 @@ static void cpuid(int cpuInfo[4], int infoType){
            "=d" (cpuInfo[3]) :
            "a" (infoType)
        );
-#endif
+    #endif
-}
+    }
+    #endif
 #endif
 #endif // OPENMM_HARDWARE_H_
--- a/openmmapi/include/openmm/internal/vectorize.h
+++ b/openmmapi/include/openmm/internal/vectorize.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2013 Stanford University and the Authors.           *
+ * Portions copyright (c) 2014 Stanford University and the Authors.           *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -32,255 +32,10 @@
 * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
 * -------------------------------------------------------------------------- */
-#include <smmintrin.h>
+#if defined(__ANDROID__)
+    #include "vectorize_neon.h"
+#else
-// This file defines classes and functions to simplify vectorizing code with SSE.
+    #include "vectorize_sse.h"
+#endif
-class ivec4;
-/**
- * A four element vector of floats.
- */
-class fvec4 {
-public:
-    __m128 val;
-    fvec4() {}
-    fvec4(float v) : val(_mm_set1_ps(v)) {}
-    fvec4(float v1, float v2, float v3, float v4) : val(_mm_set_ps(v4, v3, v2, v1)) {}
-    fvec4(__m128 v) : val(v) {}
-    fvec4(const float* v) : val(_mm_loadu_ps(v)) {}
-    operator __m128() const {
-        return val;
-    }
-    float operator[](int i) const {
-        float result[4];
-        store(result);
-        return result[i];
-    }
-    void store(float* v) const {
-        _mm_storeu_ps(v, val);
-    }
-    fvec4 operator+(const fvec4& other) const {
-        return _mm_add_ps(val, other);
-    }
-    fvec4 operator-(const fvec4& other) const {
-        return _mm_sub_ps(val, other);
-    }
-    fvec4 operator*(const fvec4& other) const {
-        return _mm_mul_ps(val, other);
-    }
-    fvec4 operator/(const fvec4& other) const {
-        return _mm_div_ps(val, other);
-    }
-    void operator+=(const fvec4& other) {
-        val = _mm_add_ps(val, other);
-    }
-    void operator-=(const fvec4& other) {
-        val = _mm_sub_ps(val, other);
-    }
-    void operator*=(const fvec4& other) {
-        val = _mm_mul_ps(val, other);
-    }
-    void operator/=(const fvec4& other) {
-        val = _mm_div_ps(val, other);
-    }
-    fvec4 operator-() const {
-        return _mm_sub_ps(_mm_set1_ps(0.0f), val);
-    }
-    fvec4 operator&(const fvec4& other) const {
-        return _mm_and_ps(val, other);
-    }
-    fvec4 operator|(const fvec4& other) const {
-        return _mm_or_ps(val, other);
-    }
-    fvec4 operator==(const fvec4& other) const {
-        return _mm_cmpeq_ps(val, other);
-    }
-    fvec4 operator!=(const fvec4& other) const {
-        return _mm_cmpneq_ps(val, other);
-    }
-    fvec4 operator>(const fvec4& other) const {
-        return _mm_cmpgt_ps(val, other);
-    }
-    fvec4 operator<(const fvec4& other) const {
-        return _mm_cmplt_ps(val, other);
-    }
-    fvec4 operator>=(const fvec4& other) const {
-        return _mm_cmpge_ps(val, other);
-    }
-    fvec4 operator<=(const fvec4& other) const {
-        return _mm_cmple_ps(val, other);
-    }
-    operator ivec4() const;
-};
-/**
- * A four element vector of ints.
- */
-class ivec4 {
-public:
-    __m128i val;
-    ivec4() {}
-    ivec4(int v) : val(_mm_set1_epi32(v)) {}
-    ivec4(int v1, int v2, int v3, int v4) : val(_mm_set_epi32(v4, v3, v2, v1)) {}
-    ivec4(__m128i v) : val(v) {}
-    ivec4(const int* v) : val(_mm_loadu_si128((const __m128i*) v)) {}
-    operator __m128i() const {
-        return val;
-    }
-    int operator[](int i) const {
-        int result[4];
-        store(result);
-        return result[i];
-    }
-    void store(int* v) const {
-        _mm_storeu_si128((__m128i*) v, val);
-    }
-    ivec4 operator+(const ivec4& other) const {
-        return _mm_add_epi32(val, other);
-    }
-    ivec4 operator-(const ivec4& other) const {
-        return _mm_sub_epi32(val, other);
-    }
-    ivec4 operator*(const ivec4& other) const {
-        return _mm_mul_epi32(val, other);
-    }
-    void operator+=(const ivec4& other) {
-        val = _mm_add_epi32(val, other);
-    }
-    void operator-=(const ivec4& other) {
-        val = _mm_sub_epi32(val, other);
-    }
-    void operator*=(const ivec4& other) {
-        val = _mm_mul_epi32(val, other);
-    }
-    ivec4 operator-() const {
-        return _mm_sub_epi32(_mm_set1_epi32(0), val);
-    }
-    ivec4 operator&(const ivec4& other) const {
-        return _mm_and_si128(val, other);
-    }
-    ivec4 operator|(const ivec4& other) const {
-        return _mm_or_si128(val, other);
-    }
-    ivec4 operator==(const ivec4& other) const {
-        return _mm_cmpeq_epi32(val, other);
-    }
-    ivec4 operator!=(const ivec4& other) const {
-        return _mm_xor_si128(*this==other, _mm_set1_epi32(0xFFFFFFFF));
-    }
-    ivec4 operator>(const ivec4& other) const {
-        return _mm_cmpgt_epi32(val, other);
-    }
-    ivec4 operator<(const ivec4& other) const {
-        return _mm_cmplt_epi32(val, other);
-    }
-    ivec4 operator>=(const ivec4& other) const {
-        return _mm_xor_si128(_mm_cmplt_epi32(val, other), _mm_set1_epi32(0xFFFFFFFF));
-    }
-    ivec4 operator<=(const ivec4& other) const {
-        return _mm_xor_si128(_mm_cmpgt_epi32(val, other), _mm_set1_epi32(0xFFFFFFFF));
-    }
-    operator fvec4() const;
-};
-// Conversion operators.
-inline fvec4::operator ivec4() const {
-    return _mm_cvttps_epi32(val);
-}
-inline ivec4::operator fvec4() const {
-    return _mm_cvtepi32_ps(val);
-}
-// Functions that operate on fvec4s.
-static inline fvec4 floor(const fvec4& v) {
-    return fvec4(_mm_floor_ps(v.val));
-}
-static inline fvec4 ceil(const fvec4& v) {
-    return fvec4(_mm_ceil_ps(v.val));
-}
-static inline fvec4 round(const fvec4& v) {
-    return fvec4(_mm_round_ps(v.val, _MM_FROUND_TO_NEAREST_INT));
-}
-static inline fvec4 min(const fvec4& v1, const fvec4& v2) {
-    return fvec4(_mm_min_ps(v1.val, v2.val));
-}
-static inline fvec4 max(const fvec4& v1, const fvec4& v2) {
-    return fvec4(_mm_max_ps(v1.val, v2.val));
-}
-static inline fvec4 abs(const fvec4& v) {
-    static const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
-    return fvec4(_mm_and_ps(v.val, mask));
-}
-static inline fvec4 sqrt(const fvec4& v) {
-    return fvec4(_mm_sqrt_ps(v.val));
-}
-static inline float dot3(const fvec4& v1, const fvec4& v2) {
-    return _mm_cvtss_f32(_mm_dp_ps(v1, v2, 0x71));
-}
-static inline float dot4(const fvec4& v1, const fvec4& v2) {
-    return _mm_cvtss_f32(_mm_dp_ps(v1, v2, 0xF1));
-}
-static inline void transpose(fvec4& v1, fvec4& v2, fvec4& v3, fvec4& v4) {
-    _MM_TRANSPOSE4_PS(v1, v2, v3, v4);
-}
-// Functions that operate on ivec4s.
-static inline ivec4 min(const ivec4& v1, const ivec4& v2) {
-    return ivec4(_mm_min_epi32(v1.val, v2.val));
-}
-static inline ivec4 max(const ivec4& v1, const ivec4& v2) {
-    return ivec4(_mm_max_epi32(v1.val, v2.val));
-}
-static inline ivec4 abs(const ivec4& v) {
-    return ivec4(_mm_abs_epi32(v.val));
-}
-static inline bool any(const ivec4& v) {
-    return !_mm_test_all_zeros(v, _mm_set1_epi32(0xFFFFFFFF));
-}
-// Mathematical operators involving a scalar and a vector.
-static inline fvec4 operator+(float v1, const fvec4& v2) {
-    return fvec4(v1)+v2;
-}
-static inline fvec4 operator-(float v1, const fvec4& v2) {
-    return fvec4(v1)-v2;
-}
-static inline fvec4 operator*(float v1, const fvec4& v2) {
-    return fvec4(v1)*v2;
-}
-static inline fvec4 operator/(float v1, const fvec4& v2) {
-    return fvec4(v1)/v2;
-}
-// Operations for blending fvec4s based on an ivec4.
-static inline fvec4 blend(const fvec4& v1, const fvec4& v2, const ivec4& mask) {
-    return fvec4(_mm_blendv_ps(v1.val, v2.val, _mm_castsi128_ps(mask.val)));
-}
 #endif /*OPENMM_VECTORIZE_H_*/
--- a/openmmapi/include/openmm/internal/vectorize_neon.h
+++ b/openmmapi/include/openmm/internal/vectorize_neon.h
+#ifndef OPENMM_VECTORIZE_NEON_H_
+#define OPENMM_VECTORIZE_NEON_H_
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2013-2014 Stanford University and the Authors.      *
+ * Authors: Mateus Lima, Peter Eastman                                        *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+#include <cpu-features.h>
+#include <arm_neon.h>
+#include <cmath>
+typedef int int32_t;
+// This file defines classes and functions to simplify vectorizing code with NEON.
+class ivec4;
+/**
+ * A four element vector of floats.
+ */
+class fvec4 {
+public:
+    float32x4_t val;
+    fvec4() {}
+    fvec4(float v) : val(vdupq_n_f32(v)) {}
+    fvec4(float v1, float v2, float v3, float v4) {
+        float v[] = {v1, v2, v3, v4};
+        val = vld1q_f32(v);
+    }
+    fvec4(float32x4_t v) : val(v) {}
+    fvec4(const float* v) : val(vld1q_f32(v)) {}
+    operator float32x4_t() const {
+        return val;
+    }
+    float operator[](int i) const {
+        float result[4];
+        store(result);
+        return result[i];
+    }
+    void store(float* v) const {
+        vst1q_f32(v, val);
+    }
+    fvec4 operator+(const fvec4& other) const { // Tested OK
+        return vaddq_f32(val, other);
+    }
+    fvec4 operator-(const fvec4& other) const { // Tested OK
+        return vsubq_f32(val, other);
+    }
+    fvec4 operator*(const fvec4& other) const { // Tested OK
+        return vmulq_f32(val, other);
+    }
+    fvec4 operator/(const fvec4& other) const { // Tested OK
+        // NEON does not have a divide float-point operator, so we get the reciprocal and multiply.
+        float32x4_t reciprocal = vrecpeq_f32(other);
+        reciprocal = vmulq_f32(vrecpsq_f32(other, reciprocal), reciprocal);
+        reciprocal = vmulq_f32(vrecpsq_f32(other, reciprocal), reciprocal);
+        fvec4 result = vmulq_f32(val,reciprocal);
+        return result;
+    }
+    void operator+=(const fvec4& other) {
+        val = vaddq_f32(val, other);
+    }
+    void operator-=(const fvec4& other) {
+        val = vsubq_f32(val, other);
+    }
+    void operator*=(const fvec4& other) {
+        val = vmulq_f32(val, other);
+    }
+    void operator/=(const fvec4& other) {
+        val = val / other.val;
+    }
+    fvec4 operator-() const {
+        return vnegq_f32(val);
+    }
+    fvec4 operator&(const fvec4& other) const {
+        return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(val), vreinterpretq_u32_f32(other)));
+    }
+    fvec4 operator|(const fvec4& other) const {
+        return vcvtq_f32_s32(vreinterpretq_s32_u32(vorrq_u32(vcvtq_u32_f32(val), vcvtq_u32_f32(other))));
+    }
+    fvec4 operator==(const fvec4& other) const {
+        return vcvtq_f32_s32(vreinterpretq_s32_u32(vceqq_f32(val, other)));
+    }
+    fvec4 operator!=(const fvec4& other) const {
+        return vcvtq_f32_s32(vreinterpretq_s32_u32(vmvnq_u32(vceqq_f32(val, other)))); // not(equals(val, other))
+    }
+    fvec4 operator>(const fvec4& other) const {
+        return vcvtq_f32_s32(vreinterpretq_s32_u32(vcgtq_f32(val, other)));
+    }
+    fvec4 operator<(const fvec4& other) const {
+        return vcvtq_f32_s32(vreinterpretq_s32_u32(vcltq_f32(val, other)));
+    }
+    fvec4 operator>=(const fvec4& other) const {
+        return vcvtq_f32_s32(vreinterpretq_s32_u32(vcgeq_f32(val, other)));
+    }
+    fvec4 operator<=(const fvec4& other) const {
+        return vcvtq_f32_s32(vreinterpretq_s32_u32(vcleq_f32(val, other)));
+    }
+    operator ivec4() const;
+};
+/**
+ * A four element vector of ints.
+ */
+class ivec4 {
+public:
+    int32x4_t val;
+    ivec4() {}
+    ivec4(int v) : val(vdupq_n_s32(v)) {}
+    ivec4(int v1, int v2, int v3, int v4) {
+        int v[] = {v1, v2, v3, v4};
+        val = vld1q_s32(v);
+    }
+    ivec4(int32x4_t v) : val(v) {}
+    ivec4(const int* v) : val(vld1q_s32(v)) {}
+    operator int32x4_t() const {
+        return val;
+    }
+    int operator[](int i) const {
+        int result[4];
+        store(result);
+        return result[i];
+    }
+    void store(int* v) const {
+        vst1q_s32(v, val);
+    }
+    ivec4 operator+(const ivec4& other) const {
+        return vaddq_s32(val, other);
+    }
+    ivec4 operator-(const ivec4& other) const {
+        return vsubq_s32(val, other);
+    }
+    ivec4 operator*(const ivec4& other) const {
+        return vmulq_s32(val, other);
+    }
+    void operator+=(const ivec4& other) {
+        val = vaddq_s32(val, other);
+    }
+    void operator-=(const ivec4& other) {
+        val = vsubq_s32(val, other);
+    }
+    void operator*=(const ivec4& other) {
+        val = vmulq_s32(val, other);
+    }
+    ivec4 operator-() const {
+        return vnegq_s32(val);
+    }
+    ivec4 operator&(const ivec4& other) const { // Tested OK
+        return ivec4(vandq_s32(val, other));
+    }
+    ivec4 operator|(const ivec4& other) const {
+        return ivec4(vorrq_s32(val, other));
+    }
+    ivec4 operator==(const ivec4& other) const {
+        return ivec4(vreinterpretq_s32_u32(vceqq_s32(val, other)));
+    }
+    ivec4 operator!=(const ivec4& other) const { // OK
+        return ivec4(vreinterpretq_s32_u32(vmvnq_u32(vceqq_s32(val, other)))); // not(equal(val, other))
+    }
+    ivec4 operator>(const ivec4& other) const {
+        return ivec4(vreinterpretq_s32_u32(vcgtq_s32(val, other)));
+    }
+    ivec4 operator<(const ivec4& other) const {
+        return ivec4(vreinterpretq_s32_u32(vcltq_s32(val, other)));
+    }
+    ivec4 operator>=(const ivec4& other) const {
+        return ivec4(vreinterpretq_s32_u32(vcgeq_s32(val, other)));
+    }
+    ivec4 operator<=(const ivec4& other) const { // OK
+        return ivec4(vreinterpretq_s32_u32(vcleq_s32(val, other)));
+    }
+    operator fvec4() const;
+};
+// Conversion operators.
+inline fvec4::operator ivec4() const {
+    return ivec4(vcvtq_s32_f32(val));
+}
+inline ivec4::operator fvec4() const {
+    return fvec4(vcvtq_f32_s32(val));
+}
+// Functions that operate on fvec4s.
+static inline fvec4 floor(const fvec4& v) { // Tested: OK
+    fvec4 result = v + fvec4(0.5f);
+    result = (fvec4) ((ivec4) result);
+    return result;
+}
+static inline float roundToNearest(float num) {
+    return (num > 0.0f) ? std::floor(num + 0.5f) : std::ceil(num - 0.5f);
+}
+static inline fvec4 round(const fvec4& v) { // Tested: OK - Needs optimization
+    float aux[4];
+    vst1q_f32(aux, v);
+    return fvec4(roundToNearest(aux[0]), roundToNearest(aux[1]), roundToNearest(aux[2]), roundToNearest(aux[3]));
+}
+static inline fvec4 min(const fvec4& v1, const fvec4& v2) { // Tested OK
+    return fvec4(vminq_f32(v1.val, v2.val));
+}
+static inline fvec4 max(const fvec4& v1, const fvec4& v2) { // Tested OK
+    return fvec4(vmaxq_f32(v1.val, v2.val));
+}
+static inline fvec4 abs(const fvec4& v) { // Tested OK
+    return fvec4(vabdq_f32(v.val, fvec4(0.0)));
+}
+static inline fvec4 ceil(const fvec4& v) { // Tested OK
+    ivec4 intVersion = (ivec4) v;
+    fvec4 result = min((fvec4) (v > intVersion), fvec4(1.0f));
+    result += intVersion;
+    return result;
+}
+static inline fvec4 sqrt(const fvec4& v) {
+    float32x4_t recipSqrt = vrsqrteq_f32(v);
+    recipSqrt = vmulq_f32(recipSqrt, vrsqrtsq_f32(vmulq_f32(recipSqrt, v), recipSqrt));
+    recipSqrt = vmulq_f32(recipSqrt, vrsqrtsq_f32(vmulq_f32(recipSqrt, v), recipSqrt));
+    return vmulq_f32(v, recipSqrt);
+}
+static inline float dot3(const fvec4& v1, const fvec4& v2) { // Tested: OK
+    fvec4 result = v1 * v2;
+    float aux[4];
+    vst1q_f32(aux, result);
+    return aux[0] + aux[1] + aux[2]; // Ignore w component
+}
+static inline float dot4(const fvec4& v1, const fvec4& v2) { // Tested: OK
+    fvec4 result = v1 * v2;
+    float aux[4];
+    vst1q_f32(aux, result);
+    return aux[0] + aux[1] + aux[2] + aux[3];
+}
+static inline void transpose(fvec4& v1, fvec4& v2, fvec4& v3, fvec4& v4) { // Tested: OK
+    float aux1[4];
+    float aux2[4];
+    float aux3[4];
+    float aux4[4];
+    vst1q_f32(aux1, v1);
+    vst1q_f32(aux2, v2);
+    vst1q_f32(aux3, v3);
+    vst1q_f32(aux4, v4);
+    v1 = fvec4(aux1[0], aux2[0], aux3[0], aux4[0]);
+    v2 = fvec4(aux1[1], aux2[1], aux3[1], aux4[1]);
+    v3 = fvec4(aux1[2], aux2[2], aux3[2], aux4[2]);
+    v4 = fvec4(aux1[3], aux2[3], aux3[3], aux4[3]);
+}
+// Functions that operate on ivec4s.
+static inline ivec4 min(const ivec4& v1, const ivec4& v2) { // Tested: not tested
+    ivec4 res = ivec4(vminq_s32(v1.val, v2.val));
+    return res;
+}
+static inline ivec4 max(const ivec4& v1, const ivec4& v2) { // Tested: not tested
+    ivec4 res = ivec4(vmaxq_s32(v1.val, v2.val));
+    return res;
+}
+static inline ivec4 abs(const ivec4& v) { // Tested: Not tested
+    ivec4 res = ivec4(vabdq_s32(v.val, ivec4(0)));
+    return res;
+}
+static inline bool any(const ivec4& v) { // Tested: OK
+    int result[4];
+    vst1q_s32(result, v);
+    return result[0] != 0 || result[1] != 0 || result[2] != 0 || result[3] != 0;
+}
+// Mathematical operators involving a scalar and a vector.
+static inline fvec4 operator+(float v1, const fvec4& v2) {
+    return fvec4(v1)+v2;
+}
+static inline fvec4 operator-(float v1, const fvec4& v2) {
+    return fvec4(v1)-v2;
+}
+static inline fvec4 operator*(float v1, const fvec4& v2) {
+    return fvec4(v1)*v2;
+}
+static inline fvec4 operator/(float v1, const fvec4& v2) {
+    return fvec4(v1)/v2;
+}
+// Operations for blending fvec4s based on an ivec4.
+static inline fvec4 blend(const fvec4& v1, const fvec4& v2, const ivec4& mask) { //  Tested OK
+    return fvec4(vbslq_f32(vreinterpretq_u32_s32(mask.val), v2, v1));
+}
+#endif /*OPENMM_VECTORIZE_NEON_H_*/
--- a/openmmapi/include/openmm/internal/vectorize_sse.h
+++ b/openmmapi/include/openmm/internal/vectorize_sse.h
+#ifndef OPENMM_VECTORIZE_SSE_H_
+#define OPENMM_VECTORIZE_SSE_H_
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2013 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+#include <smmintrin.h>
+// This file defines classes and functions to simplify vectorizing code with SSE.
+class ivec4;
+/**
+ * A four element vector of floats.
+ */
+class fvec4 {
+public:
+    __m128 val;
+    fvec4() {}
+    fvec4(float v) : val(_mm_set1_ps(v)) {}
+    fvec4(float v1, float v2, float v3, float v4) : val(_mm_set_ps(v4, v3, v2, v1)) {}
+    fvec4(__m128 v) : val(v) {}
+    fvec4(const float* v) : val(_mm_loadu_ps(v)) {}
+    operator __m128() const {
+        return val;
+    }
+    float operator[](int i) const {
+        float result[4];
+        store(result);
+        return result[i];
+    }
+    void store(float* v) const {
+        _mm_storeu_ps(v, val);
+    }
+    fvec4 operator+(const fvec4& other) const {
+        return _mm_add_ps(val, other);
+    }
+    fvec4 operator-(const fvec4& other) const {
+        return _mm_sub_ps(val, other);
+    }
+    fvec4 operator*(const fvec4& other) const {
+        return _mm_mul_ps(val, other);
+    }
+    fvec4 operator/(const fvec4& other) const {
+        return _mm_div_ps(val, other);
+    }
+    void operator+=(const fvec4& other) {
+        val = _mm_add_ps(val, other);
+    }
+    void operator-=(const fvec4& other) {
+        val = _mm_sub_ps(val, other);
+    }
+    void operator*=(const fvec4& other) {
+        val = _mm_mul_ps(val, other);
+    }
+    void operator/=(const fvec4& other) {
+        val = _mm_div_ps(val, other);
+    }
+    fvec4 operator-() const {
+        return _mm_sub_ps(_mm_set1_ps(0.0f), val);
+    }
+    fvec4 operator&(const fvec4& other) const {
+        return _mm_and_ps(val, other);
+    }
+    fvec4 operator|(const fvec4& other) const {
+        return _mm_or_ps(val, other);
+    }
+    fvec4 operator==(const fvec4& other) const {
+        return _mm_cmpeq_ps(val, other);
+    }
+    fvec4 operator!=(const fvec4& other) const {
+        return _mm_cmpneq_ps(val, other);
+    }
+    fvec4 operator>(const fvec4& other) const {
+        return _mm_cmpgt_ps(val, other);
+    }
+    fvec4 operator<(const fvec4& other) const {
+        return _mm_cmplt_ps(val, other);
+    }
+    fvec4 operator>=(const fvec4& other) const {
+        return _mm_cmpge_ps(val, other);
+    }
+    fvec4 operator<=(const fvec4& other) const {
+        return _mm_cmple_ps(val, other);
+    }
+    operator ivec4() const;
+};
+/**
+ * A four element vector of ints.
+ */
+class ivec4 {
+public:
+    __m128i val;
+    ivec4() {}
+    ivec4(int v) : val(_mm_set1_epi32(v)) {}
+    ivec4(int v1, int v2, int v3, int v4) : val(_mm_set_epi32(v4, v3, v2, v1)) {}
+    ivec4(__m128i v) : val(v) {}
+    ivec4(const int* v) : val(_mm_loadu_si128((const __m128i*) v)) {}
+    operator __m128i() const {
+        return val;
+    }
+    int operator[](int i) const {
+        int result[4];
+        store(result);
+        return result[i];
+    }
+    void store(int* v) const {
+        _mm_storeu_si128((__m128i*) v, val);
+    }
+    ivec4 operator+(const ivec4& other) const {
+        return _mm_add_epi32(val, other);
+    }
+    ivec4 operator-(const ivec4& other) const {
+        return _mm_sub_epi32(val, other);
+    }
+    ivec4 operator*(const ivec4& other) const {
+        return _mm_mul_epi32(val, other);
+    }
+    void operator+=(const ivec4& other) {
+        val = _mm_add_epi32(val, other);
+    }
+    void operator-=(const ivec4& other) {
+        val = _mm_sub_epi32(val, other);
+    }
+    void operator*=(const ivec4& other) {
+        val = _mm_mul_epi32(val, other);
+    }
+    ivec4 operator-() const {
+        return _mm_sub_epi32(_mm_set1_epi32(0), val);
+    }
+    ivec4 operator&(const ivec4& other) const {
+        return _mm_and_si128(val, other);
+    }
+    ivec4 operator|(const ivec4& other) const {
+        return _mm_or_si128(val, other);
+    }
+    ivec4 operator==(const ivec4& other) const {
+        return _mm_cmpeq_epi32(val, other);
+    }
+    ivec4 operator!=(const ivec4& other) const {
+        return _mm_xor_si128(*this==other, _mm_set1_epi32(0xFFFFFFFF));
+    }
+    ivec4 operator>(const ivec4& other) const {
+        return _mm_cmpgt_epi32(val, other);
+    }
+    ivec4 operator<(const ivec4& other) const {
+        return _mm_cmplt_epi32(val, other);
+    }
+    ivec4 operator>=(const ivec4& other) const {
+        return _mm_xor_si128(_mm_cmplt_epi32(val, other), _mm_set1_epi32(0xFFFFFFFF));
+    }
+    ivec4 operator<=(const ivec4& other) const {
+        return _mm_xor_si128(_mm_cmpgt_epi32(val, other), _mm_set1_epi32(0xFFFFFFFF));
+    }
+    operator fvec4() const;
+};
+// Conversion operators.
+inline fvec4::operator ivec4() const {
+    return _mm_cvttps_epi32(val);
+}
+inline ivec4::operator fvec4() const {
+    return _mm_cvtepi32_ps(val);
+}
+// Functions that operate on fvec4s.
+static inline fvec4 floor(const fvec4& v) {
+    return fvec4(_mm_floor_ps(v.val));
+}
+static inline fvec4 ceil(const fvec4& v) {
+    return fvec4(_mm_ceil_ps(v.val));
+}
+static inline fvec4 round(const fvec4& v) {
+    return fvec4(_mm_round_ps(v.val, _MM_FROUND_TO_NEAREST_INT));
+}
+static inline fvec4 min(const fvec4& v1, const fvec4& v2) {
+    return fvec4(_mm_min_ps(v1.val, v2.val));
+}
+static inline fvec4 max(const fvec4& v1, const fvec4& v2) {
+    return fvec4(_mm_max_ps(v1.val, v2.val));
+}
+static inline fvec4 abs(const fvec4& v) {
+    static const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
+    return fvec4(_mm_and_ps(v.val, mask));
+}
+static inline fvec4 sqrt(const fvec4& v) {
+    return fvec4(_mm_sqrt_ps(v.val));
+}
+static inline float dot3(const fvec4& v1, const fvec4& v2) {
+    return _mm_cvtss_f32(_mm_dp_ps(v1, v2, 0x71));
+}
+static inline float dot4(const fvec4& v1, const fvec4& v2) {
+    return _mm_cvtss_f32(_mm_dp_ps(v1, v2, 0xF1));
+}
+static inline void transpose(fvec4& v1, fvec4& v2, fvec4& v3, fvec4& v4) {
+    _MM_TRANSPOSE4_PS(v1, v2, v3, v4);
+}
+// Functions that operate on ivec4s.
+static inline ivec4 min(const ivec4& v1, const ivec4& v2) {
+    return ivec4(_mm_min_epi32(v1.val, v2.val));
+}
+static inline ivec4 max(const ivec4& v1, const ivec4& v2) {
+    return ivec4(_mm_max_epi32(v1.val, v2.val));
+}
+static inline ivec4 abs(const ivec4& v) {
+    return ivec4(_mm_abs_epi32(v.val));
+}
+static inline bool any(const ivec4& v) {
+    return !_mm_test_all_zeros(v, _mm_set1_epi32(0xFFFFFFFF));
+}
+// Mathematical operators involving a scalar and a vector.
+static inline fvec4 operator+(float v1, const fvec4& v2) {
+    return fvec4(v1)+v2;
+}
+static inline fvec4 operator-(float v1, const fvec4& v2) {
+    return fvec4(v1)-v2;
+}
+static inline fvec4 operator*(float v1, const fvec4& v2) {
+    return fvec4(v1)*v2;
+}
+static inline fvec4 operator/(float v1, const fvec4& v2) {
+    return fvec4(v1)/v2;
+}
+// Operations for blending fvec4s based on an ivec4.
+static inline fvec4 blend(const fvec4& v1, const fvec4& v2, const ivec4& mask) {
+    return fvec4(_mm_blendv_ps(v1.val, v2.val, _mm_castsi128_ps(mask.val)));
+}
+#endif /*OPENMM_VECTORIZE_SSE_H_*/
--- a/platforms/cpu/sharedTarget/CMakeLists.txt
+++ b/platforms/cpu/sharedTarget/CMakeLists.txt
@@ -3,11 +3,15 @@ FOREACH(file ${SOURCE_FILES})
        IF (MSVC)
            SET_SOURCE_FILES_PROPERTIES(${file} PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} /arch:AVX /D__AVX__")
        ELSE (MSVC)
+            IF (NOT ANDROID)
                SET_SOURCE_FILES_PROPERTIES(${file} PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -msse4.1 -mavx")
+            ENDIF (NOT ANDROID)
        ENDIF (MSVC)
    ELSE (file MATCHES ".*Vec8.*")
        IF (NOT MSVC)
+            IF (NOT ANDROID)
                SET_SOURCE_FILES_PROPERTIES(${file} PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -msse4.1")
+            ENDIF (NOT ANDROID)
        ENDIF (NOT MSVC)
    ENDIF (file MATCHES ".*Vec8.*")
 ENDFOREACH(file)

--- a/platforms/cpu/src/CpuNeighborList.cpp
+++ b/platforms/cpu/src/CpuNeighborList.cpp
@@ -37,7 +37,6 @@
 #include <set>
 #include <map>
 #include <cmath>
-#include <smmintrin.h>
 using namespace std;

--- a/platforms/cpu/src/CpuNonbondedForceVec4.cpp
+++ b/platforms/cpu/src/CpuNonbondedForceVec4.cpp
@@ -103,7 +103,7 @@ void CpuNonbondedForceVec4::calculateBlockIxn(int blockIndex, float* forces, dou
            dEdR = epsSig6*(12.0f*sig6 - 6.0f);
            energy = epsSig6*(sig6-1.0f);
            if (useSwitch) {
-                fvec4 t = (r>switchingDistance) & ((r-switchingDistance)*invSwitchingInterval);
+                fvec4 t = blend(0.0f, (r-switchingDistance)*invSwitchingInterval, r>switchingDistance);
                fvec4 switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
                fvec4 switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))*invSwitchingInterval;
                dEdR = switchValue*dEdR - energy*switchDeriv*r;
@@ -214,7 +214,7 @@ void CpuNonbondedForceVec4::calculateBlockEwaldIxn(int blockIndex, float* forces
            dEdR = epsSig6*(12.0f*sig6 - 6.0f);
            energy = epsSig6*(sig6-1.0f);
            if (useSwitch) {
-                fvec4 t = (r>switchingDistance) & ((r-switchingDistance)*invSwitchingInterval);
+                fvec4 t = blend(0.0f, (r-switchingDistance)*invSwitchingInterval, r>switchingDistance);
                fvec4 switchValue = 1+t*t*t*(-10.0f+t*(15.0f-t*6.0f));
                fvec4 switchDeriv = t*t*(-30.0f+t*(60.0f-t*30.0f))*invSwitchingInterval;
                dEdR = switchValue*dEdR - energy*switchDeriv*r;

--- a/platforms/cpu/src/CpuPlatform.cpp
+++ b/platforms/cpu/src/CpuPlatform.cpp
@@ -36,6 +36,7 @@
 #include "ReferenceConstraints.h"
 #include "openmm/internal/hardware.h"
 #include <sstream>
+#include <stdlib.h>
 using namespace OpenMM;
 using namespace std;
@@ -93,8 +94,12 @@ bool CpuPlatform::supportsDoublePrecision() const {
 }
 bool CpuPlatform::isProcessorSupported() {
-    // Make sure the CPU supports SSE 4.1.
+    // Make sure the CPU supports SSE 4.1 or NEON.
+    #ifdef __ANDROID__
+        uint64_t features = android_getCpuFeatures();
+        return (features & ANDROID_CPU_ARM_FEATURE_NEON) != 0;
+    #else
        int cpuInfo[4];
        cpuid(cpuInfo, 0);
        if (cpuInfo[0] >= 1) {
@@ -102,6 +107,7 @@ bool CpuPlatform::isProcessorSupported() {
            return ((cpuInfo[2] & ((int) 1 << 19)) != 0);
        }
        return false;
+    #endif
 }
 void CpuPlatform::contextCreated(ContextImpl& context, const map<string, string>& properties) const {

--- a/platforms/opencl/src/OpenCLKernels.cpp
+++ b/platforms/opencl/src/OpenCLKernels.cpp
@@ -4816,7 +4816,7 @@ void OpenCLIntegrateVariableVerletStepKernel::initialize(const System& system, c
    kernel1 = cl::Kernel(program, "integrateVerletPart1");
    kernel2 = cl::Kernel(program, "integrateVerletPart2");
    selectSizeKernel = cl::Kernel(program, "selectVerletStepSize");
-    blockSize = min(min(256, system.getNumParticles()), (int) selectSizeKernel.getInfo<CL_KERNEL_WORK_GROUP_SIZE>());
+    blockSize = min(min(256, system.getNumParticles()), (int) selectSizeKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(cl.getDevice()));
 }
 double OpenCLIntegrateVariableVerletStepKernel::execute(ContextImpl& context, const VariableVerletIntegrator& integrator, double maxTime) {
@@ -4926,7 +4926,7 @@ void OpenCLIntegrateVariableLangevinStepKernel::initialize(const System& system,
    params = new OpenCLArray(cl, 3, cl.getUseDoublePrecision() || cl.getUseMixedPrecision() ? sizeof(cl_double) : sizeof(cl_float), "langevinParams");
    blockSize = min(256, system.getNumParticles());
    blockSize = max(blockSize, params->getSize());
-    blockSize = min(blockSize, (int) selectSizeKernel.getInfo<CL_KERNEL_WORK_GROUP_SIZE>());
+    blockSize = min(blockSize, (int) selectSizeKernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(cl.getDevice()));
 }
 double OpenCLIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, const VariableLangevinIntegrator& integrator, double maxTime) {

--- a/platforms/opencl/src/OpenCLPlatform.cpp
+++ b/platforms/opencl/src/OpenCLPlatform.cpp
@@ -32,6 +32,7 @@
 #include "openmm/Context.h"
 #include "openmm/System.h"
 #include <algorithm>
+#include <cctype>
 #include <sstream>
 #ifdef __APPLE__
 #include "sys/sysctl.h"
@@ -39,10 +40,7 @@
 using namespace OpenMM;
-using std::map;
+using namespace std;
-using std::string;
-using std::stringstream;
-using std::vector;
 #ifdef OPENMM_OPENCL_BUILDING_STATIC_LIBRARY
 extern "C" void registerOpenCLPlatform() {

--- a/plugins/cpupme/CMakeLists.txt
+++ b/plugins/cpupme/CMakeLists.txt
@@ -57,7 +57,11 @@ ENDFOREACH(subdir)
 INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/src)
 IF (NOT MSVC)
+    IF (ANDROID)
+        SET_SOURCE_FILES_PROPERTIES(${SOURCE_FILES} PROPERTIES COMPILE_FLAGS "")
+    ELSE (ANDROID)
        SET_SOURCE_FILES_PROPERTIES(${SOURCE_FILES} PROPERTIES COMPILE_FLAGS "-msse4.1")
+    ENDIF (ANDROID)
 ENDIF (NOT MSVC)
 # Include FFTW related files.

--- a/plugins/cpupme/src/CpuPmeKernels.cpp
+++ b/plugins/cpupme/src/CpuPmeKernels.cpp
@@ -569,13 +569,20 @@ double CpuCalcPmeReciprocalForceKernel::finishComputation(IO& io) {
 }
 bool CpuCalcPmeReciprocalForceKernel::isProcessorSupported() {
+    // Make sure the CPU supports SSE 4.1 or NEON.
+    #ifdef __ANDROID__
+        uint64_t features = android_getCpuFeatures();
+        return (features & ANDROID_CPU_ARM_FEATURE_NEON) != 0;
+    #else
        int cpuInfo[4];
        cpuid(cpuInfo, 0);
        if (cpuInfo[0] >= 1) {
            cpuid(cpuInfo, 1);
-        return ((cpuInfo[2] & ((int) 1 << 19)) != 0); // Require SSE 4.1
+            return ((cpuInfo[2] & ((int) 1 << 19)) != 0);
        }
        return false;
+    #endif
 }
 int CpuCalcPmeReciprocalForceKernel::findFFTDimension(int minimum) {