Merge branch 'master' into nucleic

fd473eea · Peter Eastman · 0a751b5b · 6a985cfd · fd473eea · fd473eea
Commit fd473eea authored Oct 29, 2015 by Peter Eastman
20 changed files
--- a/.travis.yml
+++ b/.travis.yml
-language: cpp
-compiler:
-  - clang
+language: python
+python:
+  - "2.7_with_system_site_packages"
+  - "3.4"
+
 sudo: false
 addons:
  apt:
@@ -10,16 +12,10 @@ addons:
      - libpcre3
      - libpcre3-dev
      - gromacs
-      - swig
      - doxygen
-      - clang-3.3
-      - llvm-3.3
      - python-numpy
      - python-scipy
-      - python-sphinx
-      - python-yaml
-      - python-pip
-      - python-virtualenv
+

 env:
  matrix:
@@ -29,11 +25,14 @@ env:
 before_install:
  - export CC=clang
  - export CXX=clang++
-  - export ASAN_SYMBOLIZER_PATH=/usr/bin/llvm-symbolizer-3.3
+
+  - wget https://anaconda.org/anaconda/swig/3.0.2/download/linux-64/swig-3.0.2-0.tar.bz2
+  - mkdir $HOME/swig
+  - tar -xjvf swig-3.0.2-0.tar.bz2 -C $HOME/swig
+  - export PATH=$HOME/swig/bin:$PATH
+  - export SWIG_LIB=$HOME/swig/share/swig/3.0.2

 script:
-  - virtualenv --system-site-packages openmm_env
-  - source openmm_env/bin/activate
  - cmake -DCMAKE_INSTALL_PREFIX=$HOME/OpenMM -DOPENMM_BUILD_STATIC_LIB=$OPENMM_BUILD_STATIC_LIB .
  - make -j2
  - make -j2 install
@@ -41,13 +40,14 @@ script:
  - # Run the testInstallation script
  - python -m simtk.testInstallation
  - # run all of the tests, making sure failures at this stage don't cause travis failures
-  - ctest -j2 -V || true
+  - ctest -j2 || true
  - # get a list of all of the failed tests into this stupid ctest format
-  - python -c 'fn = "Testing/Temporary/LastTestsFailed.log"; import os; os.path.exists(fn) or exit(0); l = [line.split(":")[0] for line in open(fn)]; triplets = zip(l, l, [","]*len(l)); print "".join(",".join(t) for t in triplets)' > FailedTests.log
+  - python -c "import os; fn = os.path.join('Testing', 'Temporary', 'LastTestsFailed.log'); os.path.exists(fn) or exit(0); failed = [line.split(':')[0] for line in open(fn)]; print(','.join(x+','+x for x in failed))" > FailedTests.log
  - # rerun all of the failed tests
-  - if [ -s FailedTests.log ]; then ctest -V -I FailedTests.log; fi;
+  - if [ -s Testing/Temporary/LastTestsFailed.log ]; then ctest -I FailedTests.log; fi;
  - # run the python tests too
  - cd python/tests  
-  - pip install nose
-  - nosetests -vv --processes=-1 --process-timeout=200
+  - # nosetests -vv --processes=-1 --process-timeout=200
+  - # nosetests -vv
+  - py.test -v *
  - cd -
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -22,6 +22,9 @@ IF( NOT PROJECT_NAME )
 ENDIF( NOT PROJECT_NAME )

 CMAKE_MINIMUM_REQUIRED(VERSION 2.8)
+if("${CMAKE_VERSION}" VERSION_GREATER "3.0" OR "${CMAKE_VERSION}" VERSION_EQUAL "3.0")
+    CMAKE_POLICY(SET CMP0042 OLD)
+endif()

 #SET(CMAKE_VERBOSE_MAKEFILE 1)

@@ -79,7 +82,7 @@ ENDIF(${CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT})

 # The source is organized into subdirectories, but we handle them all from
 # this CMakeLists file rather than letting CMake visit them as SUBDIRS.
-SET(OPENMM_SOURCE_SUBDIRS . openmmapi olla libraries/jama libraries/quern libraries/lepton libraries/sfmt libraries/lbfgs libraries/hilbert libraries/csha1 platforms/reference serialization libraries/validate libraries/irrxml)
+SET(OPENMM_SOURCE_SUBDIRS . openmmapi olla libraries/jama libraries/quern libraries/lepton libraries/sfmt libraries/lbfgs libraries/hilbert libraries/csha1 platforms/reference serialization libraries/validate libraries/irrxml libraries/vecmath)
 IF(WIN32)
    SET(OPENMM_SOURCE_SUBDIRS ${OPENMM_SOURCE_SUBDIRS} libraries/pthreads)
 ELSE(WIN32)
@@ -121,6 +124,12 @@ IF (APPLE AND (NOT PNACL))
 ELSE (APPLE AND (NOT PNACL))
    IF (MSVC OR ANDROID OR PNACL)
        SET(EXTRA_COMPILE_FLAGS)
+        IF (MSVC)
+            # Use warning level 2, not whatever warning level CMake picked.
+            STRING(REGEX REPLACE "/W[0-4]" "/W2" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+            # Explicitly suppress warnings 4305 and 4244.
+            SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4305 /wd4244")
+        ENDIF (MSVC)
    ELSE (MSVC OR ANDROID OR PNACL)
        SET(EXTRA_COMPILE_FLAGS "-msse2")
    ENDIF (MSVC OR ANDROID OR PNACL)
@@ -150,8 +159,8 @@ ENDIF (NOT CMAKE_CXX_FLAGS_RELEASE)
 # and make it available to the code so it can be built into the binaries.

 SET(OPENMM_LIBRARY_NAME OpenMM)
-SET(OPENMM_MAJOR_VERSION 6)
-SET(OPENMM_MINOR_VERSION 3)
+SET(OPENMM_MAJOR_VERSION 7)
+SET(OPENMM_MINOR_VERSION 0)
 SET(OPENMM_BUILD_VERSION 0)

 SET(OPENMM_COPYRIGHT_YEARS "2008-2015")
@@ -287,6 +296,9 @@ IF (CMAKE_SYSTEM_NAME MATCHES "Linux")
        SET(EXTRA_LINK_FLAGS "${EXTRA_LINK_FLAGS} -Wl,--no-as-needed -lrt")
    ENDIF (NOT ANDROID)
 ENDIF (CMAKE_SYSTEM_NAME MATCHES "Linux")
+IF (MSVC)
+    SET(EXTRA_LINK_FLAGS)
+ENDIF (MSVC)

 IF(OPENMM_BUILD_SHARED_LIB)
    ADD_LIBRARY(${SHARED_TARGET} SHARED ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})
@@ -328,6 +340,7 @@ ELSE(DL_LIBRARY)
 ENDIF(DL_LIBRARY)

 IF(BUILD_TESTING)
+    INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/tests)
    ADD_SUBDIRECTORY(platforms/reference/tests)
 ENDIF(BUILD_TESTING)


--- a/appveyor.yml
+++ b/appveyor.yml
+os: Windows Server 2012 R2
+shallow_clone: true
+install:
+
+# Setup shell for VS2010, x64, release mode
+  - >
+    "%ProgramFiles%\Microsoft SDKs\Windows\v7.1\Bin\SetEnv.cmd" /x64 /release
+
+# Set path to python, git-bash tools.
+  - "set PATH=C:\\Python34-x64;C:\\Python34-x64\\Scripts;%PATH%"
+  - "set PATH=C:\\Program Files (x86)\\Git\\bin;%PATH%"
+  - pip install nose
+
+# Download FFTW3 for PME plugin
+  - C:\MinGW\msys\1.0\bin\wget -q ftp://ftp.fftw.org/pub/fftw/fftw-3.3.4-dll64.zip
+  - 7z x fftw-3.3.4-dll64.zip -oC:\fftw > null
+  - cd C:\fftw
+  - lib /def:libfftw3f-3.def
+  - cd %APPVEYOR_BUILD_FOLDER%
+  - "set PATH=C:\\fftw;%PATH%"
+
+# Download and install some OpenMM build dependencies (doxygen, swig)
+  - choco install -y doxygen.portable swig
+
+# Set CMake options
+  - ps: $env:CMAKE_FLAGS='-DOPENMM_BUILD_PME_PLUGIN=ON -DFFTW_LIBRARY=C:/fftw/libfftw3f-3.lib -DFFTW_INCLUDES=C:/fftw -DCMAKE_BUILD_TYPE=Release -DOPENMM_BUILD_EXAMPLES=OFF -DCMAKE_CXX_FLAGS_RELEASE="/MD /Od /Ob0 /D NDEBUG"'
+  - mkdir build
+  - cd build
+  - cmake -G "NMake Makefiles" %CMAKE_FLAGS% -LA ..
+
+# Run the build
+  - cmake --build . --target install
+  - cmake --build . --target PythonInstall
+
+build: false
+test_script:
+  - ctest || exit 0
+  - python -c "import os; fn = os.path.join('Testing', 'Temporary', 'LastTestsFailed.log'); os.path.exists(fn) or exit(0); failed = [line.split(':')[0] for line in open(fn)]; print(','.join(x+','+x for x in failed))" > FailedTests.log
+  - ps: >
+     If (Test-Path "Testing\\Temporary\\LastTestsFailed.log") {
+         cat Testing\\Temporary\\LastTestsFailed.log
+         cat FailedTests.log
+         ctest -I FailedTests.log
+     }
+  - cd python\tests
+  - nosetests -vv --processes=-1 --process-timeout=200
+  - cd %APPVEYOR_BUILD_FOLDER%
--- a/devtools/forcefield-scripts/processAmberForceField.py
+++ b/devtools/forcefield-scripts/processAmberForceField.py
@@ -405,8 +405,10 @@ for index, type in enumerate(types):
            sigma = (params[0]/params[1])**(1.0/6.0)
            epsilon = 4.184*params[1]*params[1]/(4*params[0])
    else:
-        sigma = 0
+        sigma = 1
        epsilon = 0
+    if sigma == 0 or epsilon == 0:
+        sigma, epsilon = 1, 0
    if q != 0 or epsilon != 0:
        print """  <Atom type="%d" charge="%s" sigma="%s" epsilon="%s"/>""" % (index, q, sigma, epsilon)
 print " </NonbondedForce>"

--- a/devtools/forcefield-scripts/processTinkerForceField.py
+++ b/devtools/forcefield-scripts/processTinkerForceField.py
@@ -897,7 +897,6 @@ if( isAmoeba ):
    torsionTorsionUnit      = 1.0
    outputString            = """ <AmoebaTorsionTorsionForce >"""
    tinkerXmlFile.write( "%s\n" % (outputString ) )
-    conversion              = 41.84/radian
    torsionTorsions         = forces['tortors']
    for (index, torsionTorsion) in enumerate(torsionTorsions):
       torInfo       = torsionTorsion[0]

--- a/docs-source/developerguide/conf.py
+++ b/docs-source/developerguide/conf.py
@@ -142,7 +142,7 @@ html_static_path = ['_static']
 #html_domain_indices = True

 # If false, no index is generated.
-#html_use_index = True
+html_use_index = False

 # If true, the index is split into individual pages for each letter.
 #html_split_index = False

--- a/docs-source/developerguide/developer.rst
+++ b/docs-source/developerguide/developer.rst
@@ -66,7 +66,7 @@ information related to a particular simulation, and define methods for
 performing calculations.

 Note that, whereas a Force is logically “part of” a System, a ForceImpl is
-logically “part of” a Context.  (See :numref:`Figure,API Relationships`\ .)  If you create many Contexts
+logically “part of” a Context.  (See :autonumref:`Figure,API Relationships`\ .)  If you create many Contexts
 for simulating the same System, there is still only one System and only one copy
 of each Force in it.  But there will be separate ForceImpls for each Context,
 and those ForceImpls store information related to their particular Contexts.

--- a/docs-source/developerguide/index.rst
+++ b/docs-source/developerguide/index.rst
@@ -2,34 +2,9 @@
 OpenMM Developer Guide
 ######################

-Portions copyright (c) 2011-2014 Stanford University and the Authors
-Contributors: Peter Eastman
+.. only:: latex

-Permission is hereby granted, free of charge, to any person obtaining a copy of
-this document (the "Document"), to deal in the Document without restriction,
-including without limitation the rights to use, copy, modify, merge, publish,
-distribute, sublicense, and/or sell copies of the Document, and to permit
-persons to whom the Document is furnished to do so, subject to the following
-conditions:
-
-This copyright and permission notice shall be included in all copies or
-substantial portions of the Document.
-
-THE DOCUMENT IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
-FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS,
-CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE DOCUMENT OR THE USE OR OTHER DEALINGS IN THE
-DOCUMENT.
-
-Acknowledgments
-
-OpenMM software and all related activities, such as this manual, are funded by
-the Simbios National Center for Biomedical Computing through the National
-Institutes of Health Roadmap for Medical Research, Grant U54 GM072970.
-Information on the National Centers can be found at
-http://nihroadmap.nih.gov/bioinformatics.
+   .. include:: license.rst

 .. toctree::
   :maxdepth: 3
@@ -37,3 +12,6 @@ http://nihroadmap.nih.gov/bioinformatics.
   
   developer

+.. only:: html
+
+   .. include:: license.rst
--- a/docs-source/developerguide/license.rst
+++ b/docs-source/developerguide/license.rst
+Portions copyright (c) 2011-2015 Stanford University and the Authors
+
+Contributors: Peter Eastman
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this document (the "Document"), to deal in the Document without restriction,
+including without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Document, and to permit
+persons to whom the Document is furnished to do so, subject to the following
+conditions:
+
+This copyright and permission notice shall be included in all copies or
+substantial portions of the Document.
+
+THE DOCUMENT IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS,
+CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE DOCUMENT OR THE USE OR OTHER DEALINGS IN THE
+DOCUMENT.
+
+Acknowledgments
+
+OpenMM software and all related activities, such as this manual, are funded by
+the Simbios National Center for Biomedical Computing through the National
+Institutes of Health Roadmap for Medical Research, Grant U54 GM072970.
+Information on the National Centers can be found at
+http://nihroadmap.nih.gov/bioinformatics.
--- a/docs-source/usersguide/application.rst
+++ b/docs-source/usersguide/application.rst
@@ -675,11 +675,17 @@ Platforms

 When creating a :class:`Simulation`, you can optionally tell it what :class:`Platform` to use.
 OpenMM includes four platforms: :class:`Reference`, :class:`CPU`, :class:`CUDA`, and :class:`OpenCL`.  For a
-description of the differences between them, see Section :ref:`platforms`.  If you do not
-specify a :class:`Platform`, it will select one automatically.  Usually its choice will
-be reasonable, but you may want to change it.
+description of the differences between them, see Section :ref:`platforms`.  There are three ways in which
+the :class:`Platform` can be chosen:

-The following lines specify to use the :class:`CUDA` platform:
+1. By default, OpenMM will try to select the fastest available :class:`Platform`.  Usually its choice will
+be reasonable, but sometimes you may want to change it.
+
+2. Alternatively, you can set the :envvar:`OPENMM_DEFAULT_PLATFORM` environment variable to the name
+of the :class:`Platform` to use.  This overrides the default logic.
+
+3. Finally, you can explicitly specify a :class:`Platform` object in your script when you create the
+:class:`Simulation`.  The following lines specify to use the :class:`CUDA` platform:
 ::

    platform = Platform.getPlatformByName('CUDA')
@@ -1837,7 +1843,7 @@ The :code:`<ForceField>` tag contains the following children:
 * Zero or more tags defining specific forces


-The order of these tags does not matter.  They are described in details below.
+The order of these tags does not matter.  They are described in detail below.

 <AtomTypes>
 ===========
@@ -1921,9 +1927,9 @@ as in the following example:

 Each :code:`<VirtualSite>` tag indicates an atom in the residue that should
 be represented with a virtual site.  The :code:`type` attribute may equal
-:code:`"average2"`\ , :code:`"average3"`\ , or :code:`"outOfPlane"`\ , which
-correspond to the TwoParticleAverageSite, ThreeParticleAverageSite, and
-OutOfPlaneSite classes respectively.  The :code:`index` attribute gives the
+:code:`"average2"`\ , :code:`"average3"`\ , :code:`"outOfPlane"`\ , or
+:code:`"localCoords"`\ , which correspond to the TwoParticleAverageSite, ThreeParticleAverageSite,
+OutOfPlaneSite, and LocalCoordinatesSite classes respectively.  The :code:`index` attribute gives the
 index (starting from 0) of the atom to represent with a virtual site.  The atoms
 it is calculated based on are specified by :code:`atom1`\ , :code:`atom2`\ ,
 and (for virtual site classes that involve three atoms) :code:`atom3`\ .  The
@@ -1932,7 +1938,10 @@ parameters for calculating the site position.  For a TwoParticleAverageSite,
 they are :code:`weight1` and :code:`weight2`\ .  For a
 ThreeParticleAverageSite, they are :code:`weight1`\ , :code:`weight2`\ , and
 \ :code:`weight3`\ . For an OutOfPlaneSite, they are :code:`weight12`\ ,
-:code:`weight13`\ , and :code:`weightCross`\ .
+:code:`weight13`\ , and :code:`weightCross`\ . For a LocalCoordinatesSite, they
+are :code:`wo1`\ , :code:`wo2`\ , :code:`wo3`\ , :code:`wx1`\ , :code:`wx2`\ ,
+:code:`wx3`\ , :code:`wy1`\ , :code:`wy2`\ , :code:`wy3`\ , :code:`p1`\ ,
+:code:`p2`\ , and :code:`p3`\ .


 <HarmonicBondForce>
@@ -2490,8 +2499,9 @@ The following operators are supported: + (add), - (subtract), * (multiply), /

 The following standard functions are supported: sqrt, exp, log, sin, cos, sec,
 csc, tan, cot, asin, acos, atan, sinh, cosh, tanh, erf, erfc, min, max, abs,
-step. step(x) = 0 if x < 0, 1 otherwise.  Some custom forces allow additional
-functions to be defined from tabulated values.
+floor, ceil, step, delta, select. step(x) = 0 if x < 0, 1 otherwise.
+delta(x) = 1 if x is 0, 0 otherwise.  select(x,y,z) = z if x = 0, y otherwise.
+Some custom forces allow additional functions to be defined from tabulated values.

 Numbers may be given in either decimal or exponential form.  All of the
 following are valid numbers: 5, -3.1, 1e6, and 3.12e-2.
@@ -2515,8 +2525,8 @@ values.  All uses of a value must appear *before* that value’s definition.

 .. _tabulated-functions:

-TabulatedFunctions
-==================
+Tabulated Functions
+===================

 Some forces, such as CustomNonbondedForce and CustomGBForce, allow you to define
 tabulated functions.  To define a function, include a :code:`<Function>` tag inside the
@@ -2563,6 +2573,47 @@ successive values separated by white space.  See the API documentation for more
 details.


+Residue Template Parameters
+===========================
+
+In forces that use an :code:`<Atom>` tag to define parameters for atom types or
+classes, there is an alternate mechanism you can also use: defining those
+parameter values in the residue template.  This is useful for situations that
+come up in certain force fields.  For example, :code:`NonbondedForce` and
+:code:`GBSAOBCForce` each have a :code:`charge` attribute.  If you only have to
+define the charge of each atom type once, that is more convenient and avoids
+potential bugs.  Also, many force fields have a different charge for each atom
+type, but Lennard-Jones parameters that are the same for all types in a class.
+It would be preferable not to have to repeat those parameter values many times
+over.
+
+When writing a residue template, you can add arbitrary additional attributes
+to each :code:`<Atom>` tag.  For example, you might include a :code:`charge`
+attribute as follows:
+
+.. code-block:: xml
+
+   <Atom name="CA" type="53" charge="0.0381"/>
+
+When writing the tag for a force, you can then include a
+:code:`<UseAttributeFromResidue>` tag inside it.  This indicates that a
+specified attribute should be taken from the residue template.  Finally, you
+simply omit that attribute in the force's own :code:`<Atom>` tags.  For example:
+
+.. code-block:: xml
+
+    <NonbondedForce coulomb14scale="0.833333" lj14scale="0.5">
+     <UseAttributeFromResidue name="charge"/>
+     <Atom class="CX" sigma="0.339966950842" epsilon="0.4577296"/>
+     ...
+    </NonbondedForce>
+
+Notice that the :code:`charge` attribute is missing, and that the parameters
+are specified by class, not by type.  This means that sigma and epsilon only
+need to be specified once for each class.  The atom charges, which are different
+for each type, are taken from the residue template instead.
+
+
 Using Multiple Files
 ********************


--- a/docs-source/usersguide/conf.py
+++ b/docs-source/usersguide/conf.py
@@ -142,7 +142,7 @@ html_static_path = ['_static']
 #html_domain_indices = True

 # If false, no index is generated.
-#html_use_index = True
+html_use_index = False

 # If true, the index is split into individual pages for each letter.
 #html_split_index = False

--- a/docs-source/usersguide/index.rst
+++ b/docs-source/usersguide/index.rst
@@ -4,31 +4,9 @@
 OpenMM Users Manual and Theory Guide
 ####################################

+.. only:: latex

-Portions copyright (c) 2008-2014 Stanford University and the Authors
-
-Contributors:  Kyle Beauchamp, Christopher Bruns, John Chodera, Peter Eastman, Mark
-Friedrichs, Joy P. Ku, Tom Markland, Vijay Pande, Randy Radmer, Michael Sherman,
-Jason Swails, Lee-Ping Wang
-
-Permission is hereby granted, free of charge, to any person obtaining a copy of
-this document (the "Document"), to deal in the Document without restriction,
-including without limitation the rights to use, copy, modify, merge, publish,
-distribute, sublicense, and/or sell copies of the Document, and to permit
-persons to whom the Document is furnished to do so, subject to the following
-conditions:
-
-This copyright and permission notice shall be included in all copies or
-substantial portions of the Document.
-
-THE DOCUMENT IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
-FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS,
-CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE DOCUMENT OR THE USE OR OTHER DEALINGS IN THE
-DOCUMENT.
-
+   .. include:: license.rst

 .. toctree::
   :numbered:
@@ -40,3 +18,6 @@ DOCUMENT.
   theory
   zbibliography

+.. only:: html
+
+   .. include:: license.rst
--- a/docs-source/usersguide/license.rst
+++ b/docs-source/usersguide/license.rst
+Portions copyright (c) 2008-2015 Stanford University and the Authors
+
+Contributors:  Kyle Beauchamp, Christopher Bruns, John Chodera, Peter Eastman, Mark
+Friedrichs, Joy P. Ku, Tom Markland, Vijay Pande, Randy Radmer, Michael Sherman,
+Jason Swails, Lee-Ping Wang
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this document (the "Document"), to deal in the Document without restriction,
+including without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Document, and to permit
+persons to whom the Document is furnished to do so, subject to the following
+conditions:
+
+This copyright and permission notice shall be included in all copies or
+substantial portions of the Document.
+
+THE DOCUMENT IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS,
+CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE DOCUMENT OR THE USE OR OTHER DEALINGS IN THE
+DOCUMENT.
--- a/docs-source/usersguide/theory.rst
+++ b/docs-source/usersguide/theory.rst
@@ -861,6 +861,28 @@ Parameters may be specified in two ways:
 * Per-bond parameters are defined by specifying a value for each bond.


+CustomCentroidBondForce
+***********************
+
+CustomCentroidBondForce is very similar to CustomCompoundBondForce, but instead
+of creating bonds between individual particles, the bonds are between the
+centers of groups of particles.  This is useful for purposes such as restraining
+the distance between two molecules or pinning the center of mass of a single
+molecule.
+
+The first step in computing this force is to calculate the center position of
+each defined group of particles.  This is calculated as a weighted average of
+the positions of all the particles in the group, with the weights being user
+defined.  The computation then proceeds exactly as with CustomCompoundBondForce,
+but the energy of each "bond" is now calculated based on the centers of a set
+of groups, rather than on the positions of individual particles.
+
+This class supports all the same function types and features as
+CustomCompoundBondForce.  In fact, any interaction that could be implemented
+with CustomCompoundBondForce can also be implemented with this class, simply by
+defining each group to contain only a single atom.
+
+
 CustomManyParticleForce
 ***********************


--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -28,7 +28,7 @@ FOREACH(EX_ROOT ${CPP_EXAMPLES})
        SET_TARGET_PROPERTIES(${EX_ROOT}
            PROPERTIES
            PROJECT_LABEL "Example - ${EX_ROOT}"
-            LINK_FLAGS "${EXTRA_COMPILE_FLAGS}"
+            LINK_FLAGS "${EXTRA_LINK_FLAGS}"
            COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS}")
        TARGET_LINK_LIBRARIES(${EX_ROOT} ${SHARED_TARGET})
    ENDIF (OPENMM_BUILD_SHARED_LIB)
@@ -40,7 +40,7 @@ FOREACH(EX_ROOT ${CPP_EXAMPLES})
        SET_TARGET_PROPERTIES(${EX_STATIC}
            PROPERTIES
            PROJECT_LABEL "Example - ${EX_STATIC}"
-            LINK_FLAGS "${EXTRA_COMPILE_FLAGS}"
+            LINK_FLAGS "${EXTRA_LINK_FLAGS}"
            COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_USE_STATIC_LIBRARIES")
        TARGET_LINK_LIBRARIES(${EX_STATIC} ${STATIC_TARGET})
    ENDIF (OPENMM_BUILD_STATIC_LIB)
@@ -62,7 +62,7 @@ IF(OPENMM_BUILD_C_AND_FORTRAN_WRAPPERS)
            SET_TARGET_PROPERTIES(${EX_ROOT}
                PROPERTIES
                PROJECT_LABEL "Example C - ${EX_ROOT}"
-                LINK_FLAGS "${EXTRA_COMPILE_FLAGS}"
+                LINK_FLAGS "${EXTRA_LINK_FLAGS}"
                COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS}")
            TARGET_LINK_LIBRARIES(${EX_ROOT} ${SHARED_TARGET})
            ADD_DEPENDENCIES(${EX_ROOT} ApiWrappers)
@@ -77,7 +77,7 @@ IF(OPENMM_BUILD_C_AND_FORTRAN_WRAPPERS)
            SET_TARGET_PROPERTIES(${EX_STATIC}
                PROPERTIES
                PROJECT_LABEL "Example C - ${EX_STATIC}"
-                LINK_FLAGS "${EXTRA_COMPILE_FLAGS}"
+                LINK_FLAGS "${EXTRA_LINK_FLAGS}"
                COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_USE_STATIC_LIBRARIES")
            TARGET_LINK_LIBRARIES(${EX_STATIC} ${STATIC_TARGET})
            ADD_DEPENDENCIES(${EX_STATIC} ApiWrappers)

--- a/examples/benchmark.py
+++ b/examples/benchmark.py
@@ -52,7 +52,11 @@ def runOneTest(testName, options):
            cutoff = 2.0*unit.nanometers
            vdwCutoff = 1.2*unit.nanometers
            system = ff.createSystem(pdb.topology, nonbondedMethod=app.NoCutoff, constraints=constraints, mutualInducedTargetEpsilon=epsilon, polarization=polarization)
-        dt = 0.001*unit.picoseconds
+        for f in system.getForces():
+            if isinstance(f, mm.AmoebaMultipoleForce) or isinstance(f, mm.AmoebaVdwForce) or isinstance(f, mm.AmoebaGeneralizedKirkwoodForce) or isinstance(f, mm.AmoebaWcaDispersionForce):
+                f.setForceGroup(1)
+        dt = 0.002*unit.picoseconds
+        integ = mm.MTSIntegrator(dt, [(0,2), (1,1)])
    else:
        if explicit:
            ff = app.ForceField('amber99sb.xml', 'tip3p.xml')
@@ -77,6 +81,7 @@ def runOneTest(testName, options):
            constraints = app.HBonds
            hydrogenMass = None
        system = ff.createSystem(pdb.topology, nonbondedMethod=method, nonbondedCutoff=cutoff, constraints=constraints, hydrogenMass=hydrogenMass)
+        integ = mm.LangevinIntegrator(300*unit.kelvin, 91*(1/unit.picoseconds), dt)
    print('Step Size: %g fs' % dt.value_in_unit(unit.femtoseconds))
    properties = {}
    initialSteps = 5
@@ -95,7 +100,6 @@ def runOneTest(testName, options):
    
    # Run the simulation.
    
-    integ = mm.LangevinIntegrator(300*unit.kelvin, 91*(1/unit.picoseconds), dt)
    integ.setConstraintTolerance(1e-5)
    if len(properties) > 0:
        context = mm.Context(system, integ, platform, properties)

--- a/examples/simulateGromacs.py
+++ b/examples/simulateGromacs.py
@@ -4,7 +4,7 @@ from simtk.unit import *
 from sys import stdout

 gro = GromacsGroFile('input.gro')
-top = GromacsTopFile('input.top', periodicBoxVectors=gro.getPeriodicBoxVectors(), includeDir='/usr/local/gromacs/share/gromacs/top')
+top = GromacsTopFile('input.top', periodicBoxVectors=gro.getPeriodicBoxVectors())
 system = top.createSystem(nonbondedMethod=PME, nonbondedCutoff=1*nanometer, constraints=HBonds)
 integrator = LangevinIntegrator(300*kelvin, 1/picosecond, 0.002*picoseconds)
 simulation = Simulation(top.topology, system, integrator)

--- a/libraries/sfmt/src/SFMT.cpp
+++ b/libraries/sfmt/src/SFMT.cpp
@@ -124,11 +124,13 @@ public:
 };

 void SFMT::createCheckpoint(std::ostream& stream) {
+    stream.write((char*) &data->baseData, sizeof(data->baseData));
    stream.write((char*) &data->sfmt, sizeof(data->sfmt));
    stream.write((char*) &data->idx, sizeof(data->idx));
 }

 void SFMT::loadCheckpoint(std::istream& stream) {
+    stream.read((char*) &data->baseData, sizeof(data->baseData));
    stream.read((char*) &data->sfmt, sizeof(data->sfmt));
    stream.read((char*) &data->idx, sizeof(data->idx));
 }

--- a/libraries/vecmath/include/neon_mathfun.h
+++ b/libraries/vecmath/include/neon_mathfun.h
+/* NEON implementation of sin, cos, exp and log
+
+   Inspired by Intel Approximate Math library, and based on the
+   corresponding algorithms of the cephes math library
+*/
+
+/* Copyright (C) 2011  Julien Pommier
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+
+  (this is the zlib license)
+*/
+
+#include <arm_neon.h>
+#include "openmm/internal/windowsExport.h"
+
+typedef float32x4_t v4sf;  // vector of 4 float
+typedef uint32x4_t v4su;  // vector of 4 uint32
+typedef int32x4_t v4si;  // vector of 4 uint32
+
+#define c_inv_mant_mask ~0x7f800000u
+#define c_cephes_SQRTHF 0.707106781186547524
+#define c_cephes_log_p0 7.0376836292E-2
+#define c_cephes_log_p1 - 1.1514610310E-1
+#define c_cephes_log_p2 1.1676998740E-1
+#define c_cephes_log_p3 - 1.2420140846E-1
+#define c_cephes_log_p4 + 1.4249322787E-1
+#define c_cephes_log_p5 - 1.6668057665E-1
+#define c_cephes_log_p6 + 2.0000714765E-1
+#define c_cephes_log_p7 - 2.4999993993E-1
+#define c_cephes_log_p8 + 3.3333331174E-1
+#define c_cephes_log_q1 -2.12194440e-4
+#define c_cephes_log_q2 0.693359375
+
+/* natural logarithm computed for 4 simultaneous float 
+   return NaN for x <= 0
+*/
+OPENMM_EXPORT v4sf log_ps(v4sf x) {
+  v4sf one = vdupq_n_f32(1);
+
+  x = vmaxq_f32(x, vdupq_n_f32(0)); /* force flush to zero on denormal values */
+  v4su invalid_mask = vcleq_f32(x, vdupq_n_f32(0));
+
+  v4si ux = vreinterpretq_s32_f32(x);
+  
+  v4si emm0 = vshrq_n_s32(ux, 23);
+
+  /* keep only the fractional part */
+  ux = vandq_s32(ux, vdupq_n_s32(c_inv_mant_mask));
+  ux = vorrq_s32(ux, vreinterpretq_s32_f32(vdupq_n_f32(0.5f)));
+  x = vreinterpretq_f32_s32(ux);
+
+  emm0 = vsubq_s32(emm0, vdupq_n_s32(0x7f));
+  v4sf e = vcvtq_f32_s32(emm0);
+
+  e = vaddq_f32(e, one);
+
+  /* part2: 
+     if( x < SQRTHF ) {
+       e -= 1;
+       x = x + x - 1.0;
+     } else { x = x - 1.0; }
+  */
+  v4su mask = vcltq_f32(x, vdupq_n_f32(c_cephes_SQRTHF));
+  v4sf tmp = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(x), mask));
+  x = vsubq_f32(x, one);
+  e = vsubq_f32(e, vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(one), mask)));
+  x = vaddq_f32(x, tmp);
+
+  v4sf z = vmulq_f32(x,x);
+
+  v4sf y = vdupq_n_f32(c_cephes_log_p0);
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p1));
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p2));
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p3));
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p4));
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p5));
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p6));
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p7));
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p8));
+  y = vmulq_f32(y, x);
+
+  y = vmulq_f32(y, z);
+  
+
+  tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q1));
+  y = vaddq_f32(y, tmp);
+
+
+  tmp = vmulq_f32(z, vdupq_n_f32(0.5f));
+  y = vsubq_f32(y, tmp);
+
+  tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q2));
+  x = vaddq_f32(x, y);
+  x = vaddq_f32(x, tmp);
+  x = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(x), invalid_mask)); // negative arg will be NAN
+  return x;
+}
+
+#define c_exp_hi 88.3762626647949f
+#define c_exp_lo -88.3762626647949f
+
+#define c_cephes_LOG2EF 1.44269504088896341
+#define c_cephes_exp_C1 0.693359375
+#define c_cephes_exp_C2 -2.12194440e-4
+
+#define c_cephes_exp_p0 1.9875691500E-4
+#define c_cephes_exp_p1 1.3981999507E-3
+#define c_cephes_exp_p2 8.3334519073E-3
+#define c_cephes_exp_p3 4.1665795894E-2
+#define c_cephes_exp_p4 1.6666665459E-1
+#define c_cephes_exp_p5 5.0000001201E-1
+
+/* exp() computed for 4 float at once */
+OPENMM_EXPORT v4sf exp_ps(v4sf x) {
+  v4sf tmp, fx;
+
+  v4sf one = vdupq_n_f32(1);
+  x = vminq_f32(x, vdupq_n_f32(c_exp_hi));
+  x = vmaxq_f32(x, vdupq_n_f32(c_exp_lo));
+
+  /* express exp(x) as exp(g + n*log(2)) */
+  fx = vmlaq_f32(vdupq_n_f32(0.5f), x, vdupq_n_f32(c_cephes_LOG2EF));
+
+  /* perform a floorf */
+  tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx));
+
+  /* if greater, substract 1 */
+  v4su mask = vcgtq_f32(tmp, fx);    
+  mask = vandq_u32(mask, vreinterpretq_u32_f32(one));
+
+
+  fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask));
+
+  tmp = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C1));
+  v4sf z = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C2));
+  x = vsubq_f32(x, tmp);
+  x = vsubq_f32(x, z);
+
+  static const float cephes_exp_p[6] = { c_cephes_exp_p0, c_cephes_exp_p1, c_cephes_exp_p2, c_cephes_exp_p3, c_cephes_exp_p4, c_cephes_exp_p5 };
+  v4sf y = vld1q_dup_f32(cephes_exp_p+0);
+  v4sf c1 = vld1q_dup_f32(cephes_exp_p+1); 
+  v4sf c2 = vld1q_dup_f32(cephes_exp_p+2); 
+  v4sf c3 = vld1q_dup_f32(cephes_exp_p+3); 
+  v4sf c4 = vld1q_dup_f32(cephes_exp_p+4); 
+  v4sf c5 = vld1q_dup_f32(cephes_exp_p+5);
+
+  y = vmulq_f32(y, x);
+  z = vmulq_f32(x,x);
+  y = vaddq_f32(y, c1);
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, c2);
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, c3);
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, c4);
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, c5);
+  
+  y = vmulq_f32(y, z);
+  y = vaddq_f32(y, x);
+  y = vaddq_f32(y, one);
+
+  /* build 2^n */
+  int32x4_t mm;
+  mm = vcvtq_s32_f32(fx);
+  mm = vaddq_s32(mm, vdupq_n_s32(0x7f));
+  mm = vshlq_n_s32(mm, 23);
+  v4sf pow2n = vreinterpretq_f32_s32(mm);
+
+  y = vmulq_f32(y, pow2n);
+  return y;
+}
+
+#define c_minus_cephes_DP1 -0.78515625
+#define c_minus_cephes_DP2 -2.4187564849853515625e-4
+#define c_minus_cephes_DP3 -3.77489497744594108e-8
+#define c_sincof_p0 -1.9515295891E-4
+#define c_sincof_p1  8.3321608736E-3
+#define c_sincof_p2 -1.6666654611E-1
+#define c_coscof_p0  2.443315711809948E-005
+#define c_coscof_p1 -1.388731625493765E-003
+#define c_coscof_p2  4.166664568298827E-002
+#define c_cephes_FOPI 1.27323954473516 // 4 / M_PI
+
+/* evaluation of 4 sines & cosines at once.
+
+   The code is the exact rewriting of the cephes sinf function.
+   Precision is excellent as long as x < 8192 (I did not bother to
+   take into account the special handling they have for greater values
+   -- it does not return garbage for arguments over 8192, though, but
+   the extra precision is missing).
+
+   Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
+   surprising but correct result.
+
+   Note also that when you compute sin(x), cos(x) is available at
+   almost no extra price so both sin_ps and cos_ps make use of
+   sincos_ps..
+  */
+OPENMM_EXPORT void sincos_ps(v4sf x, v4sf *ysin, v4sf *ycos) { // any x
+  v4sf xmm1, xmm2, xmm3, y;
+
+  v4su emm2;
+  
+  v4su sign_mask_sin, sign_mask_cos;
+  sign_mask_sin = vcltq_f32(x, vdupq_n_f32(0));
+  x = vabsq_f32(x);
+
+  /* scale by 4/Pi */
+  y = vmulq_f32(x, vdupq_n_f32(c_cephes_FOPI));
+
+  /* store the integer part of y in mm0 */
+  emm2 = vcvtq_u32_f32(y);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  emm2 = vaddq_u32(emm2, vdupq_n_u32(1));
+  emm2 = vandq_u32(emm2, vdupq_n_u32(~1));
+  y = vcvtq_f32_u32(emm2);
+
+  /* get the polynom selection mask 
+     there is one polynom for 0 <= x <= Pi/4
+     and another one for Pi/4<x<=Pi/2
+
+     Both branches will be computed.
+  */
+  v4su poly_mask = vtstq_u32(emm2, vdupq_n_u32(2));
+  
+  /* The magic pass: "Extended precision modular arithmetic" 
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = vmulq_n_f32(y, c_minus_cephes_DP1);
+  xmm2 = vmulq_n_f32(y, c_minus_cephes_DP2);
+  xmm3 = vmulq_n_f32(y, c_minus_cephes_DP3);
+  x = vaddq_f32(x, xmm1);
+  x = vaddq_f32(x, xmm2);
+  x = vaddq_f32(x, xmm3);
+
+  sign_mask_sin = veorq_u32(sign_mask_sin, vtstq_u32(emm2, vdupq_n_u32(4)));
+  sign_mask_cos = vtstq_u32(vsubq_u32(emm2, vdupq_n_u32(2)), vdupq_n_u32(4));
+
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) in y1, 
+     and the second polynom      (Pi/4 <= x <= 0) in y2 */
+  v4sf z = vmulq_f32(x,x);
+  v4sf y1, y2;
+
+  y1 = vmulq_n_f32(z, c_coscof_p0);
+  y2 = vmulq_n_f32(z, c_sincof_p0);
+  y1 = vaddq_f32(y1, vdupq_n_f32(c_coscof_p1));
+  y2 = vaddq_f32(y2, vdupq_n_f32(c_sincof_p1));
+  y1 = vmulq_f32(y1, z);
+  y2 = vmulq_f32(y2, z);
+  y1 = vaddq_f32(y1, vdupq_n_f32(c_coscof_p2));
+  y2 = vaddq_f32(y2, vdupq_n_f32(c_sincof_p2));
+  y1 = vmulq_f32(y1, z);
+  y2 = vmulq_f32(y2, z);
+  y1 = vmulq_f32(y1, z);
+  y2 = vmulq_f32(y2, x);
+  y1 = vsubq_f32(y1, vmulq_f32(z, vdupq_n_f32(0.5f)));
+  y2 = vaddq_f32(y2, x);
+  y1 = vaddq_f32(y1, vdupq_n_f32(1));
+
+  /* select the correct result from the two polynoms */  
+  v4sf ys = vbslq_f32(poly_mask, y1, y2);
+  v4sf yc = vbslq_f32(poly_mask, y2, y1);
+  *ysin = vbslq_f32(sign_mask_sin, vnegq_f32(ys), ys);
+  *ycos = vbslq_f32(sign_mask_cos, yc, vnegq_f32(yc));
+}
+
+OPENMM_EXPORT v4sf sin_ps(v4sf x) {
+  v4sf ysin, ycos; 
+  sincos_ps(x, &ysin, &ycos); 
+  return ysin;
+}
+
+OPENMM_EXPORT v4sf cos_ps(v4sf x) {
+  v4sf ysin, ycos; 
+  sincos_ps(x, &ysin, &ycos); 
+  return ycos;
+}
+
+
--- a/libraries/vecmath/include/sse_mathfun.h
+++ b/libraries/vecmath/include/sse_mathfun.h
+/* SIMD (SSE1+MMX or SSE2) implementation of sin, cos, exp and log
+
+   Inspired by Intel Approximate Math library, and based on the
+   corresponding algorithms of the cephes math library
+
+   The default is to use the SSE1 version. If you define USE_SSE2 the
+   the SSE2 intrinsics will be used in place of the MMX intrinsics. Do
+   not expect any significant performance improvement with SSE2.
+*/
+
+/* Copyright (C) 2007  Julien Pommier
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+
+  (this is the zlib license)
+*/
+
+#include <xmmintrin.h>
+#include "openmm/internal/windowsExport.h"
+
+/* yes I know, the top of this file is quite ugly */
+
+#ifdef _MSC_VER /* visual c++ */
+# define ALIGN16_BEG __declspec(align(16))
+# define ALIGN16_END 
+#else /* gcc or icc */
+# define ALIGN16_BEG
+# define ALIGN16_END __attribute__((aligned(16)))
+#endif
+
+/* __m128 is ugly to write */
+typedef __m128 v4sf;  // vector of 4 float (sse1)
+
+#ifdef USE_SSE2
+# include <emmintrin.h>
+typedef __m128i v4si; // vector of 4 int (sse2)
+#else
+typedef __m64 v2si;   // vector of 2 int (mmx)
+#endif
+
+/* declare some SSE constants -- why can't I figure a better way to do that? */
+#define _PS_CONST(Name, Val)                                            \
+  static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
+#define _PI32_CONST(Name, Val)                                            \
+  static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
+#define _PS_CONST_TYPE(Name, Type, Val)                                 \
+  static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
+
+_PS_CONST(1  , 1.0f);
+_PS_CONST(0p5, 0.5f);
+/* the smallest non denormalized float number */
+_PS_CONST_TYPE(min_norm_pos, int, 0x00800000);
+_PS_CONST_TYPE(mant_mask, int, 0x7f800000);
+_PS_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
+
+_PS_CONST_TYPE(sign_mask, int, (int)0x80000000);
+_PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
+
+_PI32_CONST(1, 1);
+_PI32_CONST(inv1, ~1);
+_PI32_CONST(2, 2);
+_PI32_CONST(4, 4);
+_PI32_CONST(0x7f, 0x7f);
+
+_PS_CONST(cephes_SQRTHF, 0.707106781186547524);
+_PS_CONST(cephes_log_p0, 7.0376836292E-2);
+_PS_CONST(cephes_log_p1, - 1.1514610310E-1);
+_PS_CONST(cephes_log_p2, 1.1676998740E-1);
+_PS_CONST(cephes_log_p3, - 1.2420140846E-1);
+_PS_CONST(cephes_log_p4, + 1.4249322787E-1);
+_PS_CONST(cephes_log_p5, - 1.6668057665E-1);
+_PS_CONST(cephes_log_p6, + 2.0000714765E-1);
+_PS_CONST(cephes_log_p7, - 2.4999993993E-1);
+_PS_CONST(cephes_log_p8, + 3.3333331174E-1);
+_PS_CONST(cephes_log_q1, -2.12194440e-4);
+_PS_CONST(cephes_log_q2, 0.693359375);
+
+#ifndef USE_SSE2
+typedef union xmm_mm_union {
+  __m128 xmm;
+  __m64 mm[2];
+} xmm_mm_union;
+
+#define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) {          \
+    xmm_mm_union u; u.xmm = xmm_;                   \
+    mm0_ = u.mm[0];                                 \
+    mm1_ = u.mm[1];                                 \
+}
+
+#define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) {                         \
+    xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm;      \
+  }
+
+#endif // USE_SSE2
+
+/* natural logarithm computed for 4 simultaneous float 
+   return NaN for x <= 0
+*/
+OPENMM_EXPORT v4sf log_ps(v4sf x) {
+#ifdef USE_SSE2
+  v4si emm0;
+#else
+  v2si mm0, mm1;
+#endif
+  v4sf one = *(v4sf*)_ps_1;
+
+  v4sf invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
+
+  x = _mm_max_ps(x, *(v4sf*)_ps_min_norm_pos);  /* cut off denormalized stuff */
+
+#ifndef USE_SSE2
+  /* part 1: x = frexpf(x, &e); */
+  COPY_XMM_TO_MM(x, mm0, mm1);
+  mm0 = _mm_srli_pi32(mm0, 23);
+  mm1 = _mm_srli_pi32(mm1, 23);
+#else
+  emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
+#endif
+  /* keep only the fractional part */
+  x = _mm_and_ps(x, *(v4sf*)_ps_inv_mant_mask);
+  x = _mm_or_ps(x, *(v4sf*)_ps_0p5);
+
+#ifndef USE_SSE2
+  /* now e=mm0:mm1 contain the really base-2 exponent */
+  mm0 = _mm_sub_pi32(mm0, *(v2si*)_pi32_0x7f);
+  mm1 = _mm_sub_pi32(mm1, *(v2si*)_pi32_0x7f);
+  v4sf e = _mm_cvtpi32x2_ps(mm0, mm1);
+  _mm_empty(); /* bye bye mmx */
+#else
+  emm0 = _mm_sub_epi32(emm0, *(v4si*)_pi32_0x7f);
+  v4sf e = _mm_cvtepi32_ps(emm0);
+#endif
+
+  e = _mm_add_ps(e, one);
+
+  /* part2: 
+     if( x < SQRTHF ) {
+       e -= 1;
+       x = x + x - 1.0;
+     } else { x = x - 1.0; }
+  */
+  v4sf mask = _mm_cmplt_ps(x, *(v4sf*)_ps_cephes_SQRTHF);
+  v4sf tmp = _mm_and_ps(x, mask);
+  x = _mm_sub_ps(x, one);
+  e = _mm_sub_ps(e, _mm_and_ps(one, mask));
+  x = _mm_add_ps(x, tmp);
+
+
+  v4sf z = _mm_mul_ps(x,x);
+
+  v4sf y = *(v4sf*)_ps_cephes_log_p0;
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p1);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p2);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p3);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p4);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p5);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p6);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p7);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p8);
+  y = _mm_mul_ps(y, x);
+
+  y = _mm_mul_ps(y, z);
+  
+
+  tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q1);
+  y = _mm_add_ps(y, tmp);
+
+
+  tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
+  y = _mm_sub_ps(y, tmp);
+
+  tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q2);
+  x = _mm_add_ps(x, y);
+  x = _mm_add_ps(x, tmp);
+  x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN
+  return x;
+}
+
+_PS_CONST(exp_hi,	88.3762626647949f);
+_PS_CONST(exp_lo,	-88.3762626647949f);
+
+_PS_CONST(cephes_LOG2EF, 1.44269504088896341);
+_PS_CONST(cephes_exp_C1, 0.693359375);
+_PS_CONST(cephes_exp_C2, -2.12194440e-4);
+
+_PS_CONST(cephes_exp_p0, 1.9875691500E-4);
+_PS_CONST(cephes_exp_p1, 1.3981999507E-3);
+_PS_CONST(cephes_exp_p2, 8.3334519073E-3);
+_PS_CONST(cephes_exp_p3, 4.1665795894E-2);
+_PS_CONST(cephes_exp_p4, 1.6666665459E-1);
+_PS_CONST(cephes_exp_p5, 5.0000001201E-1);
+
+OPENMM_EXPORT v4sf exp_ps(v4sf x) {
+  v4sf tmp = _mm_setzero_ps(), fx;
+#ifdef USE_SSE2
+  v4si emm0;
+#else
+  v2si mm0, mm1;
+#endif
+  v4sf one = *(v4sf*)_ps_1;
+
+  x = _mm_min_ps(x, *(v4sf*)_ps_exp_hi);
+  x = _mm_max_ps(x, *(v4sf*)_ps_exp_lo);
+
+  /* express exp(x) as exp(g + n*log(2)) */
+  fx = _mm_mul_ps(x, *(v4sf*)_ps_cephes_LOG2EF);
+  fx = _mm_add_ps(fx, *(v4sf*)_ps_0p5);
+
+  /* how to perform a floorf with SSE: just below */
+#ifndef USE_SSE2
+  /* step 1 : cast to int */
+  tmp = _mm_movehl_ps(tmp, fx);
+  mm0 = _mm_cvttps_pi32(fx);
+  mm1 = _mm_cvttps_pi32(tmp);
+  /* step 2 : cast back to float */
+  tmp = _mm_cvtpi32x2_ps(mm0, mm1);
+#else
+  emm0 = _mm_cvttps_epi32(fx);
+  tmp  = _mm_cvtepi32_ps(emm0);
+#endif
+  /* if greater, substract 1 */
+  v4sf mask = _mm_cmpgt_ps(tmp, fx);    
+  mask = _mm_and_ps(mask, one);
+  fx = _mm_sub_ps(tmp, mask);
+
+  tmp = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C1);
+  v4sf z = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C2);
+  x = _mm_sub_ps(x, tmp);
+  x = _mm_sub_ps(x, z);
+
+  z = _mm_mul_ps(x,x);
+  
+  v4sf y = *(v4sf*)_ps_cephes_exp_p0;
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p1);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p2);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p3);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p4);
+  y = _mm_mul_ps(y, x);
+  y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p5);
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, x);
+  y = _mm_add_ps(y, one);
+
+  /* build 2^n */
+#ifndef USE_SSE2
+  z = _mm_movehl_ps(z, fx);
+  mm0 = _mm_cvttps_pi32(fx);
+  mm1 = _mm_cvttps_pi32(z);
+  mm0 = _mm_add_pi32(mm0, *(v2si*)_pi32_0x7f);
+  mm1 = _mm_add_pi32(mm1, *(v2si*)_pi32_0x7f);
+  mm0 = _mm_slli_pi32(mm0, 23); 
+  mm1 = _mm_slli_pi32(mm1, 23);
+  
+  v4sf pow2n; 
+  COPY_MM_TO_XMM(mm0, mm1, pow2n);
+  _mm_empty();
+#else
+  emm0 = _mm_cvttps_epi32(fx);
+  emm0 = _mm_add_epi32(emm0, *(v4si*)_pi32_0x7f);
+  emm0 = _mm_slli_epi32(emm0, 23);
+  v4sf pow2n = _mm_castsi128_ps(emm0);
+#endif
+  y = _mm_mul_ps(y, pow2n);
+  return y;
+}
+
+_PS_CONST(minus_cephes_DP1, -0.78515625);
+_PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
+_PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
+_PS_CONST(sincof_p0, -1.9515295891E-4);
+_PS_CONST(sincof_p1,  8.3321608736E-3);
+_PS_CONST(sincof_p2, -1.6666654611E-1);
+_PS_CONST(coscof_p0,  2.443315711809948E-005);
+_PS_CONST(coscof_p1, -1.388731625493765E-003);
+_PS_CONST(coscof_p2,  4.166664568298827E-002);
+_PS_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
+
+
+/* evaluation of 4 sines at onces, using only SSE1+MMX intrinsics so
+   it runs also on old athlons XPs and the pentium III of your grand
+   mother.
+
+   The code is the exact rewriting of the cephes sinf function.
+   Precision is excellent as long as x < 8192 (I did not bother to
+   take into account the special handling they have for greater values
+   -- it does not return garbage for arguments over 8192, though, but
+   the extra precision is missing).
+
+   Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
+   surprising but correct result.
+
+   Performance is also surprisingly good, 1.33 times faster than the
+   macos vsinf SSE2 function, and 1.5 times faster than the
+   __vrs4_sinf of amd's ACML (which is only available in 64 bits). Not
+   too bad for an SSE1 function (with no special tuning) !
+   However the latter libraries probably have a much better handling of NaN,
+   Inf, denormalized and other special arguments..
+
+   On my core 1 duo, the execution of this function takes approximately 95 cycles.
+
+   From what I have observed on the experiments with Intel AMath lib, switching to an
+   SSE2 version would improve the perf by only 10%.
+
+   Since it is based on SSE intrinsics, it has to be compiled at -O2 to
+   deliver full speed.
+*/
+OPENMM_EXPORT v4sf sin_ps(v4sf x) { // any x
+  v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
+
+#ifdef USE_SSE2
+  v4si emm0, emm2;
+#else
+  v2si mm0, mm1, mm2, mm3;
+#endif
+  sign_bit = x;
+  /* take the absolute value */
+  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
+  /* extract the sign bit (upper one) */
+  sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
+  
+  /* scale by 4/Pi */
+  y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
+
+#ifdef USE_SSE2
+  /* store the integer part of y in mm0 */
+  emm2 = _mm_cvttps_epi32(y);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
+  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
+  y = _mm_cvtepi32_ps(emm2);
+
+  /* get the swap sign flag */
+  emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
+  emm0 = _mm_slli_epi32(emm0, 29);
+  /* get the polynom selection mask 
+     there is one polynom for 0 <= x <= Pi/4
+     and another one for Pi/4<x<=Pi/2
+
+     Both branches will be computed.
+  */
+  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
+  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
+  
+  v4sf swap_sign_bit = _mm_castsi128_ps(emm0);
+  v4sf poly_mask = _mm_castsi128_ps(emm2);
+  sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
+  
+#else
+  /* store the integer part of y in mm0:mm1 */
+  xmm2 = _mm_movehl_ps(xmm2, y);
+  mm2 = _mm_cvttps_pi32(y);
+  mm3 = _mm_cvttps_pi32(xmm2);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
+  mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
+  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
+  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
+  y = _mm_cvtpi32x2_ps(mm2, mm3);
+  /* get the swap sign flag */
+  mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
+  mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
+  mm0 = _mm_slli_pi32(mm0, 29);
+  mm1 = _mm_slli_pi32(mm1, 29);
+  /* get the polynom selection mask */
+  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
+  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
+  mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
+  mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
+  v4sf swap_sign_bit, poly_mask;
+  COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit);
+  COPY_MM_TO_XMM(mm2, mm3, poly_mask);
+  sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
+  _mm_empty(); /* good-bye mmx */
+#endif
+  
+  /* The magic pass: "Extended precision modular arithmetic" 
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
+  xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
+  xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
+  xmm1 = _mm_mul_ps(y, xmm1);
+  xmm2 = _mm_mul_ps(y, xmm2);
+  xmm3 = _mm_mul_ps(y, xmm3);
+  x = _mm_add_ps(x, xmm1);
+  x = _mm_add_ps(x, xmm2);
+  x = _mm_add_ps(x, xmm3);
+
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  y = *(v4sf*)_ps_coscof_p0;
+  v4sf z = _mm_mul_ps(x,x);
+
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
+  y = _mm_mul_ps(y, z);
+  y = _mm_mul_ps(y, z);
+  v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
+  y = _mm_sub_ps(y, tmp);
+  y = _mm_add_ps(y, *(v4sf*)_ps_1);
+  
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+  v4sf y2 = *(v4sf*)_ps_sincof_p0;
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_mul_ps(y2, x);
+  y2 = _mm_add_ps(y2, x);
+
+  /* select the correct result from the two polynoms */  
+  xmm3 = poly_mask;
+  y2 = _mm_and_ps(xmm3, y2); //, xmm3);
+  y = _mm_andnot_ps(xmm3, y);
+  y = _mm_add_ps(y,y2);
+  /* update the sign */
+  y = _mm_xor_ps(y, sign_bit);
+  return y;
+}
+
+/* almost the same as sin_ps */
+OPENMM_EXPORT v4sf cos_ps(v4sf x) { // any x
+  v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
+#ifdef USE_SSE2
+  v4si emm0, emm2;
+#else
+  v2si mm0, mm1, mm2, mm3;
+#endif
+  /* take the absolute value */
+  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
+  
+  /* scale by 4/Pi */
+  y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
+  
+#ifdef USE_SSE2
+  /* store the integer part of y in mm0 */
+  emm2 = _mm_cvttps_epi32(y);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
+  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
+  y = _mm_cvtepi32_ps(emm2);
+
+  emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
+  
+  /* get the swap sign flag */
+  emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
+  emm0 = _mm_slli_epi32(emm0, 29);
+  /* get the polynom selection mask */
+  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
+  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
+  
+  v4sf sign_bit = _mm_castsi128_ps(emm0);
+  v4sf poly_mask = _mm_castsi128_ps(emm2);
+#else
+  /* store the integer part of y in mm0:mm1 */
+  xmm2 = _mm_movehl_ps(xmm2, y);
+  mm2 = _mm_cvttps_pi32(y);
+  mm3 = _mm_cvttps_pi32(xmm2);
+
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
+  mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
+  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
+  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
+
+  y = _mm_cvtpi32x2_ps(mm2, mm3);
+
+
+  mm2 = _mm_sub_pi32(mm2, *(v2si*)_pi32_2);
+  mm3 = _mm_sub_pi32(mm3, *(v2si*)_pi32_2);
+
+  /* get the swap sign flag in mm0:mm1 and the 
+     polynom selection mask in mm2:mm3 */
+
+  mm0 = _mm_andnot_si64(mm2, *(v2si*)_pi32_4);
+  mm1 = _mm_andnot_si64(mm3, *(v2si*)_pi32_4);
+  mm0 = _mm_slli_pi32(mm0, 29);
+  mm1 = _mm_slli_pi32(mm1, 29);
+
+  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
+  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
+
+  mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
+  mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
+
+  v4sf sign_bit, poly_mask;
+  COPY_MM_TO_XMM(mm0, mm1, sign_bit);
+  COPY_MM_TO_XMM(mm2, mm3, poly_mask);
+  _mm_empty(); /* good-bye mmx */
+#endif
+  /* The magic pass: "Extended precision modular arithmetic" 
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
+  xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
+  xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
+  xmm1 = _mm_mul_ps(y, xmm1);
+  xmm2 = _mm_mul_ps(y, xmm2);
+  xmm3 = _mm_mul_ps(y, xmm3);
+  x = _mm_add_ps(x, xmm1);
+  x = _mm_add_ps(x, xmm2);
+  x = _mm_add_ps(x, xmm3);
+  
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  y = *(v4sf*)_ps_coscof_p0;
+  v4sf z = _mm_mul_ps(x,x);
+
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
+  y = _mm_mul_ps(y, z);
+  y = _mm_mul_ps(y, z);
+  v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
+  y = _mm_sub_ps(y, tmp);
+  y = _mm_add_ps(y, *(v4sf*)_ps_1);
+  
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+  v4sf y2 = *(v4sf*)_ps_sincof_p0;
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_mul_ps(y2, x);
+  y2 = _mm_add_ps(y2, x);
+
+  /* select the correct result from the two polynoms */  
+  xmm3 = poly_mask;
+  y2 = _mm_and_ps(xmm3, y2); //, xmm3);
+  y = _mm_andnot_ps(xmm3, y);
+  y = _mm_add_ps(y,y2);
+  /* update the sign */
+  y = _mm_xor_ps(y, sign_bit);
+
+  return y;
+}
+
+/* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them..
+   it is almost as fast, and gives you a free cosine with your sine */
+OPENMM_EXPORT void sincos_ps(v4sf x, v4sf *s, v4sf *c) {
+  v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
+#ifdef USE_SSE2
+  v4si emm0, emm2, emm4;
+#else
+  v2si mm0, mm1, mm2, mm3, mm4, mm5;
+#endif
+  sign_bit_sin = x;
+  /* take the absolute value */
+  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
+  /* extract the sign bit (upper one) */
+  sign_bit_sin = _mm_and_ps(sign_bit_sin, *(v4sf*)_ps_sign_mask);
+  
+  /* scale by 4/Pi */
+  y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
+    
+#ifdef USE_SSE2
+  /* store the integer part of y in emm2 */
+  emm2 = _mm_cvttps_epi32(y);
+
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
+  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
+  y = _mm_cvtepi32_ps(emm2);
+
+  emm4 = emm2;
+
+  /* get the swap sign flag for the sine */
+  emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
+  emm0 = _mm_slli_epi32(emm0, 29);
+  v4sf swap_sign_bit_sin = _mm_castsi128_ps(emm0);
+
+  /* get the polynom selection mask for the sine*/
+  emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
+  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
+  v4sf poly_mask = _mm_castsi128_ps(emm2);
+#else
+  /* store the integer part of y in mm2:mm3 */
+  xmm3 = _mm_movehl_ps(xmm3, y);
+  mm2 = _mm_cvttps_pi32(y);
+  mm3 = _mm_cvttps_pi32(xmm3);
+
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
+  mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
+  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
+  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
+
+  y = _mm_cvtpi32x2_ps(mm2, mm3);
+
+  mm4 = mm2;
+  mm5 = mm3;
+
+  /* get the swap sign flag for the sine */
+  mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
+  mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
+  mm0 = _mm_slli_pi32(mm0, 29);
+  mm1 = _mm_slli_pi32(mm1, 29);
+  v4sf swap_sign_bit_sin;
+  COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
+
+  /* get the polynom selection mask for the sine */
+
+  mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
+  mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
+  mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
+  mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
+  v4sf poly_mask;
+  COPY_MM_TO_XMM(mm2, mm3, poly_mask);
+#endif
+
+  /* The magic pass: "Extended precision modular arithmetic" 
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
+  xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
+  xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
+  xmm1 = _mm_mul_ps(y, xmm1);
+  xmm2 = _mm_mul_ps(y, xmm2);
+  xmm3 = _mm_mul_ps(y, xmm3);
+  x = _mm_add_ps(x, xmm1);
+  x = _mm_add_ps(x, xmm2);
+  x = _mm_add_ps(x, xmm3);
+
+#ifdef USE_SSE2
+  emm4 = _mm_sub_epi32(emm4, *(v4si*)_pi32_2);
+  emm4 = _mm_andnot_si128(emm4, *(v4si*)_pi32_4);
+  emm4 = _mm_slli_epi32(emm4, 29);
+  v4sf sign_bit_cos = _mm_castsi128_ps(emm4);
+#else
+  /* get the sign flag for the cosine */
+  mm4 = _mm_sub_pi32(mm4, *(v2si*)_pi32_2);
+  mm5 = _mm_sub_pi32(mm5, *(v2si*)_pi32_2);
+  mm4 = _mm_andnot_si64(mm4, *(v2si*)_pi32_4);
+  mm5 = _mm_andnot_si64(mm5, *(v2si*)_pi32_4);
+  mm4 = _mm_slli_pi32(mm4, 29);
+  mm5 = _mm_slli_pi32(mm5, 29);
+  v4sf sign_bit_cos;
+  COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
+  _mm_empty(); /* good-bye mmx */
+#endif
+
+  sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
+
+  
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  v4sf z = _mm_mul_ps(x,x);
+  y = *(v4sf*)_ps_coscof_p0;
+
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
+  y = _mm_mul_ps(y, z);
+  y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
+  y = _mm_mul_ps(y, z);
+  y = _mm_mul_ps(y, z);
+  v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
+  y = _mm_sub_ps(y, tmp);
+  y = _mm_add_ps(y, *(v4sf*)_ps_1);
+  
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+  v4sf y2 = *(v4sf*)_ps_sincof_p0;
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
+  y2 = _mm_mul_ps(y2, z);
+  y2 = _mm_mul_ps(y2, x);
+  y2 = _mm_add_ps(y2, x);
+
+  /* select the correct result from the two polynoms */  
+  xmm3 = poly_mask;
+  v4sf ysin2 = _mm_and_ps(xmm3, y2);
+  v4sf ysin1 = _mm_andnot_ps(xmm3, y);
+  y2 = _mm_sub_ps(y2,ysin2);
+  y = _mm_sub_ps(y, ysin1);
+
+  xmm1 = _mm_add_ps(ysin1,ysin2);
+  xmm2 = _mm_add_ps(y,y2);
+ 
+  /* update the sign */
+  *s = _mm_xor_ps(xmm1, sign_bit_sin);
+  *c = _mm_xor_ps(xmm2, sign_bit_cos);
+}
+