Commit fd473eea authored by Peter Eastman's avatar Peter Eastman
Browse files

Merge branch 'master' into nucleic

parents 0a751b5b 6a985cfd
language: cpp
compiler:
- clang
language: python
python:
- "2.7_with_system_site_packages"
- "3.4"
sudo: false
addons:
apt:
......@@ -10,16 +12,10 @@ addons:
- libpcre3
- libpcre3-dev
- gromacs
- swig
- doxygen
- clang-3.3
- llvm-3.3
- python-numpy
- python-scipy
- python-sphinx
- python-yaml
- python-pip
- python-virtualenv
env:
matrix:
......@@ -29,11 +25,14 @@ env:
before_install:
- export CC=clang
- export CXX=clang++
- export ASAN_SYMBOLIZER_PATH=/usr/bin/llvm-symbolizer-3.3
- wget https://anaconda.org/anaconda/swig/3.0.2/download/linux-64/swig-3.0.2-0.tar.bz2
- mkdir $HOME/swig
- tar -xjvf swig-3.0.2-0.tar.bz2 -C $HOME/swig
- export PATH=$HOME/swig/bin:$PATH
- export SWIG_LIB=$HOME/swig/share/swig/3.0.2
script:
- virtualenv --system-site-packages openmm_env
- source openmm_env/bin/activate
- cmake -DCMAKE_INSTALL_PREFIX=$HOME/OpenMM -DOPENMM_BUILD_STATIC_LIB=$OPENMM_BUILD_STATIC_LIB .
- make -j2
- make -j2 install
......@@ -41,13 +40,14 @@ script:
- # Run the testInstallation script
- python -m simtk.testInstallation
- # run all of the tests, making sure failures at this stage don't cause travis failures
- ctest -j2 -V || true
- ctest -j2 || true
- # get a list of all of the failed tests into this stupid ctest format
- python -c 'fn = "Testing/Temporary/LastTestsFailed.log"; import os; os.path.exists(fn) or exit(0); l = [line.split(":")[0] for line in open(fn)]; triplets = zip(l, l, [","]*len(l)); print "".join(",".join(t) for t in triplets)' > FailedTests.log
- python -c "import os; fn = os.path.join('Testing', 'Temporary', 'LastTestsFailed.log'); os.path.exists(fn) or exit(0); failed = [line.split(':')[0] for line in open(fn)]; print(','.join(x+','+x for x in failed))" > FailedTests.log
- # rerun all of the failed tests
- if [ -s FailedTests.log ]; then ctest -V -I FailedTests.log; fi;
- if [ -s Testing/Temporary/LastTestsFailed.log ]; then ctest -I FailedTests.log; fi;
- # run the python tests too
- cd python/tests
- pip install nose
- nosetests -vv --processes=-1 --process-timeout=200
- # nosetests -vv --processes=-1 --process-timeout=200
- # nosetests -vv
- py.test -v *
- cd -
......@@ -22,6 +22,9 @@ IF( NOT PROJECT_NAME )
ENDIF( NOT PROJECT_NAME )
CMAKE_MINIMUM_REQUIRED(VERSION 2.8)
if("${CMAKE_VERSION}" VERSION_GREATER "3.0" OR "${CMAKE_VERSION}" VERSION_EQUAL "3.0")
CMAKE_POLICY(SET CMP0042 OLD)
endif()
#SET(CMAKE_VERBOSE_MAKEFILE 1)
......@@ -79,7 +82,7 @@ ENDIF(${CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT})
# The source is organized into subdirectories, but we handle them all from
# this CMakeLists file rather than letting CMake visit them as SUBDIRS.
SET(OPENMM_SOURCE_SUBDIRS . openmmapi olla libraries/jama libraries/quern libraries/lepton libraries/sfmt libraries/lbfgs libraries/hilbert libraries/csha1 platforms/reference serialization libraries/validate libraries/irrxml)
SET(OPENMM_SOURCE_SUBDIRS . openmmapi olla libraries/jama libraries/quern libraries/lepton libraries/sfmt libraries/lbfgs libraries/hilbert libraries/csha1 platforms/reference serialization libraries/validate libraries/irrxml libraries/vecmath)
IF(WIN32)
SET(OPENMM_SOURCE_SUBDIRS ${OPENMM_SOURCE_SUBDIRS} libraries/pthreads)
ELSE(WIN32)
......@@ -121,6 +124,12 @@ IF (APPLE AND (NOT PNACL))
ELSE (APPLE AND (NOT PNACL))
IF (MSVC OR ANDROID OR PNACL)
SET(EXTRA_COMPILE_FLAGS)
IF (MSVC)
# Use warning level 2, not whatever warning level CMake picked.
STRING(REGEX REPLACE "/W[0-4]" "/W2" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
# Explicitly suppress warnings 4305 and 4244.
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4305 /wd4244")
ENDIF (MSVC)
ELSE (MSVC OR ANDROID OR PNACL)
SET(EXTRA_COMPILE_FLAGS "-msse2")
ENDIF (MSVC OR ANDROID OR PNACL)
......@@ -150,8 +159,8 @@ ENDIF (NOT CMAKE_CXX_FLAGS_RELEASE)
# and make it available to the code so it can be built into the binaries.
SET(OPENMM_LIBRARY_NAME OpenMM)
SET(OPENMM_MAJOR_VERSION 6)
SET(OPENMM_MINOR_VERSION 3)
SET(OPENMM_MAJOR_VERSION 7)
SET(OPENMM_MINOR_VERSION 0)
SET(OPENMM_BUILD_VERSION 0)
SET(OPENMM_COPYRIGHT_YEARS "2008-2015")
......@@ -287,6 +296,9 @@ IF (CMAKE_SYSTEM_NAME MATCHES "Linux")
SET(EXTRA_LINK_FLAGS "${EXTRA_LINK_FLAGS} -Wl,--no-as-needed -lrt")
ENDIF (NOT ANDROID)
ENDIF (CMAKE_SYSTEM_NAME MATCHES "Linux")
IF (MSVC)
SET(EXTRA_LINK_FLAGS)
ENDIF (MSVC)
IF(OPENMM_BUILD_SHARED_LIB)
ADD_LIBRARY(${SHARED_TARGET} SHARED ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})
......@@ -328,6 +340,7 @@ ELSE(DL_LIBRARY)
ENDIF(DL_LIBRARY)
IF(BUILD_TESTING)
INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/tests)
ADD_SUBDIRECTORY(platforms/reference/tests)
ENDIF(BUILD_TESTING)
......
os: Windows Server 2012 R2
shallow_clone: true
install:
# Setup shell for VS2010, x64, release mode
- >
"%ProgramFiles%\Microsoft SDKs\Windows\v7.1\Bin\SetEnv.cmd" /x64 /release
# Set path to python, git-bash tools.
- "set PATH=C:\\Python34-x64;C:\\Python34-x64\\Scripts;%PATH%"
- "set PATH=C:\\Program Files (x86)\\Git\\bin;%PATH%"
- pip install nose
# Download FFTW3 for PME plugin
- C:\MinGW\msys\1.0\bin\wget -q ftp://ftp.fftw.org/pub/fftw/fftw-3.3.4-dll64.zip
- 7z x fftw-3.3.4-dll64.zip -oC:\fftw > null
- cd C:\fftw
- lib /def:libfftw3f-3.def
- cd %APPVEYOR_BUILD_FOLDER%
- "set PATH=C:\\fftw;%PATH%"
# Download and install some OpenMM build dependencies (doxygen, swig)
- choco install -y doxygen.portable swig
# Set CMake options
- ps: $env:CMAKE_FLAGS='-DOPENMM_BUILD_PME_PLUGIN=ON -DFFTW_LIBRARY=C:/fftw/libfftw3f-3.lib -DFFTW_INCLUDES=C:/fftw -DCMAKE_BUILD_TYPE=Release -DOPENMM_BUILD_EXAMPLES=OFF -DCMAKE_CXX_FLAGS_RELEASE="/MD /Od /Ob0 /D NDEBUG"'
- mkdir build
- cd build
- cmake -G "NMake Makefiles" %CMAKE_FLAGS% -LA ..
# Run the build
- cmake --build . --target install
- cmake --build . --target PythonInstall
build: false
test_script:
- ctest || exit 0
- python -c "import os; fn = os.path.join('Testing', 'Temporary', 'LastTestsFailed.log'); os.path.exists(fn) or exit(0); failed = [line.split(':')[0] for line in open(fn)]; print(','.join(x+','+x for x in failed))" > FailedTests.log
- ps: >
If (Test-Path "Testing\\Temporary\\LastTestsFailed.log") {
cat Testing\\Temporary\\LastTestsFailed.log
cat FailedTests.log
ctest -I FailedTests.log
}
- cd python\tests
- nosetests -vv --processes=-1 --process-timeout=200
- cd %APPVEYOR_BUILD_FOLDER%
......@@ -405,8 +405,10 @@ for index, type in enumerate(types):
sigma = (params[0]/params[1])**(1.0/6.0)
epsilon = 4.184*params[1]*params[1]/(4*params[0])
else:
sigma = 0
sigma = 1
epsilon = 0
if sigma == 0 or epsilon == 0:
sigma, epsilon = 1, 0
if q != 0 or epsilon != 0:
print """ <Atom type="%d" charge="%s" sigma="%s" epsilon="%s"/>""" % (index, q, sigma, epsilon)
print " </NonbondedForce>"
......
......@@ -897,7 +897,6 @@ if( isAmoeba ):
torsionTorsionUnit = 1.0
outputString = """ <AmoebaTorsionTorsionForce >"""
tinkerXmlFile.write( "%s\n" % (outputString ) )
conversion = 41.84/radian
torsionTorsions = forces['tortors']
for (index, torsionTorsion) in enumerate(torsionTorsions):
torInfo = torsionTorsion[0]
......
......@@ -142,7 +142,7 @@ html_static_path = ['_static']
#html_domain_indices = True
# If false, no index is generated.
#html_use_index = True
html_use_index = False
# If true, the index is split into individual pages for each letter.
#html_split_index = False
......
......@@ -66,7 +66,7 @@ information related to a particular simulation, and define methods for
performing calculations.
Note that, whereas a Force is logically “part of” a System, a ForceImpl is
logically “part of” a Context. (See :numref:`Figure,API Relationships`\ .) If you create many Contexts
logically “part of” a Context. (See :autonumref:`Figure,API Relationships`\ .) If you create many Contexts
for simulating the same System, there is still only one System and only one copy
of each Force in it. But there will be separate ForceImpls for each Context,
and those ForceImpls store information related to their particular Contexts.
......
......@@ -2,34 +2,9 @@
OpenMM Developer Guide
######################
Portions copyright (c) 2011-2014 Stanford University and the Authors
Contributors: Peter Eastman
.. only:: latex
Permission is hereby granted, free of charge, to any person obtaining a copy of
this document (the "Document"), to deal in the Document without restriction,
including without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Document, and to permit
persons to whom the Document is furnished to do so, subject to the following
conditions:
This copyright and permission notice shall be included in all copies or
substantial portions of the Document.
THE DOCUMENT IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS,
CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE DOCUMENT OR THE USE OR OTHER DEALINGS IN THE
DOCUMENT.
Acknowledgments
OpenMM software and all related activities, such as this manual, are funded by
the Simbios National Center for Biomedical Computing through the National
Institutes of Health Roadmap for Medical Research, Grant U54 GM072970.
Information on the National Centers can be found at
http://nihroadmap.nih.gov/bioinformatics.
.. include:: license.rst
.. toctree::
:maxdepth: 3
......@@ -37,3 +12,6 @@ http://nihroadmap.nih.gov/bioinformatics.
developer
.. only:: html
.. include:: license.rst
Portions copyright (c) 2011-2015 Stanford University and the Authors
Contributors: Peter Eastman
Permission is hereby granted, free of charge, to any person obtaining a copy of
this document (the "Document"), to deal in the Document without restriction,
including without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Document, and to permit
persons to whom the Document is furnished to do so, subject to the following
conditions:
This copyright and permission notice shall be included in all copies or
substantial portions of the Document.
THE DOCUMENT IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS,
CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE DOCUMENT OR THE USE OR OTHER DEALINGS IN THE
DOCUMENT.
Acknowledgments
OpenMM software and all related activities, such as this manual, are funded by
the Simbios National Center for Biomedical Computing through the National
Institutes of Health Roadmap for Medical Research, Grant U54 GM072970.
Information on the National Centers can be found at
http://nihroadmap.nih.gov/bioinformatics.
......@@ -675,11 +675,17 @@ Platforms
When creating a :class:`Simulation`, you can optionally tell it what :class:`Platform` to use.
OpenMM includes four platforms: :class:`Reference`, :class:`CPU`, :class:`CUDA`, and :class:`OpenCL`. For a
description of the differences between them, see Section :ref:`platforms`. If you do not
specify a :class:`Platform`, it will select one automatically. Usually its choice will
be reasonable, but you may want to change it.
description of the differences between them, see Section :ref:`platforms`. There are three ways in which
the :class:`Platform` can be chosen:
The following lines specify to use the :class:`CUDA` platform:
1. By default, OpenMM will try to select the fastest available :class:`Platform`. Usually its choice will
be reasonable, but sometimes you may want to change it.
2. Alternatively, you can set the :envvar:`OPENMM_DEFAULT_PLATFORM` environment variable to the name
of the :class:`Platform` to use. This overrides the default logic.
3. Finally, you can explicitly specify a :class:`Platform` object in your script when you create the
:class:`Simulation`. The following lines specify to use the :class:`CUDA` platform:
::
platform = Platform.getPlatformByName('CUDA')
......@@ -1837,7 +1843,7 @@ The :code:`<ForceField>` tag contains the following children:
* Zero or more tags defining specific forces
The order of these tags does not matter. They are described in details below.
The order of these tags does not matter. They are described in detail below.
<AtomTypes>
===========
......@@ -1921,9 +1927,9 @@ as in the following example:
Each :code:`<VirtualSite>` tag indicates an atom in the residue that should
be represented with a virtual site. The :code:`type` attribute may equal
:code:`"average2"`\ , :code:`"average3"`\ , or :code:`"outOfPlane"`\ , which
correspond to the TwoParticleAverageSite, ThreeParticleAverageSite, and
OutOfPlaneSite classes respectively. The :code:`index` attribute gives the
:code:`"average2"`\ , :code:`"average3"`\ , :code:`"outOfPlane"`\ , or
:code:`"localCoords"`\ , which correspond to the TwoParticleAverageSite, ThreeParticleAverageSite,
OutOfPlaneSite, and LocalCoordinatesSite classes respectively. The :code:`index` attribute gives the
index (starting from 0) of the atom to represent with a virtual site. The atoms
it is calculated based on are specified by :code:`atom1`\ , :code:`atom2`\ ,
and (for virtual site classes that involve three atoms) :code:`atom3`\ . The
......@@ -1932,7 +1938,10 @@ parameters for calculating the site position. For a TwoParticleAverageSite,
they are :code:`weight1` and :code:`weight2`\ . For a
ThreeParticleAverageSite, they are :code:`weight1`\ , :code:`weight2`\ , and
\ :code:`weight3`\ . For an OutOfPlaneSite, they are :code:`weight12`\ ,
:code:`weight13`\ , and :code:`weightCross`\ .
:code:`weight13`\ , and :code:`weightCross`\ . For a LocalCoordinatesSite, they
are :code:`wo1`\ , :code:`wo2`\ , :code:`wo3`\ , :code:`wx1`\ , :code:`wx2`\ ,
:code:`wx3`\ , :code:`wy1`\ , :code:`wy2`\ , :code:`wy3`\ , :code:`p1`\ ,
:code:`p2`\ , and :code:`p3`\ .
<HarmonicBondForce>
......@@ -2490,8 +2499,9 @@ The following operators are supported: + (add), - (subtract), * (multiply), /
The following standard functions are supported: sqrt, exp, log, sin, cos, sec,
csc, tan, cot, asin, acos, atan, sinh, cosh, tanh, erf, erfc, min, max, abs,
step. step(x) = 0 if x < 0, 1 otherwise. Some custom forces allow additional
functions to be defined from tabulated values.
floor, ceil, step, delta, select. step(x) = 0 if x < 0, 1 otherwise.
delta(x) = 1 if x is 0, 0 otherwise. select(x,y,z) = z if x = 0, y otherwise.
Some custom forces allow additional functions to be defined from tabulated values.
Numbers may be given in either decimal or exponential form. All of the
following are valid numbers: 5, -3.1, 1e6, and 3.12e-2.
......@@ -2515,8 +2525,8 @@ values. All uses of a value must appear *before* that value’s definition.
.. _tabulated-functions:
TabulatedFunctions
==================
Tabulated Functions
===================
Some forces, such as CustomNonbondedForce and CustomGBForce, allow you to define
tabulated functions. To define a function, include a :code:`<Function>` tag inside the
......@@ -2563,6 +2573,47 @@ successive values separated by white space. See the API documentation for more
details.
Residue Template Parameters
===========================
In forces that use an :code:`<Atom>` tag to define parameters for atom types or
classes, there is an alternate mechanism you can also use: defining those
parameter values in the residue template. This is useful for situations that
come up in certain force fields. For example, :code:`NonbondedForce` and
:code:`GBSAOBCForce` each have a :code:`charge` attribute. If you only have to
define the charge of each atom type once, that is more convenient and avoids
potential bugs. Also, many force fields have a different charge for each atom
type, but Lennard-Jones parameters that are the same for all types in a class.
It would be preferable not to have to repeat those parameter values many times
over.
When writing a residue template, you can add arbitrary additional attributes
to each :code:`<Atom>` tag. For example, you might include a :code:`charge`
attribute as follows:
.. code-block:: xml
<Atom name="CA" type="53" charge="0.0381"/>
When writing the tag for a force, you can then include a
:code:`<UseAttributeFromResidue>` tag inside it. This indicates that a
specified attribute should be taken from the residue template. Finally, you
simply omit that attribute in the force's own :code:`<Atom>` tags. For example:
.. code-block:: xml
<NonbondedForce coulomb14scale="0.833333" lj14scale="0.5">
<UseAttributeFromResidue name="charge"/>
<Atom class="CX" sigma="0.339966950842" epsilon="0.4577296"/>
...
</NonbondedForce>
Notice that the :code:`charge` attribute is missing, and that the parameters
are specified by class, not by type. This means that sigma and epsilon only
need to be specified once for each class. The atom charges, which are different
for each type, are taken from the residue template instead.
Using Multiple Files
********************
......
......@@ -142,7 +142,7 @@ html_static_path = ['_static']
#html_domain_indices = True
# If false, no index is generated.
#html_use_index = True
html_use_index = False
# If true, the index is split into individual pages for each letter.
#html_split_index = False
......
......@@ -4,31 +4,9 @@
OpenMM Users Manual and Theory Guide
####################################
.. only:: latex
Portions copyright (c) 2008-2014 Stanford University and the Authors
Contributors: Kyle Beauchamp, Christopher Bruns, John Chodera, Peter Eastman, Mark
Friedrichs, Joy P. Ku, Tom Markland, Vijay Pande, Randy Radmer, Michael Sherman,
Jason Swails, Lee-Ping Wang
Permission is hereby granted, free of charge, to any person obtaining a copy of
this document (the "Document"), to deal in the Document without restriction,
including without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Document, and to permit
persons to whom the Document is furnished to do so, subject to the following
conditions:
This copyright and permission notice shall be included in all copies or
substantial portions of the Document.
THE DOCUMENT IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS,
CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE DOCUMENT OR THE USE OR OTHER DEALINGS IN THE
DOCUMENT.
.. include:: license.rst
.. toctree::
:numbered:
......@@ -40,3 +18,6 @@ DOCUMENT.
theory
zbibliography
.. only:: html
.. include:: license.rst
Portions copyright (c) 2008-2015 Stanford University and the Authors
Contributors: Kyle Beauchamp, Christopher Bruns, John Chodera, Peter Eastman, Mark
Friedrichs, Joy P. Ku, Tom Markland, Vijay Pande, Randy Radmer, Michael Sherman,
Jason Swails, Lee-Ping Wang
Permission is hereby granted, free of charge, to any person obtaining a copy of
this document (the "Document"), to deal in the Document without restriction,
including without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Document, and to permit
persons to whom the Document is furnished to do so, subject to the following
conditions:
This copyright and permission notice shall be included in all copies or
substantial portions of the Document.
THE DOCUMENT IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS,
CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE DOCUMENT OR THE USE OR OTHER DEALINGS IN THE
DOCUMENT.
......@@ -861,6 +861,28 @@ Parameters may be specified in two ways:
* Per-bond parameters are defined by specifying a value for each bond.
CustomCentroidBondForce
***********************
CustomCentroidBondForce is very similar to CustomCompoundBondForce, but instead
of creating bonds between individual particles, the bonds are between the
centers of groups of particles. This is useful for purposes such as restraining
the distance between two molecules or pinning the center of mass of a single
molecule.
The first step in computing this force is to calculate the center position of
each defined group of particles. This is calculated as a weighted average of
the positions of all the particles in the group, with the weights being user
defined. The computation then proceeds exactly as with CustomCompoundBondForce,
but the energy of each "bond" is now calculated based on the centers of a set
of groups, rather than on the positions of individual particles.
This class supports all the same function types and features as
CustomCompoundBondForce. In fact, any interaction that could be implemented
with CustomCompoundBondForce can also be implemented with this class, simply by
defining each group to contain only a single atom.
CustomManyParticleForce
***********************
......
......@@ -28,7 +28,7 @@ FOREACH(EX_ROOT ${CPP_EXAMPLES})
SET_TARGET_PROPERTIES(${EX_ROOT}
PROPERTIES
PROJECT_LABEL "Example - ${EX_ROOT}"
LINK_FLAGS "${EXTRA_COMPILE_FLAGS}"
LINK_FLAGS "${EXTRA_LINK_FLAGS}"
COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS}")
TARGET_LINK_LIBRARIES(${EX_ROOT} ${SHARED_TARGET})
ENDIF (OPENMM_BUILD_SHARED_LIB)
......@@ -40,7 +40,7 @@ FOREACH(EX_ROOT ${CPP_EXAMPLES})
SET_TARGET_PROPERTIES(${EX_STATIC}
PROPERTIES
PROJECT_LABEL "Example - ${EX_STATIC}"
LINK_FLAGS "${EXTRA_COMPILE_FLAGS}"
LINK_FLAGS "${EXTRA_LINK_FLAGS}"
COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_USE_STATIC_LIBRARIES")
TARGET_LINK_LIBRARIES(${EX_STATIC} ${STATIC_TARGET})
ENDIF (OPENMM_BUILD_STATIC_LIB)
......@@ -62,7 +62,7 @@ IF(OPENMM_BUILD_C_AND_FORTRAN_WRAPPERS)
SET_TARGET_PROPERTIES(${EX_ROOT}
PROPERTIES
PROJECT_LABEL "Example C - ${EX_ROOT}"
LINK_FLAGS "${EXTRA_COMPILE_FLAGS}"
LINK_FLAGS "${EXTRA_LINK_FLAGS}"
COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS}")
TARGET_LINK_LIBRARIES(${EX_ROOT} ${SHARED_TARGET})
ADD_DEPENDENCIES(${EX_ROOT} ApiWrappers)
......@@ -77,7 +77,7 @@ IF(OPENMM_BUILD_C_AND_FORTRAN_WRAPPERS)
SET_TARGET_PROPERTIES(${EX_STATIC}
PROPERTIES
PROJECT_LABEL "Example C - ${EX_STATIC}"
LINK_FLAGS "${EXTRA_COMPILE_FLAGS}"
LINK_FLAGS "${EXTRA_LINK_FLAGS}"
COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_USE_STATIC_LIBRARIES")
TARGET_LINK_LIBRARIES(${EX_STATIC} ${STATIC_TARGET})
ADD_DEPENDENCIES(${EX_STATIC} ApiWrappers)
......
......@@ -52,7 +52,11 @@ def runOneTest(testName, options):
cutoff = 2.0*unit.nanometers
vdwCutoff = 1.2*unit.nanometers
system = ff.createSystem(pdb.topology, nonbondedMethod=app.NoCutoff, constraints=constraints, mutualInducedTargetEpsilon=epsilon, polarization=polarization)
dt = 0.001*unit.picoseconds
for f in system.getForces():
if isinstance(f, mm.AmoebaMultipoleForce) or isinstance(f, mm.AmoebaVdwForce) or isinstance(f, mm.AmoebaGeneralizedKirkwoodForce) or isinstance(f, mm.AmoebaWcaDispersionForce):
f.setForceGroup(1)
dt = 0.002*unit.picoseconds
integ = mm.MTSIntegrator(dt, [(0,2), (1,1)])
else:
if explicit:
ff = app.ForceField('amber99sb.xml', 'tip3p.xml')
......@@ -77,6 +81,7 @@ def runOneTest(testName, options):
constraints = app.HBonds
hydrogenMass = None
system = ff.createSystem(pdb.topology, nonbondedMethod=method, nonbondedCutoff=cutoff, constraints=constraints, hydrogenMass=hydrogenMass)
integ = mm.LangevinIntegrator(300*unit.kelvin, 91*(1/unit.picoseconds), dt)
print('Step Size: %g fs' % dt.value_in_unit(unit.femtoseconds))
properties = {}
initialSteps = 5
......@@ -95,7 +100,6 @@ def runOneTest(testName, options):
# Run the simulation.
integ = mm.LangevinIntegrator(300*unit.kelvin, 91*(1/unit.picoseconds), dt)
integ.setConstraintTolerance(1e-5)
if len(properties) > 0:
context = mm.Context(system, integ, platform, properties)
......
......@@ -4,7 +4,7 @@ from simtk.unit import *
from sys import stdout
gro = GromacsGroFile('input.gro')
top = GromacsTopFile('input.top', periodicBoxVectors=gro.getPeriodicBoxVectors(), includeDir='/usr/local/gromacs/share/gromacs/top')
top = GromacsTopFile('input.top', periodicBoxVectors=gro.getPeriodicBoxVectors())
system = top.createSystem(nonbondedMethod=PME, nonbondedCutoff=1*nanometer, constraints=HBonds)
integrator = LangevinIntegrator(300*kelvin, 1/picosecond, 0.002*picoseconds)
simulation = Simulation(top.topology, system, integrator)
......
......@@ -124,11 +124,13 @@ public:
};
void SFMT::createCheckpoint(std::ostream& stream) {
stream.write((char*) &data->baseData, sizeof(data->baseData));
stream.write((char*) &data->sfmt, sizeof(data->sfmt));
stream.write((char*) &data->idx, sizeof(data->idx));
}
void SFMT::loadCheckpoint(std::istream& stream) {
stream.read((char*) &data->baseData, sizeof(data->baseData));
stream.read((char*) &data->sfmt, sizeof(data->sfmt));
stream.read((char*) &data->idx, sizeof(data->idx));
}
......
/* NEON implementation of sin, cos, exp and log
Inspired by Intel Approximate Math library, and based on the
corresponding algorithms of the cephes math library
*/
/* Copyright (C) 2011 Julien Pommier
This software is provided 'as-is', without any express or implied
warranty. In no event will the authors be held liable for any damages
arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it
freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not
claim that you wrote the original software. If you use this software
in a product, an acknowledgment in the product documentation would be
appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be
misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
(this is the zlib license)
*/
#include <arm_neon.h>
#include "openmm/internal/windowsExport.h"
typedef float32x4_t v4sf; // vector of 4 float
typedef uint32x4_t v4su; // vector of 4 uint32
typedef int32x4_t v4si; // vector of 4 uint32
#define c_inv_mant_mask ~0x7f800000u
#define c_cephes_SQRTHF 0.707106781186547524
#define c_cephes_log_p0 7.0376836292E-2
#define c_cephes_log_p1 - 1.1514610310E-1
#define c_cephes_log_p2 1.1676998740E-1
#define c_cephes_log_p3 - 1.2420140846E-1
#define c_cephes_log_p4 + 1.4249322787E-1
#define c_cephes_log_p5 - 1.6668057665E-1
#define c_cephes_log_p6 + 2.0000714765E-1
#define c_cephes_log_p7 - 2.4999993993E-1
#define c_cephes_log_p8 + 3.3333331174E-1
#define c_cephes_log_q1 -2.12194440e-4
#define c_cephes_log_q2 0.693359375
/* natural logarithm computed for 4 simultaneous float
return NaN for x <= 0
*/
OPENMM_EXPORT v4sf log_ps(v4sf x) {
v4sf one = vdupq_n_f32(1);
x = vmaxq_f32(x, vdupq_n_f32(0)); /* force flush to zero on denormal values */
v4su invalid_mask = vcleq_f32(x, vdupq_n_f32(0));
v4si ux = vreinterpretq_s32_f32(x);
v4si emm0 = vshrq_n_s32(ux, 23);
/* keep only the fractional part */
ux = vandq_s32(ux, vdupq_n_s32(c_inv_mant_mask));
ux = vorrq_s32(ux, vreinterpretq_s32_f32(vdupq_n_f32(0.5f)));
x = vreinterpretq_f32_s32(ux);
emm0 = vsubq_s32(emm0, vdupq_n_s32(0x7f));
v4sf e = vcvtq_f32_s32(emm0);
e = vaddq_f32(e, one);
/* part2:
if( x < SQRTHF ) {
e -= 1;
x = x + x - 1.0;
} else { x = x - 1.0; }
*/
v4su mask = vcltq_f32(x, vdupq_n_f32(c_cephes_SQRTHF));
v4sf tmp = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(x), mask));
x = vsubq_f32(x, one);
e = vsubq_f32(e, vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(one), mask)));
x = vaddq_f32(x, tmp);
v4sf z = vmulq_f32(x,x);
v4sf y = vdupq_n_f32(c_cephes_log_p0);
y = vmulq_f32(y, x);
y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p1));
y = vmulq_f32(y, x);
y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p2));
y = vmulq_f32(y, x);
y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p3));
y = vmulq_f32(y, x);
y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p4));
y = vmulq_f32(y, x);
y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p5));
y = vmulq_f32(y, x);
y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p6));
y = vmulq_f32(y, x);
y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p7));
y = vmulq_f32(y, x);
y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p8));
y = vmulq_f32(y, x);
y = vmulq_f32(y, z);
tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q1));
y = vaddq_f32(y, tmp);
tmp = vmulq_f32(z, vdupq_n_f32(0.5f));
y = vsubq_f32(y, tmp);
tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q2));
x = vaddq_f32(x, y);
x = vaddq_f32(x, tmp);
x = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(x), invalid_mask)); // negative arg will be NAN
return x;
}
#define c_exp_hi 88.3762626647949f
#define c_exp_lo -88.3762626647949f
#define c_cephes_LOG2EF 1.44269504088896341
#define c_cephes_exp_C1 0.693359375
#define c_cephes_exp_C2 -2.12194440e-4
#define c_cephes_exp_p0 1.9875691500E-4
#define c_cephes_exp_p1 1.3981999507E-3
#define c_cephes_exp_p2 8.3334519073E-3
#define c_cephes_exp_p3 4.1665795894E-2
#define c_cephes_exp_p4 1.6666665459E-1
#define c_cephes_exp_p5 5.0000001201E-1
/* exp() computed for 4 float at once */
OPENMM_EXPORT v4sf exp_ps(v4sf x) {
v4sf tmp, fx;
v4sf one = vdupq_n_f32(1);
x = vminq_f32(x, vdupq_n_f32(c_exp_hi));
x = vmaxq_f32(x, vdupq_n_f32(c_exp_lo));
/* express exp(x) as exp(g + n*log(2)) */
fx = vmlaq_f32(vdupq_n_f32(0.5f), x, vdupq_n_f32(c_cephes_LOG2EF));
/* perform a floorf */
tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx));
/* if greater, substract 1 */
v4su mask = vcgtq_f32(tmp, fx);
mask = vandq_u32(mask, vreinterpretq_u32_f32(one));
fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask));
tmp = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C1));
v4sf z = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C2));
x = vsubq_f32(x, tmp);
x = vsubq_f32(x, z);
static const float cephes_exp_p[6] = { c_cephes_exp_p0, c_cephes_exp_p1, c_cephes_exp_p2, c_cephes_exp_p3, c_cephes_exp_p4, c_cephes_exp_p5 };
v4sf y = vld1q_dup_f32(cephes_exp_p+0);
v4sf c1 = vld1q_dup_f32(cephes_exp_p+1);
v4sf c2 = vld1q_dup_f32(cephes_exp_p+2);
v4sf c3 = vld1q_dup_f32(cephes_exp_p+3);
v4sf c4 = vld1q_dup_f32(cephes_exp_p+4);
v4sf c5 = vld1q_dup_f32(cephes_exp_p+5);
y = vmulq_f32(y, x);
z = vmulq_f32(x,x);
y = vaddq_f32(y, c1);
y = vmulq_f32(y, x);
y = vaddq_f32(y, c2);
y = vmulq_f32(y, x);
y = vaddq_f32(y, c3);
y = vmulq_f32(y, x);
y = vaddq_f32(y, c4);
y = vmulq_f32(y, x);
y = vaddq_f32(y, c5);
y = vmulq_f32(y, z);
y = vaddq_f32(y, x);
y = vaddq_f32(y, one);
/* build 2^n */
int32x4_t mm;
mm = vcvtq_s32_f32(fx);
mm = vaddq_s32(mm, vdupq_n_s32(0x7f));
mm = vshlq_n_s32(mm, 23);
v4sf pow2n = vreinterpretq_f32_s32(mm);
y = vmulq_f32(y, pow2n);
return y;
}
#define c_minus_cephes_DP1 -0.78515625
#define c_minus_cephes_DP2 -2.4187564849853515625e-4
#define c_minus_cephes_DP3 -3.77489497744594108e-8
#define c_sincof_p0 -1.9515295891E-4
#define c_sincof_p1 8.3321608736E-3
#define c_sincof_p2 -1.6666654611E-1
#define c_coscof_p0 2.443315711809948E-005
#define c_coscof_p1 -1.388731625493765E-003
#define c_coscof_p2 4.166664568298827E-002
#define c_cephes_FOPI 1.27323954473516 // 4 / M_PI
/* evaluation of 4 sines & cosines at once.
The code is the exact rewriting of the cephes sinf function.
Precision is excellent as long as x < 8192 (I did not bother to
take into account the special handling they have for greater values
-- it does not return garbage for arguments over 8192, though, but
the extra precision is missing).
Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
surprising but correct result.
Note also that when you compute sin(x), cos(x) is available at
almost no extra price so both sin_ps and cos_ps make use of
sincos_ps..
*/
OPENMM_EXPORT void sincos_ps(v4sf x, v4sf *ysin, v4sf *ycos) { // any x
v4sf xmm1, xmm2, xmm3, y;
v4su emm2;
v4su sign_mask_sin, sign_mask_cos;
sign_mask_sin = vcltq_f32(x, vdupq_n_f32(0));
x = vabsq_f32(x);
/* scale by 4/Pi */
y = vmulq_f32(x, vdupq_n_f32(c_cephes_FOPI));
/* store the integer part of y in mm0 */
emm2 = vcvtq_u32_f32(y);
/* j=(j+1) & (~1) (see the cephes sources) */
emm2 = vaddq_u32(emm2, vdupq_n_u32(1));
emm2 = vandq_u32(emm2, vdupq_n_u32(~1));
y = vcvtq_f32_u32(emm2);
/* get the polynom selection mask
there is one polynom for 0 <= x <= Pi/4
and another one for Pi/4<x<=Pi/2
Both branches will be computed.
*/
v4su poly_mask = vtstq_u32(emm2, vdupq_n_u32(2));
/* The magic pass: "Extended precision modular arithmetic"
x = ((x - y * DP1) - y * DP2) - y * DP3; */
xmm1 = vmulq_n_f32(y, c_minus_cephes_DP1);
xmm2 = vmulq_n_f32(y, c_minus_cephes_DP2);
xmm3 = vmulq_n_f32(y, c_minus_cephes_DP3);
x = vaddq_f32(x, xmm1);
x = vaddq_f32(x, xmm2);
x = vaddq_f32(x, xmm3);
sign_mask_sin = veorq_u32(sign_mask_sin, vtstq_u32(emm2, vdupq_n_u32(4)));
sign_mask_cos = vtstq_u32(vsubq_u32(emm2, vdupq_n_u32(2)), vdupq_n_u32(4));
/* Evaluate the first polynom (0 <= x <= Pi/4) in y1,
and the second polynom (Pi/4 <= x <= 0) in y2 */
v4sf z = vmulq_f32(x,x);
v4sf y1, y2;
y1 = vmulq_n_f32(z, c_coscof_p0);
y2 = vmulq_n_f32(z, c_sincof_p0);
y1 = vaddq_f32(y1, vdupq_n_f32(c_coscof_p1));
y2 = vaddq_f32(y2, vdupq_n_f32(c_sincof_p1));
y1 = vmulq_f32(y1, z);
y2 = vmulq_f32(y2, z);
y1 = vaddq_f32(y1, vdupq_n_f32(c_coscof_p2));
y2 = vaddq_f32(y2, vdupq_n_f32(c_sincof_p2));
y1 = vmulq_f32(y1, z);
y2 = vmulq_f32(y2, z);
y1 = vmulq_f32(y1, z);
y2 = vmulq_f32(y2, x);
y1 = vsubq_f32(y1, vmulq_f32(z, vdupq_n_f32(0.5f)));
y2 = vaddq_f32(y2, x);
y1 = vaddq_f32(y1, vdupq_n_f32(1));
/* select the correct result from the two polynoms */
v4sf ys = vbslq_f32(poly_mask, y1, y2);
v4sf yc = vbslq_f32(poly_mask, y2, y1);
*ysin = vbslq_f32(sign_mask_sin, vnegq_f32(ys), ys);
*ycos = vbslq_f32(sign_mask_cos, yc, vnegq_f32(yc));
}
OPENMM_EXPORT v4sf sin_ps(v4sf x) {
v4sf ysin, ycos;
sincos_ps(x, &ysin, &ycos);
return ysin;
}
OPENMM_EXPORT v4sf cos_ps(v4sf x) {
v4sf ysin, ycos;
sincos_ps(x, &ysin, &ycos);
return ycos;
}
/* SIMD (SSE1+MMX or SSE2) implementation of sin, cos, exp and log
Inspired by Intel Approximate Math library, and based on the
corresponding algorithms of the cephes math library
The default is to use the SSE1 version. If you define USE_SSE2 the
the SSE2 intrinsics will be used in place of the MMX intrinsics. Do
not expect any significant performance improvement with SSE2.
*/
/* Copyright (C) 2007 Julien Pommier
This software is provided 'as-is', without any express or implied
warranty. In no event will the authors be held liable for any damages
arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it
freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not
claim that you wrote the original software. If you use this software
in a product, an acknowledgment in the product documentation would be
appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be
misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
(this is the zlib license)
*/
#include <xmmintrin.h>
#include "openmm/internal/windowsExport.h"
/* yes I know, the top of this file is quite ugly */
#ifdef _MSC_VER /* visual c++ */
# define ALIGN16_BEG __declspec(align(16))
# define ALIGN16_END
#else /* gcc or icc */
# define ALIGN16_BEG
# define ALIGN16_END __attribute__((aligned(16)))
#endif
/* __m128 is ugly to write */
typedef __m128 v4sf; // vector of 4 float (sse1)
#ifdef USE_SSE2
# include <emmintrin.h>
typedef __m128i v4si; // vector of 4 int (sse2)
#else
typedef __m64 v2si; // vector of 2 int (mmx)
#endif
/* declare some SSE constants -- why can't I figure a better way to do that? */
#define _PS_CONST(Name, Val) \
static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
#define _PI32_CONST(Name, Val) \
static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
#define _PS_CONST_TYPE(Name, Type, Val) \
static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
_PS_CONST(1 , 1.0f);
_PS_CONST(0p5, 0.5f);
/* the smallest non denormalized float number */
_PS_CONST_TYPE(min_norm_pos, int, 0x00800000);
_PS_CONST_TYPE(mant_mask, int, 0x7f800000);
_PS_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
_PS_CONST_TYPE(sign_mask, int, (int)0x80000000);
_PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
_PI32_CONST(1, 1);
_PI32_CONST(inv1, ~1);
_PI32_CONST(2, 2);
_PI32_CONST(4, 4);
_PI32_CONST(0x7f, 0x7f);
_PS_CONST(cephes_SQRTHF, 0.707106781186547524);
_PS_CONST(cephes_log_p0, 7.0376836292E-2);
_PS_CONST(cephes_log_p1, - 1.1514610310E-1);
_PS_CONST(cephes_log_p2, 1.1676998740E-1);
_PS_CONST(cephes_log_p3, - 1.2420140846E-1);
_PS_CONST(cephes_log_p4, + 1.4249322787E-1);
_PS_CONST(cephes_log_p5, - 1.6668057665E-1);
_PS_CONST(cephes_log_p6, + 2.0000714765E-1);
_PS_CONST(cephes_log_p7, - 2.4999993993E-1);
_PS_CONST(cephes_log_p8, + 3.3333331174E-1);
_PS_CONST(cephes_log_q1, -2.12194440e-4);
_PS_CONST(cephes_log_q2, 0.693359375);
#ifndef USE_SSE2
typedef union xmm_mm_union {
__m128 xmm;
__m64 mm[2];
} xmm_mm_union;
#define COPY_XMM_TO_MM(xmm_, mm0_, mm1_) { \
xmm_mm_union u; u.xmm = xmm_; \
mm0_ = u.mm[0]; \
mm1_ = u.mm[1]; \
}
#define COPY_MM_TO_XMM(mm0_, mm1_, xmm_) { \
xmm_mm_union u; u.mm[0]=mm0_; u.mm[1]=mm1_; xmm_ = u.xmm; \
}
#endif // USE_SSE2
/* natural logarithm computed for 4 simultaneous float
return NaN for x <= 0
*/
OPENMM_EXPORT v4sf log_ps(v4sf x) {
#ifdef USE_SSE2
v4si emm0;
#else
v2si mm0, mm1;
#endif
v4sf one = *(v4sf*)_ps_1;
v4sf invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
x = _mm_max_ps(x, *(v4sf*)_ps_min_norm_pos); /* cut off denormalized stuff */
#ifndef USE_SSE2
/* part 1: x = frexpf(x, &e); */
COPY_XMM_TO_MM(x, mm0, mm1);
mm0 = _mm_srli_pi32(mm0, 23);
mm1 = _mm_srli_pi32(mm1, 23);
#else
emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
#endif
/* keep only the fractional part */
x = _mm_and_ps(x, *(v4sf*)_ps_inv_mant_mask);
x = _mm_or_ps(x, *(v4sf*)_ps_0p5);
#ifndef USE_SSE2
/* now e=mm0:mm1 contain the really base-2 exponent */
mm0 = _mm_sub_pi32(mm0, *(v2si*)_pi32_0x7f);
mm1 = _mm_sub_pi32(mm1, *(v2si*)_pi32_0x7f);
v4sf e = _mm_cvtpi32x2_ps(mm0, mm1);
_mm_empty(); /* bye bye mmx */
#else
emm0 = _mm_sub_epi32(emm0, *(v4si*)_pi32_0x7f);
v4sf e = _mm_cvtepi32_ps(emm0);
#endif
e = _mm_add_ps(e, one);
/* part2:
if( x < SQRTHF ) {
e -= 1;
x = x + x - 1.0;
} else { x = x - 1.0; }
*/
v4sf mask = _mm_cmplt_ps(x, *(v4sf*)_ps_cephes_SQRTHF);
v4sf tmp = _mm_and_ps(x, mask);
x = _mm_sub_ps(x, one);
e = _mm_sub_ps(e, _mm_and_ps(one, mask));
x = _mm_add_ps(x, tmp);
v4sf z = _mm_mul_ps(x,x);
v4sf y = *(v4sf*)_ps_cephes_log_p0;
y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p1);
y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p2);
y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p3);
y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p4);
y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p5);
y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p6);
y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p7);
y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_log_p8);
y = _mm_mul_ps(y, x);
y = _mm_mul_ps(y, z);
tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q1);
y = _mm_add_ps(y, tmp);
tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
y = _mm_sub_ps(y, tmp);
tmp = _mm_mul_ps(e, *(v4sf*)_ps_cephes_log_q2);
x = _mm_add_ps(x, y);
x = _mm_add_ps(x, tmp);
x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN
return x;
}
_PS_CONST(exp_hi, 88.3762626647949f);
_PS_CONST(exp_lo, -88.3762626647949f);
_PS_CONST(cephes_LOG2EF, 1.44269504088896341);
_PS_CONST(cephes_exp_C1, 0.693359375);
_PS_CONST(cephes_exp_C2, -2.12194440e-4);
_PS_CONST(cephes_exp_p0, 1.9875691500E-4);
_PS_CONST(cephes_exp_p1, 1.3981999507E-3);
_PS_CONST(cephes_exp_p2, 8.3334519073E-3);
_PS_CONST(cephes_exp_p3, 4.1665795894E-2);
_PS_CONST(cephes_exp_p4, 1.6666665459E-1);
_PS_CONST(cephes_exp_p5, 5.0000001201E-1);
OPENMM_EXPORT v4sf exp_ps(v4sf x) {
v4sf tmp = _mm_setzero_ps(), fx;
#ifdef USE_SSE2
v4si emm0;
#else
v2si mm0, mm1;
#endif
v4sf one = *(v4sf*)_ps_1;
x = _mm_min_ps(x, *(v4sf*)_ps_exp_hi);
x = _mm_max_ps(x, *(v4sf*)_ps_exp_lo);
/* express exp(x) as exp(g + n*log(2)) */
fx = _mm_mul_ps(x, *(v4sf*)_ps_cephes_LOG2EF);
fx = _mm_add_ps(fx, *(v4sf*)_ps_0p5);
/* how to perform a floorf with SSE: just below */
#ifndef USE_SSE2
/* step 1 : cast to int */
tmp = _mm_movehl_ps(tmp, fx);
mm0 = _mm_cvttps_pi32(fx);
mm1 = _mm_cvttps_pi32(tmp);
/* step 2 : cast back to float */
tmp = _mm_cvtpi32x2_ps(mm0, mm1);
#else
emm0 = _mm_cvttps_epi32(fx);
tmp = _mm_cvtepi32_ps(emm0);
#endif
/* if greater, substract 1 */
v4sf mask = _mm_cmpgt_ps(tmp, fx);
mask = _mm_and_ps(mask, one);
fx = _mm_sub_ps(tmp, mask);
tmp = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C1);
v4sf z = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C2);
x = _mm_sub_ps(x, tmp);
x = _mm_sub_ps(x, z);
z = _mm_mul_ps(x,x);
v4sf y = *(v4sf*)_ps_cephes_exp_p0;
y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p1);
y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p2);
y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p3);
y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p4);
y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p5);
y = _mm_mul_ps(y, z);
y = _mm_add_ps(y, x);
y = _mm_add_ps(y, one);
/* build 2^n */
#ifndef USE_SSE2
z = _mm_movehl_ps(z, fx);
mm0 = _mm_cvttps_pi32(fx);
mm1 = _mm_cvttps_pi32(z);
mm0 = _mm_add_pi32(mm0, *(v2si*)_pi32_0x7f);
mm1 = _mm_add_pi32(mm1, *(v2si*)_pi32_0x7f);
mm0 = _mm_slli_pi32(mm0, 23);
mm1 = _mm_slli_pi32(mm1, 23);
v4sf pow2n;
COPY_MM_TO_XMM(mm0, mm1, pow2n);
_mm_empty();
#else
emm0 = _mm_cvttps_epi32(fx);
emm0 = _mm_add_epi32(emm0, *(v4si*)_pi32_0x7f);
emm0 = _mm_slli_epi32(emm0, 23);
v4sf pow2n = _mm_castsi128_ps(emm0);
#endif
y = _mm_mul_ps(y, pow2n);
return y;
}
_PS_CONST(minus_cephes_DP1, -0.78515625);
_PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
_PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
_PS_CONST(sincof_p0, -1.9515295891E-4);
_PS_CONST(sincof_p1, 8.3321608736E-3);
_PS_CONST(sincof_p2, -1.6666654611E-1);
_PS_CONST(coscof_p0, 2.443315711809948E-005);
_PS_CONST(coscof_p1, -1.388731625493765E-003);
_PS_CONST(coscof_p2, 4.166664568298827E-002);
_PS_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI
/* evaluation of 4 sines at onces, using only SSE1+MMX intrinsics so
it runs also on old athlons XPs and the pentium III of your grand
mother.
The code is the exact rewriting of the cephes sinf function.
Precision is excellent as long as x < 8192 (I did not bother to
take into account the special handling they have for greater values
-- it does not return garbage for arguments over 8192, though, but
the extra precision is missing).
Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
surprising but correct result.
Performance is also surprisingly good, 1.33 times faster than the
macos vsinf SSE2 function, and 1.5 times faster than the
__vrs4_sinf of amd's ACML (which is only available in 64 bits). Not
too bad for an SSE1 function (with no special tuning) !
However the latter libraries probably have a much better handling of NaN,
Inf, denormalized and other special arguments..
On my core 1 duo, the execution of this function takes approximately 95 cycles.
From what I have observed on the experiments with Intel AMath lib, switching to an
SSE2 version would improve the perf by only 10%.
Since it is based on SSE intrinsics, it has to be compiled at -O2 to
deliver full speed.
*/
OPENMM_EXPORT v4sf sin_ps(v4sf x) { // any x
v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
#ifdef USE_SSE2
v4si emm0, emm2;
#else
v2si mm0, mm1, mm2, mm3;
#endif
sign_bit = x;
/* take the absolute value */
x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
/* extract the sign bit (upper one) */
sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
/* scale by 4/Pi */
y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
#ifdef USE_SSE2
/* store the integer part of y in mm0 */
emm2 = _mm_cvttps_epi32(y);
/* j=(j+1) & (~1) (see the cephes sources) */
emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
y = _mm_cvtepi32_ps(emm2);
/* get the swap sign flag */
emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
emm0 = _mm_slli_epi32(emm0, 29);
/* get the polynom selection mask
there is one polynom for 0 <= x <= Pi/4
and another one for Pi/4<x<=Pi/2
Both branches will be computed.
*/
emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
v4sf swap_sign_bit = _mm_castsi128_ps(emm0);
v4sf poly_mask = _mm_castsi128_ps(emm2);
sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
#else
/* store the integer part of y in mm0:mm1 */
xmm2 = _mm_movehl_ps(xmm2, y);
mm2 = _mm_cvttps_pi32(y);
mm3 = _mm_cvttps_pi32(xmm2);
/* j=(j+1) & (~1) (see the cephes sources) */
mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
y = _mm_cvtpi32x2_ps(mm2, mm3);
/* get the swap sign flag */
mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
mm0 = _mm_slli_pi32(mm0, 29);
mm1 = _mm_slli_pi32(mm1, 29);
/* get the polynom selection mask */
mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
v4sf swap_sign_bit, poly_mask;
COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit);
COPY_MM_TO_XMM(mm2, mm3, poly_mask);
sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
_mm_empty(); /* good-bye mmx */
#endif
/* The magic pass: "Extended precision modular arithmetic"
x = ((x - y * DP1) - y * DP2) - y * DP3; */
xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
xmm1 = _mm_mul_ps(y, xmm1);
xmm2 = _mm_mul_ps(y, xmm2);
xmm3 = _mm_mul_ps(y, xmm3);
x = _mm_add_ps(x, xmm1);
x = _mm_add_ps(x, xmm2);
x = _mm_add_ps(x, xmm3);
/* Evaluate the first polynom (0 <= x <= Pi/4) */
y = *(v4sf*)_ps_coscof_p0;
v4sf z = _mm_mul_ps(x,x);
y = _mm_mul_ps(y, z);
y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
y = _mm_mul_ps(y, z);
y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
y = _mm_mul_ps(y, z);
y = _mm_mul_ps(y, z);
v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
y = _mm_sub_ps(y, tmp);
y = _mm_add_ps(y, *(v4sf*)_ps_1);
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
v4sf y2 = *(v4sf*)_ps_sincof_p0;
y2 = _mm_mul_ps(y2, z);
y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
y2 = _mm_mul_ps(y2, z);
y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
y2 = _mm_mul_ps(y2, z);
y2 = _mm_mul_ps(y2, x);
y2 = _mm_add_ps(y2, x);
/* select the correct result from the two polynoms */
xmm3 = poly_mask;
y2 = _mm_and_ps(xmm3, y2); //, xmm3);
y = _mm_andnot_ps(xmm3, y);
y = _mm_add_ps(y,y2);
/* update the sign */
y = _mm_xor_ps(y, sign_bit);
return y;
}
/* almost the same as sin_ps */
OPENMM_EXPORT v4sf cos_ps(v4sf x) { // any x
v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
#ifdef USE_SSE2
v4si emm0, emm2;
#else
v2si mm0, mm1, mm2, mm3;
#endif
/* take the absolute value */
x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
/* scale by 4/Pi */
y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
#ifdef USE_SSE2
/* store the integer part of y in mm0 */
emm2 = _mm_cvttps_epi32(y);
/* j=(j+1) & (~1) (see the cephes sources) */
emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
y = _mm_cvtepi32_ps(emm2);
emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
/* get the swap sign flag */
emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
emm0 = _mm_slli_epi32(emm0, 29);
/* get the polynom selection mask */
emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
v4sf sign_bit = _mm_castsi128_ps(emm0);
v4sf poly_mask = _mm_castsi128_ps(emm2);
#else
/* store the integer part of y in mm0:mm1 */
xmm2 = _mm_movehl_ps(xmm2, y);
mm2 = _mm_cvttps_pi32(y);
mm3 = _mm_cvttps_pi32(xmm2);
/* j=(j+1) & (~1) (see the cephes sources) */
mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
y = _mm_cvtpi32x2_ps(mm2, mm3);
mm2 = _mm_sub_pi32(mm2, *(v2si*)_pi32_2);
mm3 = _mm_sub_pi32(mm3, *(v2si*)_pi32_2);
/* get the swap sign flag in mm0:mm1 and the
polynom selection mask in mm2:mm3 */
mm0 = _mm_andnot_si64(mm2, *(v2si*)_pi32_4);
mm1 = _mm_andnot_si64(mm3, *(v2si*)_pi32_4);
mm0 = _mm_slli_pi32(mm0, 29);
mm1 = _mm_slli_pi32(mm1, 29);
mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
v4sf sign_bit, poly_mask;
COPY_MM_TO_XMM(mm0, mm1, sign_bit);
COPY_MM_TO_XMM(mm2, mm3, poly_mask);
_mm_empty(); /* good-bye mmx */
#endif
/* The magic pass: "Extended precision modular arithmetic"
x = ((x - y * DP1) - y * DP2) - y * DP3; */
xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
xmm1 = _mm_mul_ps(y, xmm1);
xmm2 = _mm_mul_ps(y, xmm2);
xmm3 = _mm_mul_ps(y, xmm3);
x = _mm_add_ps(x, xmm1);
x = _mm_add_ps(x, xmm2);
x = _mm_add_ps(x, xmm3);
/* Evaluate the first polynom (0 <= x <= Pi/4) */
y = *(v4sf*)_ps_coscof_p0;
v4sf z = _mm_mul_ps(x,x);
y = _mm_mul_ps(y, z);
y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
y = _mm_mul_ps(y, z);
y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
y = _mm_mul_ps(y, z);
y = _mm_mul_ps(y, z);
v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
y = _mm_sub_ps(y, tmp);
y = _mm_add_ps(y, *(v4sf*)_ps_1);
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
v4sf y2 = *(v4sf*)_ps_sincof_p0;
y2 = _mm_mul_ps(y2, z);
y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
y2 = _mm_mul_ps(y2, z);
y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
y2 = _mm_mul_ps(y2, z);
y2 = _mm_mul_ps(y2, x);
y2 = _mm_add_ps(y2, x);
/* select the correct result from the two polynoms */
xmm3 = poly_mask;
y2 = _mm_and_ps(xmm3, y2); //, xmm3);
y = _mm_andnot_ps(xmm3, y);
y = _mm_add_ps(y,y2);
/* update the sign */
y = _mm_xor_ps(y, sign_bit);
return y;
}
/* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them..
it is almost as fast, and gives you a free cosine with your sine */
OPENMM_EXPORT void sincos_ps(v4sf x, v4sf *s, v4sf *c) {
v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
#ifdef USE_SSE2
v4si emm0, emm2, emm4;
#else
v2si mm0, mm1, mm2, mm3, mm4, mm5;
#endif
sign_bit_sin = x;
/* take the absolute value */
x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
/* extract the sign bit (upper one) */
sign_bit_sin = _mm_and_ps(sign_bit_sin, *(v4sf*)_ps_sign_mask);
/* scale by 4/Pi */
y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
#ifdef USE_SSE2
/* store the integer part of y in emm2 */
emm2 = _mm_cvttps_epi32(y);
/* j=(j+1) & (~1) (see the cephes sources) */
emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
y = _mm_cvtepi32_ps(emm2);
emm4 = emm2;
/* get the swap sign flag for the sine */
emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
emm0 = _mm_slli_epi32(emm0, 29);
v4sf swap_sign_bit_sin = _mm_castsi128_ps(emm0);
/* get the polynom selection mask for the sine*/
emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
v4sf poly_mask = _mm_castsi128_ps(emm2);
#else
/* store the integer part of y in mm2:mm3 */
xmm3 = _mm_movehl_ps(xmm3, y);
mm2 = _mm_cvttps_pi32(y);
mm3 = _mm_cvttps_pi32(xmm3);
/* j=(j+1) & (~1) (see the cephes sources) */
mm2 = _mm_add_pi32(mm2, *(v2si*)_pi32_1);
mm3 = _mm_add_pi32(mm3, *(v2si*)_pi32_1);
mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_inv1);
mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_inv1);
y = _mm_cvtpi32x2_ps(mm2, mm3);
mm4 = mm2;
mm5 = mm3;
/* get the swap sign flag for the sine */
mm0 = _mm_and_si64(mm2, *(v2si*)_pi32_4);
mm1 = _mm_and_si64(mm3, *(v2si*)_pi32_4);
mm0 = _mm_slli_pi32(mm0, 29);
mm1 = _mm_slli_pi32(mm1, 29);
v4sf swap_sign_bit_sin;
COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin);
/* get the polynom selection mask for the sine */
mm2 = _mm_and_si64(mm2, *(v2si*)_pi32_2);
mm3 = _mm_and_si64(mm3, *(v2si*)_pi32_2);
mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
v4sf poly_mask;
COPY_MM_TO_XMM(mm2, mm3, poly_mask);
#endif
/* The magic pass: "Extended precision modular arithmetic"
x = ((x - y * DP1) - y * DP2) - y * DP3; */
xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
xmm1 = _mm_mul_ps(y, xmm1);
xmm2 = _mm_mul_ps(y, xmm2);
xmm3 = _mm_mul_ps(y, xmm3);
x = _mm_add_ps(x, xmm1);
x = _mm_add_ps(x, xmm2);
x = _mm_add_ps(x, xmm3);
#ifdef USE_SSE2
emm4 = _mm_sub_epi32(emm4, *(v4si*)_pi32_2);
emm4 = _mm_andnot_si128(emm4, *(v4si*)_pi32_4);
emm4 = _mm_slli_epi32(emm4, 29);
v4sf sign_bit_cos = _mm_castsi128_ps(emm4);
#else
/* get the sign flag for the cosine */
mm4 = _mm_sub_pi32(mm4, *(v2si*)_pi32_2);
mm5 = _mm_sub_pi32(mm5, *(v2si*)_pi32_2);
mm4 = _mm_andnot_si64(mm4, *(v2si*)_pi32_4);
mm5 = _mm_andnot_si64(mm5, *(v2si*)_pi32_4);
mm4 = _mm_slli_pi32(mm4, 29);
mm5 = _mm_slli_pi32(mm5, 29);
v4sf sign_bit_cos;
COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos);
_mm_empty(); /* good-bye mmx */
#endif
sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
/* Evaluate the first polynom (0 <= x <= Pi/4) */
v4sf z = _mm_mul_ps(x,x);
y = *(v4sf*)_ps_coscof_p0;
y = _mm_mul_ps(y, z);
y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p1);
y = _mm_mul_ps(y, z);
y = _mm_add_ps(y, *(v4sf*)_ps_coscof_p2);
y = _mm_mul_ps(y, z);
y = _mm_mul_ps(y, z);
v4sf tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
y = _mm_sub_ps(y, tmp);
y = _mm_add_ps(y, *(v4sf*)_ps_1);
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
v4sf y2 = *(v4sf*)_ps_sincof_p0;
y2 = _mm_mul_ps(y2, z);
y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
y2 = _mm_mul_ps(y2, z);
y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
y2 = _mm_mul_ps(y2, z);
y2 = _mm_mul_ps(y2, x);
y2 = _mm_add_ps(y2, x);
/* select the correct result from the two polynoms */
xmm3 = poly_mask;
v4sf ysin2 = _mm_and_ps(xmm3, y2);
v4sf ysin1 = _mm_andnot_ps(xmm3, y);
y2 = _mm_sub_ps(y2,ysin2);
y = _mm_sub_ps(y, ysin1);
xmm1 = _mm_add_ps(ysin1,ysin2);
xmm2 = _mm_add_ps(y,y2);
/* update the sign */
*s = _mm_xor_ps(xmm1, sign_bit_sin);
*c = _mm_xor_ps(xmm2, sign_bit_cos);
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment