Commit 5270f858 authored by Charlles Abreu's avatar Charlles Abreu
Browse files

resolved PR #2611 merge conflicts

parents 697ab72e eec9cd69
...@@ -216,7 +216,7 @@ script: ...@@ -216,7 +216,7 @@ script:
- python devtools/run-ctest.py --start-time $START_TIME - python devtools/run-ctest.py --start-time $START_TIME
- if [[ ! -z "${DOCS_DEPLOY}" && "${DOCS_DEPLOY}" = "true" ]]; then - if [[ ! -z "${DOCS_DEPLOY}" && "${DOCS_DEPLOY}" = "true" ]]; then
pip install sphinx sphinxcontrib-bibtex sphinxcontrib-lunrsearch sphinxcontrib-autodoc_doxygen; pip install sphinx==2.3.1 sphinxcontrib-bibtex sphinxcontrib-lunrsearch sphinxcontrib-autodoc_doxygen;
make sphinxhtml; make sphinxhtml;
make sphinxpdf; make sphinxpdf;
make C++ApiDocs PythonApiDocs; make C++ApiDocs PythonApiDocs;
......
...@@ -275,7 +275,7 @@ ENDIF (MSVC) ...@@ -275,7 +275,7 @@ ENDIF (MSVC)
IF(OPENMM_BUILD_SHARED_LIB) IF(OPENMM_BUILD_SHARED_LIB)
ADD_LIBRARY(${SHARED_TARGET} SHARED ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES}) ADD_LIBRARY(${SHARED_TARGET} SHARED ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})
SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_LINK_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_BUILDING_SHARED_LIBRARY -DLEPTON_BUILDING_SHARED_LIBRARY -DPTHREAD_BUILDING_SHARED_LIBRARY") SET_TARGET_PROPERTIES(${SHARED_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_LINK_FLAGS}" COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_BUILDING_SHARED_LIBRARY -DLEPTON_BUILDING_SHARED_LIBRARY -DPTHREAD_BUILDING_SHARED_LIBRARY" SOVERSION "${OPENMM_MAJOR_VERSION}.${OPENMM_MINOR_VERSION}")
ENDIF(OPENMM_BUILD_SHARED_LIB) ENDIF(OPENMM_BUILD_SHARED_LIB)
IF(OPENMM_BUILD_STATIC_LIB) IF(OPENMM_BUILD_STATIC_LIB)
......
...@@ -650,8 +650,8 @@ such as :file:`charmm36/water.xml`, which specifies the default CHARMM water mod ...@@ -650,8 +650,8 @@ such as :file:`charmm36/water.xml`, which specifies the default CHARMM water mod
.. warning:: Drude polarizable sites and lone pairs are not yet supported .. warning:: Drude polarizable sites and lone pairs are not yet supported
by `ParmEd <https://github.com/parmed/parmed>`_ and the CHARMM36 forcefields by `ParmEd <https://github.com/parmed/parmed>`_ and the CHARMM36 forcefields
that depend on these features are not included in this port. that depend on these features are not included in this port.
To use the CHARMM 2013 polarizable force field\ :cite:`Lopes2013`, To use the CHARMM 2019 polarizable force field\ :cite:`Lopes2013`,
include the single file :file:`charmm_polar_2013.xml`. include the single file :file:`charmm_polar_2019.xml`.
.. tip:: The solvent model XML files included under the :file:`charmm36/` directory .. tip:: The solvent model XML files included under the :file:`charmm36/` directory
include both water *and* ions compatible with that water model, so if you include both water *and* ions compatible with that water model, so if you
...@@ -712,17 +712,20 @@ recommended for most simulations. ...@@ -712,17 +712,20 @@ recommended for most simulations.
CHARMM Polarizable Force Field CHARMM Polarizable Force Field
------------------------------ ------------------------------
To use the CHARMM 2013 polarizable force field\ :cite:`Lopes2013`, include the To use the CHARMM 2019 polarizable force field\ :cite:`Lopes2013`, include the
single file :file:`charmm_polar_2013.xml`. It includes parameters for proteins, single file :file:`charmm_polar_2019.xml`. It includes parameters for proteins, lipids,
water, and ions. When using this force field, remember to add extra particles to water, and ions. When using this force field, remember to add extra particles to
the :class:`Topology` as described in section :ref:`adding-or-removing-extra-particles`. the :class:`Topology` as described in section :ref:`adding-or-removing-extra-particles`.
This force field also requires that you use one of the special integrators that
supports Drude particles. The options are DrudeLangevinIntegrator, DrudeNoseHooverIntegrator,
and DrudeSCFIntegrator.
Older Amber Force Fields Older Force Fields
------------------------ ------------------
OpenMM includes several older Amber force fields as well. For most simulations OpenMM includes several older force fields as well. For most simulations, the
Amber14 is preferred over any of these, but they are still useful for reproducing newer force fields described above are preferred over any of these, but they are
older results. still useful for reproducing older results.
.. tabularcolumns:: |l|L| .. tabularcolumns:: |l|L|
...@@ -735,6 +738,7 @@ File Force Field ...@@ -735,6 +738,7 @@ File Force Field
:code:`amber99sbnmr.xml` Amber99SB with modifications to fit NMR data\ :cite:`Li2010` :code:`amber99sbnmr.xml` Amber99SB with modifications to fit NMR data\ :cite:`Li2010`
:code:`amber03.xml` Amber03\ :cite:`Duan2003` :code:`amber03.xml` Amber03\ :cite:`Duan2003`
:code:`amber10.xml` Amber10 (documented in the AmberTools_ manual as `ff10`) :code:`amber10.xml` Amber10 (documented in the AmberTools_ manual as `ff10`)
:code:`charmm_polar_2013.xml` 2013 version of the CHARMM polarizable force field\ :cite:`Lopes2013`
============================= ================================================================================ ============================= ================================================================================
Several of these force fields support implicit solvent. To enable it, also Several of these force fields support implicit solvent. To enable it, also
...@@ -1059,6 +1063,34 @@ sampling, and therefore is preferred for most applications. Also note that ...@@ -1059,6 +1063,34 @@ sampling, and therefore is preferred for most applications. Also note that
:code:`LangevinIntegrator`\ , like :code:`LangevinMiddleIntegrator`\ , is a leapfrog :code:`LangevinIntegrator`\ , like :code:`LangevinMiddleIntegrator`\ , is a leapfrog
integrator, so the velocities are offset by half a time step from the positions. integrator, so the velocities are offset by half a time step from the positions.
Nosé-Hoover Integrator
----------------------
The :code:`NoseHooverIntegrator` uses the same "middle" leapfrog propagation
algorithm as :code:`LangevinMiddleIntegrator`, but replaces the stochastic
temperature control with a velocity scaling algorithm that produces more
accurate transport properties :cite:`Basconi2013`. This velocity scaling
results from propagating a chain of extra variables, which slightly reduces the
computational efficiency with respect to :code:`LangevinMiddleIntegrator`. The
thermostated integrator is minimally created with syntax analogous to the
:code:`LangevinMiddleIntegrator` example above::
NoseHooverIntegrator integrator(300*kelvin, 1/picosecond,
0.004*picoseconds);
The first argument specifies the target temperature. The second specifies the
frequency of interaction with the heat bath: a lower value interacts minimally,
yielding the microcanonical ensemble in the limit of a zero frequency, while a
larger frequency will perturb the system greater, keeping it closer to the
target temperature. The third argument is the integration timestep that, like
the other arguments, must be specified with units. For initial equilibration
to the target temperature, a larger interaction frequency is recommended,
*e.g.* 25 ps\ :sup:`-1`.
This integrator supports lots of other options, including the ability to couple
different parts of the system to thermostats at different temperatures. See the
API documentation for details.
Leapfrog Verlet Integrator Leapfrog Verlet Integrator
-------------------------- --------------------------
......
...@@ -3634,7 +3634,7 @@ the Drude particle and the spring constant *k* by ...@@ -3634,7 +3634,7 @@ the Drude particle and the spring constant *k* by
A damped interaction\ :cite:`Thole1981` is used between dipoles that are A damped interaction\ :cite:`Thole1981` is used between dipoles that are
bonded to each other. bonded to each other.
The equations of motion can be integrated with two different methods: The equations of motion can be integrated with three different methods:
#. In the Self Consistent Field (SCF) method, the ordinary particles are first #. In the Self Consistent Field (SCF) method, the ordinary particles are first
updated as usual. A local energy minimization is then performed to select new updated as usual. A local energy minimization is then performed to select new
...@@ -3644,8 +3644,25 @@ The equations of motion can be integrated with two different methods: ...@@ -3644,8 +3644,25 @@ The equations of motion can be integrated with two different methods:
#. In the extended Lagrangian method, the positions of the Drude particles are #. In the extended Lagrangian method, the positions of the Drude particles are
treated as dynamical variables, just like any other particles. A small amount treated as dynamical variables, just like any other particles. A small amount
of mass is transferred from the parent particles to the Drude particles, of mass is transferred from the parent particles to the Drude particles,
allowing them to be integrated normally. A dual Langevin integrator is used to allowing them to be integrated normally. A dual Langevin or Nose-Hoover integrator is used to
maintain the center of mass of each Drude particle pair at the system maintain the center of mass of each Drude particle pair at the system
temperature, while using a much lower temperature for their relative internal temperature, while using a much lower temperature for their relative internal
motion. In practice, this produces dipole moments very close to those from the motion. In practice, this produces dipole moments very close to those from the
SCF solution while being much faster to compute. SCF solution while being much faster to compute.
#. The Nosé-Hoover dual thermostat method. In this approach the motion of
non-Drude sites and center of mass motion of Drude pairs are thermostated to
the target temperature with one thermostat. Another thermostat is used to keep
relative motion of Drude pairs to a different, typically much lower,
temperature to maintain separation of nuclear and electronic degrees of
freedom. The minimal specification is as follows::
DrudeNoseHooverIntegrator integrator(temperature, frequency,
temperatureDrude, frequencyDrude,
1*femtoseconds)
Where the first and third arguments specify the center-of-mass temperature and
relative temperature for each Drude pair, respecitvely. The second and fourth
arguments describe the frequency of interaction with the center-of-mass and
relative heat baths, respectively, and should be specified with inverse time
units. The fifth argument is the timestep. The multi-timestep and Nosé-Hoover
chain length may also be specified, but sensible defaults are provided.
...@@ -41,6 +41,17 @@ ...@@ -41,6 +41,17 @@
doi = {10.1103/PhysRevLett.100.020603}, doi = {10.1103/PhysRevLett.100.020603},
} }
@article{Basconi2013,
title = {Effects of Temperature Control Algorithms on Transport Properties and Kinetics in Molecular Dynamics Simulations},
author = {Joseph E. Bascon and Michael R. Shirts},
journal = {Journal of Chemical Theory and Computation},
volume = {9},
issue = {7},
pages = {2887-2899},
year = {2013},
doi= {10.1021/ct400109a}
}
@article{Berendsen1987 @article{Berendsen1987
author = {Berendsen, H. J. C. and Grigera, J. R. and Straatsma, T. P.}, author = {Berendsen, H. J. C. and Grigera, J. R. and Straatsma, T. P.},
title = {The missing term in effective pair potentials}, title = {The missing term in effective pair potentials},
...@@ -314,6 +325,16 @@ ...@@ -314,6 +325,16 @@
journal = {Europhysics Letters ({EPL})}, journal = {Europhysics Letters ({EPL})},
} }
@article{Martyna1992,
author = {Glenn J. Martyna and Michael L. Klein and Mark Tuckerman},
year = 1992,
title = {Nos\'{e}–Hoover chains: The canonical ensemble via continuous dynamics},
pages = {2635-2643},
journal = {Journal of Chemical Physics},
volume = {97},
issue = {4},
}
@article{Markland2008 @article{Markland2008
author = {Markland, Thomas E. and Manolopoulos, David E.}, author = {Markland, Thomas E. and Manolopoulos, David E.},
title = {An efficient ring polymer contraction scheme for imaginary time path integral simulations}, title = {An efficient ring polymer contraction scheme for imaginary time path integral simulations},
......
...@@ -1391,6 +1391,52 @@ twice per time step, compared to only once for LangevinIntegrator. This ...@@ -1391,6 +1391,52 @@ twice per time step, compared to only once for LangevinIntegrator. This
can make it slightly slower for systems that involve constraints. However, this can make it slightly slower for systems that involve constraints. However, this
usually is more than compensated by allowing you to use a larger time step. usually is more than compensated by allowing you to use a larger time step.
.. _nosehoover-integrators-theory:
NoseHooverIntegrator
********************
Like LangevinMiddleIntegerator, this uses the LFMiddle discretization.
:cite:`Zhang2019` In each step, the positions and velocities are updated as
follows:
.. math::
\mathbf{v}_{i}(t+\Delta t/2) = \mathbf{v}_{i}(t-\Delta t/2) + \mathbf{f}_{i}(t)\Delta t/{m}_{i}
.. math::
\mathbf{r}_{i}(t+\Delta t/2) = \mathbf{r}_{i}(t) + \mathbf{v}_{i}(t+\Delta t/2)\Delta t/2
.. math::
\mathbf{v'}_{i}(t+\Delta t/2) = \mathrm{scale}\times\mathbf{v}_{i}(t+\Delta t/2)
.. math::
\mathbf{r}_{i}(t+\Delta t) = \mathbf{r}_{i}(t+\Delta t/2) + \mathbf{v'}_{i}(t+\Delta t/2)\Delta t/2
The universal scale factor used in the third step is determined by propagating
auxilliary degrees of freedom alongside the regular particles. The original
Nosé-Hoover formulation used a single harmonic oscillator for the heat bath,
but this is problematic in small or stiff systems, which are non-ergodic, so
the chain formulation extends this by replacing the single oscillator
thermostat with a chain of connected oscillators. :cite:`Martyna1992` For
large systems a single oscillator (*i.e.* a chain length of one) will suffice,
but longer chains are necessary to properly thermostat non-ergodic systems.
The OpenMM default is to use a chain length of three to cover the latter case,
but this can be safely reduced to increase efficiency in large systems.
The heat bath propagation is performed using a multi-timestep algorithm. Each
propagation step is discretized into substeps using a factorization from
Yoshida and Suzuki; the default discretization uses a :math:`\mathcal{O}(\Delta
t^6)` approach that uses 7 points, but 1, 3 or 5 points may also be used to
increase performace, at the expense of accuracy. Each step is further
subdivided into multi-timesteps with a default of 3 multi time steps per
propagation; as with the number of Yoshida-Suziki points this value may be
increase to increase accuracy but with additional computational expense.
BrownianIntegrator BrownianIntegrator
****************** ******************
...@@ -1623,7 +1669,8 @@ Force Groups ...@@ -1623,7 +1669,8 @@ Force Groups
************ ************
It is possible to split the Force objects in a System into groups. Those groups It is possible to split the Force objects in a System into groups. Those groups
can then be evaluated independently of each other. Some Force classes also can then be evaluated independently of each other. This is done by calling
:code:`setForceGroup()` on the Force. Some Force classes also
provide finer grained control over grouping. For example, NonbondedForce allows provide finer grained control over grouping. For example, NonbondedForce allows
direct space computations to be in one group and reciprocal space computations direct space computations to be in one group and reciprocal space computations
in a different group. in a different group.
...@@ -1631,8 +1678,15 @@ in a different group. ...@@ -1631,8 +1678,15 @@ in a different group.
The most important use of force groups is for implementing multiple time step The most important use of force groups is for implementing multiple time step
algorithms with CustomIntegrator. For example, you might evaluate the slowly algorithms with CustomIntegrator. For example, you might evaluate the slowly
changing nonbonded interactions less frequently than the quickly changing bonded changing nonbonded interactions less frequently than the quickly changing bonded
ones. It also is useful if you want the ability to query a subset of the forces ones. This can be done by putting the slow and fast forces into separate
acting on the system. groups, then using a :class:`MTSIntegrator` or :class:`MTSLangevinIntegrator`
that evaluates the groups at different frequencies.
Another important use is to define forces that are not used when integrating
the equations of motion, but can still be queried efficiently. To do this,
call :code:`setIntegrationForceGroups()` on the :class:`Integrator`. Any groups
omitted will be ignored during simulation, but can be queried at any time by
calling :code:`getState()`.
Virtual Sites Virtual Sites
************* *************
......
This diff is collapsed.
...@@ -88,12 +88,13 @@ def runOneTest(testName, options): ...@@ -88,12 +88,13 @@ def runOneTest(testName, options):
dt = 0.005*unit.picoseconds dt = 0.005*unit.picoseconds
constraints = app.AllBonds constraints = app.AllBonds
hydrogenMass = 4*unit.amu hydrogenMass = 4*unit.amu
integ = mm.LangevinIntegrator(300*unit.kelvin, friction, dt)
else: else:
dt = 0.002*unit.picoseconds dt = 0.004*unit.picoseconds
constraints = app.HBonds constraints = app.HBonds
hydrogenMass = None hydrogenMass = None
integ = mm.LangevinMiddleIntegrator(300*unit.kelvin, friction, dt)
system = ff.createSystem(pdb.topology, nonbondedMethod=method, nonbondedCutoff=cutoff, constraints=constraints, hydrogenMass=hydrogenMass) system = ff.createSystem(pdb.topology, nonbondedMethod=method, nonbondedCutoff=cutoff, constraints=constraints, hydrogenMass=hydrogenMass)
integ = mm.LangevinIntegrator(300*unit.kelvin, friction, dt)
print('Step Size: %g fs' % dt.value_in_unit(unit.femtoseconds)) print('Step Size: %g fs' % dt.value_in_unit(unit.femtoseconds))
properties = {} properties = {}
initialSteps = 5 initialSteps = 5
......
...@@ -53,6 +53,7 @@ ...@@ -53,6 +53,7 @@
#include <errno.h> #include <errno.h>
#include <sys/timeb.h> #include <sys/timeb.h>
#include <process.h> #include <process.h>
#include <stdint.h>
#ifndef ETIMEDOUT #ifndef ETIMEDOUT
#define ETIMEDOUT 110 #define ETIMEDOUT 110
...@@ -78,7 +79,7 @@ ...@@ -78,7 +79,7 @@
#define PTHREAD_DEFAULT_ATTR (PTHREAD_CANCEL_ENABLE) #define PTHREAD_DEFAULT_ATTR (PTHREAD_CANCEL_ENABLE)
#define PTHREAD_CANCELED ((void *) 0xDEADBEEF) #define PTHREAD_CANCELED ((void *)(uintptr_t) 0xDEADBEEF)
#define PTHREAD_ONCE_INIT 0 #define PTHREAD_ONCE_INIT 0
#define PTHREAD_MUTEX_INITIALIZER {(void*)-1,-1,0,0,0,0} #define PTHREAD_MUTEX_INITIALIZER {(void*)-1,-1,0,0,0,0}
...@@ -1090,7 +1091,7 @@ static int pthread_barrierattr_destroy(void **attr) ...@@ -1090,7 +1091,7 @@ static int pthread_barrierattr_destroy(void **attr)
static int pthread_barrierattr_setpshared(void **attr, int s) static int pthread_barrierattr_setpshared(void **attr, int s)
{ {
*attr = (void *) s; *attr = (void *)(intptr_t) s;
return 0; return 0;
} }
......
...@@ -85,6 +85,18 @@ public: ...@@ -85,6 +85,18 @@ public:
* @param steps the number of time steps to take * @param steps the number of time steps to take
*/ */
virtual void step(int steps) = 0; virtual void step(int steps) = 0;
/**
* Get which force groups to use for integration. By default, all force groups
* are included. This is interpreted as a set of bit flags: the forces from group i
* will be included if (groups&(1<<i)) != 0.
*/
virtual int getIntegrationForceGroups() const;
/**
* Set which force groups to use for integration. By default, all force groups
* are included. This is interpreted as a set of bit flags: the forces from group i
* will be included if (groups&(1<<i)) != 0.
*/
virtual void setIntegrationForceGroups(int groups);
protected: protected:
friend class Context; friend class Context;
friend class ContextImpl; friend class ContextImpl;
...@@ -166,6 +178,7 @@ protected: ...@@ -166,6 +178,7 @@ protected:
} }
private: private:
double stepSize, constraintTol; double stepSize, constraintTol;
int forceGroups;
}; };
} // namespace OpenMM } // namespace OpenMM
......
...@@ -43,6 +43,10 @@ namespace OpenMM { ...@@ -43,6 +43,10 @@ namespace OpenMM {
* force to the potential function. The strength of the restraining force is steadily increased * force to the potential function. The strength of the restraining force is steadily increased
* until the minimum energy configuration satisfies all constraints to within the tolerance * until the minimum energy configuration satisfies all constraints to within the tolerance
* specified by the Context's Integrator. * specified by the Context's Integrator.
*
* Energy minimization is done using the force groups defined by the Integrator.
* If you have called setIntegrationForceGroups() on it to restrict the set of forces
* used for integration, only the energy of the included forces will be minimized.
*/ */
class OPENMM_EXPORT LocalEnergyMinimizer { class OPENMM_EXPORT LocalEnergyMinimizer {
......
#ifndef OPENMM_VECTORIZE8_H_ #ifndef OPENMM_VECTORIZEAVX_H_
#define OPENMM_VECTORIZE8_H_ #define OPENMM_VECTORIZEAVX_H_
/* -------------------------------------------------------------------------- * /* -------------------------------------------------------------------------- *
* OpenMM * * OpenMM *
...@@ -57,9 +57,7 @@ public: ...@@ -57,9 +57,7 @@ public:
* @param table The table from which to do a lookup. * @param table The table from which to do a lookup.
* @param indexes The indexes to gather. * @param indexes The indexes to gather.
*/ */
fvec8(const float* table, const int idx[8]) { fvec8(const float* table, const int32_t idx[8]) {
// :TODO: Using int32_t explicitly as the index type could allow the real gather instruction to be used.
// Use gather and static assert? Conditional code?
val = _mm256_setr_ps(table[idx[0]], table[idx[1]], table[idx[2]], table[idx[3]], table[idx[4]], table[idx[5]], table[idx[6]], table[idx[7]]); val = _mm256_setr_ps(table[idx[0]], table[idx[1]], table[idx[2]], table[idx[3]], table[idx[4]], table[idx[5]], table[idx[6]], table[idx[7]]);
} }
...@@ -75,55 +73,55 @@ public: ...@@ -75,55 +73,55 @@ public:
void store(float* v) const { void store(float* v) const {
_mm256_storeu_ps(v, val); _mm256_storeu_ps(v, val);
} }
fvec8 operator+(const fvec8& other) const { fvec8 operator+(fvec8 other) const {
return _mm256_add_ps(val, other); return _mm256_add_ps(val, other);
} }
fvec8 operator-(const fvec8& other) const { fvec8 operator-(fvec8 other) const {
return _mm256_sub_ps(val, other); return _mm256_sub_ps(val, other);
} }
fvec8 operator*(const fvec8& other) const { fvec8 operator*(fvec8 other) const {
return _mm256_mul_ps(val, other); return _mm256_mul_ps(val, other);
} }
fvec8 operator/(const fvec8& other) const { fvec8 operator/(fvec8 other) const {
return _mm256_div_ps(val, other); return _mm256_div_ps(val, other);
} }
void operator+=(const fvec8& other) { void operator+=(fvec8 other) {
val = _mm256_add_ps(val, other); val = _mm256_add_ps(val, other);
} }
void operator-=(const fvec8& other) { void operator-=(fvec8 other) {
val = _mm256_sub_ps(val, other); val = _mm256_sub_ps(val, other);
} }
void operator*=(const fvec8& other) { void operator*=(fvec8 other) {
val = _mm256_mul_ps(val, other); val = _mm256_mul_ps(val, other);
} }
void operator/=(const fvec8& other) { void operator/=(fvec8 other) {
val = _mm256_div_ps(val, other); val = _mm256_div_ps(val, other);
} }
fvec8 operator-() const { fvec8 operator-() const {
return _mm256_sub_ps(_mm256_set1_ps(0.0f), val); return _mm256_sub_ps(_mm256_set1_ps(0.0f), val);
} }
fvec8 operator&(const fvec8& other) const { fvec8 operator&(fvec8 other) const {
return _mm256_and_ps(val, other); return _mm256_and_ps(val, other);
} }
fvec8 operator|(const fvec8& other) const { fvec8 operator|(fvec8& other) const {
return _mm256_or_ps(val, other); return _mm256_or_ps(val, other);
} }
fvec8 operator==(const fvec8& other) const { fvec8 operator==(fvec8 other) const {
return _mm256_cmp_ps(val, other, _CMP_EQ_OQ); return _mm256_cmp_ps(val, other, _CMP_EQ_OQ);
} }
fvec8 operator!=(const fvec8& other) const { fvec8 operator!=(fvec8 other) const {
return _mm256_cmp_ps(val, other, _CMP_NEQ_OQ); return _mm256_cmp_ps(val, other, _CMP_NEQ_OQ);
} }
fvec8 operator>(const fvec8& other) const { fvec8 operator>(fvec8 other) const {
return _mm256_cmp_ps(val, other, _CMP_GT_OQ); return _mm256_cmp_ps(val, other, _CMP_GT_OQ);
} }
fvec8 operator<(const fvec8& other) const { fvec8 operator<(fvec8 other) const {
return _mm256_cmp_ps(val, other, _CMP_LT_OQ); return _mm256_cmp_ps(val, other, _CMP_LT_OQ);
} }
fvec8 operator>=(const fvec8& other) const { fvec8 operator>=(fvec8 other) const {
return _mm256_cmp_ps(val, other, _CMP_GE_OQ); return _mm256_cmp_ps(val, other, _CMP_GE_OQ);
} }
fvec8 operator<=(const fvec8& other) const { fvec8 operator<=(fvec8 other) const {
return _mm256_cmp_ps(val, other, _CMP_LE_OQ); return _mm256_cmp_ps(val, other, _CMP_LE_OQ);
} }
operator ivec8() const; operator ivec8() const;
...@@ -159,10 +157,10 @@ public: ...@@ -159,10 +157,10 @@ public:
void store(int* v) const { void store(int* v) const {
_mm256_storeu_si256((__m256i*) v, val); _mm256_storeu_si256((__m256i*) v, val);
} }
ivec8 operator&(const ivec8& other) const { ivec8 operator&(ivec8 other) const {
return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(val), _mm256_castsi256_ps(other.val))); return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(val), _mm256_castsi256_ps(other.val)));
} }
ivec8 operator|(const ivec8& other) const { ivec8 operator|(ivec8 other) const {
return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(val), _mm256_castsi256_ps(other.val))); return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(val), _mm256_castsi256_ps(other.val)));
} }
operator fvec8() const; operator fvec8() const;
...@@ -193,36 +191,36 @@ inline fvec8 fvec8::expandBitsToMask(int bitmask) { ...@@ -193,36 +191,36 @@ inline fvec8 fvec8::expandBitsToMask(int bitmask) {
// Functions that operate on fvec8s. // Functions that operate on fvec8s.
static inline fvec8 floor(const fvec8& v) { static inline fvec8 floor(fvec8 v) {
return fvec8(_mm256_round_ps(v.val, 0x09)); return fvec8(_mm256_round_ps(v.val, 0x09));
} }
static inline fvec8 ceil(const fvec8& v) { static inline fvec8 ceil(fvec8 v) {
return fvec8(_mm256_round_ps(v.val, 0x0A)); return fvec8(_mm256_round_ps(v.val, 0x0A));
} }
static inline fvec8 round(const fvec8& v) { static inline fvec8 round(fvec8 v) {
return fvec8(_mm256_round_ps(v.val, _MM_FROUND_TO_NEAREST_INT)); return fvec8(_mm256_round_ps(v.val, _MM_FROUND_TO_NEAREST_INT));
} }
static inline fvec8 min(const fvec8& v1, const fvec8& v2) { static inline fvec8 min(fvec8 v1, fvec8 v2) {
return fvec8(_mm256_min_ps(v1.val, v2.val)); return fvec8(_mm256_min_ps(v1.val, v2.val));
} }
static inline fvec8 max(const fvec8& v1, const fvec8& v2) { static inline fvec8 max(fvec8 v1, fvec8 v2) {
return fvec8(_mm256_max_ps(v1.val, v2.val)); return fvec8(_mm256_max_ps(v1.val, v2.val));
} }
static inline fvec8 abs(const fvec8& v) { static inline fvec8 abs(fvec8 v) {
static const __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)); static const __m256 mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
return fvec8(_mm256_and_ps(v.val, mask)); return fvec8(_mm256_and_ps(v.val, mask));
} }
static inline fvec8 sqrt(const fvec8& v) { static inline fvec8 sqrt(fvec8 v) {
return fvec8(_mm256_sqrt_ps(v.val)); return fvec8(_mm256_sqrt_ps(v.val));
} }
static inline fvec8 rsqrt(const fvec8& v) { static inline fvec8 rsqrt(fvec8 v) {
// Initial estimate of rsqrt(). // Initial estimate of rsqrt().
fvec8 y(_mm256_rsqrt_ps(v.val)); fvec8 y(_mm256_rsqrt_ps(v.val));
...@@ -234,17 +232,17 @@ static inline fvec8 rsqrt(const fvec8& v) { ...@@ -234,17 +232,17 @@ static inline fvec8 rsqrt(const fvec8& v) {
return y; return y;
} }
static inline float dot8(const fvec8& v1, const fvec8& v2) { static inline float dot8(fvec8 v1, fvec8 v2) {
fvec8 result = _mm256_dp_ps(v1, v2, 0xF1); fvec8 result = _mm256_dp_ps(v1, v2, 0xF1);
return _mm_cvtss_f32(result.lowerVec())+_mm_cvtss_f32(result.upperVec()); return _mm_cvtss_f32(result.lowerVec())+_mm_cvtss_f32(result.upperVec());
} }
static inline float reduceAdd(const fvec8 v) { static inline float reduceAdd(fvec8 v) {
// :TODO: There are more efficient ways to do this. // :TODO: There are more efficient ways to do this.
return dot8(v, fvec8(1.0f)); return dot8(v, fvec8(1.0f));
} }
static inline void transpose(const fvec4& in1, const fvec4& in2, const fvec4& in3, const fvec4& in4, const fvec4& in5, const fvec4& in6, const fvec4& in7, const fvec4& in8, fvec8& out1, fvec8& out2, fvec8& out3, fvec8& out4) { static inline void transpose(fvec4 in1, fvec4 in2, fvec4 in3, fvec4 in4, fvec4 in5, fvec4 in6, fvec4 in7, fvec4 in8, fvec8& out1, fvec8& out2, fvec8& out3, fvec8& out4) {
fvec4 i1 = in1, i2 = in2, i3 = in3, i4 = in4; fvec4 i1 = in1, i2 = in2, i3 = in3, i4 = in4;
fvec4 i5 = in5, i6 = in6, i7 = in7, i8 = in8; fvec4 i5 = in5, i6 = in6, i7 = in7, i8 = in8;
_MM_TRANSPOSE4_PS(i1, i2, i3, i4); _MM_TRANSPOSE4_PS(i1, i2, i3, i4);
...@@ -275,7 +273,7 @@ static inline void transpose(const fvec4 in[8], fvec8& out1, fvec8& out2, fvec8& ...@@ -275,7 +273,7 @@ static inline void transpose(const fvec4 in[8], fvec8& out1, fvec8& out2, fvec8&
transpose(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], out1, out2, out3, out4); transpose(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], out1, out2, out3, out4);
} }
static inline void transpose(const fvec8& in1, const fvec8& in2, const fvec8& in3, const fvec8& in4, fvec4& out1, fvec4& out2, fvec4& out3, fvec4& out4, fvec4& out5, fvec4& out6, fvec4& out7, fvec4& out8) { static inline void transpose(fvec8 in1, fvec8 in2, fvec8 in3, fvec8 in4, fvec4& out1, fvec4& out2, fvec4& out3, fvec4& out4, fvec4& out5, fvec4& out6, fvec4& out7, fvec4& out8) {
out1 = in1.lowerVec(); out1 = in1.lowerVec();
out2 = in2.lowerVec(); out2 = in2.lowerVec();
out3 = in3.lowerVec(); out3 = in3.lowerVec();
...@@ -291,40 +289,40 @@ static inline void transpose(const fvec8& in1, const fvec8& in2, const fvec8& in ...@@ -291,40 +289,40 @@ static inline void transpose(const fvec8& in1, const fvec8& in2, const fvec8& in
/** /**
* Given 4 input vectors of 8 elements, transpose them to form 8 output vectors of 4 elements. * Given 4 input vectors of 8 elements, transpose them to form 8 output vectors of 4 elements.
*/ */
static inline void transpose(const fvec8& in1, const fvec8& in2, const fvec8& in3, const fvec8& in4, fvec4 out[8]) { static inline void transpose(fvec8 in1, fvec8 in2, fvec8 in3, fvec8 in4, fvec4 out[8]) {
transpose(in1, in2, in3, in4, out[0], out[1], out[2], out[3], out[4], out[5], out[6], out[7]); transpose(in1, in2, in3, in4, out[0], out[1], out[2], out[3], out[4], out[5], out[6], out[7]);
} }
// Functions that operate on ivec8s. // Functions that operate on ivec8s.
static inline bool any(const ivec8& v) { static inline bool any(ivec8 v) {
return !_mm256_testz_si256(v, _mm256_set1_epi32(0xFFFFFFFF)); return !_mm256_testz_si256(v, _mm256_set1_epi32(0xFFFFFFFF));
} }
// Mathematical operators involving a scalar and a vector. // Mathematical operators involving a scalar and a vector.
static inline fvec8 operator+(float v1, const fvec8& v2) { static inline fvec8 operator+(float v1, fvec8 v2) {
return fvec8(v1)+v2; return fvec8(v1)+v2;
} }
static inline fvec8 operator-(float v1, const fvec8& v2) { static inline fvec8 operator-(float v1, fvec8 v2) {
return fvec8(v1)-v2; return fvec8(v1)-v2;
} }
static inline fvec8 operator*(float v1, const fvec8& v2) { static inline fvec8 operator*(float v1, fvec8 v2) {
return fvec8(v1)*v2; return fvec8(v1)*v2;
} }
static inline fvec8 operator/(float v1, const fvec8& v2) { static inline fvec8 operator/(float v1, fvec8 v2) {
return fvec8(v1)/v2; return fvec8(v1)/v2;
} }
// Operation for blending fvec8 from a full bitmask. // Operation for blending fvec8 from a full bitmask.
static inline fvec8 blend(const fvec8& v1, const fvec8& v2, const fvec8& mask) { static inline fvec8 blend(fvec8 v1, fvec8 v2, fvec8 mask) {
return fvec8(_mm256_blendv_ps(v1.val, v2.val, mask.val)); return fvec8(_mm256_blendv_ps(v1.val, v2.val, mask.val));
} }
static inline fvec8 blendZero(const fvec8 v, const fvec8 mask) { static inline fvec8 blendZero(fvec8 v, fvec8 mask) {
return blend(0.0f, v, mask); return blend(0.0f, v, mask);
} }
...@@ -333,7 +331,7 @@ static inline fvec8 blendZero(const fvec8 v, const fvec8 mask) { ...@@ -333,7 +331,7 @@ static inline fvec8 blendZero(const fvec8 v, const fvec8 mask) {
* of vectors. The first result vector contains the values at the given indexes, and the second * of vectors. The first result vector contains the values at the given indexes, and the second
* result vector contains the values from each respective index+1. * result vector contains the values from each respective index+1.
*/ */
static inline void gatherVecPair(const float* table, const ivec8 index, fvec8& out0, fvec8& out1) { static inline void gatherVecPair(const float* table, ivec8 index, fvec8& out0, fvec8& out1) {
const auto lower = index.lowerVec(); const auto lower = index.lowerVec();
const auto upper = index.upperVec(); const auto upper = index.upperVec();
...@@ -368,7 +366,7 @@ static inline void gatherVecPair(const float* table, const ivec8 index, fvec8& o ...@@ -368,7 +366,7 @@ static inline void gatherVecPair(const float* table, const ivec8 index, fvec8& o
* output[2] = (Z0 + Z1 + Z2 + ...) * output[2] = (Z0 + Z1 + Z2 + ...)
* output[3] = undefined * output[3] = undefined
*/ */
static inline fvec4 reduceToVec3(const fvec8 x, const fvec8 y, const fvec8 z) { static inline fvec4 reduceToVec3(fvec8 x, fvec8 y, fvec8 z) {
// The general strategy for a vector reduce-add operation is to take values from // The general strategy for a vector reduce-add operation is to take values from
// different parts of the vector and overlap them a different part of the vector and then // different parts of the vector and overlap them a different part of the vector and then
// add together. Repeat this several times until all values have been summed. Initially 8 // add together. Repeat this several times until all values have been summed. Initially 8
...@@ -415,4 +413,4 @@ static inline fvec4 reduceToVec3(const fvec8 x, const fvec8 y, const fvec8 z) { ...@@ -415,4 +413,4 @@ static inline fvec4 reduceToVec3(const fvec8 x, const fvec8 y, const fvec8 z) {
return laneResult.lowerVec() + laneResult.upperVec(); return laneResult.lowerVec() + laneResult.upperVec();
} }
#endif /*OPENMM_VECTORIZE8_H_*/ #endif /*OPENMM_VECTORIZEAVX_H_*/
#ifndef OPENMM_VECTORIZE_AVX2_H_
#define OPENMM_VECTORIZE_AVX2_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2013-2014 Stanford University and the Authors. *
* Authors: Daniel Towner *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include "vectorizeAvx.h"
#include <immintrin.h>
// This file defines classes and functions to simplify vectorizing code with AVX.
bool isAvx2Supported() {
// Provide an alternative implementation of CPUID to support AVX2. On older
// non-Windows OSes the hardware.h support for CPUID doesn't set the CX register
// properly and gives the wrong answer when detecting AVX2 and beyond. On Windows
// the cpuid seems to work as expected so can be used.
#if !(defined(_WIN32) || defined(WIN32))
auto cpuid = [](int output[4], int functionnumber) {
int a, b, c, d;
__asm("cpuid" : "=a"(a),"=b"(b),"=c"(c),"=d"(d) : "a"(functionnumber), "c"(0) : );
output[0] = a;
output[1] = b;
output[2] = c;
output[3] = d;
};
#endif
int cpuInfo[4];
cpuid(cpuInfo, 0);
if (cpuInfo[0] >= 7) {
cpuInfo[2] = 0;
cpuid(cpuInfo, 7);
return ((cpuInfo[1] & ((int) 1 << 5)) != 0);
}
return false;
}
/**
* Derive from fvec8 so that default implementations of everything are provided,
* but can be overriden with AVX2-specific variants where possible.
*/
class fvecAvx2 : public fvec8 {
public:
fvecAvx2() = default;
fvecAvx2(fvec8 v) : fvec8(v) {}
fvecAvx2(float v) : fvec8(v) {}
fvecAvx2(float v1, float v2, float v3, float v4, float v5, float v6, float v7, float v8) : fvec8(v8, v7, v6, v5, v4, v3, v2, v1) {}
fvecAvx2(__m256 v) : fvec8(v) {}
fvecAvx2(const float* v) : fvec8(v) {}
/** Create a vector by gathering individual indexes of data from a table. Element i of the vector will
* be loaded from table[idx[i]].
* @param table The table from which to do a lookup.
* @param indexes The indexes to gather.
*/
fvecAvx2(const float* table, const int idx[8])
: fvec8(_mm256_i32gather_ps(table, _mm256_loadu_si256((const __m256i*)idx), 4)) {}
static fvecAvx2 expandBitsToMask(int bitmask);
};
inline fvecAvx2 fvecAvx2::expandBitsToMask(int bitmask) {
// Put a copy of all bits into each vector element and then shift so that the
// appropriate sub-bit becomes the MSB. For masking purposes, only the MSB matters and
// the other bits can be completely arbitrary.
const auto msb = _mm256_sllv_epi32(_mm256_set1_epi8(bitmask),
_mm256_setr_epi32(7, 6, 5, 4, 3, 2, 1, 0));
return _mm256_castsi256_ps(msb);
}
/**
* Given a table of floating-point values and a set of indexes, perform a gather read into a pair
* of vectors. The first result vector contains the values at the given indexes, and the second
* result vector contains the values from each respective index+1.
*/
static inline void gatherVecPair(const float* table, ivec8 index, fvecAvx2& out0, fvecAvx2& out1) {
const double* tableAsDbl = (const double*)table;
// The input is a set of 8 indexes, each of which refers to a pair of floating point
// values. The most efficient way to load from indexes in a vector is the gather instruction,
// and the 64-bit variant should be used to get the pairs.
// Given indexes ABCDEFGH, load the pairs corresponding to A C E G. This gives a set of
// 4 pairs. The high indexes (in the upper part of each 64-bit index) are cleared.
const auto lowerIdx = _mm256_and_si256(index, _mm256_set1_epi64x(0xFFFFFFFF));
const auto lowerGather = _mm256_castpd_ps(_mm256_i64gather_pd(tableAsDbl, lowerIdx, 4));
// Load indexes B D F H, this time by shifting the high 32-bit indexes into the lower 32-bits.
const auto upperIdx = _mm256_srli_epi64(index, 32);
const auto upperGather = _mm256_castpd_ps(_mm256_i64gather_pd(tableAsDbl, upperIdx, 4));
// All the first values must now be brought together. The lower values are already in the
// correct place, but the upper gather values must be moved over and blended in.
const auto swapUpper = _mm256_permute_ps(upperGather, 0b10110001);
out0 = fvecAvx2(_mm256_blend_ps(lowerGather, swapUpper, 0b10101010));
// And the same for the upper values.
const auto swapLower = _mm256_permute_ps(lowerGather, 0b10110001);
out1 = fvecAvx2(_mm256_blend_ps(swapLower, upperGather, 0b10101010));
}
#endif /*OPENMM_VECTORIZE_AVX2_H_*/
...@@ -76,10 +76,7 @@ public: ...@@ -76,10 +76,7 @@ public:
fvec4() = default; fvec4() = default;
fvec4(float v) : val(vdupq_n_f32(v)) {} fvec4(float v) : val(vdupq_n_f32(v)) {}
fvec4(float v1, float v2, float v3, float v4) { fvec4(float v1, float v2, float v3, float v4) : val {v1, v2, v3, v4} {}
float v[] = {v1, v2, v3, v4};
val = vld1q_f32(v);
}
fvec4(float32x4_t v) : val(v) {} fvec4(float32x4_t v) : val(v) {}
fvec4(const float* v) : val(vld1q_f32(v)) {} fvec4(const float* v) : val(vld1q_f32(v)) {}
operator float32x4_t() const { operator float32x4_t() const {
...@@ -92,7 +89,7 @@ public: ...@@ -92,7 +89,7 @@ public:
* @param table The table from which to do a lookup. * @param table The table from which to do a lookup.
* @param indexes The indexes to gather. * @param indexes The indexes to gather.
*/ */
fvec4(const float* table, const int idx[4]) fvec4(const float* table, const int32_t idx[4])
: fvec4(table[idx[0]], table[idx[1]], table[idx[2]], table[idx[3]]) { } : fvec4(table[idx[0]], table[idx[1]], table[idx[2]], table[idx[3]]) { }
float operator[](int i) const { float operator[](int i) const {
...@@ -121,16 +118,16 @@ public: ...@@ -121,16 +118,16 @@ public:
v[2] = vgetq_lane_f32(val, 2); v[2] = vgetq_lane_f32(val, 2);
} }
fvec4 operator+(const fvec4& other) const { fvec4 operator+(fvec4 other) const {
return vaddq_f32(val, other); return vaddq_f32(val, other);
} }
fvec4 operator-(const fvec4& other) const { fvec4 operator-(fvec4 other) const {
return vsubq_f32(val, other); return vsubq_f32(val, other);
} }
fvec4 operator*(const fvec4& other) const { fvec4 operator*(fvec4 other) const {
return vmulq_f32(val, other); return vmulq_f32(val, other);
} }
fvec4 operator/(const fvec4& other) const { fvec4 operator/(fvec4 other) const {
// NEON does not have a divide float-point operator, so we get the reciprocal and multiply. // NEON does not have a divide float-point operator, so we get the reciprocal and multiply.
float32x4_t reciprocal = vrecpeq_f32(other); float32x4_t reciprocal = vrecpeq_f32(other);
...@@ -139,45 +136,34 @@ public: ...@@ -139,45 +136,34 @@ public:
fvec4 result = vmulq_f32(val,reciprocal); fvec4 result = vmulq_f32(val,reciprocal);
return result; return result;
} }
void operator+=(const fvec4& other) { void operator+=(fvec4 other) {
val = vaddq_f32(val, other); val = vaddq_f32(val, other);
} }
void operator-=(const fvec4& other) { void operator-=(fvec4 other) {
val = vsubq_f32(val, other); val = vsubq_f32(val, other);
} }
void operator*=(const fvec4& other) { void operator*=(fvec4 other) {
val = vmulq_f32(val, other); val = vmulq_f32(val, other);
} }
void operator/=(const fvec4& other) { void operator/=(fvec4 other) {
val = *this/other; val = *this/other;
} }
fvec4 operator-() const { fvec4 operator-() const {
return vnegq_f32(val); return vnegq_f32(val);
} }
fvec4 operator&(const fvec4& other) const { fvec4 operator&(fvec4 other) const {
return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(val), vreinterpretq_u32_f32(other))); return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(val), vreinterpretq_u32_f32(other)));
} }
fvec4 operator|(const fvec4& other) const { fvec4 operator|(fvec4 other) const {
return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(val), vreinterpretq_u32_f32(other))); return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(val), vreinterpretq_u32_f32(other)));
} }
fvec4 operator==(const fvec4& other) const {
return vcvtq_f32_s32(vreinterpretq_s32_u32(vceqq_f32(val, other))); ivec4 operator==(fvec4 other) const;
} ivec4 operator!=(fvec4 other) const;
fvec4 operator!=(const fvec4& other) const { ivec4 operator>(fvec4 other) const;
return vcvtq_f32_s32(vreinterpretq_s32_u32(vmvnq_u32(vceqq_f32(val, other)))); // not(equals(val, other)) ivec4 operator<(fvec4 other) const;
} ivec4 operator>=(fvec4 other) const;
fvec4 operator>(const fvec4& other) const { ivec4 operator<=(fvec4 other) const;
return vcvtq_f32_s32(vreinterpretq_s32_u32(vcgtq_f32(val, other)));
}
fvec4 operator<(const fvec4& other) const {
return vcvtq_f32_s32(vreinterpretq_s32_u32(vcltq_f32(val, other)));
}
fvec4 operator>=(const fvec4& other) const {
return vcvtq_f32_s32(vreinterpretq_s32_u32(vcgeq_f32(val, other)));
}
fvec4 operator<=(const fvec4& other) const {
return vcvtq_f32_s32(vreinterpretq_s32_u32(vcleq_f32(val, other)));
}
operator ivec4() const; operator ivec4() const;
/** /**
...@@ -198,10 +184,7 @@ public: ...@@ -198,10 +184,7 @@ public:
ivec4() {} ivec4() {}
ivec4(int v) : val(vdupq_n_s32(v)) {} ivec4(int v) : val(vdupq_n_s32(v)) {}
ivec4(int v1, int v2, int v3, int v4) { ivec4(int v1, int v2, int v3, int v4) : val {v1, v2, v3, v4} {}
int v[] = {v1, v2, v3, v4};
val = vld1q_s32(v);
}
ivec4(int32x4_t v) : val(v) {} ivec4(int32x4_t v) : val(v) {}
ivec4(const int* v) : val(vld1q_s32(v)) {} ivec4(const int* v) : val(vld1q_s32(v)) {}
operator int32x4_t() const { operator int32x4_t() const {
...@@ -223,49 +206,49 @@ public: ...@@ -223,49 +206,49 @@ public:
void store(int* v) const { void store(int* v) const {
vst1q_s32(v, val); vst1q_s32(v, val);
} }
ivec4 operator+(const ivec4& other) const { ivec4 operator+(ivec4 other) const {
return vaddq_s32(val, other); return vaddq_s32(val, other);
} }
ivec4 operator-(const ivec4& other) const { ivec4 operator-(ivec4 other) const {
return vsubq_s32(val, other); return vsubq_s32(val, other);
} }
ivec4 operator*(const ivec4& other) const { ivec4 operator*(ivec4 other) const {
return vmulq_s32(val, other); return vmulq_s32(val, other);
} }
void operator+=(const ivec4& other) { void operator+=(ivec4 other) {
val = vaddq_s32(val, other); val = vaddq_s32(val, other);
} }
void operator-=(const ivec4& other) { void operator-=(ivec4 other) {
val = vsubq_s32(val, other); val = vsubq_s32(val, other);
} }
void operator*=(const ivec4& other) { void operator*=(ivec4 other) {
val = vmulq_s32(val, other); val = vmulq_s32(val, other);
} }
ivec4 operator-() const { ivec4 operator-() const {
return vnegq_s32(val); return vnegq_s32(val);
} }
ivec4 operator&(const ivec4& other) const { ivec4 operator&(ivec4 other) const {
return vandq_s32(val, other); return vandq_s32(val, other);
} }
ivec4 operator|(const ivec4& other) const { ivec4 operator|(ivec4 other) const {
return vorrq_s32(val, other); return vorrq_s32(val, other);
} }
ivec4 operator==(const ivec4& other) const { ivec4 operator==(ivec4 other) const {
return vreinterpretq_s32_u32(vceqq_s32(val, other)); return vreinterpretq_s32_u32(vceqq_s32(val, other));
} }
ivec4 operator!=(const ivec4& other) const { ivec4 operator!=(ivec4 other) const {
return vreinterpretq_s32_u32(vmvnq_u32(vceqq_s32(val, other))); // not(equal(val, other)) return vreinterpretq_s32_u32(vmvnq_u32(vceqq_s32(val, other))); // not(equal(val, other))
} }
ivec4 operator>(const ivec4& other) const { ivec4 operator>(ivec4 other) const {
return vreinterpretq_s32_u32(vcgtq_s32(val, other)); return vreinterpretq_s32_u32(vcgtq_s32(val, other));
} }
ivec4 operator<(const ivec4& other) const { ivec4 operator<(ivec4 other) const {
return vreinterpretq_s32_u32(vcltq_s32(val, other)); return vreinterpretq_s32_u32(vcltq_s32(val, other));
} }
ivec4 operator>=(const ivec4& other) const { ivec4 operator>=(ivec4 other) const {
return vreinterpretq_s32_u32(vcgeq_s32(val, other)); return vreinterpretq_s32_u32(vcgeq_s32(val, other));
} }
ivec4 operator<=(const ivec4& other) const { ivec4 operator<=(ivec4 other) const {
return vreinterpretq_s32_u32(vcleq_s32(val, other)); return vreinterpretq_s32_u32(vcleq_s32(val, other));
} }
operator fvec4() const; operator fvec4() const;
...@@ -287,54 +270,84 @@ inline ivec4 fvec4::expandBitsToMask(int bitmask) { ...@@ -287,54 +270,84 @@ inline ivec4 fvec4::expandBitsToMask(int bitmask) {
bitmask & 4 ? -1 : 0, bitmask & 4 ? -1 : 0,
bitmask & 8 ? -1 : 0); bitmask & 8 ? -1 : 0);
} }
// Comparison operators
inline ivec4 fvec4::operator==(fvec4 other) const {
return vreinterpretq_s32_u32(vceqq_f32(val, other));
}
inline ivec4 fvec4::operator!=(fvec4 other) const {
return vreinterpretq_s32_u32(vmvnq_u32(vceqq_f32(val, other))); // not(equals(val, other))
}
inline ivec4 fvec4::operator>(fvec4 other) const {
return vreinterpretq_s32_u32(vcgtq_f32(val, other));
}
inline ivec4 fvec4::operator<(fvec4 other) const {
return vreinterpretq_s32_u32(vcltq_f32(val, other));
}
inline ivec4 fvec4::operator>=(fvec4 other) const {
return vreinterpretq_s32_u32(vcgeq_f32(val, other));
}
inline ivec4 fvec4::operator<=(fvec4 other) const {
return vreinterpretq_s32_u32(vcleq_f32(val, other));
}
// Functions that operate on fvec4s. // Functions that operate on fvec4s.
static inline fvec4 min(const fvec4& v1, const fvec4& v2) { static inline fvec4 min(fvec4 v1, fvec4 v2) {
return vminq_f32(v1, v2); return vminq_f32(v1, v2);
} }
static inline fvec4 max(const fvec4& v1, const fvec4& v2) { static inline fvec4 max(fvec4 v1, fvec4 v2) {
return vmaxq_f32(v1, v2); return vmaxq_f32(v1, v2);
} }
static inline fvec4 abs(const fvec4& v) { static inline fvec4 abs(fvec4 v) {
return vabsq_f32(v); return vabsq_f32(v);
} }
static inline fvec4 rsqrt(const fvec4& v) { static inline fvec4 rsqrt(fvec4 v) {
float32x4_t recipSqrt = vrsqrteq_f32(v); float32x4_t recipSqrt = vrsqrteq_f32(v);
recipSqrt = vmulq_f32(recipSqrt, vrsqrtsq_f32(vmulq_f32(recipSqrt, v), recipSqrt)); recipSqrt = vmulq_f32(recipSqrt, vrsqrtsq_f32(vmulq_f32(recipSqrt, v), recipSqrt));
recipSqrt = vmulq_f32(recipSqrt, vrsqrtsq_f32(vmulq_f32(recipSqrt, v), recipSqrt)); recipSqrt = vmulq_f32(recipSqrt, vrsqrtsq_f32(vmulq_f32(recipSqrt, v), recipSqrt));
return recipSqrt; return recipSqrt;
} }
static inline fvec4 sqrt(const fvec4& v) { static inline fvec4 sqrt(fvec4 v) {
return rsqrt(v)*v; return rsqrt(v)*v;
} }
static inline fvec4 exp(const fvec4& v) { static inline fvec4 exp(fvec4 v) {
return fvec4(exp_ps(v.val)); return fvec4(exp_ps(v.val));
} }
static inline fvec4 log(const fvec4& v) { static inline fvec4 log(fvec4 v) {
return fvec4(log_ps(v.val)); return fvec4(log_ps(v.val));
} }
static inline float dot3(const fvec4& v1, const fvec4& v2) { static inline float dot3(fvec4 v1, fvec4 v2) {
fvec4 result = v1*v2; fvec4 result = v1*v2;
return vgetq_lane_f32(result, 0) + vgetq_lane_f32(result, 1) + vgetq_lane_f32(result, 2); return vgetq_lane_f32(result, 0) + vgetq_lane_f32(result, 1) + vgetq_lane_f32(result, 2);
} }
static inline float dot4(const fvec4& v1, const fvec4& v2) { static inline float dot4(fvec4 v1, fvec4 v2) {
fvec4 result = v1*v2; fvec4 result = v1*v2;
return vgetq_lane_f32(result, 0) + vgetq_lane_f32(result, 1) + vgetq_lane_f32(result, 2) + vgetq_lane_f32(result,3); return vgetq_lane_f32(result, 0) + vgetq_lane_f32(result, 1) + vgetq_lane_f32(result, 2) + vgetq_lane_f32(result,3);
} }
static inline float reduceAdd(const fvec4 v) { static inline float reduceAdd(fvec4 v) {
#ifdef __ARM64__
return vaddvq_f32(v);
#else
return dot4(v, fvec4(1.0f)); return dot4(v, fvec4(1.0f));
#endif
} }
static inline fvec4 cross(const fvec4& v1, const fvec4& v2) { static inline fvec4 cross(fvec4 v1, fvec4 v2) {
return fvec4(v1[1]*v2[2] - v1[2]*v2[1], return fvec4(v1[1]*v2[2] - v1[2]*v2[1],
v1[2]*v2[0] - v1[0]*v2[2], v1[2]*v2[0] - v1[0]*v2[2],
v1[0]*v2[1] - v1[1]*v2[0], 0); v1[0]*v2[1] - v1[1]*v2[0], 0);
...@@ -362,71 +375,79 @@ static inline void transpose(const fvec4 in[4], fvec4& v0, fvec4& v1, fvec4& v2, ...@@ -362,71 +375,79 @@ static inline void transpose(const fvec4 in[4], fvec4& v0, fvec4& v1, fvec4& v2,
/** /**
* Out-of-place transpose from named variables into an array. * Out-of-place transpose from named variables into an array.
*/ */
static inline void transpose(const fvec4 v0, const fvec4 v1, const fvec4 v2, const fvec4 v3, fvec4 out[4]) { static inline void transpose(fvec4 v0, fvec4 v1, fvec4 v2, fvec4 v3, fvec4 out[4]) {
out[0] = v0; out[1] = v1; out[2] = v2; out[3] = v3; out[0] = v0; out[1] = v1; out[2] = v2; out[3] = v3;
transpose(out[0], out[1], out[2], out[3]); transpose(out[0], out[1], out[2], out[3]);
} }
// Functions that operate on ivec4s. // Functions that operate on ivec4s.
static inline ivec4 min(const ivec4& v1, const ivec4& v2) { static inline ivec4 min(ivec4 v1, ivec4 v2) {
return vminq_s32(v1, v2); return vminq_s32(v1, v2);
} }
static inline ivec4 max(const ivec4& v1, const ivec4& v2) { static inline ivec4 max(ivec4 v1, ivec4 v2) {
return vmaxq_s32(v1, v2); return vmaxq_s32(v1, v2);
} }
static inline ivec4 abs(const ivec4& v) { static inline ivec4 abs(ivec4 v) {
return vabdq_s32(v, ivec4(0)); return vabdq_s32(v, ivec4(0));
} }
static inline bool any(const ivec4& v) { static inline bool any(ivec4 v) {
#ifdef __ARM64__
return (vmaxvq_u32(vreinterpretq_u32_s32(v)) != 0);
#else
return (vgetq_lane_s32(v, 0) != 0 || vgetq_lane_s32(v, 1) != 0 || vgetq_lane_s32(v, 2) != 0 || vgetq_lane_s32(v, 3) != 0); return (vgetq_lane_s32(v, 0) != 0 || vgetq_lane_s32(v, 1) != 0 || vgetq_lane_s32(v, 2) != 0 || vgetq_lane_s32(v, 3) != 0);
#endif
} }
// Mathematical operators involving a scalar and a vector. // Mathematical operators involving a scalar and a vector.
static inline fvec4 operator+(float v1, const fvec4& v2) { static inline fvec4 operator+(float v1, fvec4 v2) {
return fvec4(v1)+v2; return fvec4(v1)+v2;
} }
static inline fvec4 operator-(float v1, const fvec4& v2) { static inline fvec4 operator-(float v1, fvec4 v2) {
return fvec4(v1)-v2; return fvec4(v1)-v2;
} }
static inline fvec4 operator*(float v1, const fvec4& v2) { static inline fvec4 operator*(float v1, fvec4 v2) {
return fvec4(v1)*v2; return fvec4(v1)*v2;
} }
static inline fvec4 operator/(float v1, const fvec4& v2) { static inline fvec4 operator/(float v1, fvec4 v2) {
return fvec4(v1)/v2; return fvec4(v1)/v2;
} }
// Operations for blending fvec4s based on an ivec4. // Operations for blending fvec4s based on an ivec4.
static inline fvec4 blend(const fvec4& v1, const fvec4& v2, const ivec4& mask) { static inline fvec4 blend(fvec4 v1, fvec4 v2, ivec4 mask) {
return vbslq_f32(vreinterpretq_u32_s32(mask), v2, v1); return vbslq_f32(vreinterpretq_u32_s32(mask), v2, v1);
} }
static inline fvec4 blendZero(const fvec4 v, const ivec4 mask) { static inline fvec4 blendZero(fvec4 v, ivec4 mask) {
return blend(0.0f, v, mask); return vreinterpretq_f32_s32(vandq_s32(vreinterpretq_s32_f32(v), mask));
}
static inline ivec4 blendZero(ivec4 v, ivec4 mask) {
return v & mask;
} }
// These are at the end since they involve other functions defined above. // These are at the end since they involve other functions defined above.
static inline fvec4 round(const fvec4& v) { static inline fvec4 round(fvec4 v) {
fvec4 shift(0x1.0p23f); fvec4 shift(0x1.0p23f);
fvec4 absResult = (abs(v)+shift)-shift; fvec4 absResult = (abs(v)+shift)-shift;
return blend(v, absResult, ivec4(0x7FFFFFFF)); return blend(v, absResult, ivec4(0x7FFFFFFF));
} }
static inline fvec4 floor(const fvec4& v) { static inline fvec4 floor(fvec4 v) {
fvec4 rounded = round(v); fvec4 rounded = round(v);
return rounded + blend(0.0f, -1.0f, rounded>v); return rounded + blend(0.0f, -1.0f, rounded>v);
} }
static inline fvec4 ceil(const fvec4& v) { static inline fvec4 ceil(fvec4 v) {
fvec4 rounded = round(v); fvec4 rounded = round(v);
return rounded + blend(0.0f, 1.0f, rounded<v); return rounded + blend(0.0f, 1.0f, rounded<v);
} }
...@@ -435,7 +456,7 @@ static inline fvec4 ceil(const fvec4& v) { ...@@ -435,7 +456,7 @@ static inline fvec4 ceil(const fvec4& v) {
* of vectors. The first result vector contains the values at the given indexes, and the second * of vectors. The first result vector contains the values at the given indexes, and the second
* result vector contains the values from each respective index+1. * result vector contains the values from each respective index+1.
*/ */
static inline void gatherVecPair(const float* table, const ivec4 index, fvec4& out0, fvec4& out1) { static inline void gatherVecPair(const float* table, ivec4 index, fvec4& out0, fvec4& out1) {
fvec4 t0(table + index[0]); fvec4 t0(table + index[0]);
fvec4 t1(table + index[1]); fvec4 t1(table + index[1]);
fvec4 t2(table + index[2]); fvec4 t2(table + index[2]);
...@@ -458,7 +479,7 @@ static inline void gatherVecPair(const float* table, const ivec4 index, fvec4& o ...@@ -458,7 +479,7 @@ static inline void gatherVecPair(const float* table, const ivec4 index, fvec4& o
* output[2] = (Z0 + Z1 + Z2 + Z3) * output[2] = (Z0 + Z1 + Z2 + Z3)
* output[3] = undefined * output[3] = undefined
*/ */
static inline fvec4 reduceToVec3(const fvec4 x, const fvec4 y, const fvec4 z) { static inline fvec4 reduceToVec3(fvec4 x, fvec4 y, fvec4 z) {
const auto nx = reduceAdd(x); const auto nx = reduceAdd(x);
const auto ny = reduceAdd(y); const auto ny = reduceAdd(y);
const auto nz = reduceAdd(z); const auto nz = reduceAdd(z);
......
...@@ -74,7 +74,7 @@ public: ...@@ -74,7 +74,7 @@ public:
* @param table The table from which to do a lookup. * @param table The table from which to do a lookup.
* @param indexes The indexes to gather. * @param indexes The indexes to gather.
*/ */
fvec4(const float* table, const int idx[4]) fvec4(const float* table, const int32_t idx[4])
: fvec4(table[idx[0]], table[idx[1]], table[idx[2]], table[idx[3]]) { } : fvec4(table[idx[0]], table[idx[1]], table[idx[2]], table[idx[3]]) { }
operator __m128() const { operator __m128() const {
...@@ -95,45 +95,45 @@ public: ...@@ -95,45 +95,45 @@ public:
v[1] = val[1]; v[1] = val[1];
v[2] = val[2]; v[2] = val[2];
} }
fvec4 operator+(const fvec4& other) const { fvec4 operator+(fvec4 other) const {
return val+other; return val+other;
} }
fvec4 operator-(const fvec4& other) const { fvec4 operator-(fvec4 other) const {
return val-other; return val-other;
} }
fvec4 operator*(const fvec4& other) const { fvec4 operator*(fvec4 other) const {
return val*other; return val*other;
} }
fvec4 operator/(const fvec4& other) const { fvec4 operator/(fvec4 other) const {
return val/other; return val/other;
} }
void operator+=(const fvec4& other) { void operator+=(fvec4 other) {
val = val+other; val = val+other;
} }
void operator-=(const fvec4& other) { void operator-=(fvec4 other) {
val = val-other; val = val-other;
} }
void operator*=(const fvec4& other) { void operator*=(fvec4 other) {
val = val*other; val = val*other;
} }
void operator/=(const fvec4& other) { void operator/=(fvec4 other) {
val = val/other; val = val/other;
} }
fvec4 operator-() const { fvec4 operator-() const {
return -val; return -val;
} }
fvec4 operator&(const fvec4& other) const { fvec4 operator&(fvec4 other) const {
return (fvec4) (((__m128i)val)&((__m128i)other.val)); return (fvec4) (((__m128i)val)&((__m128i)other.val));
} }
fvec4 operator|(const fvec4& other) const { fvec4 operator|(fvec4 other) const {
return (fvec4) (((__m128i)val)|((__m128i)other.val)); return (fvec4) (((__m128i)val)|((__m128i)other.val));
} }
ivec4 operator==(const fvec4& other) const; ivec4 operator==(fvec4 other) const;
ivec4 operator!=(const fvec4& other) const; ivec4 operator!=(fvec4 other) const;
ivec4 operator>(const fvec4& other) const; ivec4 operator>(fvec4 other) const;
ivec4 operator<(const fvec4& other) const; ivec4 operator<(fvec4 other) const;
ivec4 operator>=(const fvec4& other) const; ivec4 operator>=(fvec4 other) const;
ivec4 operator<=(const fvec4& other) const; ivec4 operator<=(fvec4 other) const;
operator ivec4() const; operator ivec4() const;
/** /**
...@@ -171,49 +171,49 @@ public: ...@@ -171,49 +171,49 @@ public:
void store(int* v) const { void store(int* v) const {
*((__m128*) v) = val; *((__m128*) v) = val;
} }
ivec4 operator+(const ivec4& other) const { ivec4 operator+(ivec4 other) const {
return val+other; return val+other;
} }
ivec4 operator-(const ivec4& other) const { ivec4 operator-(ivec4 other) const {
return val-other; return val-other;
} }
ivec4 operator*(const ivec4& other) const { ivec4 operator*(ivec4 other) const {
return val*other; return val*other;
} }
void operator+=(const ivec4& other) { void operator+=(ivec4 other) {
val = val+other; val = val+other;
} }
void operator-=(const ivec4& other) { void operator-=(ivec4 other) {
val = val-other; val = val-other;
} }
void operator*=(const ivec4& other) { void operator*=(ivec4 other) {
val = val*other; val = val*other;
} }
ivec4 operator-() const { ivec4 operator-() const {
return -val; return -val;
} }
ivec4 operator&(const ivec4& other) const { ivec4 operator&(ivec4 other) const {
return val&other.val; return val&other.val;
} }
ivec4 operator|(const ivec4& other) const { ivec4 operator|(ivec4 other) const {
return val|other.val; return val|other.val;
} }
ivec4 operator==(const ivec4& other) const { ivec4 operator==(ivec4 other) const {
return (val==other.val); return (val==other.val);
} }
ivec4 operator!=(const ivec4& other) const { ivec4 operator!=(ivec4 other) const {
return (val!=other.val); return (val!=other.val);
} }
ivec4 operator>(const ivec4& other) const { ivec4 operator>(ivec4 other) const {
return (val>other.val); return (val>other.val);
} }
ivec4 operator<(const ivec4& other) const { ivec4 operator<(ivec4 other) const {
return (val<other.val); return (val<other.val);
} }
ivec4 operator>=(const ivec4& other) const { ivec4 operator>=(ivec4 other) const {
return (val>=other.val); return (val>=other.val);
} }
ivec4 operator<=(const ivec4& other) const { ivec4 operator<=(ivec4 other) const {
return (val<=other.val); return (val<=other.val);
} }
operator fvec4() const; operator fvec4() const;
...@@ -221,27 +221,27 @@ public: ...@@ -221,27 +221,27 @@ public:
// Conversion operators. // Conversion operators.
inline ivec4 fvec4::operator==(const fvec4& other) const { inline ivec4 fvec4::operator==(fvec4 other) const {
return (__m128i) (val==other.val); return (__m128i) (val==other.val);
} }
inline ivec4 fvec4::operator!=(const fvec4& other) const { inline ivec4 fvec4::operator!=(fvec4 other) const {
return (__m128i) (val!=other.val); return (__m128i) (val!=other.val);
} }
inline ivec4 fvec4::operator>(const fvec4& other) const { inline ivec4 fvec4::operator>(fvec4 other) const {
return (__m128i) (val>other.val); return (__m128i) (val>other.val);
} }
inline ivec4 fvec4::operator<(const fvec4& other) const { inline ivec4 fvec4::operator<(fvec4 other) const {
return (__m128i) (val<other.val); return (__m128i) (val<other.val);
} }
inline ivec4 fvec4::operator>=(const fvec4& other) const { inline ivec4 fvec4::operator>=(fvec4 other) const {
return (__m128i) (val>=other.val); return (__m128i) (val>=other.val);
} }
inline ivec4 fvec4::operator<=(const fvec4& other) const { inline ivec4 fvec4::operator<=(fvec4 other) const {
return (__m128i) (val<=other.val); return (__m128i) (val<=other.val);
} }
...@@ -262,34 +262,34 @@ inline ivec4 fvec4::expandBitsToMask(int bitmask) { ...@@ -262,34 +262,34 @@ inline ivec4 fvec4::expandBitsToMask(int bitmask) {
// Functions that operate on fvec4s. // Functions that operate on fvec4s.
static inline fvec4 abs(const fvec4& v) { static inline fvec4 abs(fvec4 v) {
return v&(__m128) ivec4(0x7FFFFFFF); return v&(__m128) ivec4(0x7FFFFFFF);
} }
static inline fvec4 exp(const fvec4& v) { static inline fvec4 exp(fvec4 v) {
return fvec4(expf(v[0]), expf(v[1]), expf(v[2]), expf(v[3])); return fvec4(expf(v[0]), expf(v[1]), expf(v[2]), expf(v[3]));
} }
static inline fvec4 log(const fvec4& v) { static inline fvec4 log(fvec4 v) {
return fvec4(logf(v[0]), logf(v[1]), logf(v[2]), logf(v[3])); return fvec4(logf(v[0]), logf(v[1]), logf(v[2]), logf(v[3]));
} }
static inline float dot3(const fvec4& v1, const fvec4& v2) { static inline float dot3(fvec4 v1, fvec4 v2) {
fvec4 r = v1*v2; fvec4 r = v1*v2;
return r[0]+r[1]+r[2]; return r[0]+r[1]+r[2];
} }
static inline float dot4(const fvec4& v1, const fvec4& v2) { static inline float dot4(fvec4 v1, fvec4 v2) {
fvec4 r = v1*v2; fvec4 r = v1*v2;
fvec4 temp = __builtin_shufflevector(r.val, r.val, 0, 1, -1, -1)+__builtin_shufflevector(r.val, r.val, 2, 3, -1, -1); fvec4 temp = __builtin_shufflevector(r.val, r.val, 0, 1, -1, -1)+__builtin_shufflevector(r.val, r.val, 2, 3, -1, -1);
return temp[0]+temp[1]; return temp[0]+temp[1];
} }
static inline float reduceAdd(const fvec4 v) { static inline float reduceAdd(fvec4 v) {
return dot4(v, fvec4(1.0f)); return dot4(v, fvec4(1.0f));
} }
static inline fvec4 cross(const fvec4& v1, const fvec4& v2) { static inline fvec4 cross(fvec4 v1, fvec4 v2) {
__m128 temp = v2.val*__builtin_shufflevector(v1.val, v1.val, 2, 0, 1, 3) - __m128 temp = v2.val*__builtin_shufflevector(v1.val, v1.val, 2, 0, 1, 3) -
v1.val*__builtin_shufflevector(v2.val, v2.val, 2, 0, 1, 3); v1.val*__builtin_shufflevector(v2.val, v2.val, 2, 0, 1, 3);
return __builtin_shufflevector(temp, temp, 2, 0, 1, 3); return __builtin_shufflevector(temp, temp, 2, 0, 1, 3);
...@@ -317,85 +317,89 @@ static inline void transpose(const fvec4 in[4], fvec4& v0, fvec4& v1, fvec4& v2, ...@@ -317,85 +317,89 @@ static inline void transpose(const fvec4 in[4], fvec4& v0, fvec4& v1, fvec4& v2,
/** /**
* Out-of-place transpose from named variables into an array. * Out-of-place transpose from named variables into an array.
*/ */
static inline void transpose(const fvec4 v0, const fvec4 v1, const fvec4 v2, const fvec4 v3, fvec4 out[4]) { static inline void transpose(fvec4 v0, fvec4 v1, fvec4 v2, fvec4 v3, fvec4 out[4]) {
out[0] = v0; out[1] = v1; out[2] = v2; out[3] = v3; out[0] = v0; out[1] = v1; out[2] = v2; out[3] = v3;
transpose(out[0], out[1], out[2], out[3]); transpose(out[0], out[1], out[2], out[3]);
} }
// Functions that operate on ivec4s. // Functions that operate on ivec4s.
static inline ivec4 min(const ivec4& v1, const ivec4& v2) { static inline ivec4 min(ivec4 v1, ivec4 v2) {
return ivec4(std::min(v1[0], v2[0]), std::min(v1[1], v2[1]), std::min(v1[2], v2[2]), std::min(v1[3], v2[3])); return ivec4(std::min(v1[0], v2[0]), std::min(v1[1], v2[1]), std::min(v1[2], v2[2]), std::min(v1[3], v2[3]));
} }
static inline ivec4 max(const ivec4& v1, const ivec4& v2) { static inline ivec4 max(ivec4 v1, ivec4 v2) {
return ivec4(std::max(v1[0], v2[0]), std::max(v1[1], v2[1]), std::max(v1[2], v2[2]), std::max(v1[3], v2[3])); return ivec4(std::max(v1[0], v2[0]), std::max(v1[1], v2[1]), std::max(v1[2], v2[2]), std::max(v1[3], v2[3]));
} }
static inline ivec4 abs(const ivec4& v) { static inline ivec4 abs(ivec4 v) {
return ivec4(abs(v[0]), abs(v[1]), abs(v[2]), abs(v[3])); return ivec4(abs(v[0]), abs(v[1]), abs(v[2]), abs(v[3]));
} }
static inline bool any(const __m128i& v) { static inline bool any(__m128i v) {
ivec4 temp = __builtin_shufflevector(v, v, 0, 1, -1, -1) | __builtin_shufflevector(v, v, 2, 3, -1, -1); ivec4 temp = __builtin_shufflevector(v, v, 0, 1, -1, -1) | __builtin_shufflevector(v, v, 2, 3, -1, -1);
return (temp[0] || temp[1]); return (temp[0] || temp[1]);
} }
// Mathematical operators involving a scalar and a vector. // Mathematical operators involving a scalar and a vector.
static inline fvec4 operator+(float v1, const fvec4& v2) { static inline fvec4 operator+(float v1, fvec4 v2) {
return fvec4(v1)+v2; return fvec4(v1)+v2;
} }
static inline fvec4 operator-(float v1, const fvec4& v2) { static inline fvec4 operator-(float v1, fvec4 v2) {
return fvec4(v1)-v2; return fvec4(v1)-v2;
} }
static inline fvec4 operator*(float v1, const fvec4& v2) { static inline fvec4 operator*(float v1, fvec4 v2) {
return fvec4(v1)*v2; return fvec4(v1)*v2;
} }
static inline fvec4 operator/(float v1, const fvec4& v2) { static inline fvec4 operator/(float v1, fvec4 v2) {
return fvec4(v1)/v2; return fvec4(v1)/v2;
} }
// Operations for blending fvec4s based on an ivec4. // Operations for blending fvec4s based on an ivec4.
static inline fvec4 blend(const fvec4& v1, const fvec4& v2, const __m128i& mask) { static inline fvec4 blend(fvec4 v1, fvec4 v2, __m128i mask) {
return (__m128) ((mask&(__m128i)v2) + ((ivec4(0xFFFFFFFF)-ivec4(mask))&(__m128i)v1)); return (__m128) ((mask&(__m128i)v2) + ((ivec4(0xFFFFFFFF)-ivec4(mask))&(__m128i)v1));
} }
static inline fvec4 blendZero(const fvec4 v, const ivec4 mask) { static inline fvec4 blendZero(fvec4 v, ivec4 mask) {
return blend(0.0f, v, mask); return blend(0.0f, v, mask);
} }
static inline ivec4 blendZero(ivec4 v, ivec4 mask) {
return v & mask;
}
// These are at the end since they involve other functions defined above. // These are at the end since they involve other functions defined above.
static inline fvec4 min(const fvec4& v1, const fvec4& v2) { static inline fvec4 min(fvec4 v1, fvec4 v2) {
return blend(v1, v2, v1 > v2); return blend(v1, v2, v1 > v2);
} }
static inline fvec4 max(const fvec4& v1, const fvec4& v2) { static inline fvec4 max(fvec4 v1, fvec4 v2) {
return blend(v1, v2, v1 < v2); return blend(v1, v2, v1 < v2);
} }
static inline fvec4 round(const fvec4& v) { static inline fvec4 round(fvec4 v) {
fvec4 shift(0x1.0p23f); fvec4 shift(0x1.0p23f);
fvec4 absResult = (abs(v)+shift)-shift; fvec4 absResult = (abs(v)+shift)-shift;
return (__m128) ((ivec4(0x80000000)&(__m128i)v) + (ivec4(0x7FFFFFFF)&(__m128i)absResult)); return (__m128) ((ivec4(0x80000000)&(__m128i)v) + (ivec4(0x7FFFFFFF)&(__m128i)absResult));
} }
static inline fvec4 floor(const fvec4& v) { static inline fvec4 floor(fvec4 v) {
fvec4 truncated = __builtin_convertvector(__builtin_convertvector(v.val, __m128i), __m128); fvec4 truncated = __builtin_convertvector(__builtin_convertvector(v.val, __m128i), __m128);
return truncated + blend(0.0f, -1.0f, truncated>v); return truncated + blend(0.0f, -1.0f, truncated>v);
} }
static inline fvec4 ceil(const fvec4& v) { static inline fvec4 ceil(fvec4 v) {
fvec4 truncated = __builtin_convertvector(__builtin_convertvector(v.val, __m128i), __m128); fvec4 truncated = __builtin_convertvector(__builtin_convertvector(v.val, __m128i), __m128);
return truncated + blend(0.0f, 1.0f, truncated<v); return truncated + blend(0.0f, 1.0f, truncated<v);
} }
static inline fvec4 rsqrt(const fvec4& v) { static inline fvec4 rsqrt(fvec4 v) {
// Initial estimate of rsqrt(). // Initial estimate of rsqrt().
ivec4 i = (__m128i) v; ivec4 i = (__m128i) v;
...@@ -411,7 +415,7 @@ static inline fvec4 rsqrt(const fvec4& v) { ...@@ -411,7 +415,7 @@ static inline fvec4 rsqrt(const fvec4& v) {
return y; return y;
} }
static inline fvec4 sqrt(const fvec4& v) { static inline fvec4 sqrt(fvec4 v) {
return rsqrt(v)*v; return rsqrt(v)*v;
} }
...@@ -420,7 +424,7 @@ static inline fvec4 sqrt(const fvec4& v) { ...@@ -420,7 +424,7 @@ static inline fvec4 sqrt(const fvec4& v) {
* of vectors. The first result vector contains the values at the given indexes, and the second * of vectors. The first result vector contains the values at the given indexes, and the second
* result vector contains the values from each respective index+1. * result vector contains the values from each respective index+1.
*/ */
static inline void gatherVecPair(const float* table, const ivec4 index, fvec4& out0, fvec4& out1) { static inline void gatherVecPair(const float* table, ivec4 index, fvec4& out0, fvec4& out1) {
fvec4 t0(table + index[0]); fvec4 t0(table + index[0]);
fvec4 t1(table + index[1]); fvec4 t1(table + index[1]);
fvec4 t2(table + index[2]); fvec4 t2(table + index[2]);
...@@ -443,7 +447,7 @@ static inline void gatherVecPair(const float* table, const ivec4 index, fvec4& o ...@@ -443,7 +447,7 @@ static inline void gatherVecPair(const float* table, const ivec4 index, fvec4& o
* output[2] = (Z0 + Z1 + Z2 + Z3) * output[2] = (Z0 + Z1 + Z2 + Z3)
* output[3] = undefined * output[3] = undefined
*/ */
static inline fvec4 reduceToVec3(const fvec4 x, const fvec4 y, const fvec4 z) { static inline fvec4 reduceToVec3(fvec4 x, fvec4 y, fvec4 z) {
const auto nx = reduceAdd(x); const auto nx = reduceAdd(x);
const auto ny = reduceAdd(y); const auto ny = reduceAdd(y);
const auto nz = reduceAdd(z); const auto nz = reduceAdd(z);
......
...@@ -97,45 +97,45 @@ public: ...@@ -97,45 +97,45 @@ public:
v[2] = val[2]; v[2] = val[2];
} }
fvec4 operator+(const fvec4& other) const { fvec4 operator+(fvec4 other) const {
return vec_add(val, other.val); return vec_add(val, other.val);
} }
fvec4 operator-(const fvec4& other) const { fvec4 operator-(fvec4 other) const {
return vec_sub(val, other.val); return vec_sub(val, other.val);
} }
fvec4 operator*(const fvec4& other) const { fvec4 operator*(fvec4 other) const {
return vec_mul(val, other.val); return vec_mul(val, other.val);
} }
fvec4 operator/(const fvec4& other) const { fvec4 operator/(fvec4 other) const {
return vec_div(val, other.val); return vec_div(val, other.val);
} }
void operator+=(const fvec4& other) { void operator+=(fvec4 other) {
val = vec_add(val, other.val); val = vec_add(val, other.val);
} }
void operator-=(const fvec4& other) { void operator-=(fvec4 other) {
val = vec_sub(val, other.val); val = vec_sub(val, other.val);
} }
void operator*=(const fvec4& other) { void operator*=(fvec4 other) {
val = vec_mul(val, other.val); val = vec_mul(val, other.val);
} }
void operator/=(const fvec4& other) { void operator/=(fvec4 other) {
val = vec_div(val, other.val); val = vec_div(val, other.val);
} }
fvec4 operator-() const { fvec4 operator-() const {
return -val; return -val;
} }
fvec4 operator&(const fvec4& other) const { fvec4 operator&(fvec4 other) const {
return vec_and(val, other.val); return vec_and(val, other.val);
} }
fvec4 operator|(const fvec4& other) const { fvec4 operator|(fvec4 other) const {
return vec_or(val, other.val); return vec_or(val, other.val);
} }
ivec4 operator==(const fvec4& other) const; ivec4 operator==(fvec4 other) const;
ivec4 operator!=(const fvec4& other) const; ivec4 operator!=(fvec4 other) const;
ivec4 operator>(const fvec4& other) const; ivec4 operator>(fvec4 other) const;
ivec4 operator<(const fvec4& other) const; ivec4 operator<(fvec4 other) const;
ivec4 operator>=(const fvec4& other) const; ivec4 operator>=(fvec4 other) const;
ivec4 operator<=(const fvec4& other) const; ivec4 operator<=(fvec4 other) const;
operator ivec4() const; operator ivec4() const;
/*** /***
...@@ -173,49 +173,49 @@ public: ...@@ -173,49 +173,49 @@ public:
void store(int* v) const { void store(int* v) const {
*((__m128i*) v) = val; *((__m128i*) v) = val;
} }
ivec4 operator+(const ivec4& other) const { ivec4 operator+(ivec4 other) const {
return vec_add(val, other.val); return vec_add(val, other.val);
} }
ivec4 operator-(const ivec4& other) const { ivec4 operator-(ivec4 other) const {
return vec_sub(val, other.val); return vec_sub(val, other.val);
} }
ivec4 operator*(const ivec4& other) const { ivec4 operator*(ivec4 other) const {
return val*other.val; return val*other.val;
} }
void operator+=(const ivec4& other) { void operator+=(ivec4 other) {
val = vec_add(val, other.val); val = vec_add(val, other.val);
} }
void operator-=(const ivec4& other) { void operator-=(ivec4 other) {
val = vec_sub(val, other.val); val = vec_sub(val, other.val);
} }
void operator*=(const ivec4& other) { void operator*=(ivec4 other) {
val = val*other.val; val = val*other.val;
} }
ivec4 operator-() const { ivec4 operator-() const {
return -val; return -val;
} }
ivec4 operator&(const ivec4& other) const { ivec4 operator&(ivec4 other) const {
return val&other.val; return val&other.val;
} }
ivec4 operator|(const ivec4& other) const { ivec4 operator|(ivec4 other) const {
return val|other.val; return val|other.val;
} }
ivec4 operator==(const ivec4& other) const { ivec4 operator==(ivec4 other) const {
return (val==other.val); return (val==other.val);
} }
ivec4 operator!=(const ivec4& other) const { ivec4 operator!=(ivec4 other) const {
return (val!=other.val); return (val!=other.val);
} }
ivec4 operator>(const ivec4& other) const { ivec4 operator>(ivec4 other) const {
return (val>other.val); return (val>other.val);
} }
ivec4 operator<(const ivec4& other) const { ivec4 operator<(ivec4 other) const {
return (val<other.val); return (val<other.val);
} }
ivec4 operator>=(const ivec4& other) const { ivec4 operator>=(ivec4 other) const {
return (val>=other.val); return (val>=other.val);
} }
ivec4 operator<=(const ivec4& other) const { ivec4 operator<=(ivec4 other) const {
return (val<=other.val); return (val<=other.val);
} }
operator fvec4() const; operator fvec4() const;
...@@ -223,27 +223,27 @@ public: ...@@ -223,27 +223,27 @@ public:
// Conversion operators. // Conversion operators.
inline ivec4 fvec4::operator==(const fvec4& other) const { inline ivec4 fvec4::operator==(fvec4 other) const {
return (val==other.val); return (val==other.val);
} }
inline ivec4 fvec4::operator!=(const fvec4& other) const { inline ivec4 fvec4::operator!=(fvec4 other) const {
return (val!=other.val); return (val!=other.val);
} }
inline ivec4 fvec4::operator>(const fvec4& other) const { inline ivec4 fvec4::operator>(fvec4 other) const {
return (val>other.val); return (val>other.val);
} }
inline ivec4 fvec4::operator<(const fvec4& other) const { inline ivec4 fvec4::operator<(fvec4 other) const {
return (val<other.val); return (val<other.val);
} }
inline ivec4 fvec4::operator>=(const fvec4& other) const { inline ivec4 fvec4::operator>=(fvec4 other) const {
return (val>=other.val); return (val>=other.val);
} }
inline ivec4 fvec4::operator<=(const fvec4& other) const { inline ivec4 fvec4::operator<=(fvec4 other) const {
return (val<=other.val); return (val<=other.val);
} }
...@@ -264,34 +264,34 @@ inline ivec4 fvec4::expandBitsToMask(int bitmask) { ...@@ -264,34 +264,34 @@ inline ivec4 fvec4::expandBitsToMask(int bitmask) {
// Functions that operate on fvec4s. // Functions that operate on fvec4s.
static inline fvec4 abs(const fvec4& v) { static inline fvec4 abs(fvec4 v) {
return vec_abs(v.val); return vec_abs(v.val);
} }
static inline fvec4 exp(const fvec4& v) { static inline fvec4 exp(fvec4 v) {
return fvec4(expf(v[0]), expf(v[1]), expf(v[2]), expf(v[3])); return fvec4(expf(v[0]), expf(v[1]), expf(v[2]), expf(v[3]));
} }
static inline fvec4 log(const fvec4& v) { static inline fvec4 log(fvec4 v) {
return fvec4(logf(v[0]), logf(v[1]), logf(v[2]), logf(v[3])); return fvec4(logf(v[0]), logf(v[1]), logf(v[2]), logf(v[3]));
} }
static inline float dot3(const fvec4& v1, const fvec4& v2) { static inline float dot3(fvec4 v1, fvec4 v2) {
fvec4 r = v1*v2; fvec4 r = v1*v2;
return r[0]+r[1]+r[2]; return r[0]+r[1]+r[2];
} }
static inline float dot4(const fvec4& v1, const fvec4& v2) { static inline float dot4(fvec4 v1, fvec4 v2) {
fvec4 r = v1*v2; fvec4 r = v1*v2;
fvec4 temp = r + vec_sld(r.val, r.val, 8); fvec4 temp = r + vec_sld(r.val, r.val, 8);
return temp[0]+temp[1]; return temp[0]+temp[1];
} }
static inline float reduceAdd(const fvec4 v) { static inline float reduceAdd(fvec4 v) {
return dot4(v, fvec4(1.0f)); return dot4(v, fvec4(1.0f));
} }
static inline fvec4 cross(const fvec4& v1, const fvec4& v2) { static inline fvec4 cross(fvec4 v1, fvec4 v2) {
vector unsigned char perm = (vector unsigned char) {8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15}; vector unsigned char perm = (vector unsigned char) {8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15};
__m128 temp = v2.val*vec_perm(v1.val, v1.val, perm) - __m128 temp = v2.val*vec_perm(v1.val, v1.val, perm) -
v1.val*vec_perm(v2.val, v2.val, perm); v1.val*vec_perm(v2.val, v2.val, perm);
...@@ -324,80 +324,84 @@ static inline void transpose(const fvec4 in[4], fvec4& v0, fvec4& v1, fvec4& v2, ...@@ -324,80 +324,84 @@ static inline void transpose(const fvec4 in[4], fvec4& v0, fvec4& v1, fvec4& v2,
/** /**
* Out-of-place transpose from named variables into an array. * Out-of-place transpose from named variables into an array.
*/ */
static inline void transpose(const fvec4 v0, const fvec4 v1, const fvec4 v2, const fvec4 v3, fvec4 out[4]) { static inline void transpose(fvec4 v0, fvec4 v1, fvec4 v2, fvec4 v3, fvec4 out[4]) {
out[0] = v0; out[1] = v1; out[2] = v2; out[3] = v3; out[0] = v0; out[1] = v1; out[2] = v2; out[3] = v3;
transpose(out[0], out[1], out[2], out[3]); transpose(out[0], out[1], out[2], out[3]);
} }
// Functions that operate on ivec4s. // Functions that operate on ivec4s.
static inline ivec4 min(const ivec4& v1, const ivec4& v2) { static inline ivec4 min(ivec4 v1, ivec4 v2) {
return vec_min(v1.val, v2.val); return vec_min(v1.val, v2.val);
} }
static inline ivec4 max(const ivec4& v1, const ivec4& v2) { static inline ivec4 max(ivec4 v1, ivec4 v2) {
return vec_max(v1.val, v2.val); return vec_max(v1.val, v2.val);
} }
static inline ivec4 abs(const ivec4& v) { static inline ivec4 abs(ivec4 v) {
return vec_abs(v.val); return vec_abs(v.val);
} }
static inline bool any(const ivec4 v) { static inline bool any(ivec4 v) {
return !vec_all_eq(v.val, ivec4(0).val); return !vec_all_eq(v.val, ivec4(0).val);
} }
// Mathematical operators involving a scalar and a vector. // Mathematical operators involving a scalar and a vector.
static inline fvec4 operator+(float v1, const fvec4& v2) { static inline fvec4 operator+(float v1, fvec4 v2) {
return fvec4(v1)+v2; return fvec4(v1)+v2;
} }
static inline fvec4 operator-(float v1, const fvec4& v2) { static inline fvec4 operator-(float v1, fvec4 v2) {
return fvec4(v1)-v2; return fvec4(v1)-v2;
} }
static inline fvec4 operator*(float v1, const fvec4& v2) { static inline fvec4 operator*(float v1, fvec4 v2) {
return fvec4(v1)*v2; return fvec4(v1)*v2;
} }
static inline fvec4 operator/(float v1, const fvec4& v2) { static inline fvec4 operator/(float v1, fvec4 v2) {
return fvec4(v1)/v2; return fvec4(v1)/v2;
} }
// Operations for blending fvec4s based on an ivec4. // Operations for blending fvec4s based on an ivec4.
static inline fvec4 blend(const fvec4& v1, const fvec4& v2, const __m128i& mask) { static inline fvec4 blend(fvec4 v1, fvec4 v2, __m128i mask) {
return (__m128) ((mask&(__m128i)v2.val) + ((ivec4(0xFFFFFFFF)-ivec4(mask))&(__m128i)v1.val).val); return (__m128) ((mask&(__m128i)v2.val) + ((ivec4(0xFFFFFFFF)-ivec4(mask))&(__m128i)v1.val).val);
} }
static inline fvec4 blendZero(const fvec4 v, const ivec4 mask) { static inline fvec4 blendZero(fvec4 v, ivec4 mask) {
return blend(0.0f, v, mask); return blend(0.0f, v, mask);
} }
static inline ivec4 blendZero(ivec4 v, ivec4 mask) {
return v & mask;
}
// These are at the end since they involve other functions defined above. // These are at the end since they involve other functions defined above.
static inline fvec4 min(const fvec4& v1, const fvec4& v2) { static inline fvec4 min(fvec4 v1, fvec4 v2) {
return vec_min(v1.val, v2.val); return vec_min(v1.val, v2.val);
} }
static inline fvec4 max(const fvec4& v1, const fvec4& v2) { static inline fvec4 max(fvec4 v1, fvec4 v2) {
return vec_max(v1.val, v2.val); return vec_max(v1.val, v2.val);
} }
static inline fvec4 round(const fvec4& v) { static inline fvec4 round(fvec4 v) {
return vec_round(v.val); return vec_round(v.val);
} }
static inline fvec4 floor(const fvec4& v) { static inline fvec4 floor(fvec4 v) {
return vec_floor(v.val); return vec_floor(v.val);
} }
static inline fvec4 ceil(const fvec4& v) { static inline fvec4 ceil(fvec4 v) {
return vec_ceil(v.val); return vec_ceil(v.val);
} }
static inline fvec4 rsqrt(const fvec4& v) { static inline fvec4 rsqrt(fvec4 v) {
// Initial estimate of rsqrt(). // Initial estimate of rsqrt().
fvec4 y(vec_rsqrte(v.val)); fvec4 y(vec_rsqrte(v.val));
...@@ -409,7 +413,7 @@ static inline fvec4 rsqrt(const fvec4& v) { ...@@ -409,7 +413,7 @@ static inline fvec4 rsqrt(const fvec4& v) {
return y; return y;
} }
static inline fvec4 sqrt(const fvec4& v) { static inline fvec4 sqrt(fvec4 v) {
return vec_sqrt(v.val); return vec_sqrt(v.val);
} }
...@@ -417,7 +421,7 @@ static inline fvec4 sqrt(const fvec4& v) { ...@@ -417,7 +421,7 @@ static inline fvec4 sqrt(const fvec4& v) {
* of vectors. The first result vector contains the values at the given indexes, and the second * of vectors. The first result vector contains the values at the given indexes, and the second
* result vector contains the values from each respective index+1. * result vector contains the values from each respective index+1.
*/ */
static inline void gatherVecPair(const float* table, const ivec4 index, fvec4& out0, fvec4& out1) { static inline void gatherVecPair(const float* table, ivec4 index, fvec4& out0, fvec4& out1) {
fvec4 t0(table + index[0]); fvec4 t0(table + index[0]);
fvec4 t1(table + index[1]); fvec4 t1(table + index[1]);
fvec4 t2(table + index[2]); fvec4 t2(table + index[2]);
...@@ -440,7 +444,7 @@ static inline void gatherVecPair(const float* table, const ivec4 index, fvec4& o ...@@ -440,7 +444,7 @@ static inline void gatherVecPair(const float* table, const ivec4 index, fvec4& o
* output[2] = (Z0 + Z1 + Z2 + Z3) * output[2] = (Z0 + Z1 + Z2 + Z3)
* output[3] = undefined * output[3] = undefined
*/ */
static inline fvec4 reduceToVec3(const fvec4 x, const fvec4 y, const fvec4 z) { static inline fvec4 reduceToVec3(fvec4 x, fvec4 y, fvec4 z) {
const auto nx = reduceAdd(x); const auto nx = reduceAdd(x);
const auto ny = reduceAdd(y); const auto ny = reduceAdd(y);
const auto nz = reduceAdd(z); const auto nz = reduceAdd(z);
......
...@@ -80,7 +80,7 @@ public: ...@@ -80,7 +80,7 @@ public:
* @param table The table from which to do a lookup. * @param table The table from which to do a lookup.
* @param indexes The indexes to gather. * @param indexes The indexes to gather.
*/ */
fvec4(const float* table, const int idx[4]) fvec4(const float* table, const int32_t idx[4])
: fvec4(table[idx[0]], table[idx[1]], table[idx[2]], table[idx[3]]) { } : fvec4(table[idx[0]], table[idx[1]], table[idx[2]], table[idx[3]]) { }
operator __m128() const { operator __m128() const {
...@@ -108,55 +108,55 @@ public: ...@@ -108,55 +108,55 @@ public:
#endif #endif
} }
fvec4 operator+(const fvec4& other) const { fvec4 operator+(fvec4 other) const {
return _mm_add_ps(val, other); return _mm_add_ps(val, other);
} }
fvec4 operator-(const fvec4& other) const { fvec4 operator-(fvec4 other) const {
return _mm_sub_ps(val, other); return _mm_sub_ps(val, other);
} }
fvec4 operator*(const fvec4& other) const { fvec4 operator*(fvec4 other) const {
return _mm_mul_ps(val, other); return _mm_mul_ps(val, other);
} }
fvec4 operator/(const fvec4& other) const { fvec4 operator/(fvec4 other) const {
return _mm_div_ps(val, other); return _mm_div_ps(val, other);
} }
void operator+=(const fvec4& other) { void operator+=(fvec4 other) {
val = _mm_add_ps(val, other); val = _mm_add_ps(val, other);
} }
void operator-=(const fvec4& other) { void operator-=(fvec4 other) {
val = _mm_sub_ps(val, other); val = _mm_sub_ps(val, other);
} }
void operator*=(const fvec4& other) { void operator*=(fvec4 other) {
val = _mm_mul_ps(val, other); val = _mm_mul_ps(val, other);
} }
void operator/=(const fvec4& other) { void operator/=(fvec4 other) {
val = _mm_div_ps(val, other); val = _mm_div_ps(val, other);
} }
fvec4 operator-() const { fvec4 operator-() const {
return _mm_sub_ps(_mm_set1_ps(0.0f), val); return _mm_sub_ps(_mm_set1_ps(0.0f), val);
} }
fvec4 operator&(const fvec4& other) const { fvec4 operator&(fvec4 other) const {
return _mm_and_ps(val, other); return _mm_and_ps(val, other);
} }
fvec4 operator|(const fvec4& other) const { fvec4 operator|(fvec4 other) const {
return _mm_or_ps(val, other); return _mm_or_ps(val, other);
} }
fvec4 operator==(const fvec4& other) const { fvec4 operator==(fvec4 other) const {
return _mm_cmpeq_ps(val, other); return _mm_cmpeq_ps(val, other);
} }
fvec4 operator!=(const fvec4& other) const { fvec4 operator!=(fvec4 other) const {
return _mm_cmpneq_ps(val, other); return _mm_cmpneq_ps(val, other);
} }
fvec4 operator>(const fvec4& other) const { fvec4 operator>(fvec4 other) const {
return _mm_cmpgt_ps(val, other); return _mm_cmpgt_ps(val, other);
} }
fvec4 operator<(const fvec4& other) const { fvec4 operator<(fvec4 other) const {
return _mm_cmplt_ps(val, other); return _mm_cmplt_ps(val, other);
} }
fvec4 operator>=(const fvec4& other) const { fvec4 operator>=(fvec4 other) const {
return _mm_cmpge_ps(val, other); return _mm_cmpge_ps(val, other);
} }
fvec4 operator<=(const fvec4& other) const { fvec4 operator<=(fvec4 other) const {
return _mm_cmple_ps(val, other); return _mm_cmple_ps(val, other);
} }
operator ivec4() const; operator ivec4() const;
...@@ -191,49 +191,49 @@ public: ...@@ -191,49 +191,49 @@ public:
void store(int* v) const { void store(int* v) const {
_mm_storeu_si128((__m128i*) v, val); _mm_storeu_si128((__m128i*) v, val);
} }
ivec4 operator+(const ivec4& other) const { ivec4 operator+(ivec4 other) const {
return _mm_add_epi32(val, other); return _mm_add_epi32(val, other);
} }
ivec4 operator-(const ivec4& other) const { ivec4 operator-(ivec4 other) const {
return _mm_sub_epi32(val, other); return _mm_sub_epi32(val, other);
} }
ivec4 operator*(const ivec4& other) const { ivec4 operator*(ivec4 other) const {
return _mm_mullo_epi32(val, other); return _mm_mullo_epi32(val, other);
} }
void operator+=(const ivec4& other) { void operator+=(ivec4 other) {
val = _mm_add_epi32(val, other); val = _mm_add_epi32(val, other);
} }
void operator-=(const ivec4& other) { void operator-=(ivec4 other) {
val = _mm_sub_epi32(val, other); val = _mm_sub_epi32(val, other);
} }
void operator*=(const ivec4& other) { void operator*=(ivec4 other) {
val = _mm_mullo_epi32(val, other); val = _mm_mullo_epi32(val, other);
} }
ivec4 operator-() const { ivec4 operator-() const {
return _mm_sub_epi32(_mm_set1_epi32(0), val); return _mm_sub_epi32(_mm_set1_epi32(0), val);
} }
ivec4 operator&(const ivec4& other) const { ivec4 operator&(ivec4 other) const {
return _mm_and_si128(val, other); return _mm_and_si128(val, other);
} }
ivec4 operator|(const ivec4& other) const { ivec4 operator|(ivec4 other) const {
return _mm_or_si128(val, other); return _mm_or_si128(val, other);
} }
ivec4 operator==(const ivec4& other) const { ivec4 operator==(ivec4 other) const {
return _mm_cmpeq_epi32(val, other); return _mm_cmpeq_epi32(val, other);
} }
ivec4 operator!=(const ivec4& other) const { ivec4 operator!=(ivec4 other) const {
return _mm_xor_si128(*this==other, _mm_set1_epi32(0xFFFFFFFF)); return _mm_xor_si128(*this==other, _mm_set1_epi32(0xFFFFFFFF));
} }
ivec4 operator>(const ivec4& other) const { ivec4 operator>(ivec4 other) const {
return _mm_cmpgt_epi32(val, other); return _mm_cmpgt_epi32(val, other);
} }
ivec4 operator<(const ivec4& other) const { ivec4 operator<(ivec4 other) const {
return _mm_cmplt_epi32(val, other); return _mm_cmplt_epi32(val, other);
} }
ivec4 operator>=(const ivec4& other) const { ivec4 operator>=(ivec4 other) const {
return _mm_xor_si128(_mm_cmplt_epi32(val, other), _mm_set1_epi32(0xFFFFFFFF)); return _mm_xor_si128(_mm_cmplt_epi32(val, other), _mm_set1_epi32(0xFFFFFFFF));
} }
ivec4 operator<=(const ivec4& other) const { ivec4 operator<=(ivec4 other) const {
return _mm_xor_si128(_mm_cmpgt_epi32(val, other), _mm_set1_epi32(0xFFFFFFFF)); return _mm_xor_si128(_mm_cmpgt_epi32(val, other), _mm_set1_epi32(0xFFFFFFFF));
} }
operator fvec4() const; operator fvec4() const;
...@@ -258,36 +258,36 @@ inline fvec4 fvec4::expandBitsToMask(int bitmask) { ...@@ -258,36 +258,36 @@ inline fvec4 fvec4::expandBitsToMask(int bitmask) {
// Functions that operate on fvec4s. // Functions that operate on fvec4s.
static inline fvec4 floor(const fvec4& v) { static inline fvec4 floor(fvec4 v) {
return fvec4(_mm_floor_ps(v.val)); return fvec4(_mm_floor_ps(v.val));
} }
static inline fvec4 ceil(const fvec4& v) { static inline fvec4 ceil(fvec4 v) {
return fvec4(_mm_ceil_ps(v.val)); return fvec4(_mm_ceil_ps(v.val));
} }
static inline fvec4 round(const fvec4& v) { static inline fvec4 round(fvec4 v) {
return fvec4(_mm_round_ps(v.val, _MM_FROUND_TO_NEAREST_INT)); return fvec4(_mm_round_ps(v.val, _MM_FROUND_TO_NEAREST_INT));
} }
static inline fvec4 min(const fvec4& v1, const fvec4& v2) { static inline fvec4 min(fvec4 v1, fvec4 v2) {
return fvec4(_mm_min_ps(v1.val, v2.val)); return fvec4(_mm_min_ps(v1.val, v2.val));
} }
static inline fvec4 max(const fvec4& v1, const fvec4& v2) { static inline fvec4 max(fvec4 v1, fvec4 v2) {
return fvec4(_mm_max_ps(v1.val, v2.val)); return fvec4(_mm_max_ps(v1.val, v2.val));
} }
static inline fvec4 abs(const fvec4& v) { static inline fvec4 abs(fvec4 v) {
static const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF)); static const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
return fvec4(_mm_and_ps(v.val, mask)); return fvec4(_mm_and_ps(v.val, mask));
} }
static inline fvec4 sqrt(const fvec4& v) { static inline fvec4 sqrt(fvec4 v) {
return fvec4(_mm_sqrt_ps(v.val)); return fvec4(_mm_sqrt_ps(v.val));
} }
static inline fvec4 rsqrt(const fvec4& v) { static inline fvec4 rsqrt(fvec4 v) {
// Initial estimate of rsqrt(). // Initial estimate of rsqrt().
fvec4 y(_mm_rsqrt_ps(v.val)); fvec4 y(_mm_rsqrt_ps(v.val));
...@@ -299,27 +299,27 @@ static inline fvec4 rsqrt(const fvec4& v) { ...@@ -299,27 +299,27 @@ static inline fvec4 rsqrt(const fvec4& v) {
return y; return y;
} }
static inline fvec4 exp(const fvec4& v) { static inline fvec4 exp(fvec4 v) {
return fvec4(exp_ps(v.val)); return fvec4(exp_ps(v.val));
} }
static inline fvec4 log(const fvec4& v) { static inline fvec4 log(fvec4 v) {
return fvec4(log_ps(v.val)); return fvec4(log_ps(v.val));
} }
static inline float dot3(const fvec4& v1, const fvec4& v2) { static inline float dot3(fvec4 v1, fvec4 v2) {
return _mm_cvtss_f32(_mm_dp_ps(v1, v2, 0x71)); return _mm_cvtss_f32(_mm_dp_ps(v1, v2, 0x71));
} }
static inline float dot4(const fvec4& v1, const fvec4& v2) { static inline float dot4(fvec4 v1, fvec4 v2) {
return _mm_cvtss_f32(_mm_dp_ps(v1, v2, 0xF1)); return _mm_cvtss_f32(_mm_dp_ps(v1, v2, 0xF1));
} }
static inline float reduceAdd(const fvec4 v) { static inline float reduceAdd(fvec4 v) {
return dot4(v, fvec4(1.0f)); return dot4(v, fvec4(1.0f));
} }
static inline fvec4 cross(const fvec4& v1, const fvec4& v2) { static inline fvec4 cross(fvec4 v1, fvec4 v2) {
fvec4 temp = fvec4(_mm_mul_ps(v1, _mm_shuffle_ps(v2, v2, _MM_SHUFFLE(3, 0, 2, 1)))) - fvec4 temp = fvec4(_mm_mul_ps(v1, _mm_shuffle_ps(v2, v2, _MM_SHUFFLE(3, 0, 2, 1)))) -
fvec4(_mm_mul_ps(v2, _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(3, 0, 2, 1)))); fvec4(_mm_mul_ps(v2, _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(3, 0, 2, 1))));
return _mm_shuffle_ps(temp, temp, _MM_SHUFFLE(3, 0, 2, 1)); return _mm_shuffle_ps(temp, temp, _MM_SHUFFLE(3, 0, 2, 1));
...@@ -340,53 +340,53 @@ static inline void transpose(const fvec4 in[4], fvec4& v0, fvec4& v1, fvec4& v2, ...@@ -340,53 +340,53 @@ static inline void transpose(const fvec4 in[4], fvec4& v0, fvec4& v1, fvec4& v2,
/** /**
* Out-of-place transpose from named variables into an array. * Out-of-place transpose from named variables into an array.
*/ */
static inline void transpose(const fvec4 v0, const fvec4 v1, const fvec4 v2, const fvec4 v3, fvec4 out[4]) { static inline void transpose(fvec4 v0, fvec4 v1, fvec4 v2, fvec4 v3, fvec4 out[4]) {
out[0] = v0; out[1] = v1; out[2] = v2; out[3] = v3; out[0] = v0; out[1] = v1; out[2] = v2; out[3] = v3;
transpose(out[0], out[1], out[2], out[3]); transpose(out[0], out[1], out[2], out[3]);
} }
// Functions that operate on ivec4s. // Functions that operate on ivec4s.
static inline ivec4 min(const ivec4& v1, const ivec4& v2) { static inline ivec4 min(ivec4 v1, ivec4 v2) {
return ivec4(_mm_min_epi32(v1.val, v2.val)); return ivec4(_mm_min_epi32(v1.val, v2.val));
} }
static inline ivec4 max(const ivec4& v1, const ivec4& v2) { static inline ivec4 max(ivec4 v1, ivec4 v2) {
return ivec4(_mm_max_epi32(v1.val, v2.val)); return ivec4(_mm_max_epi32(v1.val, v2.val));
} }
static inline ivec4 abs(const ivec4& v) { static inline ivec4 abs(ivec4 v) {
return ivec4(_mm_abs_epi32(v.val)); return ivec4(_mm_abs_epi32(v.val));
} }
static inline bool any(const ivec4& v) { static inline bool any(ivec4 v) {
return !_mm_test_all_zeros(v, _mm_set1_epi32(0xFFFFFFFF)); return !_mm_test_all_zeros(v, _mm_set1_epi32(0xFFFFFFFF));
} }
// Mathematical operators involving a scalar and a vector. // Mathematical operators involving a scalar and a vector.
static inline fvec4 operator+(float v1, const fvec4& v2) { static inline fvec4 operator+(float v1, fvec4 v2) {
return fvec4(v1)+v2; return fvec4(v1)+v2;
} }
static inline fvec4 operator-(float v1, const fvec4& v2) { static inline fvec4 operator-(float v1, fvec4 v2) {
return fvec4(v1)-v2; return fvec4(v1)-v2;
} }
static inline fvec4 operator*(float v1, const fvec4& v2) { static inline fvec4 operator*(float v1, fvec4 v2) {
return fvec4(v1)*v2; return fvec4(v1)*v2;
} }
static inline fvec4 operator/(float v1, const fvec4& v2) { static inline fvec4 operator/(float v1, fvec4 v2) {
return fvec4(v1)/v2; return fvec4(v1)/v2;
} }
// Operations for blending fvec4 // Operations for blending fvec4
static inline fvec4 blend(const fvec4& v1, const fvec4& v2, const fvec4& mask) { static inline fvec4 blend(fvec4 v1, fvec4 v2, fvec4 mask) {
return fvec4(_mm_blendv_ps(v1.val, v2.val, mask.val)); return fvec4(_mm_blendv_ps(v1.val, v2.val, mask.val));
} }
static inline fvec4 blendZero(const fvec4 v, const fvec4 mask) { static inline fvec4 blendZero(fvec4 v, fvec4 mask) {
return blend(0.0f, v, mask); return blend(0.0f, v, mask);
} }
...@@ -394,7 +394,7 @@ static inline fvec4 blendZero(const fvec4 v, const fvec4 mask) { ...@@ -394,7 +394,7 @@ static inline fvec4 blendZero(const fvec4 v, const fvec4 mask) {
* of vectors. The first result vector contains the values at the given indexes, and the second * of vectors. The first result vector contains the values at the given indexes, and the second
* result vector contains the values from each respective index+1. * result vector contains the values from each respective index+1.
*/ */
static inline void gatherVecPair(const float* table, const ivec4 index, fvec4& out0, fvec4& out1) { static inline void gatherVecPair(const float* table, ivec4 index, fvec4& out0, fvec4& out1) {
fvec4 t0(table + index[0]); fvec4 t0(table + index[0]);
fvec4 t1(table + index[1]); fvec4 t1(table + index[1]);
fvec4 t2(table + index[2]); fvec4 t2(table + index[2]);
...@@ -417,7 +417,7 @@ static inline void gatherVecPair(const float* table, const ivec4 index, fvec4& o ...@@ -417,7 +417,7 @@ static inline void gatherVecPair(const float* table, const ivec4 index, fvec4& o
* output[2] = (Z0 + Z1 + Z2 + Z3) * output[2] = (Z0 + Z1 + Z2 + Z3)
* output[3] = undefined * output[3] = undefined
*/ */
static inline fvec4 reduceToVec3(const fvec4 x, const fvec4 y, const fvec4 z) { static inline fvec4 reduceToVec3(fvec4 x, fvec4 y, fvec4 z) {
// :TODO: Could be made more efficient. // :TODO: Could be made more efficient.
const auto nx = reduceAdd(x); const auto nx = reduceAdd(x);
const auto ny = reduceAdd(y); const auto ny = reduceAdd(y);
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2008-2019 Stanford University and the Authors. * * Portions copyright (c) 2008-2020 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -80,7 +80,7 @@ void BrownianIntegrator::step(int steps) { ...@@ -80,7 +80,7 @@ void BrownianIntegrator::step(int steps) {
throw OpenMMException("This Integrator is not bound to a context!"); throw OpenMMException("This Integrator is not bound to a context!");
for (int i = 0; i < steps; ++i) { for (int i = 0; i < steps; ++i) {
context->updateContextState(); context->updateContextState();
context->calcForcesAndEnergy(true, false); context->calcForcesAndEnergy(true, false, getIntegrationForceGroups());
kernel.getAs<IntegrateBrownianStepKernel>().execute(*context, *this); kernel.getAs<IntegrateBrownianStepKernel>().execute(*context, *this);
} }
} }
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2013-2015 Stanford University and the Authors. * * Portions copyright (c) 2013-2020 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -39,7 +39,7 @@ ...@@ -39,7 +39,7 @@
using namespace OpenMM; using namespace OpenMM;
Integrator::Integrator() : owner(NULL), context(NULL) { Integrator::Integrator() : owner(NULL), context(NULL), forceGroups(0xFFFFFFFF) {
} }
Integrator::~Integrator() { Integrator::~Integrator() {
...@@ -69,6 +69,14 @@ void Integrator::setConstraintTolerance(double tol) { ...@@ -69,6 +69,14 @@ void Integrator::setConstraintTolerance(double tol) {
constraintTol = tol; constraintTol = tol;
} }
int Integrator::getIntegrationForceGroups() const {
return forceGroups;
}
void Integrator::setIntegrationForceGroups(int groups) {
forceGroups = groups;
}
std::vector<Vec3> Integrator::getVelocitiesForTemperature(const System &system, double temperature, int randomSeed) const { std::vector<Vec3> Integrator::getVelocitiesForTemperature(const System &system, double temperature, int randomSeed) const {
// Generate the list of Gaussian random numbers. // Generate the list of Gaussian random numbers.
OpenMM_SFMT::SFMT sfmt; OpenMM_SFMT::SFMT sfmt;
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2008-2012 Stanford University and the Authors. * * Portions copyright (c) 2008-2020 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -76,7 +76,7 @@ void LangevinIntegrator::step(int steps) { ...@@ -76,7 +76,7 @@ void LangevinIntegrator::step(int steps) {
throw OpenMMException("This Integrator is not bound to a context!"); throw OpenMMException("This Integrator is not bound to a context!");
for (int i = 0; i < steps; ++i) { for (int i = 0; i < steps; ++i) {
context->updateContextState(); context->updateContextState();
context->calcForcesAndEnergy(true, false); context->calcForcesAndEnergy(true, false, getIntegrationForceGroups());
kernel.getAs<IntegrateLangevinStepKernel>().execute(*context, *this); kernel.getAs<IntegrateLangevinStepKernel>().execute(*context, *this);
} }
} }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment