Optimizations to CPU nonbonded forces: better load balancing between threads,...

Optimizations to CPU nonbonded forces: better load balancing between threads, use linear splines instead of cubic

Optimizations to CPU nonbonded forces: better load balancing between threads,...
Optimizations to CPU nonbonded forces: better load balancing between threads, use linear splines instead of cubic
be04ec57 · peastman · 56cf0fde · be04ec57 · be04ec57 · be04ec57
Commit be04ec57 authored Dec 04, 2013 by peastman
5 changed files
--- a/platforms/cpu/include/CpuGBSAOBCForce.h
+++ b/platforms/cpu/include/CpuGBSAOBCForce.h
@@ -107,6 +107,7 @@ private:
    float const* posq;
    std::vector<std::vector<float> >* threadForce;
    bool includeEnergy;
+    void* atomicCounter;
  
    static const int NUM_TABLE_POINTS;


--- a/platforms/cpu/include/CpuNonbondedForce.h
+++ b/platforms/cpu/include/CpuNonbondedForce.h
@@ -156,6 +156,7 @@ private:
        bool periodic;
        bool ewald;
        bool pme;
+        bool tableIsValid;
        const CpuNeighborList* neighborList;
        float periodicBoxSize[3];
        float cutoffDistance, switchingDistance;
@@ -174,6 +175,7 @@ private:
        std::set<int> const* exclusions;
        std::vector<std::vector<float> >* threadForce;
        bool includeEnergy;
+        void* atomicCounter;

        static const float TWO_OVER_SQRT_PI;
        static const int NUM_TABLE_POINTS;

--- a/platforms/cpu/src/CpuGBSAOBCForce.cpp
+++ b/platforms/cpu/src/CpuGBSAOBCForce.cpp
@@ -24,14 +24,14 @@

 #include "CpuGBSAOBCForce.h"
 #include "SimTKOpenMMRealType.h"
-#include "openmm/internal/SplineFitter.h"
 #include "openmm/internal/vectorize.h"
+#include "gmx_atomic.h"
 #include <cmath>

 using namespace std;
 using namespace OpenMM;

-const int CpuGBSAOBCForce::NUM_TABLE_POINTS = 1025;
+const int CpuGBSAOBCForce::NUM_TABLE_POINTS = 2048;

 class CpuGBSAOBCForce::ComputeTask : public ThreadPool::Task {
 public:
@@ -46,20 +46,10 @@ public:
 CpuGBSAOBCForce::CpuGBSAOBCForce() : cutoff(false), periodic(false) {
    logDX = 0.5/NUM_TABLE_POINTS;
    logDXInv = 1.0f/logDX;
-    vector<double> x(NUM_TABLE_POINTS+1);
-    vector<double> y(NUM_TABLE_POINTS+1);
-    vector<double> deriv;
+    logTable.resize(NUM_TABLE_POINTS+1);
    for (int i = 0; i < NUM_TABLE_POINTS+1; i++) {
-        x[i] = 0.5+i*0.5/NUM_TABLE_POINTS;
-        y[i] = log(x[i]);
-    }
-    SplineFitter::createNaturalSpline(x, y, deriv);
-    logTable.resize(4*NUM_TABLE_POINTS);
-    for (int i = 0; i < NUM_TABLE_POINTS; i++) {
-        logTable[4*i] = (float) y[i];
-        logTable[4*i+1] = (float) y[i+1];
-        logTable[4*i+2] = (float) (deriv[i]*logDX*logDX/6);
-        logTable[4*i+3] = (float) (deriv[i+1]*logDX*logDX/6);
+        double x = 0.5+i*logDX;
+        logTable[i] = log(x);
    }
 }

@@ -104,16 +94,22 @@ void CpuGBSAOBCForce::computeForce(const std::vector<float>& posq, vector<vector
    threadBornForces.resize(numThreads);
    for (int i = 0; i < numThreads; i++)
        threadBornForces[i].resize(particleParams.size()+3);
+    gmx_atomic_t counter;
+    this->atomicCounter = &counter;
    
    // Signal the threads to start running and wait for them to finish.
    
    ComputeTask task(*this);
+    gmx_atomic_set(&counter, 0);
    threads.execute(task);
    threads.waitForThreads(); // Compute Born radii
+    gmx_atomic_set(&counter, 0);
    threads.resumeThreads();
    threads.waitForThreads(); // Compute surface area term
+    gmx_atomic_set(&counter, 0);
    threads.resumeThreads();
    threads.waitForThreads(); // First loop
+    gmx_atomic_set(&counter, 0);
    threads.resumeThreads();
    threads.waitForThreads(); // Second loop
    
@@ -141,8 +137,11 @@ void CpuGBSAOBCForce::threadComputeForce(ThreadPool& threads, int threadIndex) {

    // Calculate Born radii

-    for (int blockStart = start; blockStart < end; blockStart += 4) {
-        int numInBlock = min(4, end-blockStart);
+    while (true) {
+        int blockStart = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 4);
+        if (blockStart >= numParticles)
+            break;
+        int numInBlock = min(4, numParticles-blockStart);
        ivec4 blockAtomIndex(blockStart, blockStart+1, blockStart+2, blockStart+3);
        float atomRadius[4], atomx[4], atomy[4], atomz[4];
        int blockMask[4] = {0, 0, 0, 0};
@@ -213,7 +212,10 @@ void CpuGBSAOBCForce::threadComputeForce(ThreadPool& threads, int threadIndex) {
    vector<float>& bornForces = threadBornForces[threadIndex];
    for (int i = 0; i < numParticles; i++)
        bornForces[i] = 0.0f;
-    for (int atomI = start; atomI < end; atomI++) {
+    while (true) {
+        int atomI = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 1);
+        if (atomI >= numParticles)
+            break;
        if (bornRadii[atomI] > 0) {
            float radiusI = particleParams[atomI].first + dielectricOffset;
            float r = radiusI + probeRadius;
@@ -235,8 +237,11 @@ void CpuGBSAOBCForce::threadComputeForce(ThreadPool& threads, int threadIndex) {
        preFactor = ONE_4PI_EPS0*((1.0f/solventDielectric) - (1.0f/soluteDielectric));
    else
        preFactor = 0.0f;
-    for (int blockStart = start; blockStart < end; blockStart += 4) {
-        int numInBlock = min(4, end-blockStart);
+    while (true) {
+        int blockStart = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 4);
+        if (blockStart >= numParticles)
+            break;
+        int numInBlock = min(4, numParticles-blockStart);
        ivec4 blockAtomIndex(blockStart, blockStart+1, blockStart+2, blockStart+3);
        float atomCharge[4], atomx[4], atomy[4], atomz[4];
        int blockMask[4] = {0, 0, 0, 0};
@@ -303,13 +308,16 @@ void CpuGBSAOBCForce::threadComputeForce(ThreadPool& threads, int threadIndex) {

    // Second loop of Born energy computation.

-    for (int blockStart = start; blockStart < end; blockStart += 4) {
+    while (true) {
+        int blockStart = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 4);
+        if (blockStart >= numParticles)
+            break;
        fvec4 bornForce(0.0f);
        for (int i = 0; i < numThreads; i++)
            bornForce += fvec4(&threadBornForces[i][blockStart]);
        fvec4 radii(&bornRadii[blockStart]);
        bornForce *= radii*radii*fvec4(&obcChain[blockStart]);
-        int numInBlock = min(4, end-blockStart);
+        int numInBlock = min(4, numParticles-blockStart);
        ivec4 blockAtomIndex(blockStart, blockStart+1, blockStart+2, blockStart+3);
        float atomRadius[4], atomx[4], atomy[4], atomz[4];
        int blockMask[4] = {0, 0, 0, 0};
@@ -385,21 +393,16 @@ void CpuGBSAOBCForce::getDeltaR(const fvec4& posI, const fvec4& x, const fvec4&
 fvec4 CpuGBSAOBCForce::fastLog(fvec4 x) {
    // Evaluate log(x) using a lookup table for speed.

-    float y[4];
    fvec4 x1 = (x-0.5f)*logDXInv;
    ivec4 index = floor(x1);
-    fvec4 coeff[4];
-    coeff[1] = x1-index;
-    coeff[0] = 1.0f-coeff[1];
-    coeff[2] = coeff[0]*coeff[0]*coeff[0]-coeff[0];
-    coeff[3] = coeff[1]*coeff[1]*coeff[1]-coeff[1];
-    transpose(coeff[0], coeff[1], coeff[2], coeff[3]);
-    static float maxdiff = 0.0f;
+    fvec4 coeff2 = x1-index;
+    fvec4 coeff1 = 1.0f-coeff2;
+    float table1[4], table2[4];
    for (int i = 0; i < 4; i++) {
-        if (index[i] >= 0 && index[i] < NUM_TABLE_POINTS)
-            y[i] = dot4(coeff[i], fvec4(&logTable[4*index[i]]));
-        else
-            y[i] = logf(x[i]);
+        int tableIndex = index[i];
+        if (tableIndex < NUM_TABLE_POINTS)
+            table1[i] = logTable[tableIndex];
+            table2[i] = logTable[tableIndex+1];
    }
-    return fvec4(y);
+    return coeff1*fvec4(table1) + coeff2*fvec4(table2);
 }
--- a/platforms/cpu/src/CpuNonbondedForce.cpp
+++ b/platforms/cpu/src/CpuNonbondedForce.cpp
@@ -29,8 +29,8 @@
 #include "CpuNonbondedForce.h"
 #include "ReferenceForce.h"
 #include "ReferencePME.h"
-#include "openmm/internal/SplineFitter.h"
 #include "openmm/internal/vectorize.h"
+#include "gmx_atomic.h"

 // In case we're using some primitive version of Visual Studio this will
 // make sure that erf() and erfc() are defined.
@@ -40,7 +40,7 @@ using namespace std;
 using namespace OpenMM;

 const float CpuNonbondedForce::TWO_OVER_SQRT_PI = (float) (2/sqrt(PI_M));
-const int CpuNonbondedForce::NUM_TABLE_POINTS = 1025;
+const int CpuNonbondedForce::NUM_TABLE_POINTS = 2048;

 class CpuNonbondedForce::ComputeDirectTask : public ThreadPool::Task {
 public:
@@ -58,10 +58,10 @@ public:

   --------------------------------------------------------------------------------------- */

-CpuNonbondedForce::CpuNonbondedForce() : cutoff(false), useSwitch(false), periodic(false), ewald(false), pme(false) {
+CpuNonbondedForce::CpuNonbondedForce() : cutoff(false), useSwitch(false), periodic(false), ewald(false), pme(false), tableIsValid(false) {
 }

-  /**---------------------------------------------------------------------------------------
+/**---------------------------------------------------------------------------------------

   Set the force to use a cutoff.

@@ -71,8 +71,9 @@ CpuNonbondedForce::CpuNonbondedForce() : cutoff(false), useSwitch(false), period

     --------------------------------------------------------------------------------------- */

-  void CpuNonbondedForce::setUseCutoff(float distance, const CpuNeighborList& neighbors, float solventDielectric) {
-
+void CpuNonbondedForce::setUseCutoff(float distance, const CpuNeighborList& neighbors, float solventDielectric) {
+    if (distance != cutoffDistance)
+        tableIsValid = false;
    cutoff = true;
    cutoffDistance = distance;
    neighborList = &neighbors;
@@ -127,6 +128,8 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
     --------------------------------------------------------------------------------------- */

  void CpuNonbondedForce::setUseEwald(float alpha, int kmaxx, int kmaxy, int kmaxz) {
+      if (alpha != alphaEwald)
+          tableIsValid = false;
      alphaEwald = alpha;
      numRx = kmaxx;
      numRy = kmaxy;
@@ -145,6 +148,8 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
     --------------------------------------------------------------------------------------- */

  void CpuNonbondedForce::setUsePME(float alpha, int meshSize[3]) {
+      if (alpha != alphaEwald)
+          tableIsValid = false;
      alphaEwald = alpha;
      meshDim[0] = meshSize[0];
      meshDim[1] = meshSize[1];
@@ -155,24 +160,16 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {

  
 void CpuNonbondedForce::tabulateEwaldScaleFactor() {
+    if (tableIsValid)
+        return;
+    tableIsValid = true;
    ewaldDX = cutoffDistance/(NUM_TABLE_POINTS-2);
    ewaldDXInv = 1.0f/ewaldDX;
-    vector<double> x(NUM_TABLE_POINTS+1);
-    vector<double> y(NUM_TABLE_POINTS+1);
-    vector<double> deriv;
+    ewaldScaleTable.resize(NUM_TABLE_POINTS+1);
    for (int i = 0; i < NUM_TABLE_POINTS+1; i++) {
-        double r = i*cutoffDistance/(NUM_TABLE_POINTS-2);
+        double r = i*ewaldDX;
        double alphaR = alphaEwald*r;
-        x[i] = r;
-        y[i] = erfc(alphaR) + TWO_OVER_SQRT_PI*alphaR*exp(-alphaR*alphaR);
-    }
-    SplineFitter::createNaturalSpline(x, y, deriv);
-    ewaldScaleTable.resize(4*NUM_TABLE_POINTS);
-    for (int i = 0; i < NUM_TABLE_POINTS; i++) {
-        ewaldScaleTable[4*i] = (float) y[i];
-        ewaldScaleTable[4*i+1] = (float) y[i+1];
-        ewaldScaleTable[4*i+2] = (float) (deriv[i]*ewaldDX*ewaldDX/6);
-        ewaldScaleTable[4*i+3] = (float) (deriv[i+1]*ewaldDX*ewaldDX/6);
+        ewaldScaleTable[i] = erfc(alphaR) + TWO_OVER_SQRT_PI*alphaR*exp(-alphaR*alphaR);
    }
 }
  
@@ -302,6 +299,9 @@ void CpuNonbondedForce::calculateDirectIxn(int numberOfAtoms, float* posq, const
    this->threadForce = &threadForce;
    includeEnergy = (totalEnergy != NULL);
    threadEnergy.resize(threads.getNumThreads());
+    gmx_atomic_t counter;
+    gmx_atomic_set(&counter, 0);
+    this->atomicCounter = &counter;
    
    // Signal the threads to start running and wait for them to finish.
    
@@ -332,8 +332,12 @@ void CpuNonbondedForce::threadComputeDirect(ThreadPool& threads, int threadIndex
    if (ewald || pme) {
        // Compute the interactions from the neighbor list.

-        for (int i = threadIndex; i < neighborList->getNumBlocks(); i += numThreads)
-            calculateBlockEwaldIxn(i, forces, energyPtr, boxSize, invBoxSize);
+        while (true) {
+            int nextBlock = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 1);
+            if (nextBlock >= neighborList->getNumBlocks())
+                break;
+            calculateBlockEwaldIxn(nextBlock, forces, energyPtr, boxSize, invBoxSize);
+        }

        // Now subtract off the exclusions, since they were implicitly included in the reciprocal space sum.

@@ -367,13 +371,20 @@ void CpuNonbondedForce::threadComputeDirect(ThreadPool& threads, int threadIndex
    else if (cutoff) {
        // Compute the interactions from the neighbor list.

-        for (int i = threadIndex; i < neighborList->getNumBlocks(); i += numThreads)
-            calculateBlockIxn(i, forces, energyPtr, boxSize, invBoxSize);
+        while (true) {
+            int nextBlock = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 1);
+            if (nextBlock >= neighborList->getNumBlocks())
+                break;
+            calculateBlockIxn(nextBlock, forces, energyPtr, boxSize, invBoxSize);
+        }
    }
    else {
        // Loop over all atom pairs

-        for (int i = threadIndex; i < numberOfAtoms; i += numThreads){
+        while (true) {
+            int i = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 1);
+            if (i >= numberOfAtoms)
+                break;
            for (int j = i+1; j < numberOfAtoms; j++)
                if (exclusions[j].find(i) == exclusions[j].end())
                    calculateOneIxn(i, j, forces, energyPtr, boxSize, invBoxSize);
@@ -609,10 +620,10 @@ void CpuNonbondedForce::calculateBlockEwaldIxn(int blockIndex, float* forces, do
        fvec4 sig2 = inverseR*sig;
        sig2 *= sig2;
        fvec4 sig6 = sig2*sig2*sig2;
-        fvec4 eps = blockAtomEpsilon*atomParameters[atom].second;
-        dEdR += switchValue*eps*(12.0f*sig6 - 6.0f)*sig6;
+        fvec4 epsSig6 = blockAtomEpsilon*atomParameters[atom].second*sig6;
+        dEdR += switchValue*epsSig6*(12.0f*sig6 - 6.0f);
        dEdR *= inverseR*inverseR;
-        fvec4 energy = eps*(sig6-1.0f)*sig6;
+        fvec4 energy = epsSig6*(sig6-1.0f);
        if (useSwitch) {
            dEdR -= energy*switchDeriv*inverseR;
            energy *= switchValue;
@@ -683,18 +694,16 @@ fvec4 CpuNonbondedForce::erfcApprox(fvec4 x) {
 fvec4 CpuNonbondedForce::ewaldScaleFunction(fvec4 x) {
    // Compute the tabulated Ewald scale factor: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)

-    float y[4];
    fvec4 x1 = x*ewaldDXInv;
    ivec4 index = floor(x1);
-    fvec4 coeff[4];
-    coeff[1] = x1-index;
-    coeff[0] = 1.0f-coeff[1];
-    coeff[2] = coeff[0]*coeff[0]*coeff[0]-coeff[0];
-    coeff[3] = coeff[1]*coeff[1]*coeff[1]-coeff[1];
-    transpose(coeff[0], coeff[1], coeff[2], coeff[3]);
+    fvec4 coeff2 = x1-index;
+    fvec4 coeff1 = 1.0f-coeff2;
+    float table1[4], table2[4];
    for (int i = 0; i < 4; i++) {
-        if (index[i] < NUM_TABLE_POINTS)
-            y[i] = dot4(coeff[i], fvec4(&ewaldScaleTable[4*index[i]]));
+        int tableIndex = index[i];
+        if (tableIndex < NUM_TABLE_POINTS)
+            table1[i] = ewaldScaleTable[tableIndex];
+            table2[i] = ewaldScaleTable[tableIndex+1];
    }
-    return fvec4(y);
+    return coeff1*fvec4(table1) + coeff2*fvec4(table2);
 }
--- a/platforms/cpu/src/gmx_atomic.h
+++ b/platforms/cpu/src/gmx_atomic.h
+/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- 
+*
+* Copyright (c) 2004-2008, Erik Lindahl <lindahl@cbr.su.se>
+*
+*  Unfortunately, some of the constructs in this file are _very_ sensitive
+*  to compiler optimizations and architecture changes. If you find any such
+*  errors, please send a message to lindahl@cbr.su.se to help us fix the
+*  upstream version too.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+* 
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+* 
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+* THE SOFTWARE.
+*
+* And Hey:
+* Gnomes, ROck Monsters And Chili Sauce
+*/
+#ifndef _GMX_ATOMIC_H_
+#define _GMX_ATOMIC_H_
+
+/*! \file gmx_atomic.h
+ *
+ *  @brief Atomic operations for fast SMP synchronization
+ *
+ *  This file defines atomic integer operations and spinlocks for 
+ *  fast synchronization in performance-critical regions of gromacs.
+ *
+ *  In general, the best option is to use functions without explicit 
+ *  locking, e.g. gmx_atomic_fetch_add() or gmx_atomic_cmpxchg().
+ *
+ *  Not all architecture support atomic operations though inline assembly,
+ *  and even if they do it might not be implemented here. In that case
+ *  we use a fallback mutex implementation, so you can always count on
+ *  the function interfaces working in Gromacs.
+ *
+ *  Don't use spinlocks in non-performance-critical regions like file I/O.
+ *  Since they always spin busy they would waste CPU cycles instead of 
+ *  properly yielding to a computation thread while waiting for the disk.
+ *
+ *  Finally, note that all our spinlock operations are defined to return
+ *  0 if initialization or locking completes successfully.
+ *  This is the opposite of some other implementations, but the same standard
+ *  as used for pthread mutexes. So, if e.g. are trying to lock a spinlock,
+ *  you will have gotten the lock if the return value is 0.
+ * 
+ *  gmx_spinlock_islocked(x) obviously still returns 1 if the lock is locked,
+ *  and 0 if it is available, though...
+ */
+
+
+
+#include <stdio.h>
+
+#include <pthread.h>
+
+#ifdef __cplusplus
+extern "C" 
+{  
+#endif
+#if 0
+} /* Avoids screwing up auto-indentation */
+#endif
+
+
+
+
+#if ( ( (defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__PATHSCALE__)) && \
+        (defined(i386) || defined(__x86_64__)) )                                      \
+      || defined (DOXYGEN) )
+
+/* This code is executed for x86 and x86-64, with these compilers:
+ * GNU
+ * Intel 
+ * Pathscale
+ * All these support GCC-style inline assembly. 
+ * We also use this section for the documentation.
+ */
+
+/*! \brief Memory barrier operation
+ *
+ *  Modern CPUs rely heavily on out-of-order execution, and one common feature
+ *  is that load/stores might be reordered. Also, when using inline assembly
+ *  the compiler might already have loaded the variable we are changing into
+ *  a register, so any update to memory won't be visible.
+ *
+ *  This command creates a memory barrier, i.e. all memory results before
+ *  it in the code should be visible to all memory operations after it - the
+ *  CPU cannot propagate load/stores across it.
+ */
+#define gmx_atomic_memory_barrier() __asm__ __volatile__("": : :"memory")
+
+/* Only gcc and Intel support this check, otherwise set it to true (skip doc) */
+#if (!defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined DOXYGEN)
+#define __builtin_constant_p(i) (1)
+#endif
+
+
+/*! \brief Gromacs atomic operations datatype
+ *
+ *  Portable synchronization primitives like mutexes are effective for
+ *  many purposes, but usually not very high performance.
+ *  One of the problem is that you have the overhead of a function call,
+ *  and another is that Mutexes often have extra overhead to make the
+ *  scheduling fair. Finally, if performance is important we don't want
+ *  to suspend the thread if we cannot lock a mutex, but spin-lock at 100%
+ *  CPU usage until the resources is available (e.g. increment a counter).
+ *
+ *  These things can often be implemented with inline-assembly or other
+ *  system-dependent functions, and we provide such functionality for the
+ *  most common platforms. For portability we also have a fallback 
+ *  implementation using a mutex for locking.
+ *
+ *  Performance-wise, the fastest solution is always to avoid locking 
+ *  completely (obvious, but remember it!). If you cannot do that, the
+ *  next best thing is to use atomic operations that e.g. increment a
+ *  counter without explicit locking. Spinlocks are useful to lock an
+ *  entire region, but leads to more overhead and can be difficult to
+ *  debug - it is up to you to make sure that only the thread owning the
+ *  lock unlocks it!
+ *
+ *  You should normally NOT use atomic operations for things like 
+ *  I/O threads. These should yield to other threads while waiting for 
+ *  the disk instead of spinning at 100% CPU usage.
+ *
+ *  It is imperative that you use the provided routines for reading
+ *  and writing, since some implementations require memory barriers before
+ *  the CPU or memory sees an updated result. The structure contents is
+ *  only visible here so it can be inlined for performance - it might
+ *  change without further notice.
+ *
+ *  \note No initialization is required for atomic variables.
+ *
+ *  Currently, we have (real) atomic operations for:
+ *
+ *  - x86 or x86_64, using GNU compilers
+ *  - x86 or x86_64, using Intel compilers 
+ *  - x86 or x86_64, using Pathscale compilers
+ *  - Itanium, using GNU compilers 
+ *  - Itanium, using Intel compilers
+ *  - Itanium, using HP compilers
+ *  - PowerPC, using GNU compilers 
+ *  - PowerPC, using IBM AIX compilers 
+ *  - PowerPC, using IBM compilers >=7.0 under Linux or Mac OS X.
+ */
+typedef struct gmx_atomic
+{
+	volatile int	   value;      /*!< Volatile, to avoid compiler aliasing */
+}
+gmx_atomic_t;
+
+
+
+/*! \brief Gromacs spinlock
+ *
+ *  Spinlocks provide a faster synchronization than mutexes,
+ *  although they consume CPU-cycles while waiting. They are implemented
+ *  with atomic operations and inline assembly whenever possible, and
+ *  otherwise we use a fallback implementation where a spinlock is identical
+ *  to a mutex (this is one of the reasons why you have to initialize them).
+ *
+ *  There are no guarantees whatsoever about fair scheduling or
+ *  debugging if you make a mistake and unlock a variable somebody
+ *  else has locked - performance is the primary goal of spinlocks.
+ *
+ */
+typedef struct gmx_spinlock
+{
+    volatile unsigned int   lock;      /*!< Volatile, to avoid compiler aliasing */
+}
+gmx_spinlock_t;
+
+
+
+
+
+/*! \brief Spinlock static initializer
+ *
+ *  This is used for static spinlock initialization, and has the same
+ *  properties as GMX_THREAD_MUTEX_INITIALIZER has for mutexes.
+ *  This is only for inlining in the gmx_thread.h header file. Whether
+ *  it is 0, 1, or something else when unlocked depends on the platform.
+ *  Don't assume anything about it. It might even be a mutex when using the
+ * fallback implementation!
+ */
+#define GMX_SPINLOCK_INITIALIZER   { 1 }
+
+
+
+/*! \brief Return value of an atomic integer 
+ *
+ *  Also implements proper memory barriers when necessary.
+ *  The actual implementation is system-dependent.
+ *
+ *  \param  a   Atomic variable to read
+ *  \return     Integer value of the atomic variable
+ */
+#define gmx_atomic_read(a)  ((a)->value) 
+
+ 
+/*! \brief Write value to an atomic integer 
+ *
+ *  Also implements proper memory barriers when necessary.
+ *  The actual implementation is system-dependent.
+ *
+ *  \param  a   Atomic variable
+ *  \param  i   Integer to set the atomic variable to.
+ */
+#define gmx_atomic_set(a,i)  (((a)->value) = (i))
+
+ 
+/*! \brief Add integer to atomic variable
+ *
+ *  Also implements proper memory barriers when necessary.
+ *  The actual implementation is system-dependent.
+ *
+ *  \param a   atomic datatype to modify
+ *  \param i   integer to increment with. Use i<0 to subtract atomically.
+ *
+ *  \return The new value (after summation).
+ */
+static inline int
+gmx_atomic_add_return(gmx_atomic_t *     a, 
+                      volatile int       i)
+{
+    int __i;
+    
+    __i = i;
+    __asm__ __volatile__("lock ; xaddl %0, %1;"
+                         :"=r"(i) :"m"(a->value), "0"(i));
+    return i + __i;
+}  
+  
+
+/*! \brief Add to variable, return the old value.
+ *
+ *  This operation is quite useful for synchronization counters.
+ *  By performing a fetchadd with N, a thread can e.g. reserve a chunk 
+ *  with the next N iterations, and the return value is the index
+ *  of the first element to treat.
+ *
+ *  Also implements proper memory barriers when necessary.
+ *  The actual implementation is system-dependent.
+ *
+ *  \param a   atomic datatype to modify
+ *  \param i   integer to increment with. Use i<0 to subtract atomically.
+ *
+ *  \return    The value of the atomic variable before addition.
+ */
+static inline int
+gmx_atomic_fetch_add(gmx_atomic_t *     a,
+                     volatile int       i)
+{
+    int __i;
+
+    __i = i;
+    __asm__ __volatile__("lock ; xaddl %0, %1;"
+                         :"=r"(i) :"m"(a->value), "0"(i));
+    return i;
+}
+
+
+/*! \brief Atomic compare-exchange operation
+ *
+ *   The \a old value is compared with the memory value in the atomic datatype.
+ *   If the are identical, the atomic type is updated to the new value, 
+ *   and otherwise left unchanged. 
+ *  
+ *   This is a very useful synchronization primitive: You can start by reading
+ *   a value (without locking anything), perform some calculations, and then
+ *   atomically try to update it in memory unless it has changed. If it has
+ *   changed you will get an error return code - reread the new value
+ *   an repeat the calculations in that case.
+ *
+ *   \param a        Atomic datatype ('memory' value)
+ *   \param oldval   Integer value read from the atomic type at an earlier point
+ *   \param newval   New value to write to the atomic type if it currently is
+ *                   identical to the old value.
+ *
+ *   \return The value of the atomic memory variable in memory when this 
+ *           instruction was executed. This, if the operation succeeded the
+ *           return value was identical to the \a old parameter, and if not
+ *           it returns the updated value in memory so you can repeat your
+ *           operations on it. 
+ *
+ *   \note   The exchange occured if the return value is identical to \a old.
+ */
+static inline int
+gmx_atomic_cmpxchg(gmx_atomic_t *    a, 
+                   int               oldval,
+                   int               newval)
+{
+    volatile unsigned long prev;
+    
+    __asm__ __volatile__("lock ; cmpxchgl %1,%2"
+                         : "=a"(prev)
+                         : "q"(newval), "m"(a->value), "0"(oldval)
+                         : "memory");
+    
+    return prev;
+}
+
+
+/*! \brief Initialize spinlock
+ *
+ *  In theory you can call this from multiple threads, but remember
+ *  that we don't check for errors. If the first thread proceeded to
+ *  lock the spinlock after initialization, the second will happily
+ *  overwrite the contents and unlock it without warning you.
+ *
+ *  \param x      Gromacs spinlock pointer.
+ */
+static inline void
+gmx_spinlock_init(gmx_spinlock_t *   x)
+{
+    x->lock = 1;
+}
+
+
+
+/*! \brief Acquire spinlock
+ *
+ *  This routine blocks until the spinlock is available, and
+ *  the locks it again before returning.
+ *
+ *  \param x     Gromacs spinlock pointer
+ */
+static inline void
+gmx_spinlock_lock(gmx_spinlock_t *  x)
+{
+	__asm__ __volatile__("\n1:\t" 
+						 "lock ; decb %0\n\t" 
+						 "jns 3f\n" 
+						 "2:\t" 
+						 "rep;nop\n\t" 
+						 "cmpb $0,%0\n\t" 
+						 "jle 2b\n\t" 
+						 "jmp 1b\n" 
+						 "3:\n\t" 
+						 :"=m" (x->lock) : : "memory"); 
+}
+
+
+/*! \brief Attempt to acquire spinlock
+ *
+ * This routine acquires the spinlock if possible, but if 
+ * already locked it return an error code immediately.
+ *
+ *  \param x     Gromacs spinlock pointer
+ *
+ * \return 0 if the mutex was available so we could lock it,
+ *         otherwise a non-zero integer (1) if the lock is busy.
+ */
+static inline int
+gmx_spinlock_trylock(gmx_spinlock_t *  x)
+{
+	char old_value;
+	
+    __asm__ __volatile__("xchgb %b0,%1"
+                         :"=q" (old_value), "=m" (x->lock)
+						 :"0" (0) : "memory");
+    return (old_value <= 0);
+}
+
+
+/*! \brief Release spinlock
+ *
+ *  \param x     Gromacs spinlock pointer
+ *
+ *  Unlocks the spinlock, regardless if which thread locked it.
+ */
+static inline void
+gmx_spinlock_unlock(gmx_spinlock_t *  x)
+{
+	char old_value = 1;
+	
+	__asm__ __volatile__(
+                         "xchgb %b0, %1" 
+                         :"=q" (old_value), "=m" (x->lock) 
+                         :"0" (old_value) : "memory"
+                         );
+}
+ 
+
+/*! \brief Check if spinlock is locked
+ *
+ *  This routine returns immediately with the lock status.
+ *
+ *  \param x  Gromacs spinlock pointer
+ *
+ *  \return 1 if the spinlock is locked, 0 otherwise.
+ */
+static inline int
+gmx_spinlock_islocked(gmx_spinlock_t *  x)
+{
+    return (*(volatile signed char *)(&(x)->lock) <= 0);
+}
+
+
+/*! \brief Wait for a spinlock to become available
+ *
+ *  This routine blocks until the spinlock is unlocked, 
+ *  but in contrast to gmx_spinlock_lock() it returns without 
+ *  trying to lock the spinlock.
+ *
+ *  \param x  Gromacs spinlock pointer
+ */
+static inline void
+gmx_spinlock_wait(gmx_spinlock_t *   x)
+{
+    do 
+    {
+        gmx_atomic_memory_barrier(); 
+    } 
+    while(gmx_spinlock_islocked(x));
+}
+
+
+#elif ( defined(__GNUC__) && (defined(__powerpc__) || defined(__ppc__)))
+/* PowerPC using proper GCC inline assembly. 
+ * Recent versions of xlC (>=7.0) _partially_ support this, but since it is
+ * not 100% compatible we provide a separate implementation for xlC in
+ * the next section.
+ */
+
+/* Compiler-dependent stuff: GCC memory barrier */
+#define gmx_atomic_memory_barrier() __asm__ __volatile__("": : :"memory")
+
+
+
+typedef struct gmx_atomic
+{
+	volatile int	   value;      /*!< Volatile, to avoid compiler aliasing */
+}
+gmx_atomic_t;
+
+
+typedef struct gmx_spinlock
+{
+    volatile unsigned int   lock;      /*!< Volatile, to avoid compiler aliasing */
+}
+gmx_spinlock_t;
+
+
+#define GMX_SPINLOCK_INITIALIZER   { 0 }
+
+
+#define gmx_atomic_read(a)   ((a)->value) 
+#define gmx_atomic_set(a,i)  (((a)->value) = (i))
+
+
+static inline int
+gmx_atomic_add_return(gmx_atomic_t *    a, 
+                      int               i)
+{
+    int t;
+    
+	__asm__ __volatile__("1:     lwarx   %0,0,%2\n"
+                         "\tadd     %0,%1,%0\n"
+                         "\tstwcx.  %0,0,%2 \n"
+                         "\tbne-    1b"
+                         "\tisync\n"
+                         : "=&r" (t)
+						 : "r" (i), "r" (&a->value)
+						 : "cc" , "memory");
+    return t;
+}
+
+
+
+static inline int
+gmx_atomic_fetch_add(gmx_atomic_t *     a,
+                     int                i)
+{
+    int t;
+    
+    __asm__ __volatile__("\teieio\n"
+                         "1:     lwarx   %0,0,%2\n"                         
+                         "\tadd     %0,%1,%0\n"
+                         "\tstwcx.  %0,0,%2 \n"
+                         "\tbne-    1b\n"
+                         "\tisync\n"
+                         : "=&r" (t)
+                         : "r" (i), "r" (&a->value)
+                         : "cc", "memory");
+    
+    return (t - i);    
+}
+
+
+static inline int
+gmx_atomic_cmpxchg(gmx_atomic_t *       a,
+                   int                  oldval,
+                   int                  newval)
+{
+    int prev;
+    
+    __asm__ __volatile__ ("1:    lwarx   %0,0,%2 \n"
+                          "\tcmpw    0,%0,%3 \n"
+                          "\tbne     2f \n"
+                          "\tstwcx.  %4,0,%2 \n"
+                          "bne-    1b\n"
+                          "\tsync\n"
+                          "2:\n"
+                          : "=&r" (prev), "=m" (a->value)
+                          : "r" (&a->value), "r" (oldval), "r" (newval), "m" (a->value)
+                          : "cc", "memory");
+    
+    return prev;
+}
+
+static inline void
+gmx_spinlock_init(gmx_spinlock_t *x)
+{
+    x->lock = 0;
+}
+
+
+
+static inline void
+gmx_spinlock_lock(gmx_spinlock_t *  x)
+{
+    unsigned int tmp;
+    
+    __asm__ __volatile__("\tb      1f\n"
+                         "2:      lwzx    %0,0,%1\n"
+                         "\tcmpwi   0,%0,0\n"
+                         "\tbne+    2b\n"
+                         "1:      lwarx   %0,0,%1\n"
+                         "\tcmpwi   0,%0,0\n"
+                         "\tbne-    2b\n"
+                         "\tstwcx.  %2,0,%1\n"
+                         "bne-    2b\n"
+                         "\tisync\n"
+                         : "=&r"(tmp)
+                         : "r"(&x->lock), "r"(1)
+                         : "cr0", "memory");
+}
+
+
+static inline int
+gmx_spinlock_trylock(gmx_spinlock_t *  x)
+{
+    unsigned int old, t;
+    unsigned int mask = 1;
+    volatile unsigned int *p = &x->lock;
+    
+    __asm__ __volatile__("\teieio\n"
+                         "1:      lwarx   %0,0,%4 \n"
+                         "\tor      %1,%0,%3 \n"
+                         "\tstwcx.  %1,0,%4 \n"
+                         "\tbne     1b\n"
+                         "\tsync\n"
+                         : "=&r" (old), "=&r" (t), "=m" (*p)
+                         : "r" (mask), "r" (p), "m" (*p)
+                         : "cc", "memory");
+    
+    return ((old & mask) != 0);    
+}
+
+
+static inline void
+gmx_spinlock_unlock(gmx_spinlock_t *  x)
+{
+    __asm__ __volatile__("\teieio\n": : :"memory");
+    x->lock = 0;
+}
+
+
+static inline int
+gmx_spinlock_islocked(gmx_spinlock_t *   x)
+{
+    return ( x->lock != 0);
+}
+
+
+static inline void
+gmx_spinlock_wait(gmx_spinlock_t *x)
+{
+    do 
+    {
+        gmx_atomic_memory_barrier(); 
+    }
+    while(gmx_spinlock_islocked(x));
+}
+
+
+
+#elif ( (defined(__IBM_GCC_ASM) || defined(__IBM_STDCPP_ASM))  && \
+        (defined(__powerpc__) || defined(__ppc__)))
+/* PowerPC using xlC inline assembly. 
+ * Recent versions of xlC (>=7.0) _partially_ support GCC inline assembly
+ * if you use the option -qasm=gcc but we have had to hack things a bit, in 
+ * particular when it comes to clobbered variables. Since this implementation
+ * _could_ be buggy, we have separated it from the known-to-be-working gcc
+ * one above.
+ */
+
+/* memory barrier - no idea how to create one with xlc! */
+#define gmx_atomic_memory_barrier()
+
+
+
+typedef struct gmx_atomic
+{
+	volatile int	   value;      /*!< Volatile, to avoid compiler aliasing */
+}
+gmx_atomic_t;
+
+
+typedef struct gmx_spinlock
+{
+    volatile unsigned int   lock;      /*!< Volatile, to avoid compiler aliasing */
+}
+gmx_spinlock_t;
+
+
+#define GMX_SPINLOCK_INITIALIZER   { 0 }
+
+
+#define gmx_atomic_read(a)   ((a)->value) 
+#define gmx_atomic_set(a,i)  (((a)->value) = (i))
+
+
+static inline int
+gmx_atomic_add_return(gmx_atomic_t *    a, 
+                      int               i)
+{
+    int t;
+    
+	__asm__ __volatile__("1:     lwarx   %0,0,%2 \n"
+                         "\t add     %0,%1,%0 \n"
+                         "\t stwcx.  %0,0,%2 \n"
+                         "\t bne-    1b \n"
+                         "\t isync \n"
+                         : "=&r" (t)
+						 : "r" (i), "r" (&a->value) );
+    return t;
+}
+
+
+
+static inline int
+gmx_atomic_fetch_add(gmx_atomic_t *     a,
+                     int                i)
+{
+    int t;
+    
+    __asm__ __volatile__("\t eieio\n"
+                         "1:     lwarx   %0,0,%2 \n"                         
+                         "\t add     %0,%1,%0 \n"
+                         "\t stwcx.  %0,0,%2 \n"
+                         "\t bne-    1b \n"
+                         "\t isync \n"
+                         : "=&r" (t)
+                         : "r" (i), "r" (&a->value));
+    
+    return (t - i);    
+}
+
+
+static inline int
+gmx_atomic_cmpxchg(gmx_atomic_t *       a,
+                   int                  oldval,
+                   int                  newval)
+{
+    int prev;
+    
+    __asm__ __volatile__ ("1:    lwarx   %0,0,%2 \n"
+                          "\t cmpw    0,%0,%3 \n"
+                          "\t bne     2f \n"
+                          "\t stwcx.  %4,0,%2 \n"
+                          "\t bne-    1b \n"
+                          "\t sync \n"
+                          "2: \n"
+                          : "=&r" (prev), "=m" (a->value)
+                          : "r" (&a->value), "r" (oldval), "r" (newval), "m" (a->value));
+    
+    return prev;
+}
+
+static inline void
+gmx_spinlock_init(gmx_spinlock_t *x)
+{
+    x->lock = 0;
+}
+
+
+
+static inline void
+gmx_spinlock_lock(gmx_spinlock_t *  x)
+{
+    unsigned int tmp;
+    
+    __asm__ __volatile__("\t b      1f \n"
+                         "2:      lwzx    %0,0,%1 \n"
+                         "\t cmpwi   0,%0,0 \n"
+                         "\t bne+    2b \n"
+                         "1:      lwarx   %0,0,%1 \n"
+                         "\t cmpwi   0,%0,0 \n"
+                         "\t bne-    2b \n"
+                         "\t stwcx.  %2,0,%1 \n"
+                         "\t bne-    2b \n"
+                         "\t isync\n"
+                         : "=&r"(tmp)
+                         : "r"(&x->lock), "r"(1));
+}
+
+
+static inline int
+gmx_spinlock_trylock(gmx_spinlock_t *  x)
+{
+    unsigned int old, t;
+    unsigned int mask = 1;
+    volatile unsigned int *p = &x->lock;
+    
+    __asm__ __volatile__("\t eieio\n"
+                         "1:      lwarx   %0,0,%4 \n"
+                         "\t or      %1,%0,%3 \n"
+                         "\t stwcx.  %1,0,%4 \n"
+                         "\t bne     1b \n"
+                         "\t sync \n"
+                         : "=&r" (old), "=&r" (t), "=m" (*p)
+                         : "r" (mask), "r" (p), "m" (*p));
+    
+    return ((old & mask) != 0);    
+}
+
+
+static inline void
+gmx_spinlock_unlock(gmx_spinlock_t *  x)
+{
+    __asm__ __volatile__("\t eieio \n");
+    x->lock = 0;
+}
+
+
+static inline void
+gmx_spinlock_islocked(gmx_spinlock_t *   x)
+{
+    return ( x->lock != 0);
+}
+
+
+static inline void
+gmx_spinlock_wait(gmx_spinlock_t *   x)
+{
+    
+    do 
+    {
+        gmx_atomic_memory_barrier();
+    }
+    while(gmx_spinlock_islocked(x));
+}
+
+
+
+
+#elif (defined(__ia64__) && (defined(__GNUC__) || defined(__INTEL_COMPILER)))
+/* ia64 with GCC or Intel compilers. Since we need to define everything through
+* cmpxchg and fetchadd on ia64, we merge the different compilers and only provide 
+* different implementations for that single function. 
+* Documentation? Check the gcc/x86 section.
+*/
+
+
+typedef struct gmx_atomic
+{
+	volatile int	   value;      /*!< Volatile, to avoid compiler aliasing */
+}
+gmx_atomic_t;
+
+
+typedef struct gmx_spinlock
+{
+    volatile unsigned int   lock;      /*!< Volatile, to avoid compiler aliasing */
+}
+gmx_spinlock_t;
+
+
+#define GMX_SPINLOCK_INITIALIZER   { 0 }
+
+
+#define gmx_atomic_read(a)   ((a)->value) 
+#define gmx_atomic_set(a,i)  (((a)->value) = (i))
+
+
+
+/* Compiler thingies */
+#ifdef __INTEL_COMPILER
+void __memory_barrier(void);
+int _InterlockedCompareExchange(volatile int *dest, int xchg, int comp);
+unsigned __int64 __fetchadd4_rel(unsigned int *addend, const int increment);
+/* ia64 memory barrier */
+#  define gmx_atomic_memory_barrier() __memory_barrier()
+/* ia64 cmpxchg */
+#  define gmx_atomic_cmpxchg(a, oldval, newval) _InterlockedCompareExchange(&a->value,newval,oldval)
+/* ia64 fetchadd, but it only works with increments +/- 1,4,8,16 */
+#  define gmx_ia64_fetchadd(a, inc)  __fetchadd4_rel(a, inc)
+
+#elif defined __GNUC__  
+/* ia64 memory barrier */
+#  define gmx_atomic_memory_barrier() asm volatile ("":::"memory")
+/* ia64 cmpxchg */
+static inline int
+gmx_atomic_cmpxchg(gmx_atomic_t *   a,
+                   int              oldval,
+                   int              newval)
+{
+    volatile int res;
+    asm volatile ("mov ar.ccv=%0;;" :: "rO"(oldval));
+    asm volatile ("cmpxchg4.acq %0=[%1],%2,ar.ccv":                    
+                  "=r"(res) : "r"(&a->value), "r"(newval) : "memory"); 
+                          
+    return res;
+}
+
+
+/* fetchadd, but on ia64 it only works with increments +/- 1,4,8,16 */
+#define gmx_ia64_fetchadd(a, inc)                                             \
+({  unsigned long res;                                                        \
+    asm volatile ("fetchadd4.rel %0=[%1],%2"                                  \
+                  : "=r"(res) : "r"(a), "i" (inc) : "memory");                \
+                  res;                                                        \
+})
+
+
+#else /* Unknown compiler */
+#  error Unknown ia64 compiler (not GCC or ICC) - modify gmx_thread.h!
+#endif
+
+
+
+static inline int
+gmx_atomic_add_return(gmx_atomic_t *       a, 
+                      volatile int         i)
+{
+    volatile int oldval,newval;    
+    volatile int __i = i;
+
+    /* Use fetchadd if, and only if, the increment value can be determined
+     * at compile time (otherwise this check is optimized away) and it is
+     * a value supported by fetchadd (1,4,8,16,-1,-4,-8,-16).
+     */                         
+    if (__builtin_constant_p(i) &&
+        ( (__i ==   1) || (__i ==   4)  || (__i ==   8) || (__i ==  16) ||         
+          (__i ==  -1) || (__i ==  -4)  || (__i ==  -8) || (__i == -16) ) )
+    {
+        oldval = gmx_ia64_fetchadd(a,__i);
+        newval = oldval + i;
+    }
+    else
+    {
+        /* Use compare-exchange addition that works with any value */
+        do
+        {
+            oldval = gmx_atomic_read(a);
+            newval = oldval + i;
+        }
+        while(gmx_atomic_cmpxchg(a,oldval,newval) != oldval);
+    }
+    return newval;
+}
+
+
+
+static inline int
+gmx_atomic_fetch_add(gmx_atomic_t *     a,
+                     volatile int       i)
+{
+    volatile int oldval,newval;    
+    volatile int __i = i;
+    
+    /* Use ia64 fetchadd if, and only if, the increment value can be determined
+     * at compile time (otherwise this check is optimized away) and it is
+     * a value supported by fetchadd (1,4,8,16,-1,-4,-8,-16).
+     */                         
+    if (__builtin_constant_p(i) &&
+        ( (__i ==   1) || (__i ==   4)  || (__i ==   8) || (__i ==  16) ||         
+          (__i ==  -1) || (__i ==  -4)  || (__i ==  -8) || (__i == -16) ) )
+    {
+        oldval = gmx_ia64_fetchadd(a,__i);
+        newval = oldval + i;
+    }
+    else
+    {
+        /* Use compare-exchange addition that works with any value */
+        do
+        {
+            oldval = gmx_atomic_read(a);
+            newval = oldval + i;
+        }
+        while(gmx_atomic_cmpxchg(a,oldval,newval) != oldval);
+    }
+    return oldval;
+}
+
+
+static inline void
+gmx_spinlock_init(gmx_spinlock_t *x)
+{
+    x->lock = 0;
+}
+
+
+static inline void
+gmx_spinlock_lock(gmx_spinlock_t *   x)
+{
+    gmx_atomic_t *a = (gmx_atomic_t *) x;
+    unsigned long value;                                                 
+    value = gmx_atomic_cmpxchg(a, 0, 1);                             
+    if (value)                                                           
+    {                                                                    
+        do                                                               
+        {                                                                
+            while (a->value != 0)                                                 
+            {                                                            
+                gmx_atomic_memory_barrier();                             
+            }                                                            
+            value = gmx_atomic_cmpxchg(a, 0, 1);                       
+        }                                                                
+        while (value);                                                   
+    }                                                                    
+} 
+
+
+static inline int
+gmx_spinlock_trylock(gmx_spinlock_t *   x)
+{
+    return (gmx_atomic_cmpxchg((gmx_atomic_t *)x, 0, 1) != 0);
+}
+
+
+static inline void
+gmx_spinlock_unlock(gmx_spinlock_t *   x)
+{
+    do
+    {
+        gmx_atomic_memory_barrier(); 
+        x->lock = 0;
+    } 
+    while (0);
+}
+
+
+static inline int
+gmx_spinlock_islocked(gmx_spinlock_t *   x)
+{
+    return (x->lock != 0);
+}
+
+
+static inline void
+gmx_spinlock_wait(gmx_spinlock_t *   x)
+{
+    
+    do 
+    {
+        gmx_atomic_memory_barrier();
+    }
+    while(gmx_spinlock_islocked(x));
+}
+
+
+#undef gmx_ia64_fetchadd
+
+
+
+#elif (defined(__hpux) || defined(__HP_cc)) && defined(__ia64)
+/* HP compiler on ia64 */
+#include <machine/sys/inline.h>
+
+#define gmx_atomic_memory_barrier() _Asm_mf()
+
+#define gmx_hpia64_fetchadd(a, i)                           \
+    _Asm_fetchadd((_Asm_fasz)_FASZ_W,(_Asm_sem)_SEM_REL,    \
+                  (UInt32*)a,(unsigned int) i,              \
+                  (_Asm_ldhint)LDHINT_NONE)
+ 
+
+typedef struct gmx_atomic
+{
+	volatile int	   value;      /*!< Volatile, to avoid compiler aliasing */
+}
+gmx_atomic_t;
+
+
+typedef struct gmx_spinlock
+{
+    volatile unsigned int   lock;      /*!< Volatile, to avoid compiler aliasing */
+}
+gmx_spinlock_t;
+
+
+static inline int
+gmx_atomic_cmpxchg(gmx_atomic_t *   a,
+                   int              oldval,
+                   int              newval)
+{
+    int ret;
+    
+    _Asm_mov_to_ar((_Asm_app_reg)_AREG_CCV,(Uint32)oldval,                  
+                   (_Asm_fence)(_UP_CALL_FENCE | _UP_SYS_FENCE |         
+                                _DOWN_CALL_FENCE | _DOWN_SYS_FENCE));
+                   
+    ret = _Asm_cmpxchg((_Asm_sz)SZ_W,(_Asm_sem)_SEM_ACQ,(Uint32*)a,    
+                       (Uint32)newval,(_Asm_ldhint)_LDHINT_NONE);
+                   
+    return ret;
+}
+
+
+
+#define GMX_SPINLOCK_INITIALIZER   { 0 }
+
+
+#define gmx_atomic_read(a)   ((a)->value) 
+#define gmx_atomic_set(a,i)  (((a)->value) = (i))
+
+
+static inline void 
+gmx_atomic_add_return(gmx_atomic_t *       a, 
+                      int                  i)
+{
+    int old,new;    
+    int __i = i;
+    
+    /* On HP-UX we don't know any macro to determine whether the increment
+     * is known at compile time, but hopefully the call uses something simple
+     * like a constant, and then the optimizer should be able to do the job.
+     */                         
+    if (  (__i ==   1) || (__i ==   4)  || (__i ==   8) || (__i ==  16) ||         
+          (__i ==  -1) || (__i ==  -4)  || (__i ==  -8) || (__i == -16) )
+    {
+        oldval = gmx_hpia64_fetchadd(a,__i);
+        newval = oldval + i;
+    }
+    else
+    {
+        /* Use compare-exchange addition that works with any value */
+        do
+        {
+            oldval = gmx_atomic_read(a);
+            newval = oldval + i;
+        }
+        while(gmx_atomic_cmpxchg(a,oldval,newval) != oldval);
+    }
+    return newval;
+}
+
+
+
+static inline int
+gmx_atomic_fetch_add(gmx_atomic_t *     a,
+                     int                i)
+{
+    int oldval,newval;    
+    int __i = i;
+    
+    /* On HP-UX we don't know any macro to determine whether the increment
+     * is known at compile time, but hopefully the call uses something simple
+     * like a constant, and then the optimizer should be able to do the job.
+     */                         
+    if (  (__i ==   1) || (__i ==   4)  || (__i ==   8) || (__i ==  16) ||         
+          (__i ==  -1) || (__i ==  -4)  || (__i ==  -8) || (__i == -16) )
+    {
+        oldval = gmx_hpia64_fetchadd(a,__i);
+        newval = oldval + i;
+    }
+    else
+    {
+        /* Use compare-exchange addition that works with any value */
+        do
+        {
+            oldval = gmx_atomic_read(a);
+            newval = oldval + i;
+        }
+        while(gmx_atomic_cmpxchg(a,oldval,newval) != oldval);
+    }
+    return oldval;
+}
+
+
+static inline void
+gmx_spinlock_init(gmx_spinlock_t *x)
+{
+    x->lock = 0;
+}
+
+
+
+
+
+static inline void
+gmx_spinlock_trylock(gmx_spinlock_t *x)
+{
+    int rc;
+
+    rc = _Asm_xchg((_Asm_sz)_SZ_W, (unsigned int *)x, 1        
+                    (_Asm_ldhit)_LDHINT_NONE);
+    
+    return ( (rc>0) ? 1 : 0);
+}
+
+
+static inline void
+gmx_spinlock_lock(gmx_spinlock_t *x)
+{
+    int      status = 1;
+    
+    do
+    {
+        if( *((unsigned int *)x->lock) == 0 ) 
+        {
+            status = gmx_spinlock_trylock(x);
+        }
+    } while( status != 0);
+}
+
+
+static inline void
+gmx_spinlock_unlock(gmx_spinlock_t *   x)
+{
+    _Asm_fetchadd((_Asm_fasz)_SZ_W,(_Asm_sem)_SEM_REL,                  
+                  (unsigned int *)x,-1,(_Asm_ldhint)_LDHINT_NONE);
+}
+
+
+
+static inline void
+gmx_spinlock_islocked(gmx_spinlock_t *   x)
+{
+    return ( x->lock != 0 );
+}
+
+
+
+static inline void
+gmx_spinlock_wait(gmx_spinlock_t *   x)
+{
+    do
+    {
+        gmx_atomic_memory_barrier(); 
+    } 
+    while(gmx_spinlock_islocked(x));
+}
+
+
+#undef gmx_hpia64_fetchadd
+
+
+
+#elif (defined(_MSC_VER) && (_MSC_VER >= 1200))
+/* Microsoft Visual C on x86, define taken from FFTW who got it from Morten Nissov */
+
+#include <windows.h>
+
+#define gmx_atomic_memory_barrier()
+
+
+typedef struct gmx_atomic
+{
+	LONG volatile	   value;      /*!< Volatile, to avoid compiler aliasing */
+}
+gmx_atomic_t;
+
+
+typedef struct gmx_spinlock
+{
+    LONG volatile      lock;      /*!< Volatile, to avoid compiler aliasing */
+}
+gmx_spinlock_t;
+
+
+#define GMX_SPINLOCK_INITIALIZER   { 0 }
+
+
+
+
+#define gmx_atomic_read(a)  ((a)->value) 
+#define gmx_atomic_set(a,i)  (((a)->value) = (i))
+
+
+
+
+#define gmx_atomic_fetch_add(a, i)  \
+    InterlockedExchangeAdd((LONG volatile *)a, (LONG) i)
+
+#define gmx_atomic_add_return(a, i)  \
+    ( i + InterlockedExchangeAdd((LONG volatile *)a, (LONG) i) )
+
+#define gmx_atomic_cmpxchg(a, oldval, newval) \
+    InterlockedCompareExchange((LONG volatile *)a, (LONG) newval, (LONG) oldval)
+
+
+# define gmx_spinlock_lock(x)   \
+    while((InterlockedCompareExchange((LONG volatile *)&x, 1, 0))!=0)
+
+
+#define gmx_spinlock_trylock(x)   \
+    InterlockedCompareExchange((LONG volatile *)&x, 1, 0)
+
+
+static inline void
+gmx_spinlock_unlock(gmx_spinlock_t *   x)
+{
+    x->lock = 0;
+}
+
+
+static inline int
+gmx_spinlock_islocked(gmx_spinlock_t *   x)
+{
+    return (*(volatile signed char *)(&(x)->lock) != 0);
+}
+
+
+static inline void
+gmx_spinlock_wait(gmx_spinlock_t *   x)
+{
+    while(gmx_spinlock_islocked(x))
+    {
+        Sleep(0);
+    }
+}
+
+
+
+#elif defined(__xlC__) && defined (_AIX)
+/* IBM xlC compiler on AIX */
+#include <sys/atomic_op.h>
+
+
+#define gmx_atomic_memory_barrier()
+
+
+typedef struct gmx_atomic
+{
+	volatile int	   value;      /*!< Volatile, to avoid compiler aliasing */
+}
+gmx_atomic_t;
+
+
+typedef struct gmx_spinlock
+{
+    volatile unsigned int      lock;      /*!< Volatile, to avoid compiler aliasing */
+}
+gmx_spinlock_t;
+
+
+static inline int
+gmx_atomic_cmpxchg(gmx_atomic_t *    a,
+                   int               oldval,
+                   int               newval)
+{
+    int t;
+    
+    if(__check_lock((atomic_p)&a->value, oldval, newval))
+    {
+        /* Not successful - value had changed in memory. Reload value. */
+        t = a->value;
+    }
+    else
+    {
+        /* replacement suceeded */
+        t = oldval;
+    }
+    return t;        
+}
+
+
+static inline void 
+gmx_atomic_add_return(gmx_atomic_t *       a, 
+                      int                  i)
+{
+    int oldval,newval;    
+    
+    do
+    {
+        oldval = gmx_atomic_read(a);
+        newval = oldval + i;
+    }
+    while(__check_lock((atomic_p)&a->value, oldval, newval));
+
+    return newval;
+}
+
+
+
+static inline void 
+gmx_atomic_fetch_add(gmx_atomic_t *       a, 
+                     int                  i)
+{
+    int oldval,newval;    
+    
+    do
+    {
+        oldval = gmx_atomic_read(a);
+        newval = oldval + i;
+    }
+    while(__check_lock((atomic_p)&a->value, oldval, newval));
+    
+    return oldval;
+}
+
+
+static inline void
+gmx_spinlock_init(gmx_spinlock_t *   x)
+{
+    __clear_lock((atomic_p)x,0);
+}
+
+
+static inline void
+gmx_spinlock_lock(gmx_spinlock_t *   x)
+{
+    do
+    {
+        ;
+    }
+    while(__check_lock((atomic_p)x, 0, 1));
+}
+
+
+static inline void
+gmx_spinlock_trylock(gmx_spinlock_t *   x)
+{
+    /* Return 0 if we got the lock */
+    return (__check_lock((atomic_p)x, 0, 1) != 0)
+}
+
+
+static inline void
+gmx_spinlock_unlock(gmx_spinlock_t *   x)
+{
+    __clear_lock((atomic_p)x,0);
+}
+
+
+static inline void
+gmx_spinlock_islocked(gmx_spinlock_t *   x)
+{
+    return (*((atomic_p)x) != 0);
+}
+
+
+static inline void
+gmx_spinlock_wait(gmx_spinlock_t *    x)
+{
+    while(gmx_spinlock_islocked(x)) { ; } 
+}
+
+
+#else
+/* No atomic operations, use mutex fallback. Documentation is in x86 section */
+
+
+#define gmx_atomic_memory_barrier()
+
+/* System mutex used for locking to guarantee atomicity */
+static pthread_mutex_t
+gmx_atomic_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+
+typedef struct gmx_atomic
+{
+	int	   value;
+}
+gmx_atomic_t;
+
+#define gmx_spinlock_t     pthread_mutex_t
+
+ 
+#  define GMX_SPINLOCK_INITIALIZER   PTHREAD_MUTEX_INITIALIZER
+
+/* Since mutexes guarantee memory barriers this works fine */
+#define gmx_atomic_read(a)   ((a)->value)
+
+
+static inline void
+gmx_atomic_set(gmx_atomic_t *   a, 
+               int              i)
+{
+    /* Mutexes here are necessary to guarantee memory visibility */
+    pthread_mutex_lock(&gmx_atomic_mutex);
+    a->value = i;
+    pthread_mutex_unlock(&gmx_atomic_mutex);
+}
+
+
+static inline int
+gmx_atomic_add_return(gmx_atomic_t *   a, 
+                      int              i)
+{
+    int t;
+    pthread_mutex_lock(&gmx_atomic_mutex);
+    t = a->value + i;
+    a->value = t;
+    pthread_mutex_unlock(&gmx_atomic_mutex);
+    return t;
+}
+
+
+static inline int
+gmx_atomic_fetch_add(gmx_atomic_t *   a,
+                     int              i)
+{
+    int old_value;
+    
+    pthread_mutex_lock(&gmx_atomic_mutex);
+    old_value  = a->value;
+    a->value   = old_value + i;
+    pthread_mutex_unlock(&gmx_atomic_mutex);
+    return old_value;
+}
+
+
+static inline int
+gmx_atomic_cmpxchg(gmx_atomic_t *           a, 
+                   int                      oldv,
+                   int                      newv)
+{
+    int t;
+    
+    pthread_mutex_lock(&gmx_atomic_mutex);
+    t = a->value;
+    if (t == oldv)
+    {
+        a->value = newv;
+    }
+    pthread_mutex_unlock(&gmx_atomic_mutex);
+    return t;
+}
+
+
+#define gmx_spinlock_init(lock)       pthread_mutex_init(lock)
+#define gmx_spinlock_lock(lock)       pthread_mutex_lock(lock)
+#define gmx_spinlock_trylock(lock)    pthread_mutex_trylock(lock)
+#define gmx_spinlock_unlock(lock)     pthread_mutex_unlock(lock)
+
+static inline int
+gmx_spinlock_islocked(gmx_spinlock_t *   x)
+{
+    int rc;
+    
+    if(gmx_spinlock_trylock(x) != 0)
+    {
+        /* It was locked */
+        return 1;
+    }
+    else
+    {
+        /* We just locked it */
+        gmx_spinlock_unlock(x);
+        return 0;
+    }
+}
+
+
+static inline void
+gmx_spinlock_wait(gmx_spinlock_t *   x)
+{
+    int rc;
+    
+    gmx_spinlock_lock(x);
+    /* Got the lock now, so the waiting is over */
+    gmx_spinlock_unlock(x);
+}
+
+
+#endif
+
+
+
+
+/*! \brief Spinlock-based barrier type
+ *
+ *  This barrier has the same functionality as the standard
+ *  gmx_thread_barrier_t, but since it is based on spinlocks
+ *  it provides faster synchronization at the cost of busy-waiting.
+ *
+ *  Variables of this type should be initialized by calling
+ *  gmx_spinlock_barrier_init() to set the number of threads
+ *  that should be synchronized.
+ */
+typedef struct gmx_spinlock_barrier
+{
+	gmx_atomic_t            count;     /*!< Number of threads remaining     */
+	int                     threshold; /*!< Total number of threads         */
+	volatile int            cycle;     /*!< Current cycle (alternating 0/1) */
+}
+gmx_spinlock_barrier_t;
+ 
+
+
+
+/*! \brief Initialize spinlock-based barrier
+ *
+ *  \param barrier  Pointer to _spinlock_ barrier. Note that this is not
+ *                  the same datatype as the full, thread based, barrier.
+ *  \param count    Number of threads to synchronize. All threads
+ *                  will be released after \a count calls to 
+ *                  gmx_spinlock_barrier_wait().  
+ */
+static inline void 
+gmx_spinlock_barrier_init(gmx_spinlock_barrier_t *         barrier,
+                          int                              count)
+{
+	barrier->threshold = count;
+	barrier->cycle     = 0;
+	gmx_atomic_set(&(barrier->count),count);
+}
+
+
+
+
+/*! \brief Perform busy-waiting barrier synchronization
+*
+*  This routine blocks until it has been called N times,
+*  where N is the count value the barrier was initialized with.
+*  After N total calls all threads return. The barrier automatically
+*  cycles, and thus requires another N calls to unblock another time.
+*
+*  Note that spinlock-based barriers are completely different from
+*  standard ones (using mutexes and condition variables), only the 
+*  functionality and names are similar.
+*
+*  \param barrier  Pointer to previously create barrier.
+*
+*  \return The last thread returns -1, all the others 0.
+*/
+static inline int
+gmx_spinlock_barrier_wait(gmx_spinlock_barrier_t *   barrier)
+{
+  int    cycle;
+  int    status;
+  
+  /* We don't need to lock or use atomic ops here, since the cycle index 
+	* cannot change until after the last thread has performed the check
+	* further down. Further, they cannot reach this point in the next 
+	* barrier iteration until all of them have been released, and that 
+	* happens after the cycle value has been updated.
+	*
+	* No synchronization == fast synchronization.
+	*/
+  cycle = barrier->cycle;
+  
+  /* Decrement the count atomically and check if it is zero.
+	* This will only be true for the last thread calling us.
+	*/
+  if( gmx_atomic_add_return( &(barrier->count), -1 ) == 0)
+  { 
+	gmx_atomic_set(&(barrier->count), barrier->threshold);
+	barrier->cycle = !barrier->cycle;
+    
+	status = -1;
+  }
+  else
+  {
+	/* Wait until the last thread changes the cycle index.
+	* We are both using a memory barrier, and explicit
+	* volatile pointer cast to make sure the compiler
+	* doesn't try to be smart and cache the contents.
+	*/
+	do
+	{ 
+	  gmx_atomic_memory_barrier();
+	} 
+	while( *(volatile int *)(&(barrier->cycle)) == cycle);
+	
+	status = 0;
+  }
+  return status;
+}
+
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif /* _GMX_ATOMIC_H_ */