Merge pull request #2133 from peastman/atomic

Replaced gmx_atomic with C++ atomic

Merge pull request #2133 from peastman/atomic
Replaced gmx_atomic with C++ atomic
fdb501e3 · peastman · GitHub · e72a4e8c · 72bfef12 · e72a4e8c
Unverified Commit fdb501e3 authored Jul 25, 2018 by peastman Committed by GitHub Jul 25, 2018
18 changed files
--- a/openmmapi/include/openmm/internal/gmx_atomic.h
+++ b/openmmapi/include/openmm/internal/gmx_atomic.h
-/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- 
-*
-* Copyright (c) 2004-2008, Erik Lindahl <lindahl@cbr.su.se>
-*
-*  Unfortunately, some of the constructs in this file are _very_ sensitive
-*  to compiler optimizations and architecture changes. If you find any such
-*  errors, please send a message to lindahl@cbr.su.se to help us fix the
-*  upstream version too.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to deal
-* in the Software without restriction, including without limitation the rights
-* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-* copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-* 
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-* 
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-* THE SOFTWARE.
-*
-* And Hey:
-* Gnomes, ROck Monsters And Chili Sauce
-*/
-#ifndef _GMX_ATOMIC_H_
-#define _GMX_ATOMIC_H_
-
-/*! \file gmx_atomic.h
- *
- *  @brief Atomic operations for fast SMP synchronization
- *
- *  This file defines atomic integer operations and spinlocks for 
- *  fast synchronization in performance-critical regions of gromacs.
- *
- *  In general, the best option is to use functions without explicit 
- *  locking, e.g. gmx_atomic_fetch_add() or gmx_atomic_cmpxchg().
- *
- *  Not all architecture support atomic operations though inline assembly,
- *  and even if they do it might not be implemented here. In that case
- *  we use a fallback mutex implementation, so you can always count on
- *  the function interfaces working in Gromacs.
- *
- *  Don't use spinlocks in non-performance-critical regions like file I/O.
- *  Since they always spin busy they would waste CPU cycles instead of 
- *  properly yielding to a computation thread while waiting for the disk.
- *
- *  Finally, note that all our spinlock operations are defined to return
- *  0 if initialization or locking completes successfully.
- *  This is the opposite of some other implementations, but the same standard
- *  as used for pthread mutexes. So, if e.g. are trying to lock a spinlock,
- *  you will have gotten the lock if the return value is 0.
- * 
- *  gmx_spinlock_islocked(x) obviously still returns 1 if the lock is locked,
- *  and 0 if it is available, though...
- */
-
-
-
-#include <stdio.h>
-
-#define NOMINMAX
-#include <pthread.h>
-
-#ifdef __cplusplus
-extern "C" 
-{  
-#endif
-#if 0
-} /* Avoids screwing up auto-indentation */
-#endif
-
-
-
-
-#if ( ( (defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__PATHSCALE__)) && \
-        (defined(i386) || defined(__x86_64__)) )                                      \
-      || defined (DOXYGEN) )
-
-/* This code is executed for x86 and x86-64, with these compilers:
- * GNU
- * Intel 
- * Pathscale
- * All these support GCC-style inline assembly. 
- * We also use this section for the documentation.
- */
-
-/*! \brief Memory barrier operation
- *
- *  Modern CPUs rely heavily on out-of-order execution, and one common feature
- *  is that load/stores might be reordered. Also, when using inline assembly
- *  the compiler might already have loaded the variable we are changing into
- *  a register, so any update to memory won't be visible.
- *
- *  This command creates a memory barrier, i.e. all memory results before
- *  it in the code should be visible to all memory operations after it - the
- *  CPU cannot propagate load/stores across it.
- */
-#define gmx_atomic_memory_barrier() __asm__ __volatile__("": : :"memory")
-
-/* Only gcc and Intel support this check, otherwise set it to true (skip doc) */
-#if (!defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined DOXYGEN)
-#define __builtin_constant_p(i) (1)
-#endif
-
-
-/*! \brief Gromacs atomic operations datatype
- *
- *  Portable synchronization primitives like mutexes are effective for
- *  many purposes, but usually not very high performance.
- *  One of the problem is that you have the overhead of a function call,
- *  and another is that Mutexes often have extra overhead to make the
- *  scheduling fair. Finally, if performance is important we don't want
- *  to suspend the thread if we cannot lock a mutex, but spin-lock at 100%
- *  CPU usage until the resources is available (e.g. increment a counter).
- *
- *  These things can often be implemented with inline-assembly or other
- *  system-dependent functions, and we provide such functionality for the
- *  most common platforms. For portability we also have a fallback 
- *  implementation using a mutex for locking.
- *
- *  Performance-wise, the fastest solution is always to avoid locking 
- *  completely (obvious, but remember it!). If you cannot do that, the
- *  next best thing is to use atomic operations that e.g. increment a
- *  counter without explicit locking. Spinlocks are useful to lock an
- *  entire region, but leads to more overhead and can be difficult to
- *  debug - it is up to you to make sure that only the thread owning the
- *  lock unlocks it!
- *
- *  You should normally NOT use atomic operations for things like 
- *  I/O threads. These should yield to other threads while waiting for 
- *  the disk instead of spinning at 100% CPU usage.
- *
- *  It is imperative that you use the provided routines for reading
- *  and writing, since some implementations require memory barriers before
- *  the CPU or memory sees an updated result. The structure contents is
- *  only visible here so it can be inlined for performance - it might
- *  change without further notice.
- *
- *  \note No initialization is required for atomic variables.
- *
- *  Currently, we have (real) atomic operations for:
- *
- *  - x86 or x86_64, using GNU compilers
- *  - x86 or x86_64, using Intel compilers 
- *  - x86 or x86_64, using Pathscale compilers
- *  - Itanium, using GNU compilers 
- *  - Itanium, using Intel compilers
- *  - Itanium, using HP compilers
- *  - PowerPC, using GNU compilers 
- *  - PowerPC, using IBM AIX compilers 
- *  - PowerPC, using IBM compilers >=7.0 under Linux or Mac OS X.
- */
-typedef struct gmx_atomic
-{
-	volatile int	   value;      /*!< Volatile, to avoid compiler aliasing */
-}
-gmx_atomic_t;
-
-
-
-/*! \brief Gromacs spinlock
- *
- *  Spinlocks provide a faster synchronization than mutexes,
- *  although they consume CPU-cycles while waiting. They are implemented
- *  with atomic operations and inline assembly whenever possible, and
- *  otherwise we use a fallback implementation where a spinlock is identical
- *  to a mutex (this is one of the reasons why you have to initialize them).
- *
- *  There are no guarantees whatsoever about fair scheduling or
- *  debugging if you make a mistake and unlock a variable somebody
- *  else has locked - performance is the primary goal of spinlocks.
- *
- */
-typedef struct gmx_spinlock
-{
-    volatile unsigned int   lock;      /*!< Volatile, to avoid compiler aliasing */
-}
-gmx_spinlock_t;
-
-
-
-
-
-/*! \brief Spinlock static initializer
- *
- *  This is used for static spinlock initialization, and has the same
- *  properties as GMX_THREAD_MUTEX_INITIALIZER has for mutexes.
- *  This is only for inlining in the gmx_thread.h header file. Whether
- *  it is 0, 1, or something else when unlocked depends on the platform.
- *  Don't assume anything about it. It might even be a mutex when using the
- * fallback implementation!
- */
-#define GMX_SPINLOCK_INITIALIZER   { 1 }
-
-
-
-/*! \brief Return value of an atomic integer 
- *
- *  Also implements proper memory barriers when necessary.
- *  The actual implementation is system-dependent.
- *
- *  \param  a   Atomic variable to read
- *  \return     Integer value of the atomic variable
- */
-#define gmx_atomic_read(a)  ((a)->value) 
-
- 
-/*! \brief Write value to an atomic integer 
- *
- *  Also implements proper memory barriers when necessary.
- *  The actual implementation is system-dependent.
- *
- *  \param  a   Atomic variable
- *  \param  i   Integer to set the atomic variable to.
- */
-#define gmx_atomic_set(a,i)  (((a)->value) = (i))
-
- 
-/*! \brief Add integer to atomic variable
- *
- *  Also implements proper memory barriers when necessary.
- *  The actual implementation is system-dependent.
- *
- *  \param a   atomic datatype to modify
- *  \param i   integer to increment with. Use i<0 to subtract atomically.
- *
- *  \return The new value (after summation).
- */
-static inline int
-gmx_atomic_add_return(gmx_atomic_t *     a, 
-                      volatile int       i)
-{
-    int __i;
-    
-    __i = i;
-    __asm__ __volatile__("lock ; xaddl %0, %1;"
-                         :"=r"(i) :"m"(a->value), "0"(i));
-    return i + __i;
-}  
-  
-
-/*! \brief Add to variable, return the old value.
- *
- *  This operation is quite useful for synchronization counters.
- *  By performing a fetchadd with N, a thread can e.g. reserve a chunk 
- *  with the next N iterations, and the return value is the index
- *  of the first element to treat.
- *
- *  Also implements proper memory barriers when necessary.
- *  The actual implementation is system-dependent.
- *
- *  \param a   atomic datatype to modify
- *  \param i   integer to increment with. Use i<0 to subtract atomically.
- *
- *  \return    The value of the atomic variable before addition.
- */
-static inline int
-gmx_atomic_fetch_add(gmx_atomic_t *     a,
-                     volatile int       i)
-{
-    int __i;
-
-    __i = i;
-    __asm__ __volatile__("lock ; xaddl %0, %1;"
-                         :"=r"(i) :"m"(a->value), "0"(i));
-    return i;
-}
-
-
-/*! \brief Atomic compare-exchange operation
- *
- *   The \a old value is compared with the memory value in the atomic datatype.
- *   If the are identical, the atomic type is updated to the new value, 
- *   and otherwise left unchanged. 
- *  
- *   This is a very useful synchronization primitive: You can start by reading
- *   a value (without locking anything), perform some calculations, and then
- *   atomically try to update it in memory unless it has changed. If it has
- *   changed you will get an error return code - reread the new value
- *   an repeat the calculations in that case.
- *
- *   \param a        Atomic datatype ('memory' value)
- *   \param oldval   Integer value read from the atomic type at an earlier point
- *   \param newval   New value to write to the atomic type if it currently is
- *                   identical to the old value.
- *
- *   \return The value of the atomic memory variable in memory when this 
- *           instruction was executed. This, if the operation succeeded the
- *           return value was identical to the \a old parameter, and if not
- *           it returns the updated value in memory so you can repeat your
- *           operations on it. 
- *
- *   \note   The exchange occured if the return value is identical to \a old.
- */
-static inline int
-gmx_atomic_cmpxchg(gmx_atomic_t *    a, 
-                   int               oldval,
-                   int               newval)
-{
-    volatile unsigned long prev;
-    
-    __asm__ __volatile__("lock ; cmpxchgl %1,%2"
-                         : "=a"(prev)
-                         : "q"(newval), "m"(a->value), "0"(oldval)
-                         : "memory");
-    
-    return prev;
-}
-
-
-/*! \brief Initialize spinlock
- *
- *  In theory you can call this from multiple threads, but remember
- *  that we don't check for errors. If the first thread proceeded to
- *  lock the spinlock after initialization, the second will happily
- *  overwrite the contents and unlock it without warning you.
- *
- *  \param x      Gromacs spinlock pointer.
- */
-static inline void
-gmx_spinlock_init(gmx_spinlock_t *   x)
-{
-    x->lock = 1;
-}
-
-
-
-/*! \brief Acquire spinlock
- *
- *  This routine blocks until the spinlock is available, and
- *  the locks it again before returning.
- *
- *  \param x     Gromacs spinlock pointer
- */
-static inline void
-gmx_spinlock_lock(gmx_spinlock_t *  x)
-{
-	__asm__ __volatile__("\n1:\t" 
-						 "lock ; decb %0\n\t" 
-						 "jns 3f\n" 
-						 "2:\t" 
-						 "rep;nop\n\t" 
-						 "cmpb $0,%0\n\t" 
-						 "jle 2b\n\t" 
-						 "jmp 1b\n" 
-						 "3:\n\t" 
-						 :"=m" (x->lock) : : "memory"); 
-}
-
-
-/*! \brief Attempt to acquire spinlock
- *
- * This routine acquires the spinlock if possible, but if 
- * already locked it return an error code immediately.
- *
- *  \param x     Gromacs spinlock pointer
- *
- * \return 0 if the mutex was available so we could lock it,
- *         otherwise a non-zero integer (1) if the lock is busy.
- */
-static inline int
-gmx_spinlock_trylock(gmx_spinlock_t *  x)
-{
-	char old_value;
-	
-    __asm__ __volatile__("xchgb %b0,%1"
-                         :"=q" (old_value), "=m" (x->lock)
-						 :"0" (0) : "memory");
-    return (old_value <= 0);
-}
-
-
-/*! \brief Release spinlock
- *
- *  \param x     Gromacs spinlock pointer
- *
- *  Unlocks the spinlock, regardless if which thread locked it.
- */
-static inline void
-gmx_spinlock_unlock(gmx_spinlock_t *  x)
-{
-	char old_value = 1;
-	
-	__asm__ __volatile__(
-                         "xchgb %b0, %1" 
-                         :"=q" (old_value), "=m" (x->lock) 
-                         :"0" (old_value) : "memory"
-                         );
-}
- 
-
-/*! \brief Check if spinlock is locked
- *
- *  This routine returns immediately with the lock status.
- *
- *  \param x  Gromacs spinlock pointer
- *
- *  \return 1 if the spinlock is locked, 0 otherwise.
- */
-static inline int
-gmx_spinlock_islocked(gmx_spinlock_t *  x)
-{
-    return (*(volatile signed char *)(&(x)->lock) <= 0);
-}
-
-
-/*! \brief Wait for a spinlock to become available
- *
- *  This routine blocks until the spinlock is unlocked, 
- *  but in contrast to gmx_spinlock_lock() it returns without 
- *  trying to lock the spinlock.
- *
- *  \param x  Gromacs spinlock pointer
- */
-static inline void
-gmx_spinlock_wait(gmx_spinlock_t *   x)
-{
-    do 
-    {
-        gmx_atomic_memory_barrier(); 
-    } 
-    while(gmx_spinlock_islocked(x));
-}
-
-
-#elif ( defined(__GNUC__) && (defined(__powerpc__) || defined(__ppc__)))
-/* PowerPC using proper GCC inline assembly. 
- * Recent versions of xlC (>=7.0) _partially_ support this, but since it is
- * not 100% compatible we provide a separate implementation for xlC in
- * the next section.
- */
-
-/* Compiler-dependent stuff: GCC memory barrier */
-#define gmx_atomic_memory_barrier() __asm__ __volatile__("": : :"memory")
-
-
-
-typedef struct gmx_atomic
-{
-	volatile int	   value;      /*!< Volatile, to avoid compiler aliasing */
-}
-gmx_atomic_t;
-
-
-typedef struct gmx_spinlock
-{
-    volatile unsigned int   lock;      /*!< Volatile, to avoid compiler aliasing */
-}
-gmx_spinlock_t;
-
-
-#define GMX_SPINLOCK_INITIALIZER   { 0 }
-
-
-#define gmx_atomic_read(a)   ((a)->value) 
-#define gmx_atomic_set(a,i)  (((a)->value) = (i))
-
-
-static inline int
-gmx_atomic_add_return(gmx_atomic_t *    a, 
-                      int               i)
-{
-    int t;
-    
-	__asm__ __volatile__("1:     lwarx   %0,0,%2\n"
-                         "\tadd     %0,%1,%0\n"
-                         "\tstwcx.  %0,0,%2 \n"
-                         "\tbne-    1b"
-                         "\tisync\n"
-                         : "=&r" (t)
-						 : "r" (i), "r" (&a->value)
-						 : "cc" , "memory");
-    return t;
-}
-
-
-
-static inline int
-gmx_atomic_fetch_add(gmx_atomic_t *     a,
-                     int                i)
-{
-    int t;
-    
-    __asm__ __volatile__("\teieio\n"
-                         "1:     lwarx   %0,0,%2\n"                         
-                         "\tadd     %0,%1,%0\n"
-                         "\tstwcx.  %0,0,%2 \n"
-                         "\tbne-    1b\n"
-                         "\tisync\n"
-                         : "=&r" (t)
-                         : "r" (i), "r" (&a->value)
-                         : "cc", "memory");
-    
-    return (t - i);    
-}
-
-
-static inline int
-gmx_atomic_cmpxchg(gmx_atomic_t *       a,
-                   int                  oldval,
-                   int                  newval)
-{
-    int prev;
-    
-    __asm__ __volatile__ ("1:    lwarx   %0,0,%2 \n"
-                          "\tcmpw    0,%0,%3 \n"
-                          "\tbne     2f \n"
-                          "\tstwcx.  %4,0,%2 \n"
-                          "bne-    1b\n"
-                          "\tsync\n"
-                          "2:\n"
-                          : "=&r" (prev), "=m" (a->value)
-                          : "r" (&a->value), "r" (oldval), "r" (newval), "m" (a->value)
-                          : "cc", "memory");
-    
-    return prev;
-}
-
-static inline void
-gmx_spinlock_init(gmx_spinlock_t *x)
-{
-    x->lock = 0;
-}
-
-
-
-static inline void
-gmx_spinlock_lock(gmx_spinlock_t *  x)
-{
-    unsigned int tmp;
-    
-    __asm__ __volatile__("\tb      1f\n"
-                         "2:      lwzx    %0,0,%1\n"
-                         "\tcmpwi   0,%0,0\n"
-                         "\tbne+    2b\n"
-                         "1:      lwarx   %0,0,%1\n"
-                         "\tcmpwi   0,%0,0\n"
-                         "\tbne-    2b\n"
-                         "\tstwcx.  %2,0,%1\n"
-                         "bne-    2b\n"
-                         "\tisync\n"
-                         : "=&r"(tmp)
-                         : "r"(&x->lock), "r"(1)
-                         : "cr0", "memory");
-}
-
-
-static inline int
-gmx_spinlock_trylock(gmx_spinlock_t *  x)
-{
-    unsigned int old, t;
-    unsigned int mask = 1;
-    volatile unsigned int *p = &x->lock;
-    
-    __asm__ __volatile__("\teieio\n"
-                         "1:      lwarx   %0,0,%4 \n"
-                         "\tor      %1,%0,%3 \n"
-                         "\tstwcx.  %1,0,%4 \n"
-                         "\tbne     1b\n"
-                         "\tsync\n"
-                         : "=&r" (old), "=&r" (t), "=m" (*p)
-                         : "r" (mask), "r" (p), "m" (*p)
-                         : "cc", "memory");
-    
-    return ((old & mask) != 0);    
-}
-
-
-static inline void
-gmx_spinlock_unlock(gmx_spinlock_t *  x)
-{
-    __asm__ __volatile__("\teieio\n": : :"memory");
-    x->lock = 0;
-}
-
-
-static inline int
-gmx_spinlock_islocked(gmx_spinlock_t *   x)
-{
-    return ( x->lock != 0);
-}
-
-
-static inline void
-gmx_spinlock_wait(gmx_spinlock_t *x)
-{
-    do 
-    {
-        gmx_atomic_memory_barrier(); 
-    }
-    while(gmx_spinlock_islocked(x));
-}
-
-
-
-#elif ( (defined(__IBM_GCC_ASM) || defined(__IBM_STDCPP_ASM))  && \
-        (defined(__powerpc__) || defined(__ppc__)))
-/* PowerPC using xlC inline assembly. 
- * Recent versions of xlC (>=7.0) _partially_ support GCC inline assembly
- * if you use the option -qasm=gcc but we have had to hack things a bit, in 
- * particular when it comes to clobbered variables. Since this implementation
- * _could_ be buggy, we have separated it from the known-to-be-working gcc
- * one above.
- */
-
-/* memory barrier - no idea how to create one with xlc! */
-#define gmx_atomic_memory_barrier()
-
-
-
-typedef struct gmx_atomic
-{
-	volatile int	   value;      /*!< Volatile, to avoid compiler aliasing */
-}
-gmx_atomic_t;
-
-
-typedef struct gmx_spinlock
-{
-    volatile unsigned int   lock;      /*!< Volatile, to avoid compiler aliasing */
-}
-gmx_spinlock_t;
-
-
-#define GMX_SPINLOCK_INITIALIZER   { 0 }
-
-
-#define gmx_atomic_read(a)   ((a)->value) 
-#define gmx_atomic_set(a,i)  (((a)->value) = (i))
-
-
-static inline int
-gmx_atomic_add_return(gmx_atomic_t *    a, 
-                      int               i)
-{
-    int t;
-    
-	__asm__ __volatile__("1:     lwarx   %0,0,%2 \n"
-                         "\t add     %0,%1,%0 \n"
-                         "\t stwcx.  %0,0,%2 \n"
-                         "\t bne-    1b \n"
-                         "\t isync \n"
-                         : "=&r" (t)
-						 : "r" (i), "r" (&a->value) );
-    return t;
-}
-
-
-
-static inline int
-gmx_atomic_fetch_add(gmx_atomic_t *     a,
-                     int                i)
-{
-    int t;
-    
-    __asm__ __volatile__("\t eieio\n"
-                         "1:     lwarx   %0,0,%2 \n"                         
-                         "\t add     %0,%1,%0 \n"
-                         "\t stwcx.  %0,0,%2 \n"
-                         "\t bne-    1b \n"
-                         "\t isync \n"
-                         : "=&r" (t)
-                         : "r" (i), "r" (&a->value));
-    
-    return (t - i);    
-}
-
-
-static inline int
-gmx_atomic_cmpxchg(gmx_atomic_t *       a,
-                   int                  oldval,
-                   int                  newval)
-{
-    int prev;
-    
-    __asm__ __volatile__ ("1:    lwarx   %0,0,%2 \n"
-                          "\t cmpw    0,%0,%3 \n"
-                          "\t bne     2f \n"
-                          "\t stwcx.  %4,0,%2 \n"
-                          "\t bne-    1b \n"
-                          "\t sync \n"
-                          "2: \n"
-                          : "=&r" (prev), "=m" (a->value)
-                          : "r" (&a->value), "r" (oldval), "r" (newval), "m" (a->value));
-    
-    return prev;
-}
-
-static inline void
-gmx_spinlock_init(gmx_spinlock_t *x)
-{
-    x->lock = 0;
-}
-
-
-
-static inline void
-gmx_spinlock_lock(gmx_spinlock_t *  x)
-{
-    unsigned int tmp;
-    
-    __asm__ __volatile__("\t b      1f \n"
-                         "2:      lwzx    %0,0,%1 \n"
-                         "\t cmpwi   0,%0,0 \n"
-                         "\t bne+    2b \n"
-                         "1:      lwarx   %0,0,%1 \n"
-                         "\t cmpwi   0,%0,0 \n"
-                         "\t bne-    2b \n"
-                         "\t stwcx.  %2,0,%1 \n"
-                         "\t bne-    2b \n"
-                         "\t isync\n"
-                         : "=&r"(tmp)
-                         : "r"(&x->lock), "r"(1));
-}
-
-
-static inline int
-gmx_spinlock_trylock(gmx_spinlock_t *  x)
-{
-    unsigned int old, t;
-    unsigned int mask = 1;
-    volatile unsigned int *p = &x->lock;
-    
-    __asm__ __volatile__("\t eieio\n"
-                         "1:      lwarx   %0,0,%4 \n"
-                         "\t or      %1,%0,%3 \n"
-                         "\t stwcx.  %1,0,%4 \n"
-                         "\t bne     1b \n"
-                         "\t sync \n"
-                         : "=&r" (old), "=&r" (t), "=m" (*p)
-                         : "r" (mask), "r" (p), "m" (*p));
-    
-    return ((old & mask) != 0);    
-}
-
-
-static inline void
-gmx_spinlock_unlock(gmx_spinlock_t *  x)
-{
-    __asm__ __volatile__("\t eieio \n");
-    x->lock = 0;
-}
-
-
-static inline void
-gmx_spinlock_islocked(gmx_spinlock_t *   x)
-{
-    return ( x->lock != 0);
-}
-
-
-static inline void
-gmx_spinlock_wait(gmx_spinlock_t *   x)
-{
-    
-    do 
-    {
-        gmx_atomic_memory_barrier();
-    }
-    while(gmx_spinlock_islocked(x));
-}
-
-
-
-
-#elif (defined(__ia64__) && (defined(__GNUC__) || defined(__INTEL_COMPILER)))
-/* ia64 with GCC or Intel compilers. Since we need to define everything through
-* cmpxchg and fetchadd on ia64, we merge the different compilers and only provide 
-* different implementations for that single function. 
-* Documentation? Check the gcc/x86 section.
-*/
-
-
-typedef struct gmx_atomic
-{
-	volatile int	   value;      /*!< Volatile, to avoid compiler aliasing */
-}
-gmx_atomic_t;
-
-
-typedef struct gmx_spinlock
-{
-    volatile unsigned int   lock;      /*!< Volatile, to avoid compiler aliasing */
-}
-gmx_spinlock_t;
-
-
-#define GMX_SPINLOCK_INITIALIZER   { 0 }
-
-
-#define gmx_atomic_read(a)   ((a)->value) 
-#define gmx_atomic_set(a,i)  (((a)->value) = (i))
-
-
-
-/* Compiler thingies */
-#ifdef __INTEL_COMPILER
-void __memory_barrier(void);
-int _InterlockedCompareExchange(volatile int *dest, int xchg, int comp);
-unsigned __int64 __fetchadd4_rel(unsigned int *addend, const int increment);
-/* ia64 memory barrier */
-#  define gmx_atomic_memory_barrier() __memory_barrier()
-/* ia64 cmpxchg */
-#  define gmx_atomic_cmpxchg(a, oldval, newval) _InterlockedCompareExchange(&a->value,newval,oldval)
-/* ia64 fetchadd, but it only works with increments +/- 1,4,8,16 */
-#  define gmx_ia64_fetchadd(a, inc)  __fetchadd4_rel(a, inc)
-
-#elif defined __GNUC__  
-/* ia64 memory barrier */
-#  define gmx_atomic_memory_barrier() asm volatile ("":::"memory")
-/* ia64 cmpxchg */
-static inline int
-gmx_atomic_cmpxchg(gmx_atomic_t *   a,
-                   int              oldval,
-                   int              newval)
-{
-    volatile int res;
-    asm volatile ("mov ar.ccv=%0;;" :: "rO"(oldval));
-    asm volatile ("cmpxchg4.acq %0=[%1],%2,ar.ccv":                    
-                  "=r"(res) : "r"(&a->value), "r"(newval) : "memory"); 
-                          
-    return res;
-}
-
-
-/* fetchadd, but on ia64 it only works with increments +/- 1,4,8,16 */
-#define gmx_ia64_fetchadd(a, inc)                                             \
-({  unsigned long res;                                                        \
-    asm volatile ("fetchadd4.rel %0=[%1],%2"                                  \
-                  : "=r"(res) : "r"(a), "i" (inc) : "memory");                \
-                  res;                                                        \
-})
-
-
-#else /* Unknown compiler */
-#  error Unknown ia64 compiler (not GCC or ICC) - modify gmx_thread.h!
-#endif
-
-
-
-static inline int
-gmx_atomic_add_return(gmx_atomic_t *       a, 
-                      volatile int         i)
-{
-    volatile int oldval,newval;    
-    volatile int __i = i;
-
-    /* Use fetchadd if, and only if, the increment value can be determined
-     * at compile time (otherwise this check is optimized away) and it is
-     * a value supported by fetchadd (1,4,8,16,-1,-4,-8,-16).
-     */                         
-    if (__builtin_constant_p(i) &&
-        ( (__i ==   1) || (__i ==   4)  || (__i ==   8) || (__i ==  16) ||         
-          (__i ==  -1) || (__i ==  -4)  || (__i ==  -8) || (__i == -16) ) )
-    {
-        oldval = gmx_ia64_fetchadd(a,__i);
-        newval = oldval + i;
-    }
-    else
-    {
-        /* Use compare-exchange addition that works with any value */
-        do
-        {
-            oldval = gmx_atomic_read(a);
-            newval = oldval + i;
-        }
-        while(gmx_atomic_cmpxchg(a,oldval,newval) != oldval);
-    }
-    return newval;
-}
-
-
-
-static inline int
-gmx_atomic_fetch_add(gmx_atomic_t *     a,
-                     volatile int       i)
-{
-    volatile int oldval,newval;    
-    volatile int __i = i;
-    
-    /* Use ia64 fetchadd if, and only if, the increment value can be determined
-     * at compile time (otherwise this check is optimized away) and it is
-     * a value supported by fetchadd (1,4,8,16,-1,-4,-8,-16).
-     */                         
-    if (__builtin_constant_p(i) &&
-        ( (__i ==   1) || (__i ==   4)  || (__i ==   8) || (__i ==  16) ||         
-          (__i ==  -1) || (__i ==  -4)  || (__i ==  -8) || (__i == -16) ) )
-    {
-        oldval = gmx_ia64_fetchadd(a,__i);
-        newval = oldval + i;
-    }
-    else
-    {
-        /* Use compare-exchange addition that works with any value */
-        do
-        {
-            oldval = gmx_atomic_read(a);
-            newval = oldval + i;
-        }
-        while(gmx_atomic_cmpxchg(a,oldval,newval) != oldval);
-    }
-    return oldval;
-}
-
-
-static inline void
-gmx_spinlock_init(gmx_spinlock_t *x)
-{
-    x->lock = 0;
-}
-
-
-static inline void
-gmx_spinlock_lock(gmx_spinlock_t *   x)
-{
-    gmx_atomic_t *a = (gmx_atomic_t *) x;
-    unsigned long value;                                                 
-    value = gmx_atomic_cmpxchg(a, 0, 1);                             
-    if (value)                                                           
-    {                                                                    
-        do                                                               
-        {                                                                
-            while (a->value != 0)                                                 
-            {                                                            
-                gmx_atomic_memory_barrier();                             
-            }                                                            
-            value = gmx_atomic_cmpxchg(a, 0, 1);                       
-        }                                                                
-        while (value);                                                   
-    }                                                                    
-} 
-
-
-static inline int
-gmx_spinlock_trylock(gmx_spinlock_t *   x)
-{
-    return (gmx_atomic_cmpxchg((gmx_atomic_t *)x, 0, 1) != 0);
-}
-
-
-static inline void
-gmx_spinlock_unlock(gmx_spinlock_t *   x)
-{
-    do
-    {
-        gmx_atomic_memory_barrier(); 
-        x->lock = 0;
-    } 
-    while (0);
-}
-
-
-static inline int
-gmx_spinlock_islocked(gmx_spinlock_t *   x)
-{
-    return (x->lock != 0);
-}
-
-
-static inline void
-gmx_spinlock_wait(gmx_spinlock_t *   x)
-{
-    
-    do 
-    {
-        gmx_atomic_memory_barrier();
-    }
-    while(gmx_spinlock_islocked(x));
-}
-
-
-#undef gmx_ia64_fetchadd
-
-
-
-#elif (defined(__hpux) || defined(__HP_cc)) && defined(__ia64)
-/* HP compiler on ia64 */
-#include <machine/sys/inline.h>
-
-#define gmx_atomic_memory_barrier() _Asm_mf()
-
-#define gmx_hpia64_fetchadd(a, i)                           \
-    _Asm_fetchadd((_Asm_fasz)_FASZ_W,(_Asm_sem)_SEM_REL,    \
-                  (UInt32*)a,(unsigned int) i,              \
-                  (_Asm_ldhint)LDHINT_NONE)
- 
-
-typedef struct gmx_atomic
-{
-	volatile int	   value;      /*!< Volatile, to avoid compiler aliasing */
-}
-gmx_atomic_t;
-
-
-typedef struct gmx_spinlock
-{
-    volatile unsigned int   lock;      /*!< Volatile, to avoid compiler aliasing */
-}
-gmx_spinlock_t;
-
-
-static inline int
-gmx_atomic_cmpxchg(gmx_atomic_t *   a,
-                   int              oldval,
-                   int              newval)
-{
-    int ret;
-    
-    _Asm_mov_to_ar((_Asm_app_reg)_AREG_CCV,(Uint32)oldval,                  
-                   (_Asm_fence)(_UP_CALL_FENCE | _UP_SYS_FENCE |         
-                                _DOWN_CALL_FENCE | _DOWN_SYS_FENCE));
-                   
-    ret = _Asm_cmpxchg((_Asm_sz)SZ_W,(_Asm_sem)_SEM_ACQ,(Uint32*)a,    
-                       (Uint32)newval,(_Asm_ldhint)_LDHINT_NONE);
-                   
-    return ret;
-}
-
-
-
-#define GMX_SPINLOCK_INITIALIZER   { 0 }
-
-
-#define gmx_atomic_read(a)   ((a)->value) 
-#define gmx_atomic_set(a,i)  (((a)->value) = (i))
-
-
-static inline void 
-gmx_atomic_add_return(gmx_atomic_t *       a, 
-                      int                  i)
-{
-    int old,new;    
-    int __i = i;
-    
-    /* On HP-UX we don't know any macro to determine whether the increment
-     * is known at compile time, but hopefully the call uses something simple
-     * like a constant, and then the optimizer should be able to do the job.
-     */                         
-    if (  (__i ==   1) || (__i ==   4)  || (__i ==   8) || (__i ==  16) ||         
-          (__i ==  -1) || (__i ==  -4)  || (__i ==  -8) || (__i == -16) )
-    {
-        oldval = gmx_hpia64_fetchadd(a,__i);
-        newval = oldval + i;
-    }
-    else
-    {
-        /* Use compare-exchange addition that works with any value */
-        do
-        {
-            oldval = gmx_atomic_read(a);
-            newval = oldval + i;
-        }
-        while(gmx_atomic_cmpxchg(a,oldval,newval) != oldval);
-    }
-    return newval;
-}
-
-
-
-static inline int
-gmx_atomic_fetch_add(gmx_atomic_t *     a,
-                     int                i)
-{
-    int oldval,newval;    
-    int __i = i;
-    
-    /* On HP-UX we don't know any macro to determine whether the increment
-     * is known at compile time, but hopefully the call uses something simple
-     * like a constant, and then the optimizer should be able to do the job.
-     */                         
-    if (  (__i ==   1) || (__i ==   4)  || (__i ==   8) || (__i ==  16) ||         
-          (__i ==  -1) || (__i ==  -4)  || (__i ==  -8) || (__i == -16) )
-    {
-        oldval = gmx_hpia64_fetchadd(a,__i);
-        newval = oldval + i;
-    }
-    else
-    {
-        /* Use compare-exchange addition that works with any value */
-        do
-        {
-            oldval = gmx_atomic_read(a);
-            newval = oldval + i;
-        }
-        while(gmx_atomic_cmpxchg(a,oldval,newval) != oldval);
-    }
-    return oldval;
-}
-
-
-static inline void
-gmx_spinlock_init(gmx_spinlock_t *x)
-{
-    x->lock = 0;
-}
-
-
-
-
-
-static inline void
-gmx_spinlock_trylock(gmx_spinlock_t *x)
-{
-    int rc;
-
-    rc = _Asm_xchg((_Asm_sz)_SZ_W, (unsigned int *)x, 1        
-                    (_Asm_ldhit)_LDHINT_NONE);
-    
-    return ( (rc>0) ? 1 : 0);
-}
-
-
-static inline void
-gmx_spinlock_lock(gmx_spinlock_t *x)
-{
-    int      status = 1;
-    
-    do
-    {
-        if( *((unsigned int *)x->lock) == 0 ) 
-        {
-            status = gmx_spinlock_trylock(x);
-        }
-    } while( status != 0);
-}
-
-
-static inline void
-gmx_spinlock_unlock(gmx_spinlock_t *   x)
-{
-    _Asm_fetchadd((_Asm_fasz)_SZ_W,(_Asm_sem)_SEM_REL,                  
-                  (unsigned int *)x,-1,(_Asm_ldhint)_LDHINT_NONE);
-}
-
-
-
-static inline void
-gmx_spinlock_islocked(gmx_spinlock_t *   x)
-{
-    return ( x->lock != 0 );
-}
-
-
-
-static inline void
-gmx_spinlock_wait(gmx_spinlock_t *   x)
-{
-    do
-    {
-        gmx_atomic_memory_barrier(); 
-    } 
-    while(gmx_spinlock_islocked(x));
-}
-
-
-#undef gmx_hpia64_fetchadd
-
-
-
-#elif (defined(_MSC_VER) && (_MSC_VER >= 1200))
-/* Microsoft Visual C on x86, define taken from FFTW who got it from Morten Nissov */
-
-#include <windows.h>
-
-#define gmx_atomic_memory_barrier()
-
-
-typedef struct gmx_atomic
-{
-	LONG volatile	   value;      /*!< Volatile, to avoid compiler aliasing */
-}
-gmx_atomic_t;
-
-
-typedef struct gmx_spinlock
-{
-    LONG volatile      lock;      /*!< Volatile, to avoid compiler aliasing */
-}
-gmx_spinlock_t;
-
-
-#define GMX_SPINLOCK_INITIALIZER   { 0 }
-
-
-
-
-#define gmx_atomic_read(a)  ((a)->value) 
-#define gmx_atomic_set(a,i)  (((a)->value) = (i))
-
-
-
-
-#define gmx_atomic_fetch_add(a, i)  \
-    InterlockedExchangeAdd((LONG volatile *)a, (LONG) i)
-
-#define gmx_atomic_add_return(a, i)  \
-    ( i + InterlockedExchangeAdd((LONG volatile *)a, (LONG) i) )
-
-#define gmx_atomic_cmpxchg(a, oldval, newval) \
-    InterlockedCompareExchange((LONG volatile *)a, (LONG) newval, (LONG) oldval)
-
-
-# define gmx_spinlock_lock(x)   \
-    while((InterlockedCompareExchange((LONG volatile *)&x, 1, 0))!=0)
-
-
-#define gmx_spinlock_trylock(x)   \
-    InterlockedCompareExchange((LONG volatile *)&x, 1, 0)
-
-
-static inline void
-gmx_spinlock_unlock(gmx_spinlock_t *   x)
-{
-    x->lock = 0;
-}
-
-
-static inline int
-gmx_spinlock_islocked(gmx_spinlock_t *   x)
-{
-    return (*(volatile signed char *)(&(x)->lock) != 0);
-}
-
-
-static inline void
-gmx_spinlock_wait(gmx_spinlock_t *   x)
-{
-    while(gmx_spinlock_islocked(x))
-    {
-        Sleep(0);
-    }
-}
-
-
-
-#elif defined(__xlC__) && defined (_AIX)
-/* IBM xlC compiler on AIX */
-#include <sys/atomic_op.h>
-
-
-#define gmx_atomic_memory_barrier()
-
-
-typedef struct gmx_atomic
-{
-	volatile int	   value;      /*!< Volatile, to avoid compiler aliasing */
-}
-gmx_atomic_t;
-
-
-typedef struct gmx_spinlock
-{
-    volatile unsigned int      lock;      /*!< Volatile, to avoid compiler aliasing */
-}
-gmx_spinlock_t;
-
-
-static inline int
-gmx_atomic_cmpxchg(gmx_atomic_t *    a,
-                   int               oldval,
-                   int               newval)
-{
-    int t;
-    
-    if(__check_lock((atomic_p)&a->value, oldval, newval))
-    {
-        /* Not successful - value had changed in memory. Reload value. */
-        t = a->value;
-    }
-    else
-    {
-        /* replacement suceeded */
-        t = oldval;
-    }
-    return t;        
-}
-
-
-static inline void 
-gmx_atomic_add_return(gmx_atomic_t *       a, 
-                      int                  i)
-{
-    int oldval,newval;    
-    
-    do
-    {
-        oldval = gmx_atomic_read(a);
-        newval = oldval + i;
-    }
-    while(__check_lock((atomic_p)&a->value, oldval, newval));
-
-    return newval;
-}
-
-
-
-static inline void 
-gmx_atomic_fetch_add(gmx_atomic_t *       a, 
-                     int                  i)
-{
-    int oldval,newval;    
-    
-    do
-    {
-        oldval = gmx_atomic_read(a);
-        newval = oldval + i;
-    }
-    while(__check_lock((atomic_p)&a->value, oldval, newval));
-    
-    return oldval;
-}
-
-
-static inline void
-gmx_spinlock_init(gmx_spinlock_t *   x)
-{
-    __clear_lock((atomic_p)x,0);
-}
-
-
-static inline void
-gmx_spinlock_lock(gmx_spinlock_t *   x)
-{
-    do
-    {
-        ;
-    }
-    while(__check_lock((atomic_p)x, 0, 1));
-}
-
-
-static inline void
-gmx_spinlock_trylock(gmx_spinlock_t *   x)
-{
-    /* Return 0 if we got the lock */
-    return (__check_lock((atomic_p)x, 0, 1) != 0)
-}
-
-
-static inline void
-gmx_spinlock_unlock(gmx_spinlock_t *   x)
-{
-    __clear_lock((atomic_p)x,0);
-}
-
-
-static inline void
-gmx_spinlock_islocked(gmx_spinlock_t *   x)
-{
-    return (*((atomic_p)x) != 0);
-}
-
-
-static inline void
-gmx_spinlock_wait(gmx_spinlock_t *    x)
-{
-    while(gmx_spinlock_islocked(x)) { ; } 
-}
-
-
-#else
-/* No atomic operations, use mutex fallback. Documentation is in x86 section */
-
-
-#define gmx_atomic_memory_barrier()
-
-/* System mutex used for locking to guarantee atomicity */
-static pthread_mutex_t
-gmx_atomic_mutex = PTHREAD_MUTEX_INITIALIZER;
-
-
-typedef struct gmx_atomic
-{
-	int	   value;
-}
-gmx_atomic_t;
-
-#define gmx_spinlock_t     pthread_mutex_t
-
- 
-#  define GMX_SPINLOCK_INITIALIZER   PTHREAD_MUTEX_INITIALIZER
-
-/* Since mutexes guarantee memory barriers this works fine */
-#define gmx_atomic_read(a)   ((a)->value)
-
-
-static inline void
-gmx_atomic_set(gmx_atomic_t *   a, 
-               int              i)
-{
-    /* Mutexes here are necessary to guarantee memory visibility */
-    pthread_mutex_lock(&gmx_atomic_mutex);
-    a->value = i;
-    pthread_mutex_unlock(&gmx_atomic_mutex);
-}
-
-
-static inline int
-gmx_atomic_add_return(gmx_atomic_t *   a, 
-                      int              i)
-{
-    int t;
-    pthread_mutex_lock(&gmx_atomic_mutex);
-    t = a->value + i;
-    a->value = t;
-    pthread_mutex_unlock(&gmx_atomic_mutex);
-    return t;
-}
-
-
-static inline int
-gmx_atomic_fetch_add(gmx_atomic_t *   a,
-                     int              i)
-{
-    int old_value;
-    
-    pthread_mutex_lock(&gmx_atomic_mutex);
-    old_value  = a->value;
-    a->value   = old_value + i;
-    pthread_mutex_unlock(&gmx_atomic_mutex);
-    return old_value;
-}
-
-
-static inline int
-gmx_atomic_cmpxchg(gmx_atomic_t *           a, 
-                   int                      oldv,
-                   int                      newv)
-{
-    int t;
-    
-    pthread_mutex_lock(&gmx_atomic_mutex);
-    t = a->value;
-    if (t == oldv)
-    {
-        a->value = newv;
-    }
-    pthread_mutex_unlock(&gmx_atomic_mutex);
-    return t;
-}
-
-
-#define gmx_spinlock_init(lock)       pthread_mutex_init(lock)
-#define gmx_spinlock_lock(lock)       pthread_mutex_lock(lock)
-#define gmx_spinlock_trylock(lock)    pthread_mutex_trylock(lock)
-#define gmx_spinlock_unlock(lock)     pthread_mutex_unlock(lock)
-
-static inline int
-gmx_spinlock_islocked(gmx_spinlock_t *   x)
-{
-    int rc;
-    
-    if(gmx_spinlock_trylock(x) != 0)
-    {
-        /* It was locked */
-        return 1;
-    }
-    else
-    {
-        /* We just locked it */
-        gmx_spinlock_unlock(x);
-        return 0;
-    }
-}
-
-
-static inline void
-gmx_spinlock_wait(gmx_spinlock_t *   x)
-{
-    int rc;
-    
-    gmx_spinlock_lock(x);
-    /* Got the lock now, so the waiting is over */
-    gmx_spinlock_unlock(x);
-}
-
-
-#endif
-
-
-
-
-/*! \brief Spinlock-based barrier type
- *
- *  This barrier has the same functionality as the standard
- *  gmx_thread_barrier_t, but since it is based on spinlocks
- *  it provides faster synchronization at the cost of busy-waiting.
- *
- *  Variables of this type should be initialized by calling
- *  gmx_spinlock_barrier_init() to set the number of threads
- *  that should be synchronized.
- */
-typedef struct gmx_spinlock_barrier
-{
-	gmx_atomic_t            count;     /*!< Number of threads remaining     */
-	int                     threshold; /*!< Total number of threads         */
-	volatile int            cycle;     /*!< Current cycle (alternating 0/1) */
-}
-gmx_spinlock_barrier_t;
- 
-
-
-
-/*! \brief Initialize spinlock-based barrier
- *
- *  \param barrier  Pointer to _spinlock_ barrier. Note that this is not
- *                  the same datatype as the full, thread based, barrier.
- *  \param count    Number of threads to synchronize. All threads
- *                  will be released after \a count calls to 
- *                  gmx_spinlock_barrier_wait().  
- */
-static inline void 
-gmx_spinlock_barrier_init(gmx_spinlock_barrier_t *         barrier,
-                          int                              count)
-{
-	barrier->threshold = count;
-	barrier->cycle     = 0;
-	gmx_atomic_set(&(barrier->count),count);
-}
-
-
-
-
-/*! \brief Perform busy-waiting barrier synchronization
-*
-*  This routine blocks until it has been called N times,
-*  where N is the count value the barrier was initialized with.
-*  After N total calls all threads return. The barrier automatically
-*  cycles, and thus requires another N calls to unblock another time.
-*
-*  Note that spinlock-based barriers are completely different from
-*  standard ones (using mutexes and condition variables), only the 
-*  functionality and names are similar.
-*
-*  \param barrier  Pointer to previously create barrier.
-*
-*  \return The last thread returns -1, all the others 0.
-*/
-static inline int
-gmx_spinlock_barrier_wait(gmx_spinlock_barrier_t *   barrier)
-{
-  int    cycle;
-  int    status;
-  
-  /* We don't need to lock or use atomic ops here, since the cycle index 
-	* cannot change until after the last thread has performed the check
-	* further down. Further, they cannot reach this point in the next 
-	* barrier iteration until all of them have been released, and that 
-	* happens after the cycle value has been updated.
-	*
-	* No synchronization == fast synchronization.
-	*/
-  cycle = barrier->cycle;
-  
-  /* Decrement the count atomically and check if it is zero.
-	* This will only be true for the last thread calling us.
-	*/
-  if( gmx_atomic_add_return( &(barrier->count), -1 ) == 0)
-  { 
-	gmx_atomic_set(&(barrier->count), barrier->threshold);
-	barrier->cycle = !barrier->cycle;
-    
-	status = -1;
-  }
-  else
-  {
-	/* Wait until the last thread changes the cycle index.
-	* We are both using a memory barrier, and explicit
-	* volatile pointer cast to make sure the compiler
-	* doesn't try to be smart and cache the contents.
-	*/
-	do
-	{ 
-	  gmx_atomic_memory_barrier();
-	} 
-	while( *(volatile int *)(&(barrier->cycle)) == cycle);
-	
-	status = 0;
-  }
-  return status;
-}
-
-
-
-
-#ifdef __cplusplus
-}
-#endif
-
-
-#endif /* _GMX_ATOMIC_H_ */
--- a/platforms/cpu/include/CpuCustomGBForce.h
+++ b/platforms/cpu/include/CpuCustomGBForce.h
@@ -31,6 +31,7 @@
 #include "openmm/internal/CompiledExpressionSet.h"
 #include "openmm/internal/ThreadPool.h"
 #include "openmm/internal/vectorize.h"
+#include <atomic>
 #include <map>
 #include <set>
 #include <vector>
@@ -63,7 +64,7 @@ private:
    const std::map<std::string, double>* globalParameters;
    std::vector<AlignedArray<float> >* threadForce;
    bool includeForce, includeEnergy;
-    void* atomicCounter;
+    std::atomic<int> atomicCounter;
    
    /**
     * This routine contains the code executed by each thread.

--- a/platforms/cpu/include/CpuCustomManyParticleForce.h
+++ b/platforms/cpu/include/CpuCustomManyParticleForce.h
@@ -34,6 +34,7 @@
 #include "openmm/internal/vectorize.h"
 #include "lepton/CompiledExpression.h"
 #include "lepton/ParsedExpression.h"
+#include <atomic>
 #include <map>
 #include <set>
 #include <utility>
@@ -69,7 +70,7 @@ private:
    const std::map<std::string, double>* globalParameters;
    std::vector<AlignedArray<float> >* threadForce;
    bool includeForces, includeEnergy;
-    void* atomicCounter;
+    std::atomic<int> atomicCounter;

    /**
     * This routine contains the code executed by each thread.

--- a/platforms/cpu/include/CpuCustomNonbondedForce.h
+++ b/platforms/cpu/include/CpuCustomNonbondedForce.h
@@ -30,6 +30,7 @@
 #include "openmm/internal/CompiledExpressionSet.h"
 #include "openmm/internal/ThreadPool.h"
 #include "openmm/internal/vectorize.h"
+#include <atomic>
 #include <map>
 #include <set>
 #include <utility>
@@ -147,7 +148,7 @@ private:
    const std::map<std::string, double>* globalParameters;
    std::vector<AlignedArray<float> >* threadForce;
    bool includeForce, includeEnergy;
-    void* atomicCounter;
+    std::atomic<int> atomicCounter;

    /**
     * This routine contains the code executed by each thread.

--- a/platforms/cpu/include/CpuGBSAOBCForce.h
+++ b/platforms/cpu/include/CpuGBSAOBCForce.h

-/* Portions copyright (c) 2006-2017 Stanford University and Simbios.
+/* Portions copyright (c) 2006-2018 Stanford University and Simbios.
 * Contributors: Pande Group
 *
 * Permission is hereby granted, free of charge, to any person obtaining
@@ -28,6 +28,7 @@
 #include "AlignedArray.h"
 #include "openmm/internal/ThreadPool.h"
 #include "openmm/internal/vectorize.h"
+#include <atomic>
 #include <set>
 #include <utility>
 #include <vector>
@@ -112,7 +113,7 @@ private:
    float const* posq;
    std::vector<AlignedArray<float> >* threadForce;
    bool includeEnergy;
-    void* atomicCounter;
+    std::atomic<int> atomicCounter;
  
    static const int NUM_TABLE_POINTS;
    static const float TABLE_MIN;

--- a/platforms/cpu/include/CpuGayBerneForce.h
+++ b/platforms/cpu/include/CpuGayBerneForce.h
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2016-2017 Stanford University and the Authors.      *
+ * Portions copyright (c) 2016-2018 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -91,7 +91,7 @@ private:
    Vec3 const* positions;
    std::vector<AlignedArray<float> >* threadForce;
    Vec3* boxVectors;
-    void* atomicCounter;
+    std::atomic<int> atomicCounter;

    void computeEllipsoidFrames(const std::vector<Vec3>& positions);
    

--- a/platforms/cpu/include/CpuNeighborList.h
+++ b/platforms/cpu/include/CpuNeighborList.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2013-2017 Stanford University and the Authors.      *
+ * Portions copyright (c) 2013-2018 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -35,8 +35,8 @@
 #include "AlignedArray.h"
 #include "openmm/Vec3.h"
 #include "windowsExportCpu.h"
-#include "openmm/internal/gmx_atomic.h"
 #include "openmm/internal/ThreadPool.h"
+#include <atomic>
 #include <set>
 #include <utility>
 #include <vector>
@@ -75,7 +75,7 @@ private:
    int numAtoms;
    bool usePeriodic;
    float maxDistance;
-    gmx_atomic_t atomicCounter;
+    std::atomic<int> atomicCounter;
 };

 } // namespace OpenMM

--- a/platforms/cpu/include/CpuNonbondedForce.h
+++ b/platforms/cpu/include/CpuNonbondedForce.h

-/* Portions copyright (c) 2006-2017 Stanford University and Simbios.
+/* Portions copyright (c) 2006-2018 Stanford University and Simbios.
 * Contributors: Pande Group
 *
 * Permission is hereby granted, free of charge, to any person obtaining
@@ -30,6 +30,7 @@
 #include "ReferencePairIxn.h"
 #include "openmm/internal/ThreadPool.h"
 #include "openmm/internal/vectorize.h"
+#include <atomic>
 #include <set>
 #include <utility>
 #include <vector>
@@ -200,7 +201,7 @@ protected:
        bool includeEnergy;
        float inverseRcut6;
        float inverseRcut6Expterm;
-        void* atomicCounter;
+        std::atomic<int> atomicCounter;

        static const float TWO_OVER_SQRT_PI;
        static const int NUM_TABLE_POINTS;

--- a/platforms/cpu/src/CpuCustomGBForce.cpp
+++ b/platforms/cpu/src/CpuCustomGBForce.cpp
@@ -28,7 +28,6 @@
 #include "SimTKOpenMMUtilities.h"
 #include "ReferenceForce.h"
 #include "CpuCustomGBForce.h"
-#include "openmm/internal/gmx_atomic.h"

 using namespace OpenMM;
 using namespace std;
@@ -191,13 +190,11 @@ void CpuCustomGBForce::calculateIxn(int numberOfAtoms, float* posq, vector<vecto
    this->includeForce = includeForce;
    this->includeEnergy = includeEnergy;
    threadEnergy.resize(threads.getNumThreads());
-    gmx_atomic_t counter;
-    this->atomicCounter = &counter;

    // Calculate the first computed value.

    auto task = [&] (ThreadPool& threads, int threadIndex) { threadComputeForce(threads, threadIndex); };
-    gmx_atomic_set(&counter, 0);
+    atomicCounter = 0;
    threads.execute(task);
    threads.waitForThreads();

@@ -217,7 +214,7 @@ void CpuCustomGBForce::calculateIxn(int numberOfAtoms, float* posq, vector<vecto
    // Calculate the energy terms.

    for (int i = 0; i < (int) threadData[0]->energyExpressions.size(); i++) {
-        gmx_atomic_set(&counter, 0);
+        atomicCounter = 0;
        threads.execute(task);
        threads.waitForThreads();
    }
@@ -229,7 +226,7 @@ void CpuCustomGBForce::calculateIxn(int numberOfAtoms, float* posq, vector<vecto
    
    // Apply the chain rule to evaluate forces.

-    gmx_atomic_set(&counter, 0);
+    atomicCounter = 0;
    threads.resumeThreads();
    threads.waitForThreads();

@@ -361,7 +358,7 @@ void CpuCustomGBForce::calculateParticlePairValue(int index, ThreadData& data, i
        // Loop over all pairs in the neighbor list.

        while (true) {
-            int blockIndex = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 1);
+            int blockIndex = atomicCounter++;
            if (blockIndex >= neighborList->getNumBlocks())
                break;
            const int blockSize = neighborList->getBlockSize();
@@ -386,7 +383,7 @@ void CpuCustomGBForce::calculateParticlePairValue(int index, ThreadData& data, i
        // Perform an O(N^2) loop over all atom pairs.

        while (true) {
-            int i = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 1);
+            int i = atomicCounter++;
            if (i >= numAtoms)
                break;
            for (int j = i+1; j < numAtoms; j++) {
@@ -456,7 +453,7 @@ void CpuCustomGBForce::calculateParticlePairEnergyTerm(int index, ThreadData& da
        // Loop over all pairs in the neighbor list.

        while (true) {
-            int blockIndex = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 1);
+            int blockIndex = atomicCounter++;
            if (blockIndex >= neighborList->getNumBlocks())
                break;
            const int blockSize = neighborList->getBlockSize();
@@ -480,7 +477,7 @@ void CpuCustomGBForce::calculateParticlePairEnergyTerm(int index, ThreadData& da
        // Perform an O(N^2) loop over all atom pairs.

        while (true) {
-            int i = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 1);
+            int i = atomicCounter++;
            if (i >= numAtoms)
                break;
            for (int j = i+1; j < numAtoms; j++) {
@@ -543,7 +540,7 @@ void CpuCustomGBForce::calculateChainRuleForces(ThreadData& data, int numAtoms,
        // Loop over all pairs in the neighbor list.

        while (true) {
-            int blockIndex = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 1);
+            int blockIndex = atomicCounter++;
            if (blockIndex >= neighborList->getNumBlocks())
                break;
            const int blockSize = neighborList->getBlockSize();
@@ -567,7 +564,7 @@ void CpuCustomGBForce::calculateChainRuleForces(ThreadData& data, int numAtoms,
        // Perform an O(N^2) loop over all atom pairs.

        while (true) {
-            int i = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 1);
+            int i = atomicCounter++;
            if (i >= numAtoms)
                break;
            for (int j = i+1; j < numAtoms; j++) {

--- a/platforms/cpu/src/CpuCustomManyParticleForce.cpp
+++ b/platforms/cpu/src/CpuCustomManyParticleForce.cpp
@@ -32,7 +32,6 @@
 #include "ReferenceTabulatedFunction.h"
 #include "openmm/internal/CustomManyParticleForceImpl.h"
 #include "lepton/CustomFunction.h"
-#include "openmm/internal/gmx_atomic.h"

 using namespace OpenMM;
 using namespace std;
@@ -99,9 +98,7 @@ void CpuCustomManyParticleForce::calculateIxn(AlignedArray<float>& posq, vector<
    this->threadForce = &threadForce;
    this->includeForces = includeForces;
    this->includeEnergy = includeEnergy;
-    gmx_atomic_t counter;
-    gmx_atomic_set(&counter, 0);
-    this->atomicCounter = &counter;
+    atomicCounter = 0;
    if (useCutoff) {
        // Construct a neighbor list.  We use CpuNeighborList to do this, but then copy the result
        // into a new data structure.  This is needed because in UniqueCentralParticle mode, the
@@ -156,7 +153,7 @@ void CpuCustomManyParticleForce::threadComputeForce(ThreadPool& threads, int thr
        // Loop over interactions from the neighbor list.
        
        while (true) {
-            int i = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 1);
+            int i = atomicCounter++;
            if (i >= numParticles)
                break;
            particleIndices[0] = i;
@@ -170,7 +167,7 @@ void CpuCustomManyParticleForce::threadComputeForce(ThreadPool& threads, int thr
        for (int i = 0; i < numParticles; i++)
            particles[i] = i;
        while (true) {
-            int i = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 1);
+            int i = atomicCounter++;
            if (i >= numParticles)
                break;
            particleIndices[0] = i;

--- a/platforms/cpu/src/CpuCustomNonbondedForce.cpp
+++ b/platforms/cpu/src/CpuCustomNonbondedForce.cpp

-/* Portions copyright (c) 2009-2017 Stanford University and Simbios.
+/* Portions copyright (c) 2009-2018 Stanford University and Simbios.
 * Contributors: Peter Eastman
 *
 * Permission is hereby granted, free of charge, to any person obtaining
@@ -28,7 +28,6 @@
 #include "SimTKOpenMMUtilities.h"
 #include "ReferenceForce.h"
 #include "CpuCustomNonbondedForce.h"
-#include "openmm/internal/gmx_atomic.h"

 using namespace OpenMM;
 using namespace std;
@@ -134,9 +133,7 @@ void CpuCustomNonbondedForce::calculatePairIxn(int numberOfAtoms, float* posq, v
    this->includeForce = includeForce;
    this->includeEnergy = includeEnergy;
    threadEnergy.resize(threads.getNumThreads());
-    gmx_atomic_t counter;
-    gmx_atomic_set(&counter, 0);
-    this->atomicCounter = &counter;
+    atomicCounter = 0;
    
    // Signal the threads to start running and wait for them to finish.
    
@@ -177,7 +174,7 @@ void CpuCustomNonbondedForce::threadComputeForce(ThreadPool& threads, int thread
        // The user has specified interaction groups, so compute only the requested interactions.
        
        while (true) {
-            int i = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 1);
+            int i = atomicCounter++;
            if (i >= groupInteractions.size())
                break;
            int atom1 = groupInteractions[i].first;
@@ -193,7 +190,7 @@ void CpuCustomNonbondedForce::threadComputeForce(ThreadPool& threads, int thread
        // We are using a cutoff, so get the interactions from the neighbor list.

        while (true) {
-            int blockIndex = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 1);
+            int blockIndex = atomicCounter++;
            if (blockIndex >= neighborList->getNumBlocks())
                break;
            const int blockSize = neighborList->getBlockSize();
@@ -219,7 +216,7 @@ void CpuCustomNonbondedForce::threadComputeForce(ThreadPool& threads, int thread
        // Every particle interacts with every other one.
        
        while (true) {
-            int ii = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 1);
+            int ii = atomicCounter++;
            if (ii >= numberOfAtoms)
                break;
            for (int jj = ii+1; jj < numberOfAtoms; jj++) {

--- a/platforms/cpu/src/CpuGBSAOBCForce.cpp
+++ b/platforms/cpu/src/CpuGBSAOBCForce.cpp
-/* Portions copyright (c) 2006-2017 Stanford University and Simbios.
+/* Portions copyright (c) 2006-2018 Stanford University and Simbios.
 * Contributors: Pande Group
 *
 * Permission is hereby granted, free of charge, to any person obtaining
@@ -24,7 +24,6 @@
 #include "CpuGBSAOBCForce.h"
 #include "SimTKOpenMMRealType.h"
 #include "openmm/internal/vectorize.h"
-#include "openmm/internal/gmx_atomic.h"
 #include <algorithm>
 #include <cmath>
 #include <cstdlib>
@@ -95,21 +94,19 @@ void CpuGBSAOBCForce::computeForce(const AlignedArray<float>& posq, vector<Align
    threadBornForces.resize(numThreads);
    for (int i = 0; i < numThreads; i++)
        threadBornForces[i].resize(particleParams.size()+3);
-    gmx_atomic_t counter;
-    this->atomicCounter = &counter;
    
    // Signal the threads to start running and wait for them to finish.
    
-    gmx_atomic_set(&counter, 0);
+    atomicCounter = 0;
    threads.execute([&] (ThreadPool& threads, int threadIndex) { threadComputeForce(threads, threadIndex); });
    threads.waitForThreads(); // Compute Born radii
-    gmx_atomic_set(&counter, 0);
+    atomicCounter = 0;
    threads.resumeThreads();
    threads.waitForThreads(); // Compute surface area term
-    gmx_atomic_set(&counter, 0);
+    atomicCounter = 0;
    threads.resumeThreads();
    threads.waitForThreads(); // First loop
-    gmx_atomic_set(&counter, 0);
+    atomicCounter = 0;
    threads.resumeThreads();
    threads.waitForThreads(); // Second loop
    
@@ -138,7 +135,7 @@ void CpuGBSAOBCForce::threadComputeForce(ThreadPool& threads, int threadIndex) {
    // Calculate Born radii

    while (true) {
-        int blockStart = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 4);
+        int blockStart = atomicCounter.fetch_add(4);
        if (blockStart >= numParticles)
            break;
        int numInBlock = min(4, numParticles-blockStart);
@@ -215,7 +212,7 @@ void CpuGBSAOBCForce::threadComputeForce(ThreadPool& threads, int threadIndex) {
    for (int i = 0; i < numParticles; i++)
        bornForces[i] = 0.0f;
    while (true) {
-        int atomI = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 1);
+        int atomI = atomicCounter++;
        if (atomI >= numParticles)
            break;
        if (bornRadii[atomI] > 0) {
@@ -240,7 +237,7 @@ void CpuGBSAOBCForce::threadComputeForce(ThreadPool& threads, int threadIndex) {
    else
        preFactor = 0.0f;
    while (true) {
-        int blockStart = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 4);
+        int blockStart = atomicCounter.fetch_add(4);
        if (blockStart >= numParticles)
            break;
        int numInBlock = min(4, numParticles-blockStart);
@@ -318,7 +315,7 @@ void CpuGBSAOBCForce::threadComputeForce(ThreadPool& threads, int threadIndex) {
    // Second loop of Born energy computation.

    while (true) {
-        int blockStart = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 4);
+        int blockStart = atomicCounter.fetch_add(4);
        if (blockStart >= numParticles)
            break;
        fvec4 bornForce(0.0f);

--- a/platforms/cpu/src/CpuGayBerneForce.cpp
+++ b/platforms/cpu/src/CpuGayBerneForce.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2016-2017 Stanford University and the Authors.      *
+ * Portions copyright (c) 2016-2018 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -37,7 +37,6 @@
 #include "ReferenceForce.h"
 #include "openmm/OpenMMException.h"
 #include "openmm/GayBerneForce.h"
-#include "openmm/internal/gmx_atomic.h"
 #include <algorithm>
 #include <cmath>

@@ -120,9 +119,7 @@ double CpuGayBerneForce::calculateForce(const vector<Vec3>& positions, std::vect
    this->boxVectors = boxVectors;
    threadEnergy.resize(numThreads);
    threadTorque.resize(numThreads);
-    gmx_atomic_t counter;
-    gmx_atomic_set(&counter, 0);
-    this->atomicCounter = &counter;
+    atomicCounter = 0;
    
    // Signal the threads to compute the pairwise interactions.
    
@@ -131,7 +128,7 @@ double CpuGayBerneForce::calculateForce(const vector<Vec3>& positions, std::vect
    
    // Signal the threads to compute exceptions.
    
-    gmx_atomic_set(&counter, 0);
+    atomicCounter = 0;
    threads.resumeThreads();
    threads.waitForThreads();
    
@@ -162,7 +159,7 @@ void CpuGayBerneForce::threadComputeForce(ThreadPool& threads, int threadIndex,
    
    if (neighborList == NULL) {
        while (true) {
-            int i = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 1);
+            int i = atomicCounter++;
            if (i >= numParticles)
                break;
            if (particles[i].sqrtEpsilon == 0.0f)
@@ -180,7 +177,7 @@ void CpuGayBerneForce::threadComputeForce(ThreadPool& threads, int threadIndex,
    }
    else {
        while (true) {
-            int blockIndex = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 1);
+            int blockIndex = atomicCounter++;
            if (blockIndex >= neighborList->getNumBlocks())
                break;
            const int blockSize = neighborList->getBlockSize();
@@ -211,7 +208,7 @@ void CpuGayBerneForce::threadComputeForce(ThreadPool& threads, int threadIndex,
    int numExceptions = exceptions.size();
    const int groupSize = max(1, numExceptions/(10*numThreads));
    while (true) {
-        int start = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), groupSize);
+        int start = atomicCounter.fetch_add(groupSize);
        if (start >= numExceptions)
            break;
        int end = min(start+groupSize, numExceptions);

--- a/platforms/cpu/src/CpuNeighborList.cpp
+++ b/platforms/cpu/src/CpuNeighborList.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2013-2017 Stanford University and the Authors.      *
+ * Portions copyright (c) 2013-2018 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -476,7 +476,7 @@ void CpuNeighborList::computeNeighborList(int numAtoms, const AlignedArray<float

    // Signal the threads to start running and wait for them to finish.
    
-    gmx_atomic_set(&atomicCounter, 0);
+    atomicCounter = 0;
    threads.resumeThreads();
    threads.waitForThreads();
    
@@ -538,7 +538,7 @@ void CpuNeighborList::threadComputeNeighborList(ThreadPool& threads, int threadI
    vector<float> blockAtomX(blockSize), blockAtomY(blockSize), blockAtomZ(blockSize);
    vector<VoxelIndex> atomVoxelIndex;
    while (true) {
-        int i = gmx_atomic_fetch_add(&atomicCounter, 1);
+        int i = atomicCounter++;
        if (i >= numBlocks)
            break;


--- a/platforms/cpu/src/CpuNonbondedForce.cpp
+++ b/platforms/cpu/src/CpuNonbondedForce.cpp

-/* Portions copyright (c) 2006-2017 Stanford University and Simbios.
+/* Portions copyright (c) 2006-2018 Stanford University and Simbios.
 * Contributors: Pande Group
 *
 * Permission is hereby granted, free of charge, to any person obtaining
@@ -28,7 +28,6 @@
 #include "CpuNonbondedForce.h"
 #include "ReferenceForce.h"
 #include "ReferencePME.h"
-#include "openmm/internal/gmx_atomic.h"
 #include <algorithm>
 #include <iostream>

@@ -389,9 +388,7 @@ void CpuNonbondedForce::calculateDirectIxn(int numberOfAtoms, float* posq, const
    this->threadForce = &threadForce;
    includeEnergy = (totalEnergy != NULL);
    threadEnergy.resize(threads.getNumThreads());
-    gmx_atomic_t counter;
-    gmx_atomic_set(&counter, 0);
-    this->atomicCounter = &counter;
+    atomicCounter = 0;
    
    // Signal the threads to start running and wait for them to finish.
    
@@ -401,7 +398,7 @@ void CpuNonbondedForce::calculateDirectIxn(int numberOfAtoms, float* posq, const
    // Signal the threads to subtract the exclusions.
    
    if (ewald || pme) {
-        gmx_atomic_set(&counter, 0);
+        atomicCounter = 0;
        threads.resumeThreads();
        threads.waitForThreads();
    }
@@ -429,7 +426,7 @@ void CpuNonbondedForce::threadComputeDirect(ThreadPool& threads, int threadIndex
    if (ewald || pme || ljpme) {
        // Compute the interactions from the neighbor list.
        while (true) {
-            int nextBlock = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 1);
+            int nextBlock = atomicCounter++;
            if (nextBlock >= neighborList->getNumBlocks())
                break;
            calculateBlockEwaldIxn(nextBlock, forces, energyPtr, boxSize, invBoxSize);
@@ -440,7 +437,7 @@ void CpuNonbondedForce::threadComputeDirect(ThreadPool& threads, int threadIndex
        threads.syncThreads();
        const int groupSize = max(1, numberOfAtoms/(10*numThreads));
        while (true) {
-            int start = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), groupSize);
+            int start = atomicCounter.fetch_add(groupSize);
            if (start >= numberOfAtoms)
                break;
            int end = min(start+groupSize, numberOfAtoms);
@@ -490,7 +487,7 @@ void CpuNonbondedForce::threadComputeDirect(ThreadPool& threads, int threadIndex
        // Compute the interactions from the neighbor list.

        while (true) {
-            int nextBlock = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 1);
+            int nextBlock = atomicCounter++;
            if (nextBlock >= neighborList->getNumBlocks())
                break;
            calculateBlockIxn(nextBlock, forces, energyPtr, boxSize, invBoxSize);
@@ -500,7 +497,7 @@ void CpuNonbondedForce::threadComputeDirect(ThreadPool& threads, int threadIndex
        // Loop over all atom pairs

        while (true) {
-            int i = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 1);
+            int i = atomicCounter++;
            if (i >= numberOfAtoms)
                break;
            for (int j = i+1; j < numberOfAtoms; j++)

--- a/platforms/cpu/src/CpuSETTLE.cpp
+++ b/platforms/cpu/src/CpuSETTLE.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2013-2017 Stanford University and the Authors.      *
+ * Portions copyright (c) 2013-2018 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -30,7 +30,7 @@
 * -------------------------------------------------------------------------- */

 #include "CpuSETTLE.h"
-#include "openmm/internal/gmx_atomic.h"
+#include <atomic>

 using namespace OpenMM;
 using namespace std;
@@ -61,11 +61,11 @@ CpuSETTLE::~CpuSETTLE() {
 }

 void CpuSETTLE::apply(vector<OpenMM::Vec3>& atomCoordinates, vector<OpenMM::Vec3>& atomCoordinatesP, vector<double>& inverseMasses, double tolerance) {
-    gmx_atomic_t atomicCounter;
-    gmx_atomic_set(&atomicCounter, 0);
+    atomic<int> atomicCounter;
+    atomicCounter = 0;
    threads.execute([&] (ThreadPool& threads, int threadIndex) {
        while (true) {
-            int index = gmx_atomic_fetch_add(&atomicCounter, 1);
+            int index = atomicCounter++;
            if (index >= threadSettle.size())
                break;
            threadSettle[index]->apply(atomCoordinates, atomCoordinatesP, inverseMasses, tolerance);
@@ -75,11 +75,11 @@ void CpuSETTLE::apply(vector<OpenMM::Vec3>& atomCoordinates, vector<OpenMM::Vec3
 }

 void CpuSETTLE::applyToVelocities(vector<OpenMM::Vec3>& atomCoordinates, vector<OpenMM::Vec3>& velocities, vector<double>& inverseMasses, double tolerance) {
-    gmx_atomic_t atomicCounter;
-    gmx_atomic_set(&atomicCounter, 0);
+    atomic<int> atomicCounter;
+    atomicCounter = 0;
    threads.execute([&] (ThreadPool& threads, int threadIndex) {
        while (true) {
-            int index = gmx_atomic_fetch_add(&atomicCounter, 1);
+            int index = atomicCounter++;
            if (index >= threadSettle.size())
                break;
            threadSettle[index]->applyToVelocities(atomCoordinates, velocities, inverseMasses, tolerance);

--- a/plugins/cpupme/src/CpuPmeKernels.cpp
+++ b/plugins/cpupme/src/CpuPmeKernels.cpp
@@ -6,7 +6,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2013-2017 Stanford University and the Authors.      *
+ * Portions copyright (c) 2013-2018 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -52,7 +52,7 @@ bool CpuCalcDispersionPmeReciprocalForceKernel::hasInitializedThreads = false;
 int CpuCalcDispersionPmeReciprocalForceKernel::numThreads = 0;

 static void spreadCharge(float* posq, float* grid, int gridx, int gridy, int gridz, int numParticles, Vec3* periodicBoxVectors, Vec3* recipBoxVectors,
-        gmx_atomic_t& atomicCounter, const float epsilonFactor, int threadIndex, int numThreads, bool deterministic) {
+        atomic<int>& atomicCounter, const float epsilonFactor, int threadIndex, int numThreads, bool deterministic) {
    float temp[4];
    fvec4 boxSize((float) periodicBoxVectors[0][0], (float) periodicBoxVectors[1][1], (float) periodicBoxVectors[2][2], 0);
    fvec4 invBoxSize((float) recipBoxVectors[0][0], (float) recipBoxVectors[1][1], (float) recipBoxVectors[2][2], 0);
@@ -69,7 +69,7 @@ static void spreadCharge(float* posq, float* grid, int gridx, int gridy, int gri
    int i = threadIndex;
    while (true) {
        if (!deterministic)
-            i = gmx_atomic_fetch_add(&atomicCounter, 1);
+            i = atomicCounter++;
        if (i >= numParticles)
            break;

@@ -310,7 +310,7 @@ static void reciprocalConvolution(int start, int end, fftwf_complex* grid, vecto
    }
 }

-static void interpolateForces(float* posq, float* force, float* grid, int gridx, int gridy, int gridz, int numParticles, Vec3* periodicBoxVectors, Vec3* recipBoxVectors, gmx_atomic_t& atomicCounter, const float epsilonFactor) {
+static void interpolateForces(float* posq, float* force, float* grid, int gridx, int gridy, int gridz, int numParticles, Vec3* periodicBoxVectors, Vec3* recipBoxVectors, atomic<int>& atomicCounter, const float epsilonFactor) {
    fvec4 boxSize((float) periodicBoxVectors[0][0], (float) periodicBoxVectors[1][1], (float) periodicBoxVectors[2][2], 0);
    fvec4 invBoxSize((float) recipBoxVectors[0][0], (float) recipBoxVectors[1][1], (float) recipBoxVectors[2][2], 0);
    fvec4 recipBoxVec0((float) recipBoxVectors[0][0], (float) recipBoxVectors[0][1], (float) recipBoxVectors[0][2], 0);
@@ -321,7 +321,7 @@ static void interpolateForces(float* posq, float* force, float* grid, int gridx,
    fvec4 one(1);
    fvec4 scale(1.0f/(PME_ORDER-1));
    while (true) {
-        int i = gmx_atomic_fetch_add(&atomicCounter, 1);
+        int i = atomicCounter++;
        if (i >= numParticles)
            break;

@@ -545,7 +545,7 @@ void CpuCalcPmeReciprocalForceKernel::runMainThread() {
        if (isDeleted)
            break;
        posq = io->getPosq();
-        gmx_atomic_set(&atomicCounter, 0);
+        atomicCounter = 0;
        threads.execute([&] (ThreadPool& threads, int threadIndex) { runWorkerThread(threads, threadIndex); }); // Signal threads to perform charge spreading.
        threads.waitForThreads();
        threads.resumeThreads(); // Signal threads to sum the charge grids.
@@ -564,7 +564,7 @@ void CpuCalcPmeReciprocalForceKernel::runMainThread() {
        threads.resumeThreads(); // Signal threads to perform reciprocal convolution.
        threads.waitForThreads();
        fftwf_execute_dft_c2r(backwardFFT, complexGrid, realGrid);
-        gmx_atomic_set(&atomicCounter, 0);
+        atomicCounter = 0;
        threads.resumeThreads(); // Signal threads to interpolate forces.
        threads.waitForThreads();
        isFinished = true;
@@ -837,7 +837,7 @@ void CpuCalcDispersionPmeReciprocalForceKernel::runMainThread() {
            break;
        posq = io->getPosq();
        ComputeTask task(*this);
-        gmx_atomic_set(&atomicCounter, 0);
+        atomicCounter = 0;
        threads.execute(task); // Signal threads to perform charge spreading.
        threads.waitForThreads();
        threads.resumeThreads(); // Signal threads to sum the charge grids.
@@ -856,7 +856,7 @@ void CpuCalcDispersionPmeReciprocalForceKernel::runMainThread() {
        threads.resumeThreads(); // Signal threads to perform reciprocal convolution.
        threads.waitForThreads();
        fftwf_execute_dft_c2r(backwardFFT, complexGrid, realGrid);
-        gmx_atomic_set(&atomicCounter, 0);
+        atomicCounter = 0;
        threads.resumeThreads(); // Signal threads to interpolate forces.
        threads.waitForThreads();
        isFinished = true;

--- a/plugins/cpupme/src/CpuPmeKernels.h
+++ b/plugins/cpupme/src/CpuPmeKernels.h
@@ -9,7 +9,7 @@
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
- * Portions copyright (c) 2013-2017 Stanford University and the Authors.      *
+ * Portions copyright (c) 2013-2018 Stanford University and the Authors.      *
 * Authors: Peter Eastman                                                     *
 * Contributors:                                                              *
 *                                                                            *
@@ -36,8 +36,8 @@
 #include "internal/windowsExportPme.h"
 #include "openmm/kernels.h"
 #include "openmm/Vec3.h"
-#include "openmm/internal/gmx_atomic.h"
 #include "openmm/internal/ThreadPool.h"
+#include <atomic>
 #include <fftw3.h>
 #include <pthread.h>
 #include <vector>
@@ -132,7 +132,7 @@ private:
    float* posq;
    Vec3 periodicBoxVectors[3], recipBoxVectors[3];
    bool includeEnergy;
-    gmx_atomic_t atomicCounter;
+    std::atomic<int> atomicCounter;
 };


@@ -226,7 +226,7 @@ private:
    float* posq;
    Vec3 periodicBoxVectors[3], recipBoxVectors[3];
    bool includeEnergy;
-    gmx_atomic_t atomicCounter;
+    std::atomic<int> atomicCounter;
 };

 } // namespace OpenMM