Final HIP Platform implementation for AMD GPUs on ROCm (#3338)

* Support kernel files with extensions of any length (like .hip) * Do not allow to replace symbols in single-line comments * Add OPENMM_BUILD_COMMON CMake option It allows to build and install common platform files even if CUDA or OpenCL platforms are not built. This is required for HIP platform (openmm-hip) if ROCm OpenCL packages are not installed. * Add an option for Python wrapper to install into user packages OPENMM_PYTHON_USER_INSTALL is OFF be default. * Support FFT backends in Amoeba plugin The HIP platform supports FFT backends, this commit moves findLegalFFTDimension to ComputeContext, so platforms can have their own implementations. * Compatibility for common platform w/ new HIP platform * Do not use volatile with private and local AtomData parameters on HIP The generated code is not optimal, for example, the compiler generates flat_load instructions instead of ds_read. * Tune launch bounds for PME grid-related kernels and add WA for RDNA Force the compiler to use all registers for gridSpreadCharge and gridInterpolateForce by limiting max waves per EU to 1 on CDNA GPUs, RDNA GPUs work better without it. * Optimize atom data structs in GBSA and Amoeba on HIP Manually rearrange fields, add paddings and force alignments to have faster accesses to shared memory: ds_read and ds_write may work slower if addresses are not aligned by 16 bytes. Co-authored-by: Anton Gorenko <anton@streamhpc.com> Co-authored-by: Nick Curtis <nicholas.curtis@amd.com>

Final HIP Platform implementation for AMD GPUs on ROCm (#3338)
* Support kernel files with extensions of any length (like .hip) * Do not allow to replace symbols in single-line comments * Add OPENMM_BUILD_COMMON CMake option It allows to build and install common platform files even if CUDA or OpenCL platforms are not built. This is required for HIP platform (openmm-hip) if ROCm OpenCL packages are not installed. * Add an option for Python wrapper to install into user packages OPENMM_PYTHON_USER_INSTALL is OFF be default. * Support FFT backends in Amoeba plugin The HIP platform supports FFT backends, this commit moves findLegalFFTDimension to ComputeContext, so platforms can have their own implementations. * Compatibility for common platform w/ new HIP platform * Do not use volatile with private and local AtomData parameters on HIP The generated code is not optimal, for example, the compiler generates flat_load instructions instead of ds_read. * Tune launch bounds for PME grid-related kernels and add WA for RDNA Force the compiler to use all registers for gridSpreadCharge and gridInterpolateForce by limiting max waves per EU to 1 on CDNA GPUs, RDNA GPUs work better without it. * Optimize atom data structs in GBSA and Amoeba on HIP Manually rearrange fields, add paddings and force alignments to have faster accesses to shared memory: ds_read and ds_write may work slower if addresses are not aligned by 16 bytes. Co-authored-by: Anton Gorenko <anton@streamhpc.com> Co-authored-by: Nick Curtis <nicholas.curtis@amd.com>
a39fa14a · Adel Johar · GitHub · 8d9a656d · a39fa14a · a39fa14a
Unverified Commit a39fa14a authored Jul 22, 2022 by Adel Johar Committed by GitHub Jul 22, 2022
18 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -352,7 +352,9 @@ ENDIF(OPENMM_BUILD_OPENCL_LIB)
 # Common compute files
-IF(CUDAToolkit_FOUND OR OPENCL_FOUND)
+SET(OPENMM_BUILD_COMMON OFF CACHE BOOL "Build common files even if CUDA or OpenCL platforms are not built")
+IF(OPENMM_BUILD_CUDA_LIB OR OPENMM_BUILD_OPENCL_LIB OR OPENMM_BUILD_COMMON)
    ADD_SUBDIRECTORY(platforms/common)
 ENDIF()

--- a/cmake_modules/EncodeKernelFiles.cmake
+++ b/cmake_modules/EncodeKernelFiles.cmake
 FILE(GLOB KERNEL_FILES ${KERNEL_SOURCE_DIR}/kernels/*.${KERNEL_FILE_EXTENSION})
 SET(KERNEL_FILE_DECLARATIONS)
 CONFIGURE_FILE(${KERNEL_SOURCE_DIR}/${KERNEL_SOURCE_CLASS}.cpp.in ${KERNELS_CPP})
+# Determine file extension length
+STRING(LENGTH ${KERNEL_FILE_EXTENSION} extension_length)
+# add one space for the dot
+MATH(EXPR extension_length ${extension_length}+1)
 FOREACH(file ${KERNEL_FILES})
    # Load the file contents and process it.
    FILE(STRINGS ${file} file_content NEWLINE_CONSUME)
@@ -16,7 +20,7 @@ FOREACH(file ${KERNEL_FILES})
    # Determine a name for the variable that will contain this file's contents
    FILE(RELATIVE_PATH filename ${KERNEL_SOURCE_DIR}/kernels ${file})
    STRING(LENGTH ${filename} filename_length)
-    MATH(EXPR filename_length ${filename_length}-3)
+    MATH(EXPR filename_length ${filename_length}-${extension_length})
    STRING(SUBSTRING ${filename} 0 ${filename_length} variable_name)
    # Record the variable declaration and definition.

--- a/openmmapi/src/LocalEnergyMinimizer.cpp
+++ b/openmmapi/src/LocalEnergyMinimizer.cpp
@@ -51,7 +51,7 @@ struct MinimizerData {
    Context* cpuContext;
    MinimizerData(Context& context, double k) : context(context), k(k), cpuIntegrator(1.0), cpuContext(NULL) {
        string platformName = context.getPlatform().getName();
-        checkLargeForces = (platformName == "CUDA" || platformName == "OpenCL");
+        checkLargeForces = (platformName == "CUDA" || platformName == "OpenCL" || platformName == "HIP");
    }
    ~MinimizerData() {
        if (cpuContext != NULL)
@@ -110,7 +110,7 @@ static lbfgsfloatval_t evaluate(void *instance, const lbfgsfloatval_t *x, lbfgsf
        positions[i] = Vec3(x[3*i], x[3*i+1], x[3*i+2]);
    double energy = computeForcesAndEnergy(context, positions, g);
    if (data->checkLargeForces) {
-        // The CUDA and OpenCL platforms accumulate forces in fixed point, so they
+        // The CUDA, OpenCL and HIP platforms accumulate forces in fixed point, so they
        // can't handle very large forces.  Check for problematic forces (very large,
        // infinite, or NaN) and if necessary recompute them on the CPU.

--- a/platforms/common/include/openmm/common/ComputeContext.h
+++ b/platforms/common/include/openmm/common/ComputeContext.h
@@ -439,6 +439,10 @@ public:
     * when it is no longer needed.
     */
    virtual NonbondedUtilities* createNonbondedUtilities() = 0;
+    /**
+     * Get the smallest legal size for a dimension of the grid.
+     */
+    virtual int findLegalFFTDimension(int minimum);
    /**
     * This should be called by the Integrator from its own initialize() method.
     * It ensures all contexts are fully initialized.

--- a/platforms/common/src/ComputeContext.cpp
+++ b/platforms/common/src/ComputeContext.cpp
@@ -78,6 +78,19 @@ string ComputeContext::replaceStrings(const string& input, const std::map<std::s
                if ((index == 0 || symbolChars.find(result[index-1]) == symbolChars.end()) && (index == result.size()-size || symbolChars.find(result[index+size]) == symbolChars.end())) {
                    // We have found a complete symbol, not part of a longer symbol.
+                    // Do not allow to replace a symbol contained in single-line comments with a multi-line content
+                    // because only the first line will be commented
+                    // (the check is used to prevent incorrect commenting during development).
+                    if (pair.second.find('\n') != pair.second.npos) {
+                        int prevIndex = index;
+                        while (prevIndex > 1 && result[prevIndex] != '\n') {
+                            if (result[prevIndex] == '/' && result[prevIndex - 1] == '/') {
+                                throw OpenMMException("Symbol " + pair.first + " is contained in a single-line comment");
+                            }
+                            prevIndex--;
+                        }
+                    }
                    result.replace(index, size, pair.second);
                    index += pair.second.size();
                }
@@ -627,6 +640,23 @@ void ComputeContext::addPostComputation(ForcePostComputation* computation) {
    postComputations.push_back(computation);
 }
+int ComputeContext::findLegalFFTDimension(int minimum) {
+    if (minimum < 1)
+        return 1;
+    while (true) {
+        // Attempt to factor the current value.
+        int unfactored = minimum;
+        for (int factor = 2; factor < 8; factor++) {
+            while (unfactored > 1 && unfactored%factor == 0)
+                unfactored /= factor;
+        }
+        if (unfactored == 1)
+            return minimum;
+        minimum++;
+    }
+}
 struct ComputeContext::WorkThread::ThreadData {
    ThreadData(std::queue<ComputeContext::WorkTask*>& tasks, bool& waiting,  bool& finished, bool& threwException, OpenMMException& stashedException,
            pthread_mutex_t& queueLock, pthread_cond_t& waitForTaskCondition, pthread_cond_t& queueEmptyCondition) :

--- a/platforms/common/src/kernels/customManyParticle.cc
+++ b/platforms/common/src/kernels/customManyParticle.cc
@@ -152,7 +152,7 @@ KERNEL void findNeighbors(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4
        ) {
    LOCAL real3 positionCache[FIND_NEIGHBORS_WORKGROUP_SIZE];
    int indexInWarp = LOCAL_ID%32;
-#ifndef __CUDA_ARCH__
+#if !(defined(__CUDA_ARCH__) || defined(USE_HIP))
    LOCAL bool includeBlockFlags[FIND_NEIGHBORS_WORKGROUP_SIZE];
    int warpStart = LOCAL_ID-indexInWarp;
 #endif
@@ -191,7 +191,7 @@ KERNEL void findNeighbors(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4
            // Loop over any blocks we identified as potentially containing neighbors.
-#ifdef __CUDA_ARCH__
+#if defined(__CUDA_ARCH__) || defined(USE_HIP)
            int includeBlockFlags = BALLOT(includeBlock2);
            while (includeBlockFlags != 0) {
                int i = __ffs(includeBlockFlags)-1;

--- a/platforms/common/src/kernels/customNonbondedGroups.cc
+++ b/platforms/common/src/kernels/customNonbondedGroups.cc
@@ -18,6 +18,10 @@ DEVICE int reduceMax(int val, LOCAL_ARG int* temp) {
    for (int mask = 16; mask > 0; mask /= 2)
        val = max(val, __shfl_xor_sync(0xffffffff, val, mask));
    return val;
+#elif defined(USE_HIP)
+    for (int mask = 16; mask > 0; mask /= 2)
+        val = max(val, __shfl_xor(val, mask, 32));
+    return val;
 #else
    int indexInWarp = LOCAL_ID%32;
    temp[LOCAL_ID] = val;

--- a/platforms/common/src/kernels/gbsaObc.cc
+++ b/platforms/common/src/kernels/gbsaObc.cc
 #define WARPS_PER_GROUP (FORCE_WORK_GROUP_SIZE/TILE_SIZE)
-typedef struct {
+#if defined(USE_HIP)
+    #define ALIGN alignas(16)
+#else
+    #define ALIGN
+#endif
+typedef struct ALIGN {
    real x, y, z;
    real q;
    float radius, scaledRadius;
@@ -367,7 +373,7 @@ KERNEL void computeBornSum(
    }
 }
-typedef struct {
+typedef struct ALIGN {
    real x, y, z;
    real q;
    real fx, fy, fz, fw;

--- a/platforms/common/src/kernels/pme.cc
+++ b/platforms/common/src/kernels/pme.cc
@@ -61,6 +61,9 @@ KERNEL void findAtomGridIndex(GLOBAL const real4* RESTRICT posq, GLOBAL int2* RE
 #ifdef SUPPORTS_64_BIT_ATOMICS
 #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
+#if defined(USE_HIP) && !defined(AMD_RDNA)
+LAUNCH_BOUNDS_EXACT(128, 1)
+#endif
 KERNEL void gridSpreadCharge(GLOBAL const real4* RESTRICT posq,
 #ifdef USE_FIXED_POINT_CHARGE_SPREADING
        GLOBAL mm_ulong* RESTRICT pmeGrid,
@@ -75,6 +78,10 @@ KERNEL void gridSpreadCharge(GLOBAL const real4* RESTRICT posq,
        GLOBAL const real* RESTRICT charges
 #endif
        ) {
+// HIP-TODO: Workaround for RDNA, remove it when the compiler issue is fixed
+#if defined(USE_HIP)
+    (void)GLOBAL_ID;
+#endif
    // To improve memory efficiency, we divide indices along the z axis into
    // PME_ORDER blocks, where the data for each block is stored together.  We
    // can ensure that all threads write to the same block at the same time,
@@ -84,7 +91,7 @@ KERNEL void gridSpreadCharge(GLOBAL const real4* RESTRICT posq,
    int blockSize = (int) ceil(GRID_SIZE_Z/(real) PME_ORDER);
    for (int i = LOCAL_ID; i < GRID_SIZE_Z+PME_ORDER; i += LOCAL_SIZE) {
        int zindex = i % GRID_SIZE_Z;
-	int block = zindex % PME_ORDER;
+        int block = zindex % PME_ORDER;
        zindexTable[i] = zindex/PME_ORDER + block*GRID_SIZE_X*GRID_SIZE_Y*blockSize;
    }
    SYNC_THREADS;
@@ -137,7 +144,7 @@ KERNEL void gridSpreadCharge(GLOBAL const real4* RESTRICT posq,
        // Spread the charge from this atom onto each grid point.
-	int izoffset = (PME_ORDER-(gridIndex.z%PME_ORDER)) % PME_ORDER;
+        int izoffset = (PME_ORDER-(gridIndex.z%PME_ORDER)) % PME_ORDER;
        for (int ix = 0; ix < PME_ORDER; ix++) {
            int xbase = gridIndex.x+ix;
            xbase -= (xbase >= GRID_SIZE_X ? GRID_SIZE_X : 0);
@@ -149,7 +156,7 @@ KERNEL void gridSpreadCharge(GLOBAL const real4* RESTRICT posq,
                ybase = (xbase+ybase)*blockSize;
                real dxdy = dx*data[iy].y;
                for (int i = 0; i < PME_ORDER; i++) {
-		    int iz = (i+izoffset) % PME_ORDER;
+                    int iz = (i+izoffset) % PME_ORDER;
                    int zindex = gridIndex.z+iz;
                    int index = ybase + zindexTable[zindex];
                    real add = dxdy*data[iz].z;
@@ -171,6 +178,10 @@ KERNEL void finishSpreadCharge(
        GLOBAL const real* RESTRICT grid1,
 #endif
        GLOBAL real* RESTRICT grid2) {
+// HIP-TODO: Workaround for RDNA, remove it when the compiler issue is fixed
+#if defined(USE_HIP)
+    (void)GLOBAL_ID;
+#endif
    // During charge spreading, we shuffled the order of indices along the z
    // axis to make memory access more efficient.  We now need to unshuffle
    // them.  If the values were accumulated as fixed point, we also need to
@@ -179,7 +190,7 @@ KERNEL void finishSpreadCharge(
    LOCAL int zindexTable[GRID_SIZE_Z];
    int blockSize = (int) ceil(GRID_SIZE_Z/(real) PME_ORDER);
    for (int i = LOCAL_ID; i < GRID_SIZE_Z; i += LOCAL_SIZE) {
-	int block = i % PME_ORDER;
+        int block = i % PME_ORDER;
        zindexTable[i] = i/PME_ORDER + block*GRID_SIZE_X*GRID_SIZE_Y*blockSize;
    }
    SYNC_THREADS;
@@ -503,6 +514,9 @@ KERNEL void gridEvaluateEnergy(GLOBAL real2* RESTRICT pmeGrid, GLOBAL mixed* RES
 #endif
 }
+#if defined(USE_HIP) && !defined(AMD_RDNA) && !defined(USE_DOUBLE_PRECISION)
+LAUNCH_BOUNDS_EXACT(128, 1)
+#endif
 KERNEL void gridInterpolateForce(GLOBAL const real4* RESTRICT posq, GLOBAL mm_ulong* RESTRICT forceBuffers, GLOBAL const real* RESTRICT pmeGrid,
        real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
        real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ, GLOBAL const int2* RESTRICT pmeAtomGridIndex,

--- a/plugins/amoeba/platforms/common/src/AmoebaCommonKernels.cpp
+++ b/plugins/amoeba/platforms/common/src/AmoebaCommonKernels.cpp
@@ -50,23 +50,6 @@
 using namespace OpenMM;
 using namespace std;
-static int findLegalFFTDimension(int minimum) {
-    if (minimum < 1)
-        return 1;
-    while (true) {
-        // Attempt to factor the current value.
-        int unfactored = minimum;
-        for (int factor = 2; factor < 8; factor++) {
-            while (unfactored > 1 && unfactored%factor == 0)
-                unfactored /= factor;
-        }
-        if (unfactored == 1)
-            return minimum;
-        minimum++;
-    }
-}
 static void setPeriodicBoxArgs(ComputeContext& cc, ComputeKernel kernel, int index) {
    Vec3 a, b, c;
    cc.getPeriodicBoxVectors(a, b, c);
@@ -456,13 +439,13 @@ void CommonCalcAmoebaMultipoleForceKernel::initialize(const System& system, cons
            nb.setEwaldErrorTolerance(force.getEwaldErrorTolerance());
            nb.setCutoffDistance(force.getCutoffDistance());
            NonbondedForceImpl::calcPMEParameters(system, nb, pmeAlpha, gridSizeX, gridSizeY, gridSizeZ, false);
-            gridSizeX = findLegalFFTDimension(gridSizeX);
+            gridSizeX = cc.findLegalFFTDimension(gridSizeX);
-            gridSizeY = findLegalFFTDimension(gridSizeY);
+            gridSizeY = cc.findLegalFFTDimension(gridSizeY);
-            gridSizeZ = findLegalFFTDimension(gridSizeZ);
+            gridSizeZ = cc.findLegalFFTDimension(gridSizeZ);
        } else {
-            gridSizeX = findLegalFFTDimension(nx);
+            gridSizeX = cc.findLegalFFTDimension(nx);
-            gridSizeY = findLegalFFTDimension(ny);
+            gridSizeY = cc.findLegalFFTDimension(ny);
-            gridSizeZ = findLegalFFTDimension(nz);
+            gridSizeZ = cc.findLegalFFTDimension(nz);
        }
        defines["EWALD_ALPHA"] = cc.doubleToString(pmeAlpha);
        defines["SQRT_PI"] = cc.doubleToString(sqrt(M_PI));
@@ -2550,13 +2533,13 @@ void CommonCalcHippoNonbondedForceKernel::initialize(const System& system, const
            nb.setEwaldErrorTolerance(force.getEwaldErrorTolerance());
            nb.setCutoffDistance(force.getCutoffDistance());
            NonbondedForceImpl::calcPMEParameters(system, nb, pmeAlpha, gridSizeX, gridSizeY, gridSizeZ, false);
-            gridSizeX = findLegalFFTDimension(gridSizeX);
+            gridSizeX = cc.findLegalFFTDimension(gridSizeX);
-            gridSizeY = findLegalFFTDimension(gridSizeY);
+            gridSizeY = cc.findLegalFFTDimension(gridSizeY);
-            gridSizeZ = findLegalFFTDimension(gridSizeZ);
+            gridSizeZ = cc.findLegalFFTDimension(gridSizeZ);
        } else {
-            gridSizeX = findLegalFFTDimension(nx);
+            gridSizeX = cc.findLegalFFTDimension(nx);
-            gridSizeY = findLegalFFTDimension(ny);
+            gridSizeY = cc.findLegalFFTDimension(ny);
-            gridSizeZ = findLegalFFTDimension(nz);
+            gridSizeZ = cc.findLegalFFTDimension(nz);
        }
        force.getDPMEParameters(dpmeAlpha, nx, ny, nz);
        if (nx == 0 || dpmeAlpha == 0) {
@@ -2564,13 +2547,13 @@ void CommonCalcHippoNonbondedForceKernel::initialize(const System& system, const
            nb.setEwaldErrorTolerance(force.getEwaldErrorTolerance());
            nb.setCutoffDistance(force.getCutoffDistance());
            NonbondedForceImpl::calcPMEParameters(system, nb, dpmeAlpha, dispersionGridSizeX, dispersionGridSizeY, dispersionGridSizeZ, true);
-            dispersionGridSizeX = findLegalFFTDimension(dispersionGridSizeX);
+            dispersionGridSizeX = cc.findLegalFFTDimension(dispersionGridSizeX);
-            dispersionGridSizeY = findLegalFFTDimension(dispersionGridSizeY);
+            dispersionGridSizeY = cc.findLegalFFTDimension(dispersionGridSizeY);
-            dispersionGridSizeZ = findLegalFFTDimension(dispersionGridSizeZ);
+            dispersionGridSizeZ = cc.findLegalFFTDimension(dispersionGridSizeZ);
        } else {
-            dispersionGridSizeX = findLegalFFTDimension(nx);
+            dispersionGridSizeX = cc.findLegalFFTDimension(nx);
-            dispersionGridSizeY = findLegalFFTDimension(ny);
+            dispersionGridSizeY = cc.findLegalFFTDimension(ny);
-            dispersionGridSizeZ = findLegalFFTDimension(nz);
+            dispersionGridSizeZ = cc.findLegalFFTDimension(nz);
        }
        defines["EWALD_ALPHA"] = cc.doubleToString(pmeAlpha);
        defines["SQRT_PI"] = cc.doubleToString(sqrt(M_PI));

--- a/plugins/amoeba/platforms/common/src/kernels/amoebaGk.cc
+++ b/plugins/amoeba/platforms/common/src/kernels/amoebaGk.cc
@@ -190,11 +190,17 @@ typedef struct {
    real q, bornRadius, bornForce;
 } AtomData2;
-DEVICE void computeOneInteractionF1(AtomData2 atom1, volatile AtomData2 atom2, real* outputEnergy, real3* force);
+#if defined(USE_HIP)
-DEVICE void computeOneInteractionF2(AtomData2 atom1, volatile AtomData2 atom2, real* outputEnergy, real3* force);
+#define ATOM2_ARG_SPEC
-DEVICE void computeOneInteractionT1(AtomData2 atom1, volatile AtomData2 atom2, real3* torque);
+#else
-DEVICE void computeOneInteractionT2(AtomData2 atom1, volatile AtomData2 atom2, real3* torque);
+#define ATOM2_ARG_SPEC volatile
-DEVICE void computeOneInteractionB1B2(AtomData2 atom1, volatile AtomData2 atom2, real* bornForce1, real* bornForce2);
+#endif
+DEVICE void computeOneInteractionF1(AtomData2 atom1, ATOM2_ARG_SPEC AtomData2 atom2, real* outputEnergy, real3* force);
+DEVICE void computeOneInteractionF2(AtomData2 atom1, ATOM2_ARG_SPEC AtomData2 atom2, real* outputEnergy, real3* force);
+DEVICE void computeOneInteractionT1(AtomData2 atom1, ATOM2_ARG_SPEC AtomData2 atom2, real3* torque);
+DEVICE void computeOneInteractionT2(AtomData2 atom1, ATOM2_ARG_SPEC AtomData2 atom2, real3* torque);
+DEVICE void computeOneInteractionB1B2(AtomData2 atom1, ATOM2_ARG_SPEC AtomData2 atom2, real* bornForce1, real* bornForce2);
 inline DEVICE AtomData2 loadAtomData2(int atom, GLOBAL const real4* RESTRICT posq, GLOBAL const real* RESTRICT labFrameDipole,
        GLOBAL const real* RESTRICT labFrameQuadrupole, GLOBAL const real* RESTRICT inducedDipole, GLOBAL const real* RESTRICT inducedDipolePolar, GLOBAL const real* RESTRICT bornRadius) {
@@ -585,16 +591,38 @@ KERNEL void computeChainRuleForce(
    } while (pos < end);
 }
-typedef struct {
+#if defined(USE_HIP)
-    real3 pos, force, dipole, inducedDipole, inducedDipolePolar, inducedDipoleS, inducedDipolePolarS;
+    #define ALIGN alignas(16)
-    real q, quadrupoleXX, quadrupoleXY, quadrupoleXZ;
+#else
+    #define ALIGN
+#endif
+typedef struct ALIGN {
+    real3 pos;
+    real q;
+    real3 dipole;
+#if defined(USE_HIP)
+    real padding0;
+#endif
+    real3 inducedDipole, inducedDipolePolar, inducedDipoleS, inducedDipolePolarS;
+    real quadrupoleXX, quadrupoleXY, quadrupoleXZ;
    real quadrupoleYY, quadrupoleYZ, quadrupoleZZ;
+    real3 force;
    float thole, damp;
+#if defined(USE_HIP) && !defined(USE_DOUBLE_PRECISION)
+    real padding1[2]; // Prevent bank conflicts because the aligned size is 128
+#endif
 } AtomData4;
-DEVICE void computeOneEDiffInteractionF1(AtomData4* atom1, LOCAL_ARG volatile AtomData4* atom2, float dScale, float pScale, real* outputEnergy, real3* outputForce);
+#if defined(USE_HIP)
-DEVICE void computeOneEDiffInteractionT1(AtomData4* atom1, LOCAL_ARG volatile AtomData4* atom2, float dScale, float pScale, real3* outputForce);
+#define ATOM2_PTR_ARG_SPEC const
-DEVICE void computeOneEDiffInteractionT3(AtomData4* atom1, LOCAL_ARG volatile AtomData4* atom2, float dScale, float pScale, real3* outputForce);
+#else
+#define ATOM2_PTR_ARG_SPEC volatile
+#endif
+DEVICE void computeOneEDiffInteractionF1(const AtomData4* atom1, LOCAL_ARG ATOM2_PTR_ARG_SPEC AtomData4* atom2, float dScale, float pScale, real* outputEnergy, real3* outputForce);
+DEVICE void computeOneEDiffInteractionT1(const AtomData4* atom1, LOCAL_ARG ATOM2_PTR_ARG_SPEC AtomData4* atom2, float dScale, float pScale, real3* outputForce);
+DEVICE void computeOneEDiffInteractionT3(const AtomData4* atom1, LOCAL_ARG ATOM2_PTR_ARG_SPEC AtomData4* atom2, float dScale, float pScale, real3* outputForce);
 inline DEVICE AtomData4 loadAtomData4(int atom, GLOBAL const real4* RESTRICT posq, GLOBAL const real* RESTRICT labFrameDipole,
        GLOBAL const real* RESTRICT labFrameQuadrupole, GLOBAL const real* RESTRICT inducedDipole, GLOBAL const real* RESTRICT inducedDipolePolar,

--- a/plugins/amoeba/platforms/common/src/kernels/gkEDiffPairForce.cc
+++ b/plugins/amoeba/platforms/common/src/kernels/gkEDiffPairForce.cc
 #if defined F1
-DEVICE void computeOneEDiffInteractionF1(AtomData4* atom1, LOCAL_ARG volatile AtomData4* atom2, float dScale, float pScale, real* outputEnergy, real3* outputForce) {
+DEVICE void computeOneEDiffInteractionF1(const AtomData4* atom1, LOCAL_ARG ATOM2_PTR_ARG_SPEC AtomData4* atom2, float dScale, float pScale, real* outputEnergy, real3* outputForce) {
 #elif defined T1
-DEVICE void computeOneEDiffInteractionT1(AtomData4* atom1, LOCAL_ARG volatile AtomData4* atom2, float dScale, float pScale, real3* outputForce) {
+DEVICE void computeOneEDiffInteractionT1(const AtomData4* atom1, LOCAL_ARG ATOM2_PTR_ARG_SPEC AtomData4* atom2, float dScale, float pScale, real3* outputForce) {
 #elif defined T3
-DEVICE void computeOneEDiffInteractionT3(AtomData4* atom1, LOCAL_ARG volatile AtomData4* atom2, float dScale, float pScale, real3* outputForce) {
+DEVICE void computeOneEDiffInteractionT3(const AtomData4* atom1, LOCAL_ARG ATOM2_PTR_ARG_SPEC AtomData4* atom2, float dScale, float pScale, real3* outputForce) {
 #endif
    const float uscale = 1;

--- a/plugins/amoeba/platforms/common/src/kernels/gkPairForce1.cc
+++ b/plugins/amoeba/platforms/common/src/kernels/gkPairForce1.cc
@@ -4,15 +4,15 @@
 */
 #if defined F1
-DEVICE void computeOneInteractionF1(AtomData2 atom1, volatile AtomData2 atom2, real* outputEnergy, real3* force) {
+DEVICE void computeOneInteractionF1(AtomData2 atom1, ATOM2_ARG_SPEC AtomData2 atom2, real* outputEnergy, real3* force) {
 #elif defined F2
-DEVICE void computeOneInteractionF2(AtomData2 atom1, volatile AtomData2 atom2, real* outputEnergy, real3* force) {
+DEVICE void computeOneInteractionF2(AtomData2 atom1, ATOM2_ARG_SPEC AtomData2 atom2, real* outputEnergy, real3* force) {
 #elif defined T1
-DEVICE void computeOneInteractionT1(AtomData2 atom1, volatile AtomData2 atom2, real3* torque) {
+DEVICE void computeOneInteractionT1(AtomData2 atom1, ATOM2_ARG_SPEC AtomData2 atom2, real3* torque) {
 #elif defined T2
-DEVICE void computeOneInteractionT2(AtomData2 atom1, volatile AtomData2 atom2, real3* torque) {
+DEVICE void computeOneInteractionT2(AtomData2 atom1, ATOM2_ARG_SPEC AtomData2 atom2, real3* torque) {
 #elif defined B1 && defined B2
-DEVICE void computeOneInteractionB1B2(AtomData2 atom1, volatile AtomData2 atom2, real* bornForce1, real* bornForce2) {
+DEVICE void computeOneInteractionB1B2(AtomData2 atom1, ATOM2_ARG_SPEC AtomData2 atom2, real* bornForce1, real* bornForce2) {
 #endif
    const real fc = EPSILON_FACTOR*GK_FC;

--- a/plugins/amoeba/platforms/common/src/kernels/hippoNonbonded.cc
+++ b/plugins/amoeba/platforms/common/src/kernels/hippoNonbonded.cc
@@ -19,6 +19,12 @@ typedef struct {
 #endif
 #ifdef ENABLE_SHUFFLE
+#if defined(USE_HIP)
+#define real_shfl SHFL
+#else
 //support for 64 bit shuffles
 static __inline__ __device__ float real_shfl(float var, int srcLane) {
    return SHFL(var, srcLane);
@@ -41,6 +47,8 @@ static __inline__ __device__ mm_long real_shfl(mm_long var, int srcLane) {
    int2 fuse; fuse.x = lo; fuse.y = hi;
    return *reinterpret_cast<mm_long*>(&fuse);
 }
+#endif
 #endif
 KERNEL void computeNonbonded(
@@ -50,7 +58,7 @@ KERNEL void computeNonbonded(
        , GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, 
        real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, GLOBAL const real4* RESTRICT blockCenter,
        GLOBAL const real4* RESTRICT blockSize, GLOBAL const unsigned int* RESTRICT interactingAtoms
-#ifdef __CUDA_ARCH__
+#if defined(__CUDA_ARCH__) || defined(USE_HIP)
        , unsigned int maxSinglePairs, GLOBAL const int2* RESTRICT singlePairs
 #endif
 #endif
@@ -478,4 +486,4 @@ KERNEL void computeNonbonded(
 #ifdef INCLUDE_ENERGY
    energyBuffer[GLOBAL_ID] += energy;
 #endif
 }
\ No newline at end of file
--- a/plugins/amoeba/platforms/common/src/kernels/multipoleElectrostatics.cc
+++ b/plugins/amoeba/platforms/common/src/kernels/multipoleElectrostatics.cc
 #define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)
-typedef struct {
+#if defined(USE_HIP)
-    real3 pos, force, torque, inducedDipole, inducedDipolePolar, sphericalDipole;
+    #define ALIGN alignas(16)
+#else
+    #define ALIGN
+#endif
+typedef struct ALIGN {
+    real3 pos;
    real q;
+    real3 force, torque, inducedDipole, inducedDipolePolar, sphericalDipole;
    float thole, damp;
 #ifdef INCLUDE_QUADRUPOLES
    real sphericalQuadrupole[5];

--- a/plugins/amoeba/platforms/common/src/kernels/multipoleInducedField.cc
+++ b/plugins/amoeba/platforms/common/src/kernels/multipoleInducedField.cc
 #ifndef HIPPO
 #define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)
-typedef struct {
+#if defined(USE_HIP)
+    #define ALIGN alignas(16)
+#else
+    #define ALIGN
+#endif
+typedef struct ALIGN {
    real3 pos;
+#if defined(USE_HIP)
+    real padding0;
+#endif
    real3 field, fieldPolar, inducedDipole, inducedDipolePolar;
 #ifdef EXTRAPOLATED_POLARIZATION
    real fieldGradient[6], fieldGradientPolar[6];
 #endif
 #ifdef USE_GK
    real3 fieldS, fieldPolarS, inducedDipoleS, inducedDipolePolarS;
-    real bornRadius;
    #ifdef EXTRAPOLATED_POLARIZATION
        real fieldGradientS[6], fieldGradientPolarS[6];
    #endif
+    real bornRadius;
+    #if defined(USE_HIP) && !defined(USE_DOUBLE_PRECISION)
+        real padding1[3]; // Prevent bank conflicts because the aligned size is 128
+    #endif
 #endif
    float thole, damp;
 } AtomData;

--- a/plugins/amoeba/platforms/common/src/kernels/pmeMultipoleElectrostatics.cc
+++ b/plugins/amoeba/platforms/common/src/kernels/pmeMultipoleElectrostatics.cc
 #define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)
-typedef struct {
+#if defined(USE_HIP)
-    real3 pos, force, torque, inducedDipole, inducedDipolePolar, sphericalDipole;
+    #define ALIGN alignas(16)
+#else
+    #define ALIGN
+#endif
+typedef struct ALIGN {
+    real3 pos;
    real q;
+    real3 force, torque, inducedDipole, inducedDipolePolar, sphericalDipole;
    float thole, damp;
 #ifdef INCLUDE_QUADRUPOLES
    real sphericalQuadrupole[5];

--- a/wrappers/python/CMakeLists.txt
+++ b/wrappers/python/CMakeLists.txt
@@ -277,8 +277,17 @@ add_custom_target(PythonSdist
    COMMENT "Packaging source distribution package (sdist)..."
 )
-# Install binary module (to system location)
+set(OPENMM_PYTHON_USER_INSTALL OFF CACHE BOOL
-set(PYTHON_SETUP_COMMAND "install --root=\$ENV{DESTDIR}/")
+    "Whether to install OpenMM Python binary module into the user site-packages directory")
+mark_as_advanced(OPENMM_PYTHON_USER_INSTALL)
+if(OPENMM_PYTHON_USER_INSTALL)
+    # Install binary module to user location
+    set(PYTHON_SETUP_COMMAND "install --user")
+else()
+    # Install binary module to system location
+    set(PYTHON_SETUP_COMMAND "install --root=\$ENV{DESTDIR}/")
+endif()
 configure_file(pysetup.cmake.in
    "${CMAKE_CURRENT_BINARY_DIR}/pysetupinstall.cmake" @ONLY)
 add_custom_target(PythonInstall