Unverified Commit a39fa14a authored by Adel Johar's avatar Adel Johar Committed by GitHub
Browse files

Final HIP Platform implementation for AMD GPUs on ROCm (#3338)



* Support kernel files with extensions of any length (like .hip)

* Do not allow to replace symbols in single-line comments

* Add OPENMM_BUILD_COMMON CMake option

It allows to build and install common platform files even if
CUDA or OpenCL platforms are not built.
This is required for HIP platform (openmm-hip) if ROCm OpenCL
packages are not installed.

* Add an option for Python wrapper to install into user packages

OPENMM_PYTHON_USER_INSTALL is OFF be default.

* Support FFT backends in Amoeba plugin

The HIP platform supports FFT backends, this commit moves
findLegalFFTDimension to ComputeContext, so platforms can have their own
implementations.

* Compatibility for common platform w/ new HIP platform

* Do not use volatile with private and local AtomData parameters on HIP

The generated code is not optimal, for example, the compiler generates
flat_load instructions instead of ds_read.

* Tune launch bounds for PME grid-related kernels and add WA for RDNA

Force the compiler to use all registers for gridSpreadCharge and
gridInterpolateForce by limiting max waves per EU to 1 on CDNA GPUs,
RDNA GPUs work better without it.

* Optimize atom data structs in GBSA and Amoeba on HIP

Manually rearrange fields, add paddings and force alignments to
have faster accesses to shared memory: ds_read and ds_write may
work slower if addresses are not aligned by 16 bytes.
Co-authored-by: default avatarAnton Gorenko <anton@streamhpc.com>
Co-authored-by: default avatarNick Curtis <nicholas.curtis@amd.com>
parent 8d9a656d
......@@ -352,7 +352,9 @@ ENDIF(OPENMM_BUILD_OPENCL_LIB)
# Common compute files
IF(CUDAToolkit_FOUND OR OPENCL_FOUND)
SET(OPENMM_BUILD_COMMON OFF CACHE BOOL "Build common files even if CUDA or OpenCL platforms are not built")
IF(OPENMM_BUILD_CUDA_LIB OR OPENMM_BUILD_OPENCL_LIB OR OPENMM_BUILD_COMMON)
ADD_SUBDIRECTORY(platforms/common)
ENDIF()
......
FILE(GLOB KERNEL_FILES ${KERNEL_SOURCE_DIR}/kernels/*.${KERNEL_FILE_EXTENSION})
SET(KERNEL_FILE_DECLARATIONS)
CONFIGURE_FILE(${KERNEL_SOURCE_DIR}/${KERNEL_SOURCE_CLASS}.cpp.in ${KERNELS_CPP})
# Determine file extension length
STRING(LENGTH ${KERNEL_FILE_EXTENSION} extension_length)
# add one space for the dot
MATH(EXPR extension_length ${extension_length}+1)
FOREACH(file ${KERNEL_FILES})
# Load the file contents and process it.
FILE(STRINGS ${file} file_content NEWLINE_CONSUME)
......@@ -16,7 +20,7 @@ FOREACH(file ${KERNEL_FILES})
# Determine a name for the variable that will contain this file's contents
FILE(RELATIVE_PATH filename ${KERNEL_SOURCE_DIR}/kernels ${file})
STRING(LENGTH ${filename} filename_length)
MATH(EXPR filename_length ${filename_length}-3)
MATH(EXPR filename_length ${filename_length}-${extension_length})
STRING(SUBSTRING ${filename} 0 ${filename_length} variable_name)
# Record the variable declaration and definition.
......
......@@ -51,7 +51,7 @@ struct MinimizerData {
Context* cpuContext;
MinimizerData(Context& context, double k) : context(context), k(k), cpuIntegrator(1.0), cpuContext(NULL) {
string platformName = context.getPlatform().getName();
checkLargeForces = (platformName == "CUDA" || platformName == "OpenCL");
checkLargeForces = (platformName == "CUDA" || platformName == "OpenCL" || platformName == "HIP");
}
~MinimizerData() {
if (cpuContext != NULL)
......@@ -110,7 +110,7 @@ static lbfgsfloatval_t evaluate(void *instance, const lbfgsfloatval_t *x, lbfgsf
positions[i] = Vec3(x[3*i], x[3*i+1], x[3*i+2]);
double energy = computeForcesAndEnergy(context, positions, g);
if (data->checkLargeForces) {
// The CUDA and OpenCL platforms accumulate forces in fixed point, so they
// The CUDA, OpenCL and HIP platforms accumulate forces in fixed point, so they
// can't handle very large forces. Check for problematic forces (very large,
// infinite, or NaN) and if necessary recompute them on the CPU.
......
......@@ -439,6 +439,10 @@ public:
* when it is no longer needed.
*/
virtual NonbondedUtilities* createNonbondedUtilities() = 0;
/**
* Get the smallest legal size for a dimension of the grid.
*/
virtual int findLegalFFTDimension(int minimum);
/**
* This should be called by the Integrator from its own initialize() method.
* It ensures all contexts are fully initialized.
......
......@@ -78,6 +78,19 @@ string ComputeContext::replaceStrings(const string& input, const std::map<std::s
if ((index == 0 || symbolChars.find(result[index-1]) == symbolChars.end()) && (index == result.size()-size || symbolChars.find(result[index+size]) == symbolChars.end())) {
// We have found a complete symbol, not part of a longer symbol.
// Do not allow to replace a symbol contained in single-line comments with a multi-line content
// because only the first line will be commented
// (the check is used to prevent incorrect commenting during development).
if (pair.second.find('\n') != pair.second.npos) {
int prevIndex = index;
while (prevIndex > 1 && result[prevIndex] != '\n') {
if (result[prevIndex] == '/' && result[prevIndex - 1] == '/') {
throw OpenMMException("Symbol " + pair.first + " is contained in a single-line comment");
}
prevIndex--;
}
}
result.replace(index, size, pair.second);
index += pair.second.size();
}
......@@ -627,6 +640,23 @@ void ComputeContext::addPostComputation(ForcePostComputation* computation) {
postComputations.push_back(computation);
}
int ComputeContext::findLegalFFTDimension(int minimum) {
if (minimum < 1)
return 1;
while (true) {
// Attempt to factor the current value.
int unfactored = minimum;
for (int factor = 2; factor < 8; factor++) {
while (unfactored > 1 && unfactored%factor == 0)
unfactored /= factor;
}
if (unfactored == 1)
return minimum;
minimum++;
}
}
struct ComputeContext::WorkThread::ThreadData {
ThreadData(std::queue<ComputeContext::WorkTask*>& tasks, bool& waiting, bool& finished, bool& threwException, OpenMMException& stashedException,
pthread_mutex_t& queueLock, pthread_cond_t& waitForTaskCondition, pthread_cond_t& queueEmptyCondition) :
......
......@@ -152,7 +152,7 @@ KERNEL void findNeighbors(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4
) {
LOCAL real3 positionCache[FIND_NEIGHBORS_WORKGROUP_SIZE];
int indexInWarp = LOCAL_ID%32;
#ifndef __CUDA_ARCH__
#if !(defined(__CUDA_ARCH__) || defined(USE_HIP))
LOCAL bool includeBlockFlags[FIND_NEIGHBORS_WORKGROUP_SIZE];
int warpStart = LOCAL_ID-indexInWarp;
#endif
......@@ -191,7 +191,7 @@ KERNEL void findNeighbors(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4
// Loop over any blocks we identified as potentially containing neighbors.
#ifdef __CUDA_ARCH__
#if defined(__CUDA_ARCH__) || defined(USE_HIP)
int includeBlockFlags = BALLOT(includeBlock2);
while (includeBlockFlags != 0) {
int i = __ffs(includeBlockFlags)-1;
......
......@@ -18,6 +18,10 @@ DEVICE int reduceMax(int val, LOCAL_ARG int* temp) {
for (int mask = 16; mask > 0; mask /= 2)
val = max(val, __shfl_xor_sync(0xffffffff, val, mask));
return val;
#elif defined(USE_HIP)
for (int mask = 16; mask > 0; mask /= 2)
val = max(val, __shfl_xor(val, mask, 32));
return val;
#else
int indexInWarp = LOCAL_ID%32;
temp[LOCAL_ID] = val;
......
#define WARPS_PER_GROUP (FORCE_WORK_GROUP_SIZE/TILE_SIZE)
typedef struct {
#if defined(USE_HIP)
#define ALIGN alignas(16)
#else
#define ALIGN
#endif
typedef struct ALIGN {
real x, y, z;
real q;
float radius, scaledRadius;
......@@ -367,7 +373,7 @@ KERNEL void computeBornSum(
}
}
typedef struct {
typedef struct ALIGN {
real x, y, z;
real q;
real fx, fy, fz, fw;
......
......@@ -61,6 +61,9 @@ KERNEL void findAtomGridIndex(GLOBAL const real4* RESTRICT posq, GLOBAL int2* RE
#ifdef SUPPORTS_64_BIT_ATOMICS
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
#if defined(USE_HIP) && !defined(AMD_RDNA)
LAUNCH_BOUNDS_EXACT(128, 1)
#endif
KERNEL void gridSpreadCharge(GLOBAL const real4* RESTRICT posq,
#ifdef USE_FIXED_POINT_CHARGE_SPREADING
GLOBAL mm_ulong* RESTRICT pmeGrid,
......@@ -75,6 +78,10 @@ KERNEL void gridSpreadCharge(GLOBAL const real4* RESTRICT posq,
GLOBAL const real* RESTRICT charges
#endif
) {
// HIP-TODO: Workaround for RDNA, remove it when the compiler issue is fixed
#if defined(USE_HIP)
(void)GLOBAL_ID;
#endif
// To improve memory efficiency, we divide indices along the z axis into
// PME_ORDER blocks, where the data for each block is stored together. We
// can ensure that all threads write to the same block at the same time,
......@@ -171,6 +178,10 @@ KERNEL void finishSpreadCharge(
GLOBAL const real* RESTRICT grid1,
#endif
GLOBAL real* RESTRICT grid2) {
// HIP-TODO: Workaround for RDNA, remove it when the compiler issue is fixed
#if defined(USE_HIP)
(void)GLOBAL_ID;
#endif
// During charge spreading, we shuffled the order of indices along the z
// axis to make memory access more efficient. We now need to unshuffle
// them. If the values were accumulated as fixed point, we also need to
......@@ -503,6 +514,9 @@ KERNEL void gridEvaluateEnergy(GLOBAL real2* RESTRICT pmeGrid, GLOBAL mixed* RES
#endif
}
#if defined(USE_HIP) && !defined(AMD_RDNA) && !defined(USE_DOUBLE_PRECISION)
LAUNCH_BOUNDS_EXACT(128, 1)
#endif
KERNEL void gridInterpolateForce(GLOBAL const real4* RESTRICT posq, GLOBAL mm_ulong* RESTRICT forceBuffers, GLOBAL const real* RESTRICT pmeGrid,
real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ, GLOBAL const int2* RESTRICT pmeAtomGridIndex,
......
......@@ -50,23 +50,6 @@
using namespace OpenMM;
using namespace std;
static int findLegalFFTDimension(int minimum) {
if (minimum < 1)
return 1;
while (true) {
// Attempt to factor the current value.
int unfactored = minimum;
for (int factor = 2; factor < 8; factor++) {
while (unfactored > 1 && unfactored%factor == 0)
unfactored /= factor;
}
if (unfactored == 1)
return minimum;
minimum++;
}
}
static void setPeriodicBoxArgs(ComputeContext& cc, ComputeKernel kernel, int index) {
Vec3 a, b, c;
cc.getPeriodicBoxVectors(a, b, c);
......@@ -456,13 +439,13 @@ void CommonCalcAmoebaMultipoleForceKernel::initialize(const System& system, cons
nb.setEwaldErrorTolerance(force.getEwaldErrorTolerance());
nb.setCutoffDistance(force.getCutoffDistance());
NonbondedForceImpl::calcPMEParameters(system, nb, pmeAlpha, gridSizeX, gridSizeY, gridSizeZ, false);
gridSizeX = findLegalFFTDimension(gridSizeX);
gridSizeY = findLegalFFTDimension(gridSizeY);
gridSizeZ = findLegalFFTDimension(gridSizeZ);
gridSizeX = cc.findLegalFFTDimension(gridSizeX);
gridSizeY = cc.findLegalFFTDimension(gridSizeY);
gridSizeZ = cc.findLegalFFTDimension(gridSizeZ);
} else {
gridSizeX = findLegalFFTDimension(nx);
gridSizeY = findLegalFFTDimension(ny);
gridSizeZ = findLegalFFTDimension(nz);
gridSizeX = cc.findLegalFFTDimension(nx);
gridSizeY = cc.findLegalFFTDimension(ny);
gridSizeZ = cc.findLegalFFTDimension(nz);
}
defines["EWALD_ALPHA"] = cc.doubleToString(pmeAlpha);
defines["SQRT_PI"] = cc.doubleToString(sqrt(M_PI));
......@@ -2550,13 +2533,13 @@ void CommonCalcHippoNonbondedForceKernel::initialize(const System& system, const
nb.setEwaldErrorTolerance(force.getEwaldErrorTolerance());
nb.setCutoffDistance(force.getCutoffDistance());
NonbondedForceImpl::calcPMEParameters(system, nb, pmeAlpha, gridSizeX, gridSizeY, gridSizeZ, false);
gridSizeX = findLegalFFTDimension(gridSizeX);
gridSizeY = findLegalFFTDimension(gridSizeY);
gridSizeZ = findLegalFFTDimension(gridSizeZ);
gridSizeX = cc.findLegalFFTDimension(gridSizeX);
gridSizeY = cc.findLegalFFTDimension(gridSizeY);
gridSizeZ = cc.findLegalFFTDimension(gridSizeZ);
} else {
gridSizeX = findLegalFFTDimension(nx);
gridSizeY = findLegalFFTDimension(ny);
gridSizeZ = findLegalFFTDimension(nz);
gridSizeX = cc.findLegalFFTDimension(nx);
gridSizeY = cc.findLegalFFTDimension(ny);
gridSizeZ = cc.findLegalFFTDimension(nz);
}
force.getDPMEParameters(dpmeAlpha, nx, ny, nz);
if (nx == 0 || dpmeAlpha == 0) {
......@@ -2564,13 +2547,13 @@ void CommonCalcHippoNonbondedForceKernel::initialize(const System& system, const
nb.setEwaldErrorTolerance(force.getEwaldErrorTolerance());
nb.setCutoffDistance(force.getCutoffDistance());
NonbondedForceImpl::calcPMEParameters(system, nb, dpmeAlpha, dispersionGridSizeX, dispersionGridSizeY, dispersionGridSizeZ, true);
dispersionGridSizeX = findLegalFFTDimension(dispersionGridSizeX);
dispersionGridSizeY = findLegalFFTDimension(dispersionGridSizeY);
dispersionGridSizeZ = findLegalFFTDimension(dispersionGridSizeZ);
dispersionGridSizeX = cc.findLegalFFTDimension(dispersionGridSizeX);
dispersionGridSizeY = cc.findLegalFFTDimension(dispersionGridSizeY);
dispersionGridSizeZ = cc.findLegalFFTDimension(dispersionGridSizeZ);
} else {
dispersionGridSizeX = findLegalFFTDimension(nx);
dispersionGridSizeY = findLegalFFTDimension(ny);
dispersionGridSizeZ = findLegalFFTDimension(nz);
dispersionGridSizeX = cc.findLegalFFTDimension(nx);
dispersionGridSizeY = cc.findLegalFFTDimension(ny);
dispersionGridSizeZ = cc.findLegalFFTDimension(nz);
}
defines["EWALD_ALPHA"] = cc.doubleToString(pmeAlpha);
defines["SQRT_PI"] = cc.doubleToString(sqrt(M_PI));
......
......@@ -190,11 +190,17 @@ typedef struct {
real q, bornRadius, bornForce;
} AtomData2;
DEVICE void computeOneInteractionF1(AtomData2 atom1, volatile AtomData2 atom2, real* outputEnergy, real3* force);
DEVICE void computeOneInteractionF2(AtomData2 atom1, volatile AtomData2 atom2, real* outputEnergy, real3* force);
DEVICE void computeOneInteractionT1(AtomData2 atom1, volatile AtomData2 atom2, real3* torque);
DEVICE void computeOneInteractionT2(AtomData2 atom1, volatile AtomData2 atom2, real3* torque);
DEVICE void computeOneInteractionB1B2(AtomData2 atom1, volatile AtomData2 atom2, real* bornForce1, real* bornForce2);
#if defined(USE_HIP)
#define ATOM2_ARG_SPEC
#else
#define ATOM2_ARG_SPEC volatile
#endif
DEVICE void computeOneInteractionF1(AtomData2 atom1, ATOM2_ARG_SPEC AtomData2 atom2, real* outputEnergy, real3* force);
DEVICE void computeOneInteractionF2(AtomData2 atom1, ATOM2_ARG_SPEC AtomData2 atom2, real* outputEnergy, real3* force);
DEVICE void computeOneInteractionT1(AtomData2 atom1, ATOM2_ARG_SPEC AtomData2 atom2, real3* torque);
DEVICE void computeOneInteractionT2(AtomData2 atom1, ATOM2_ARG_SPEC AtomData2 atom2, real3* torque);
DEVICE void computeOneInteractionB1B2(AtomData2 atom1, ATOM2_ARG_SPEC AtomData2 atom2, real* bornForce1, real* bornForce2);
inline DEVICE AtomData2 loadAtomData2(int atom, GLOBAL const real4* RESTRICT posq, GLOBAL const real* RESTRICT labFrameDipole,
GLOBAL const real* RESTRICT labFrameQuadrupole, GLOBAL const real* RESTRICT inducedDipole, GLOBAL const real* RESTRICT inducedDipolePolar, GLOBAL const real* RESTRICT bornRadius) {
......@@ -585,16 +591,38 @@ KERNEL void computeChainRuleForce(
} while (pos < end);
}
typedef struct {
real3 pos, force, dipole, inducedDipole, inducedDipolePolar, inducedDipoleS, inducedDipolePolarS;
real q, quadrupoleXX, quadrupoleXY, quadrupoleXZ;
#if defined(USE_HIP)
#define ALIGN alignas(16)
#else
#define ALIGN
#endif
typedef struct ALIGN {
real3 pos;
real q;
real3 dipole;
#if defined(USE_HIP)
real padding0;
#endif
real3 inducedDipole, inducedDipolePolar, inducedDipoleS, inducedDipolePolarS;
real quadrupoleXX, quadrupoleXY, quadrupoleXZ;
real quadrupoleYY, quadrupoleYZ, quadrupoleZZ;
real3 force;
float thole, damp;
#if defined(USE_HIP) && !defined(USE_DOUBLE_PRECISION)
real padding1[2]; // Prevent bank conflicts because the aligned size is 128
#endif
} AtomData4;
DEVICE void computeOneEDiffInteractionF1(AtomData4* atom1, LOCAL_ARG volatile AtomData4* atom2, float dScale, float pScale, real* outputEnergy, real3* outputForce);
DEVICE void computeOneEDiffInteractionT1(AtomData4* atom1, LOCAL_ARG volatile AtomData4* atom2, float dScale, float pScale, real3* outputForce);
DEVICE void computeOneEDiffInteractionT3(AtomData4* atom1, LOCAL_ARG volatile AtomData4* atom2, float dScale, float pScale, real3* outputForce);
#if defined(USE_HIP)
#define ATOM2_PTR_ARG_SPEC const
#else
#define ATOM2_PTR_ARG_SPEC volatile
#endif
DEVICE void computeOneEDiffInteractionF1(const AtomData4* atom1, LOCAL_ARG ATOM2_PTR_ARG_SPEC AtomData4* atom2, float dScale, float pScale, real* outputEnergy, real3* outputForce);
DEVICE void computeOneEDiffInteractionT1(const AtomData4* atom1, LOCAL_ARG ATOM2_PTR_ARG_SPEC AtomData4* atom2, float dScale, float pScale, real3* outputForce);
DEVICE void computeOneEDiffInteractionT3(const AtomData4* atom1, LOCAL_ARG ATOM2_PTR_ARG_SPEC AtomData4* atom2, float dScale, float pScale, real3* outputForce);
inline DEVICE AtomData4 loadAtomData4(int atom, GLOBAL const real4* RESTRICT posq, GLOBAL const real* RESTRICT labFrameDipole,
GLOBAL const real* RESTRICT labFrameQuadrupole, GLOBAL const real* RESTRICT inducedDipole, GLOBAL const real* RESTRICT inducedDipolePolar,
......
#if defined F1
DEVICE void computeOneEDiffInteractionF1(AtomData4* atom1, LOCAL_ARG volatile AtomData4* atom2, float dScale, float pScale, real* outputEnergy, real3* outputForce) {
DEVICE void computeOneEDiffInteractionF1(const AtomData4* atom1, LOCAL_ARG ATOM2_PTR_ARG_SPEC AtomData4* atom2, float dScale, float pScale, real* outputEnergy, real3* outputForce) {
#elif defined T1
DEVICE void computeOneEDiffInteractionT1(AtomData4* atom1, LOCAL_ARG volatile AtomData4* atom2, float dScale, float pScale, real3* outputForce) {
DEVICE void computeOneEDiffInteractionT1(const AtomData4* atom1, LOCAL_ARG ATOM2_PTR_ARG_SPEC AtomData4* atom2, float dScale, float pScale, real3* outputForce) {
#elif defined T3
DEVICE void computeOneEDiffInteractionT3(AtomData4* atom1, LOCAL_ARG volatile AtomData4* atom2, float dScale, float pScale, real3* outputForce) {
DEVICE void computeOneEDiffInteractionT3(const AtomData4* atom1, LOCAL_ARG ATOM2_PTR_ARG_SPEC AtomData4* atom2, float dScale, float pScale, real3* outputForce) {
#endif
const float uscale = 1;
......
......@@ -4,15 +4,15 @@
*/
#if defined F1
DEVICE void computeOneInteractionF1(AtomData2 atom1, volatile AtomData2 atom2, real* outputEnergy, real3* force) {
DEVICE void computeOneInteractionF1(AtomData2 atom1, ATOM2_ARG_SPEC AtomData2 atom2, real* outputEnergy, real3* force) {
#elif defined F2
DEVICE void computeOneInteractionF2(AtomData2 atom1, volatile AtomData2 atom2, real* outputEnergy, real3* force) {
DEVICE void computeOneInteractionF2(AtomData2 atom1, ATOM2_ARG_SPEC AtomData2 atom2, real* outputEnergy, real3* force) {
#elif defined T1
DEVICE void computeOneInteractionT1(AtomData2 atom1, volatile AtomData2 atom2, real3* torque) {
DEVICE void computeOneInteractionT1(AtomData2 atom1, ATOM2_ARG_SPEC AtomData2 atom2, real3* torque) {
#elif defined T2
DEVICE void computeOneInteractionT2(AtomData2 atom1, volatile AtomData2 atom2, real3* torque) {
DEVICE void computeOneInteractionT2(AtomData2 atom1, ATOM2_ARG_SPEC AtomData2 atom2, real3* torque) {
#elif defined B1 && defined B2
DEVICE void computeOneInteractionB1B2(AtomData2 atom1, volatile AtomData2 atom2, real* bornForce1, real* bornForce2) {
DEVICE void computeOneInteractionB1B2(AtomData2 atom1, ATOM2_ARG_SPEC AtomData2 atom2, real* bornForce1, real* bornForce2) {
#endif
const real fc = EPSILON_FACTOR*GK_FC;
......
......@@ -19,6 +19,12 @@ typedef struct {
#endif
#ifdef ENABLE_SHUFFLE
#if defined(USE_HIP)
#define real_shfl SHFL
#else
//support for 64 bit shuffles
static __inline__ __device__ float real_shfl(float var, int srcLane) {
return SHFL(var, srcLane);
......@@ -41,6 +47,8 @@ static __inline__ __device__ mm_long real_shfl(mm_long var, int srcLane) {
int2 fuse; fuse.x = lo; fuse.y = hi;
return *reinterpret_cast<mm_long*>(&fuse);
}
#endif
#endif
KERNEL void computeNonbonded(
......@@ -50,7 +58,7 @@ KERNEL void computeNonbonded(
, GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, GLOBAL const real4* RESTRICT blockCenter,
GLOBAL const real4* RESTRICT blockSize, GLOBAL const unsigned int* RESTRICT interactingAtoms
#ifdef __CUDA_ARCH__
#if defined(__CUDA_ARCH__) || defined(USE_HIP)
, unsigned int maxSinglePairs, GLOBAL const int2* RESTRICT singlePairs
#endif
#endif
......
#define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)
typedef struct {
real3 pos, force, torque, inducedDipole, inducedDipolePolar, sphericalDipole;
#if defined(USE_HIP)
#define ALIGN alignas(16)
#else
#define ALIGN
#endif
typedef struct ALIGN {
real3 pos;
real q;
real3 force, torque, inducedDipole, inducedDipolePolar, sphericalDipole;
float thole, damp;
#ifdef INCLUDE_QUADRUPOLES
real sphericalQuadrupole[5];
......
#ifndef HIPPO
#define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)
typedef struct {
#if defined(USE_HIP)
#define ALIGN alignas(16)
#else
#define ALIGN
#endif
typedef struct ALIGN {
real3 pos;
#if defined(USE_HIP)
real padding0;
#endif
real3 field, fieldPolar, inducedDipole, inducedDipolePolar;
#ifdef EXTRAPOLATED_POLARIZATION
real fieldGradient[6], fieldGradientPolar[6];
#endif
#ifdef USE_GK
real3 fieldS, fieldPolarS, inducedDipoleS, inducedDipolePolarS;
real bornRadius;
#ifdef EXTRAPOLATED_POLARIZATION
real fieldGradientS[6], fieldGradientPolarS[6];
#endif
real bornRadius;
#if defined(USE_HIP) && !defined(USE_DOUBLE_PRECISION)
real padding1[3]; // Prevent bank conflicts because the aligned size is 128
#endif
#endif
float thole, damp;
} AtomData;
......
#define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)
typedef struct {
real3 pos, force, torque, inducedDipole, inducedDipolePolar, sphericalDipole;
#if defined(USE_HIP)
#define ALIGN alignas(16)
#else
#define ALIGN
#endif
typedef struct ALIGN {
real3 pos;
real q;
real3 force, torque, inducedDipole, inducedDipolePolar, sphericalDipole;
float thole, damp;
#ifdef INCLUDE_QUADRUPOLES
real sphericalQuadrupole[5];
......
......@@ -277,8 +277,17 @@ add_custom_target(PythonSdist
COMMENT "Packaging source distribution package (sdist)..."
)
# Install binary module (to system location)
set(PYTHON_SETUP_COMMAND "install --root=\$ENV{DESTDIR}/")
set(OPENMM_PYTHON_USER_INSTALL OFF CACHE BOOL
"Whether to install OpenMM Python binary module into the user site-packages directory")
mark_as_advanced(OPENMM_PYTHON_USER_INSTALL)
if(OPENMM_PYTHON_USER_INSTALL)
# Install binary module to user location
set(PYTHON_SETUP_COMMAND "install --user")
else()
# Install binary module to system location
set(PYTHON_SETUP_COMMAND "install --root=\$ENV{DESTDIR}/")
endif()
configure_file(pysetup.cmake.in
"${CMAKE_CURRENT_BINARY_DIR}/pysetupinstall.cmake" @ONLY)
add_custom_target(PythonInstall
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment