Unverified Commit a39fa14a authored by Adel Johar's avatar Adel Johar Committed by GitHub
Browse files

Final HIP Platform implementation for AMD GPUs on ROCm (#3338)



* Support kernel files with extensions of any length (like .hip)

* Do not allow to replace symbols in single-line comments

* Add OPENMM_BUILD_COMMON CMake option

It allows to build and install common platform files even if
CUDA or OpenCL platforms are not built.
This is required for HIP platform (openmm-hip) if ROCm OpenCL
packages are not installed.

* Add an option for Python wrapper to install into user packages

OPENMM_PYTHON_USER_INSTALL is OFF be default.

* Support FFT backends in Amoeba plugin

The HIP platform supports FFT backends, this commit moves
findLegalFFTDimension to ComputeContext, so platforms can have their own
implementations.

* Compatibility for common platform w/ new HIP platform

* Do not use volatile with private and local AtomData parameters on HIP

The generated code is not optimal, for example, the compiler generates
flat_load instructions instead of ds_read.

* Tune launch bounds for PME grid-related kernels and add WA for RDNA

Force the compiler to use all registers for gridSpreadCharge and
gridInterpolateForce by limiting max waves per EU to 1 on CDNA GPUs,
RDNA GPUs work better without it.

* Optimize atom data structs in GBSA and Amoeba on HIP

Manually rearrange fields, add paddings and force alignments to
have faster accesses to shared memory: ds_read and ds_write may
work slower if addresses are not aligned by 16 bytes.
Co-authored-by: default avatarAnton Gorenko <anton@streamhpc.com>
Co-authored-by: default avatarNick Curtis <nicholas.curtis@amd.com>
parent 8d9a656d
...@@ -352,7 +352,9 @@ ENDIF(OPENMM_BUILD_OPENCL_LIB) ...@@ -352,7 +352,9 @@ ENDIF(OPENMM_BUILD_OPENCL_LIB)
# Common compute files # Common compute files
IF(CUDAToolkit_FOUND OR OPENCL_FOUND) SET(OPENMM_BUILD_COMMON OFF CACHE BOOL "Build common files even if CUDA or OpenCL platforms are not built")
IF(OPENMM_BUILD_CUDA_LIB OR OPENMM_BUILD_OPENCL_LIB OR OPENMM_BUILD_COMMON)
ADD_SUBDIRECTORY(platforms/common) ADD_SUBDIRECTORY(platforms/common)
ENDIF() ENDIF()
......
FILE(GLOB KERNEL_FILES ${KERNEL_SOURCE_DIR}/kernels/*.${KERNEL_FILE_EXTENSION}) FILE(GLOB KERNEL_FILES ${KERNEL_SOURCE_DIR}/kernels/*.${KERNEL_FILE_EXTENSION})
SET(KERNEL_FILE_DECLARATIONS) SET(KERNEL_FILE_DECLARATIONS)
CONFIGURE_FILE(${KERNEL_SOURCE_DIR}/${KERNEL_SOURCE_CLASS}.cpp.in ${KERNELS_CPP}) CONFIGURE_FILE(${KERNEL_SOURCE_DIR}/${KERNEL_SOURCE_CLASS}.cpp.in ${KERNELS_CPP})
# Determine file extension length
STRING(LENGTH ${KERNEL_FILE_EXTENSION} extension_length)
# add one space for the dot
MATH(EXPR extension_length ${extension_length}+1)
FOREACH(file ${KERNEL_FILES}) FOREACH(file ${KERNEL_FILES})
# Load the file contents and process it. # Load the file contents and process it.
FILE(STRINGS ${file} file_content NEWLINE_CONSUME) FILE(STRINGS ${file} file_content NEWLINE_CONSUME)
...@@ -16,7 +20,7 @@ FOREACH(file ${KERNEL_FILES}) ...@@ -16,7 +20,7 @@ FOREACH(file ${KERNEL_FILES})
# Determine a name for the variable that will contain this file's contents # Determine a name for the variable that will contain this file's contents
FILE(RELATIVE_PATH filename ${KERNEL_SOURCE_DIR}/kernels ${file}) FILE(RELATIVE_PATH filename ${KERNEL_SOURCE_DIR}/kernels ${file})
STRING(LENGTH ${filename} filename_length) STRING(LENGTH ${filename} filename_length)
MATH(EXPR filename_length ${filename_length}-3) MATH(EXPR filename_length ${filename_length}-${extension_length})
STRING(SUBSTRING ${filename} 0 ${filename_length} variable_name) STRING(SUBSTRING ${filename} 0 ${filename_length} variable_name)
# Record the variable declaration and definition. # Record the variable declaration and definition.
......
...@@ -51,7 +51,7 @@ struct MinimizerData { ...@@ -51,7 +51,7 @@ struct MinimizerData {
Context* cpuContext; Context* cpuContext;
MinimizerData(Context& context, double k) : context(context), k(k), cpuIntegrator(1.0), cpuContext(NULL) { MinimizerData(Context& context, double k) : context(context), k(k), cpuIntegrator(1.0), cpuContext(NULL) {
string platformName = context.getPlatform().getName(); string platformName = context.getPlatform().getName();
checkLargeForces = (platformName == "CUDA" || platformName == "OpenCL"); checkLargeForces = (platformName == "CUDA" || platformName == "OpenCL" || platformName == "HIP");
} }
~MinimizerData() { ~MinimizerData() {
if (cpuContext != NULL) if (cpuContext != NULL)
...@@ -110,7 +110,7 @@ static lbfgsfloatval_t evaluate(void *instance, const lbfgsfloatval_t *x, lbfgsf ...@@ -110,7 +110,7 @@ static lbfgsfloatval_t evaluate(void *instance, const lbfgsfloatval_t *x, lbfgsf
positions[i] = Vec3(x[3*i], x[3*i+1], x[3*i+2]); positions[i] = Vec3(x[3*i], x[3*i+1], x[3*i+2]);
double energy = computeForcesAndEnergy(context, positions, g); double energy = computeForcesAndEnergy(context, positions, g);
if (data->checkLargeForces) { if (data->checkLargeForces) {
// The CUDA and OpenCL platforms accumulate forces in fixed point, so they // The CUDA, OpenCL and HIP platforms accumulate forces in fixed point, so they
// can't handle very large forces. Check for problematic forces (very large, // can't handle very large forces. Check for problematic forces (very large,
// infinite, or NaN) and if necessary recompute them on the CPU. // infinite, or NaN) and if necessary recompute them on the CPU.
......
...@@ -439,6 +439,10 @@ public: ...@@ -439,6 +439,10 @@ public:
* when it is no longer needed. * when it is no longer needed.
*/ */
virtual NonbondedUtilities* createNonbondedUtilities() = 0; virtual NonbondedUtilities* createNonbondedUtilities() = 0;
/**
* Get the smallest legal size for a dimension of the grid.
*/
virtual int findLegalFFTDimension(int minimum);
/** /**
* This should be called by the Integrator from its own initialize() method. * This should be called by the Integrator from its own initialize() method.
* It ensures all contexts are fully initialized. * It ensures all contexts are fully initialized.
......
...@@ -78,6 +78,19 @@ string ComputeContext::replaceStrings(const string& input, const std::map<std::s ...@@ -78,6 +78,19 @@ string ComputeContext::replaceStrings(const string& input, const std::map<std::s
if ((index == 0 || symbolChars.find(result[index-1]) == symbolChars.end()) && (index == result.size()-size || symbolChars.find(result[index+size]) == symbolChars.end())) { if ((index == 0 || symbolChars.find(result[index-1]) == symbolChars.end()) && (index == result.size()-size || symbolChars.find(result[index+size]) == symbolChars.end())) {
// We have found a complete symbol, not part of a longer symbol. // We have found a complete symbol, not part of a longer symbol.
// Do not allow to replace a symbol contained in single-line comments with a multi-line content
// because only the first line will be commented
// (the check is used to prevent incorrect commenting during development).
if (pair.second.find('\n') != pair.second.npos) {
int prevIndex = index;
while (prevIndex > 1 && result[prevIndex] != '\n') {
if (result[prevIndex] == '/' && result[prevIndex - 1] == '/') {
throw OpenMMException("Symbol " + pair.first + " is contained in a single-line comment");
}
prevIndex--;
}
}
result.replace(index, size, pair.second); result.replace(index, size, pair.second);
index += pair.second.size(); index += pair.second.size();
} }
...@@ -627,6 +640,23 @@ void ComputeContext::addPostComputation(ForcePostComputation* computation) { ...@@ -627,6 +640,23 @@ void ComputeContext::addPostComputation(ForcePostComputation* computation) {
postComputations.push_back(computation); postComputations.push_back(computation);
} }
int ComputeContext::findLegalFFTDimension(int minimum) {
if (minimum < 1)
return 1;
while (true) {
// Attempt to factor the current value.
int unfactored = minimum;
for (int factor = 2; factor < 8; factor++) {
while (unfactored > 1 && unfactored%factor == 0)
unfactored /= factor;
}
if (unfactored == 1)
return minimum;
minimum++;
}
}
struct ComputeContext::WorkThread::ThreadData { struct ComputeContext::WorkThread::ThreadData {
ThreadData(std::queue<ComputeContext::WorkTask*>& tasks, bool& waiting, bool& finished, bool& threwException, OpenMMException& stashedException, ThreadData(std::queue<ComputeContext::WorkTask*>& tasks, bool& waiting, bool& finished, bool& threwException, OpenMMException& stashedException,
pthread_mutex_t& queueLock, pthread_cond_t& waitForTaskCondition, pthread_cond_t& queueEmptyCondition) : pthread_mutex_t& queueLock, pthread_cond_t& waitForTaskCondition, pthread_cond_t& queueEmptyCondition) :
......
...@@ -152,7 +152,7 @@ KERNEL void findNeighbors(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 ...@@ -152,7 +152,7 @@ KERNEL void findNeighbors(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4
) { ) {
LOCAL real3 positionCache[FIND_NEIGHBORS_WORKGROUP_SIZE]; LOCAL real3 positionCache[FIND_NEIGHBORS_WORKGROUP_SIZE];
int indexInWarp = LOCAL_ID%32; int indexInWarp = LOCAL_ID%32;
#ifndef __CUDA_ARCH__ #if !(defined(__CUDA_ARCH__) || defined(USE_HIP))
LOCAL bool includeBlockFlags[FIND_NEIGHBORS_WORKGROUP_SIZE]; LOCAL bool includeBlockFlags[FIND_NEIGHBORS_WORKGROUP_SIZE];
int warpStart = LOCAL_ID-indexInWarp; int warpStart = LOCAL_ID-indexInWarp;
#endif #endif
...@@ -191,7 +191,7 @@ KERNEL void findNeighbors(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 ...@@ -191,7 +191,7 @@ KERNEL void findNeighbors(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4
// Loop over any blocks we identified as potentially containing neighbors. // Loop over any blocks we identified as potentially containing neighbors.
#ifdef __CUDA_ARCH__ #if defined(__CUDA_ARCH__) || defined(USE_HIP)
int includeBlockFlags = BALLOT(includeBlock2); int includeBlockFlags = BALLOT(includeBlock2);
while (includeBlockFlags != 0) { while (includeBlockFlags != 0) {
int i = __ffs(includeBlockFlags)-1; int i = __ffs(includeBlockFlags)-1;
......
...@@ -18,6 +18,10 @@ DEVICE int reduceMax(int val, LOCAL_ARG int* temp) { ...@@ -18,6 +18,10 @@ DEVICE int reduceMax(int val, LOCAL_ARG int* temp) {
for (int mask = 16; mask > 0; mask /= 2) for (int mask = 16; mask > 0; mask /= 2)
val = max(val, __shfl_xor_sync(0xffffffff, val, mask)); val = max(val, __shfl_xor_sync(0xffffffff, val, mask));
return val; return val;
#elif defined(USE_HIP)
for (int mask = 16; mask > 0; mask /= 2)
val = max(val, __shfl_xor(val, mask, 32));
return val;
#else #else
int indexInWarp = LOCAL_ID%32; int indexInWarp = LOCAL_ID%32;
temp[LOCAL_ID] = val; temp[LOCAL_ID] = val;
......
#define WARPS_PER_GROUP (FORCE_WORK_GROUP_SIZE/TILE_SIZE) #define WARPS_PER_GROUP (FORCE_WORK_GROUP_SIZE/TILE_SIZE)
typedef struct { #if defined(USE_HIP)
#define ALIGN alignas(16)
#else
#define ALIGN
#endif
typedef struct ALIGN {
real x, y, z; real x, y, z;
real q; real q;
float radius, scaledRadius; float radius, scaledRadius;
...@@ -367,7 +373,7 @@ KERNEL void computeBornSum( ...@@ -367,7 +373,7 @@ KERNEL void computeBornSum(
} }
} }
typedef struct { typedef struct ALIGN {
real x, y, z; real x, y, z;
real q; real q;
real fx, fy, fz, fw; real fx, fy, fz, fw;
......
...@@ -61,6 +61,9 @@ KERNEL void findAtomGridIndex(GLOBAL const real4* RESTRICT posq, GLOBAL int2* RE ...@@ -61,6 +61,9 @@ KERNEL void findAtomGridIndex(GLOBAL const real4* RESTRICT posq, GLOBAL int2* RE
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
#if defined(USE_HIP) && !defined(AMD_RDNA)
LAUNCH_BOUNDS_EXACT(128, 1)
#endif
KERNEL void gridSpreadCharge(GLOBAL const real4* RESTRICT posq, KERNEL void gridSpreadCharge(GLOBAL const real4* RESTRICT posq,
#ifdef USE_FIXED_POINT_CHARGE_SPREADING #ifdef USE_FIXED_POINT_CHARGE_SPREADING
GLOBAL mm_ulong* RESTRICT pmeGrid, GLOBAL mm_ulong* RESTRICT pmeGrid,
...@@ -75,6 +78,10 @@ KERNEL void gridSpreadCharge(GLOBAL const real4* RESTRICT posq, ...@@ -75,6 +78,10 @@ KERNEL void gridSpreadCharge(GLOBAL const real4* RESTRICT posq,
GLOBAL const real* RESTRICT charges GLOBAL const real* RESTRICT charges
#endif #endif
) { ) {
// HIP-TODO: Workaround for RDNA, remove it when the compiler issue is fixed
#if defined(USE_HIP)
(void)GLOBAL_ID;
#endif
// To improve memory efficiency, we divide indices along the z axis into // To improve memory efficiency, we divide indices along the z axis into
// PME_ORDER blocks, where the data for each block is stored together. We // PME_ORDER blocks, where the data for each block is stored together. We
// can ensure that all threads write to the same block at the same time, // can ensure that all threads write to the same block at the same time,
...@@ -84,7 +91,7 @@ KERNEL void gridSpreadCharge(GLOBAL const real4* RESTRICT posq, ...@@ -84,7 +91,7 @@ KERNEL void gridSpreadCharge(GLOBAL const real4* RESTRICT posq,
int blockSize = (int) ceil(GRID_SIZE_Z/(real) PME_ORDER); int blockSize = (int) ceil(GRID_SIZE_Z/(real) PME_ORDER);
for (int i = LOCAL_ID; i < GRID_SIZE_Z+PME_ORDER; i += LOCAL_SIZE) { for (int i = LOCAL_ID; i < GRID_SIZE_Z+PME_ORDER; i += LOCAL_SIZE) {
int zindex = i % GRID_SIZE_Z; int zindex = i % GRID_SIZE_Z;
int block = zindex % PME_ORDER; int block = zindex % PME_ORDER;
zindexTable[i] = zindex/PME_ORDER + block*GRID_SIZE_X*GRID_SIZE_Y*blockSize; zindexTable[i] = zindex/PME_ORDER + block*GRID_SIZE_X*GRID_SIZE_Y*blockSize;
} }
SYNC_THREADS; SYNC_THREADS;
...@@ -137,7 +144,7 @@ KERNEL void gridSpreadCharge(GLOBAL const real4* RESTRICT posq, ...@@ -137,7 +144,7 @@ KERNEL void gridSpreadCharge(GLOBAL const real4* RESTRICT posq,
// Spread the charge from this atom onto each grid point. // Spread the charge from this atom onto each grid point.
int izoffset = (PME_ORDER-(gridIndex.z%PME_ORDER)) % PME_ORDER; int izoffset = (PME_ORDER-(gridIndex.z%PME_ORDER)) % PME_ORDER;
for (int ix = 0; ix < PME_ORDER; ix++) { for (int ix = 0; ix < PME_ORDER; ix++) {
int xbase = gridIndex.x+ix; int xbase = gridIndex.x+ix;
xbase -= (xbase >= GRID_SIZE_X ? GRID_SIZE_X : 0); xbase -= (xbase >= GRID_SIZE_X ? GRID_SIZE_X : 0);
...@@ -149,7 +156,7 @@ KERNEL void gridSpreadCharge(GLOBAL const real4* RESTRICT posq, ...@@ -149,7 +156,7 @@ KERNEL void gridSpreadCharge(GLOBAL const real4* RESTRICT posq,
ybase = (xbase+ybase)*blockSize; ybase = (xbase+ybase)*blockSize;
real dxdy = dx*data[iy].y; real dxdy = dx*data[iy].y;
for (int i = 0; i < PME_ORDER; i++) { for (int i = 0; i < PME_ORDER; i++) {
int iz = (i+izoffset) % PME_ORDER; int iz = (i+izoffset) % PME_ORDER;
int zindex = gridIndex.z+iz; int zindex = gridIndex.z+iz;
int index = ybase + zindexTable[zindex]; int index = ybase + zindexTable[zindex];
real add = dxdy*data[iz].z; real add = dxdy*data[iz].z;
...@@ -171,6 +178,10 @@ KERNEL void finishSpreadCharge( ...@@ -171,6 +178,10 @@ KERNEL void finishSpreadCharge(
GLOBAL const real* RESTRICT grid1, GLOBAL const real* RESTRICT grid1,
#endif #endif
GLOBAL real* RESTRICT grid2) { GLOBAL real* RESTRICT grid2) {
// HIP-TODO: Workaround for RDNA, remove it when the compiler issue is fixed
#if defined(USE_HIP)
(void)GLOBAL_ID;
#endif
// During charge spreading, we shuffled the order of indices along the z // During charge spreading, we shuffled the order of indices along the z
// axis to make memory access more efficient. We now need to unshuffle // axis to make memory access more efficient. We now need to unshuffle
// them. If the values were accumulated as fixed point, we also need to // them. If the values were accumulated as fixed point, we also need to
...@@ -179,7 +190,7 @@ KERNEL void finishSpreadCharge( ...@@ -179,7 +190,7 @@ KERNEL void finishSpreadCharge(
LOCAL int zindexTable[GRID_SIZE_Z]; LOCAL int zindexTable[GRID_SIZE_Z];
int blockSize = (int) ceil(GRID_SIZE_Z/(real) PME_ORDER); int blockSize = (int) ceil(GRID_SIZE_Z/(real) PME_ORDER);
for (int i = LOCAL_ID; i < GRID_SIZE_Z; i += LOCAL_SIZE) { for (int i = LOCAL_ID; i < GRID_SIZE_Z; i += LOCAL_SIZE) {
int block = i % PME_ORDER; int block = i % PME_ORDER;
zindexTable[i] = i/PME_ORDER + block*GRID_SIZE_X*GRID_SIZE_Y*blockSize; zindexTable[i] = i/PME_ORDER + block*GRID_SIZE_X*GRID_SIZE_Y*blockSize;
} }
SYNC_THREADS; SYNC_THREADS;
...@@ -503,6 +514,9 @@ KERNEL void gridEvaluateEnergy(GLOBAL real2* RESTRICT pmeGrid, GLOBAL mixed* RES ...@@ -503,6 +514,9 @@ KERNEL void gridEvaluateEnergy(GLOBAL real2* RESTRICT pmeGrid, GLOBAL mixed* RES
#endif #endif
} }
#if defined(USE_HIP) && !defined(AMD_RDNA) && !defined(USE_DOUBLE_PRECISION)
LAUNCH_BOUNDS_EXACT(128, 1)
#endif
KERNEL void gridInterpolateForce(GLOBAL const real4* RESTRICT posq, GLOBAL mm_ulong* RESTRICT forceBuffers, GLOBAL const real* RESTRICT pmeGrid, KERNEL void gridInterpolateForce(GLOBAL const real4* RESTRICT posq, GLOBAL mm_ulong* RESTRICT forceBuffers, GLOBAL const real* RESTRICT pmeGrid,
real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ, GLOBAL const int2* RESTRICT pmeAtomGridIndex, real4 recipBoxVecX, real4 recipBoxVecY, real4 recipBoxVecZ, GLOBAL const int2* RESTRICT pmeAtomGridIndex,
......
...@@ -50,23 +50,6 @@ ...@@ -50,23 +50,6 @@
using namespace OpenMM; using namespace OpenMM;
using namespace std; using namespace std;
static int findLegalFFTDimension(int minimum) {
if (minimum < 1)
return 1;
while (true) {
// Attempt to factor the current value.
int unfactored = minimum;
for (int factor = 2; factor < 8; factor++) {
while (unfactored > 1 && unfactored%factor == 0)
unfactored /= factor;
}
if (unfactored == 1)
return minimum;
minimum++;
}
}
static void setPeriodicBoxArgs(ComputeContext& cc, ComputeKernel kernel, int index) { static void setPeriodicBoxArgs(ComputeContext& cc, ComputeKernel kernel, int index) {
Vec3 a, b, c; Vec3 a, b, c;
cc.getPeriodicBoxVectors(a, b, c); cc.getPeriodicBoxVectors(a, b, c);
...@@ -456,13 +439,13 @@ void CommonCalcAmoebaMultipoleForceKernel::initialize(const System& system, cons ...@@ -456,13 +439,13 @@ void CommonCalcAmoebaMultipoleForceKernel::initialize(const System& system, cons
nb.setEwaldErrorTolerance(force.getEwaldErrorTolerance()); nb.setEwaldErrorTolerance(force.getEwaldErrorTolerance());
nb.setCutoffDistance(force.getCutoffDistance()); nb.setCutoffDistance(force.getCutoffDistance());
NonbondedForceImpl::calcPMEParameters(system, nb, pmeAlpha, gridSizeX, gridSizeY, gridSizeZ, false); NonbondedForceImpl::calcPMEParameters(system, nb, pmeAlpha, gridSizeX, gridSizeY, gridSizeZ, false);
gridSizeX = findLegalFFTDimension(gridSizeX); gridSizeX = cc.findLegalFFTDimension(gridSizeX);
gridSizeY = findLegalFFTDimension(gridSizeY); gridSizeY = cc.findLegalFFTDimension(gridSizeY);
gridSizeZ = findLegalFFTDimension(gridSizeZ); gridSizeZ = cc.findLegalFFTDimension(gridSizeZ);
} else { } else {
gridSizeX = findLegalFFTDimension(nx); gridSizeX = cc.findLegalFFTDimension(nx);
gridSizeY = findLegalFFTDimension(ny); gridSizeY = cc.findLegalFFTDimension(ny);
gridSizeZ = findLegalFFTDimension(nz); gridSizeZ = cc.findLegalFFTDimension(nz);
} }
defines["EWALD_ALPHA"] = cc.doubleToString(pmeAlpha); defines["EWALD_ALPHA"] = cc.doubleToString(pmeAlpha);
defines["SQRT_PI"] = cc.doubleToString(sqrt(M_PI)); defines["SQRT_PI"] = cc.doubleToString(sqrt(M_PI));
...@@ -2550,13 +2533,13 @@ void CommonCalcHippoNonbondedForceKernel::initialize(const System& system, const ...@@ -2550,13 +2533,13 @@ void CommonCalcHippoNonbondedForceKernel::initialize(const System& system, const
nb.setEwaldErrorTolerance(force.getEwaldErrorTolerance()); nb.setEwaldErrorTolerance(force.getEwaldErrorTolerance());
nb.setCutoffDistance(force.getCutoffDistance()); nb.setCutoffDistance(force.getCutoffDistance());
NonbondedForceImpl::calcPMEParameters(system, nb, pmeAlpha, gridSizeX, gridSizeY, gridSizeZ, false); NonbondedForceImpl::calcPMEParameters(system, nb, pmeAlpha, gridSizeX, gridSizeY, gridSizeZ, false);
gridSizeX = findLegalFFTDimension(gridSizeX); gridSizeX = cc.findLegalFFTDimension(gridSizeX);
gridSizeY = findLegalFFTDimension(gridSizeY); gridSizeY = cc.findLegalFFTDimension(gridSizeY);
gridSizeZ = findLegalFFTDimension(gridSizeZ); gridSizeZ = cc.findLegalFFTDimension(gridSizeZ);
} else { } else {
gridSizeX = findLegalFFTDimension(nx); gridSizeX = cc.findLegalFFTDimension(nx);
gridSizeY = findLegalFFTDimension(ny); gridSizeY = cc.findLegalFFTDimension(ny);
gridSizeZ = findLegalFFTDimension(nz); gridSizeZ = cc.findLegalFFTDimension(nz);
} }
force.getDPMEParameters(dpmeAlpha, nx, ny, nz); force.getDPMEParameters(dpmeAlpha, nx, ny, nz);
if (nx == 0 || dpmeAlpha == 0) { if (nx == 0 || dpmeAlpha == 0) {
...@@ -2564,13 +2547,13 @@ void CommonCalcHippoNonbondedForceKernel::initialize(const System& system, const ...@@ -2564,13 +2547,13 @@ void CommonCalcHippoNonbondedForceKernel::initialize(const System& system, const
nb.setEwaldErrorTolerance(force.getEwaldErrorTolerance()); nb.setEwaldErrorTolerance(force.getEwaldErrorTolerance());
nb.setCutoffDistance(force.getCutoffDistance()); nb.setCutoffDistance(force.getCutoffDistance());
NonbondedForceImpl::calcPMEParameters(system, nb, dpmeAlpha, dispersionGridSizeX, dispersionGridSizeY, dispersionGridSizeZ, true); NonbondedForceImpl::calcPMEParameters(system, nb, dpmeAlpha, dispersionGridSizeX, dispersionGridSizeY, dispersionGridSizeZ, true);
dispersionGridSizeX = findLegalFFTDimension(dispersionGridSizeX); dispersionGridSizeX = cc.findLegalFFTDimension(dispersionGridSizeX);
dispersionGridSizeY = findLegalFFTDimension(dispersionGridSizeY); dispersionGridSizeY = cc.findLegalFFTDimension(dispersionGridSizeY);
dispersionGridSizeZ = findLegalFFTDimension(dispersionGridSizeZ); dispersionGridSizeZ = cc.findLegalFFTDimension(dispersionGridSizeZ);
} else { } else {
dispersionGridSizeX = findLegalFFTDimension(nx); dispersionGridSizeX = cc.findLegalFFTDimension(nx);
dispersionGridSizeY = findLegalFFTDimension(ny); dispersionGridSizeY = cc.findLegalFFTDimension(ny);
dispersionGridSizeZ = findLegalFFTDimension(nz); dispersionGridSizeZ = cc.findLegalFFTDimension(nz);
} }
defines["EWALD_ALPHA"] = cc.doubleToString(pmeAlpha); defines["EWALD_ALPHA"] = cc.doubleToString(pmeAlpha);
defines["SQRT_PI"] = cc.doubleToString(sqrt(M_PI)); defines["SQRT_PI"] = cc.doubleToString(sqrt(M_PI));
......
...@@ -190,11 +190,17 @@ typedef struct { ...@@ -190,11 +190,17 @@ typedef struct {
real q, bornRadius, bornForce; real q, bornRadius, bornForce;
} AtomData2; } AtomData2;
DEVICE void computeOneInteractionF1(AtomData2 atom1, volatile AtomData2 atom2, real* outputEnergy, real3* force); #if defined(USE_HIP)
DEVICE void computeOneInteractionF2(AtomData2 atom1, volatile AtomData2 atom2, real* outputEnergy, real3* force); #define ATOM2_ARG_SPEC
DEVICE void computeOneInteractionT1(AtomData2 atom1, volatile AtomData2 atom2, real3* torque); #else
DEVICE void computeOneInteractionT2(AtomData2 atom1, volatile AtomData2 atom2, real3* torque); #define ATOM2_ARG_SPEC volatile
DEVICE void computeOneInteractionB1B2(AtomData2 atom1, volatile AtomData2 atom2, real* bornForce1, real* bornForce2); #endif
DEVICE void computeOneInteractionF1(AtomData2 atom1, ATOM2_ARG_SPEC AtomData2 atom2, real* outputEnergy, real3* force);
DEVICE void computeOneInteractionF2(AtomData2 atom1, ATOM2_ARG_SPEC AtomData2 atom2, real* outputEnergy, real3* force);
DEVICE void computeOneInteractionT1(AtomData2 atom1, ATOM2_ARG_SPEC AtomData2 atom2, real3* torque);
DEVICE void computeOneInteractionT2(AtomData2 atom1, ATOM2_ARG_SPEC AtomData2 atom2, real3* torque);
DEVICE void computeOneInteractionB1B2(AtomData2 atom1, ATOM2_ARG_SPEC AtomData2 atom2, real* bornForce1, real* bornForce2);
inline DEVICE AtomData2 loadAtomData2(int atom, GLOBAL const real4* RESTRICT posq, GLOBAL const real* RESTRICT labFrameDipole, inline DEVICE AtomData2 loadAtomData2(int atom, GLOBAL const real4* RESTRICT posq, GLOBAL const real* RESTRICT labFrameDipole,
GLOBAL const real* RESTRICT labFrameQuadrupole, GLOBAL const real* RESTRICT inducedDipole, GLOBAL const real* RESTRICT inducedDipolePolar, GLOBAL const real* RESTRICT bornRadius) { GLOBAL const real* RESTRICT labFrameQuadrupole, GLOBAL const real* RESTRICT inducedDipole, GLOBAL const real* RESTRICT inducedDipolePolar, GLOBAL const real* RESTRICT bornRadius) {
...@@ -585,16 +591,38 @@ KERNEL void computeChainRuleForce( ...@@ -585,16 +591,38 @@ KERNEL void computeChainRuleForce(
} while (pos < end); } while (pos < end);
} }
typedef struct { #if defined(USE_HIP)
real3 pos, force, dipole, inducedDipole, inducedDipolePolar, inducedDipoleS, inducedDipolePolarS; #define ALIGN alignas(16)
real q, quadrupoleXX, quadrupoleXY, quadrupoleXZ; #else
#define ALIGN
#endif
typedef struct ALIGN {
real3 pos;
real q;
real3 dipole;
#if defined(USE_HIP)
real padding0;
#endif
real3 inducedDipole, inducedDipolePolar, inducedDipoleS, inducedDipolePolarS;
real quadrupoleXX, quadrupoleXY, quadrupoleXZ;
real quadrupoleYY, quadrupoleYZ, quadrupoleZZ; real quadrupoleYY, quadrupoleYZ, quadrupoleZZ;
real3 force;
float thole, damp; float thole, damp;
#if defined(USE_HIP) && !defined(USE_DOUBLE_PRECISION)
real padding1[2]; // Prevent bank conflicts because the aligned size is 128
#endif
} AtomData4; } AtomData4;
DEVICE void computeOneEDiffInteractionF1(AtomData4* atom1, LOCAL_ARG volatile AtomData4* atom2, float dScale, float pScale, real* outputEnergy, real3* outputForce); #if defined(USE_HIP)
DEVICE void computeOneEDiffInteractionT1(AtomData4* atom1, LOCAL_ARG volatile AtomData4* atom2, float dScale, float pScale, real3* outputForce); #define ATOM2_PTR_ARG_SPEC const
DEVICE void computeOneEDiffInteractionT3(AtomData4* atom1, LOCAL_ARG volatile AtomData4* atom2, float dScale, float pScale, real3* outputForce); #else
#define ATOM2_PTR_ARG_SPEC volatile
#endif
DEVICE void computeOneEDiffInteractionF1(const AtomData4* atom1, LOCAL_ARG ATOM2_PTR_ARG_SPEC AtomData4* atom2, float dScale, float pScale, real* outputEnergy, real3* outputForce);
DEVICE void computeOneEDiffInteractionT1(const AtomData4* atom1, LOCAL_ARG ATOM2_PTR_ARG_SPEC AtomData4* atom2, float dScale, float pScale, real3* outputForce);
DEVICE void computeOneEDiffInteractionT3(const AtomData4* atom1, LOCAL_ARG ATOM2_PTR_ARG_SPEC AtomData4* atom2, float dScale, float pScale, real3* outputForce);
inline DEVICE AtomData4 loadAtomData4(int atom, GLOBAL const real4* RESTRICT posq, GLOBAL const real* RESTRICT labFrameDipole, inline DEVICE AtomData4 loadAtomData4(int atom, GLOBAL const real4* RESTRICT posq, GLOBAL const real* RESTRICT labFrameDipole,
GLOBAL const real* RESTRICT labFrameQuadrupole, GLOBAL const real* RESTRICT inducedDipole, GLOBAL const real* RESTRICT inducedDipolePolar, GLOBAL const real* RESTRICT labFrameQuadrupole, GLOBAL const real* RESTRICT inducedDipole, GLOBAL const real* RESTRICT inducedDipolePolar,
......
#if defined F1 #if defined F1
DEVICE void computeOneEDiffInteractionF1(AtomData4* atom1, LOCAL_ARG volatile AtomData4* atom2, float dScale, float pScale, real* outputEnergy, real3* outputForce) { DEVICE void computeOneEDiffInteractionF1(const AtomData4* atom1, LOCAL_ARG ATOM2_PTR_ARG_SPEC AtomData4* atom2, float dScale, float pScale, real* outputEnergy, real3* outputForce) {
#elif defined T1 #elif defined T1
DEVICE void computeOneEDiffInteractionT1(AtomData4* atom1, LOCAL_ARG volatile AtomData4* atom2, float dScale, float pScale, real3* outputForce) { DEVICE void computeOneEDiffInteractionT1(const AtomData4* atom1, LOCAL_ARG ATOM2_PTR_ARG_SPEC AtomData4* atom2, float dScale, float pScale, real3* outputForce) {
#elif defined T3 #elif defined T3
DEVICE void computeOneEDiffInteractionT3(AtomData4* atom1, LOCAL_ARG volatile AtomData4* atom2, float dScale, float pScale, real3* outputForce) { DEVICE void computeOneEDiffInteractionT3(const AtomData4* atom1, LOCAL_ARG ATOM2_PTR_ARG_SPEC AtomData4* atom2, float dScale, float pScale, real3* outputForce) {
#endif #endif
const float uscale = 1; const float uscale = 1;
......
...@@ -4,15 +4,15 @@ ...@@ -4,15 +4,15 @@
*/ */
#if defined F1 #if defined F1
DEVICE void computeOneInteractionF1(AtomData2 atom1, volatile AtomData2 atom2, real* outputEnergy, real3* force) { DEVICE void computeOneInteractionF1(AtomData2 atom1, ATOM2_ARG_SPEC AtomData2 atom2, real* outputEnergy, real3* force) {
#elif defined F2 #elif defined F2
DEVICE void computeOneInteractionF2(AtomData2 atom1, volatile AtomData2 atom2, real* outputEnergy, real3* force) { DEVICE void computeOneInteractionF2(AtomData2 atom1, ATOM2_ARG_SPEC AtomData2 atom2, real* outputEnergy, real3* force) {
#elif defined T1 #elif defined T1
DEVICE void computeOneInteractionT1(AtomData2 atom1, volatile AtomData2 atom2, real3* torque) { DEVICE void computeOneInteractionT1(AtomData2 atom1, ATOM2_ARG_SPEC AtomData2 atom2, real3* torque) {
#elif defined T2 #elif defined T2
DEVICE void computeOneInteractionT2(AtomData2 atom1, volatile AtomData2 atom2, real3* torque) { DEVICE void computeOneInteractionT2(AtomData2 atom1, ATOM2_ARG_SPEC AtomData2 atom2, real3* torque) {
#elif defined B1 && defined B2 #elif defined B1 && defined B2
DEVICE void computeOneInteractionB1B2(AtomData2 atom1, volatile AtomData2 atom2, real* bornForce1, real* bornForce2) { DEVICE void computeOneInteractionB1B2(AtomData2 atom1, ATOM2_ARG_SPEC AtomData2 atom2, real* bornForce1, real* bornForce2) {
#endif #endif
const real fc = EPSILON_FACTOR*GK_FC; const real fc = EPSILON_FACTOR*GK_FC;
......
...@@ -19,6 +19,12 @@ typedef struct { ...@@ -19,6 +19,12 @@ typedef struct {
#endif #endif
#ifdef ENABLE_SHUFFLE #ifdef ENABLE_SHUFFLE
#if defined(USE_HIP)
#define real_shfl SHFL
#else
//support for 64 bit shuffles //support for 64 bit shuffles
static __inline__ __device__ float real_shfl(float var, int srcLane) { static __inline__ __device__ float real_shfl(float var, int srcLane) {
return SHFL(var, srcLane); return SHFL(var, srcLane);
...@@ -41,6 +47,8 @@ static __inline__ __device__ mm_long real_shfl(mm_long var, int srcLane) { ...@@ -41,6 +47,8 @@ static __inline__ __device__ mm_long real_shfl(mm_long var, int srcLane) {
int2 fuse; fuse.x = lo; fuse.y = hi; int2 fuse; fuse.x = lo; fuse.y = hi;
return *reinterpret_cast<mm_long*>(&fuse); return *reinterpret_cast<mm_long*>(&fuse);
} }
#endif
#endif #endif
KERNEL void computeNonbonded( KERNEL void computeNonbonded(
...@@ -50,7 +58,7 @@ KERNEL void computeNonbonded( ...@@ -50,7 +58,7 @@ KERNEL void computeNonbonded(
, GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, , GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, GLOBAL const real4* RESTRICT blockCenter, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, GLOBAL const real4* RESTRICT blockCenter,
GLOBAL const real4* RESTRICT blockSize, GLOBAL const unsigned int* RESTRICT interactingAtoms GLOBAL const real4* RESTRICT blockSize, GLOBAL const unsigned int* RESTRICT interactingAtoms
#ifdef __CUDA_ARCH__ #if defined(__CUDA_ARCH__) || defined(USE_HIP)
, unsigned int maxSinglePairs, GLOBAL const int2* RESTRICT singlePairs , unsigned int maxSinglePairs, GLOBAL const int2* RESTRICT singlePairs
#endif #endif
#endif #endif
...@@ -478,4 +486,4 @@ KERNEL void computeNonbonded( ...@@ -478,4 +486,4 @@ KERNEL void computeNonbonded(
#ifdef INCLUDE_ENERGY #ifdef INCLUDE_ENERGY
energyBuffer[GLOBAL_ID] += energy; energyBuffer[GLOBAL_ID] += energy;
#endif #endif
} }
\ No newline at end of file
#define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE) #define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)
typedef struct { #if defined(USE_HIP)
real3 pos, force, torque, inducedDipole, inducedDipolePolar, sphericalDipole; #define ALIGN alignas(16)
#else
#define ALIGN
#endif
typedef struct ALIGN {
real3 pos;
real q; real q;
real3 force, torque, inducedDipole, inducedDipolePolar, sphericalDipole;
float thole, damp; float thole, damp;
#ifdef INCLUDE_QUADRUPOLES #ifdef INCLUDE_QUADRUPOLES
real sphericalQuadrupole[5]; real sphericalQuadrupole[5];
......
#ifndef HIPPO #ifndef HIPPO
#define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE) #define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)
typedef struct { #if defined(USE_HIP)
#define ALIGN alignas(16)
#else
#define ALIGN
#endif
typedef struct ALIGN {
real3 pos; real3 pos;
#if defined(USE_HIP)
real padding0;
#endif
real3 field, fieldPolar, inducedDipole, inducedDipolePolar; real3 field, fieldPolar, inducedDipole, inducedDipolePolar;
#ifdef EXTRAPOLATED_POLARIZATION #ifdef EXTRAPOLATED_POLARIZATION
real fieldGradient[6], fieldGradientPolar[6]; real fieldGradient[6], fieldGradientPolar[6];
#endif #endif
#ifdef USE_GK #ifdef USE_GK
real3 fieldS, fieldPolarS, inducedDipoleS, inducedDipolePolarS; real3 fieldS, fieldPolarS, inducedDipoleS, inducedDipolePolarS;
real bornRadius;
#ifdef EXTRAPOLATED_POLARIZATION #ifdef EXTRAPOLATED_POLARIZATION
real fieldGradientS[6], fieldGradientPolarS[6]; real fieldGradientS[6], fieldGradientPolarS[6];
#endif #endif
real bornRadius;
#if defined(USE_HIP) && !defined(USE_DOUBLE_PRECISION)
real padding1[3]; // Prevent bank conflicts because the aligned size is 128
#endif
#endif #endif
float thole, damp; float thole, damp;
} AtomData; } AtomData;
......
#define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE) #define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)
typedef struct { #if defined(USE_HIP)
real3 pos, force, torque, inducedDipole, inducedDipolePolar, sphericalDipole; #define ALIGN alignas(16)
#else
#define ALIGN
#endif
typedef struct ALIGN {
real3 pos;
real q; real q;
real3 force, torque, inducedDipole, inducedDipolePolar, sphericalDipole;
float thole, damp; float thole, damp;
#ifdef INCLUDE_QUADRUPOLES #ifdef INCLUDE_QUADRUPOLES
real sphericalQuadrupole[5]; real sphericalQuadrupole[5];
......
...@@ -277,8 +277,17 @@ add_custom_target(PythonSdist ...@@ -277,8 +277,17 @@ add_custom_target(PythonSdist
COMMENT "Packaging source distribution package (sdist)..." COMMENT "Packaging source distribution package (sdist)..."
) )
# Install binary module (to system location) set(OPENMM_PYTHON_USER_INSTALL OFF CACHE BOOL
set(PYTHON_SETUP_COMMAND "install --root=\$ENV{DESTDIR}/") "Whether to install OpenMM Python binary module into the user site-packages directory")
mark_as_advanced(OPENMM_PYTHON_USER_INSTALL)
if(OPENMM_PYTHON_USER_INSTALL)
# Install binary module to user location
set(PYTHON_SETUP_COMMAND "install --user")
else()
# Install binary module to system location
set(PYTHON_SETUP_COMMAND "install --root=\$ENV{DESTDIR}/")
endif()
configure_file(pysetup.cmake.in configure_file(pysetup.cmake.in
"${CMAKE_CURRENT_BINARY_DIR}/pysetupinstall.cmake" @ONLY) "${CMAKE_CURRENT_BINARY_DIR}/pysetupinstall.cmake" @ONLY)
add_custom_target(PythonInstall add_custom_target(PythonInstall
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment