Unverified Commit a0acfbc9 authored by Anton Gorenko's avatar Anton Gorenko
Browse files

Optimize PME kernels

* Compile with -munsafe-fp-atomics to enable fast hardware f32 atomic
  add on global memory on pre-MI100 GPUs;
* Use fixed point charge spreading on other GPUs, otherwise float atomic
  add will be compiled as a slow CAS loop;
* Tune block sizes, use executeKernelFlat;
* Tune launch bounds of PME grid-related kernels: force the compiler to
  use all registers by limiting max waves per EU to 1.
parent a96534c1
......@@ -375,6 +375,13 @@ public:
bool getSupports64BitGlobalAtomics() const {
return true;
}
/**
* Get whether the device being used supports 32 bit floating point atomic operations
* on global memory (fast hardware instructions, not a compare-and-swap loop implementation).
*/
bool getSupportsHardwareFloatGlobalAtomicAdd() const {
return supportsHardwareFloatGlobalAtomicAdd;
}
/**
* Get whether the device being used supports double precision math.
*/
......@@ -563,6 +570,7 @@ private:
int simdWidth;
int multiprocessors;
int sharedMemPerBlock;
bool supportsHardwareFloatGlobalAtomicAdd;
bool useBlockingSync, useDoublePrecision, useMixedPrecision, contextIsValid, boxIsTriclinic, hasCompilerKernel, isHipccAvailable, hasAssignedPosqCharges;
bool isLinkedContext;
std::string compiler, tempDir, cacheDir, gpuArchitecture;
......
......@@ -76,7 +76,8 @@ bool HipContext::hasInitializedHip = false;
HipContext::HipContext(const System& system, int deviceIndex, bool useBlockingSync, const string& precision, const string& compiler,
const string& tempDir, const std::string& hostCompiler, HipPlatform::PlatformData& platformData, HipContext* originalContext) : ComputeContext(system), currentStream(0),
platformData(platformData), contextIsValid(false), hasAssignedPosqCharges(false),
hasCompilerKernel(false), isHipccAvailable(false), pinnedBuffer(NULL), integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL) {
hasCompilerKernel(false), isHipccAvailable(false), pinnedBuffer(NULL), integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL),
supportsHardwareFloatGlobalAtomicAdd(false) {
// Determine what compiler to use.
this->compiler = "\""+compiler+"\"";
......@@ -175,6 +176,15 @@ HipContext::HipContext(const System& system, int deviceIndex, bool useBlockingSy
// HIP-TODO: find a good value here
int numThreadBlocksPerComputeUnit = 6;
// GPUs starting from CDNA1 and RDNA3 support atomic add for floats (global_atomic_add_f32),
// which can be used in PME. Older GPUs use fixed point charge spreading instead.
this->supportsHardwareFloatGlobalAtomicAdd = true;
if (gpuArchitecture.find("gfx900") == 0 ||
gpuArchitecture.find("gfx906") == 0 ||
gpuArchitecture.find("gfx10") == 0) {
this->supportsHardwareFloatGlobalAtomicAdd = false;
}
contextIsValid = true;
if (contextIndex > 0) {
int canAccess;
......@@ -421,7 +431,7 @@ hipModule_t HipContext::createModule(const string source, const map<string, stri
const char* saveTempsEnv = getenv("OPENMM_SAVE_TEMPS");
bool saveTemps = saveTempsEnv != nullptr;
string bits = intToString(8*sizeof(void*));
string options = "-ffast-math -Wall";
string options = "-ffast-math -munsafe-fp-atomics -Wall";
if (gpuArchitecture.find("gfx90a") == 0 ||
gpuArchitecture.find("gfx94") == 0) {
// HIP-TODO: Remove it when the compiler does a better job
......
......@@ -737,7 +737,7 @@ void HipCalcNonbondedForceKernel::initialize(const System& system, const Nonbond
pmeDefines["GRID_SIZE_Z"] = cu.intToString(gridSizeZ);
pmeDefines["EPSILON_FACTOR"] = cu.doubleToString(sqrt(ONE_4PI_EPS0));
pmeDefines["M_PI"] = cu.doubleToString(M_PI);
if (cu.getUseDoublePrecision() || cu.getPlatformData().deterministicForces)
if (cu.getUseDoublePrecision() || !cu.getSupportsHardwareFloatGlobalAtomicAdd() || cu.getPlatformData().deterministicForces)
pmeDefines["USE_FIXED_POINT_CHARGE_SPREADING"] = "1";
if (usePmeStream)
pmeDefines["USE_PME_STREAM"] = "1";
......@@ -776,8 +776,6 @@ void HipCalcNonbondedForceKernel::initialize(const System& system, const Nonbond
pmeDefines["RECIP_EXP_FACTOR"] = cu.doubleToString(M_PI*M_PI/(dispersionAlpha*dispersionAlpha));
pmeDefines["USE_LJPME"] = "1";
pmeDefines["CHARGE_FROM_SIGEPS"] = "1";
if (cu.getUseDoublePrecision() || cu.getPlatformData().deterministicForces)
pmeDefines["USE_FIXED_POINT_CHARGE_SPREADING"] = "1";
double invRCut6 = pow(force.getCutoffDistance(), -6);
double dalphaR = dispersionAlpha * force.getCutoffDistance();
double dar2 = dalphaR*dalphaR;
......@@ -1204,7 +1202,7 @@ double HipCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomGridIndex.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeGridIndexKernel, gridIndexArgs, cu.getNumAtoms());
cu.executeKernelFlat(pmeGridIndexKernel, gridIndexArgs, cu.getNumAtoms());
sort->sort(pmeAtomGridIndex);
......@@ -1212,10 +1210,10 @@ double HipCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(),
&charges.getDevicePointer()};
cu.executeKernel(pmeSpreadChargeKernel, spreadArgs, cu.getNumAtoms(), 128);
cu.executeKernelFlat(pmeSpreadChargeKernel, spreadArgs, cu.getNumAtoms(), 128);
void* finishSpreadArgs[] = {&pmeGrid2.getDevicePointer(), &pmeGrid1.getDevicePointer()};
cu.executeKernel(pmeFinishSpreadChargeKernel, finishSpreadArgs, gridSizeX*gridSizeY*gridSizeZ, 256);
cu.executeKernelFlat(pmeFinishSpreadChargeKernel, finishSpreadArgs, gridSizeX*gridSizeY*gridSizeZ, 256);
if (useHipFFT) {
if (cu.getUseDoublePrecision()) {
......@@ -1271,7 +1269,7 @@ double HipCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomGridIndex.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeDispersionGridIndexKernel, gridIndexArgs, cu.getNumAtoms());
cu.executeKernelFlat(pmeDispersionGridIndexKernel, gridIndexArgs, cu.getNumAtoms());
sort->sort(pmeAtomGridIndex);
cu.clearBuffer(pmeEnergyBuffer);
......@@ -1282,10 +1280,10 @@ double HipCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(),
&sigmaEpsilon.getDevicePointer()};
cu.executeKernel(pmeDispersionSpreadChargeKernel, spreadArgs, cu.getNumAtoms(), 128);
cu.executeKernelFlat(pmeDispersionSpreadChargeKernel, spreadArgs, cu.getNumAtoms(), 128);
void* finishSpreadArgs[] = {&pmeGrid2.getDevicePointer(), &pmeGrid1.getDevicePointer()};
cu.executeKernel(pmeDispersionFinishSpreadChargeKernel, finishSpreadArgs, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ, 256);
cu.executeKernelFlat(pmeDispersionFinishSpreadChargeKernel, finishSpreadArgs, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ, 256);
if (useHipFFT) {
if (cu.getUseDoublePrecision()) {
......
......@@ -25,6 +25,9 @@ typedef unsigned long long mm_ulong;
#define SUPPORTS_DOUBLE_PRECISION 1
#define LAUNCH_BOUNDS_EXACT(WORK_GROUP_SIZE, WAVES_PER_EU) \
__attribute__((amdgpu_flat_work_group_size(WORK_GROUP_SIZE, WORK_GROUP_SIZE), amdgpu_waves_per_eu(WAVES_PER_EU, WAVES_PER_EU)))
#ifdef USE_DOUBLE_PRECISION
__device__ inline long long realToFixedPoint(double x) {
......
......@@ -63,7 +63,7 @@ public:
* Get whether charge spreading should be done in fixed point.
*/
bool useFixedPointChargeSpreading() const {
return cc.getUseDoublePrecision();
return cc.getUseDoublePrecision() || !dynamic_cast<HipContext&>(cc).getSupportsHardwareFloatGlobalAtomicAdd();
}
private:
bool hasInitializedFFT;
......@@ -94,7 +94,7 @@ public:
* Get whether charge spreading should be done in fixed point.
*/
bool useFixedPointChargeSpreading() const {
return cc.getUseDoublePrecision();
return cc.getUseDoublePrecision() || !dynamic_cast<HipContext&>(cc).getSupportsHardwareFloatGlobalAtomicAdd();
}
/**
* Sort the atom grid indices.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment