Unverified Commit 89d2ff0e authored by Anton Gorenko's avatar Anton Gorenko
Browse files

Add hipification of CUDA platform

Port changes in CUDA backend to HIP

Fix a warning about arithmetic operations on void* in HipArray::uploadSubArray

Fix "Error Initializing context ROCm 5.3.0"

    https://github.com/StreamHPC/openmm-hip/issues/3


    hipDeviceSetCacheConfig returns hipErrorNotSupported on 5.3
Co-authored-by: default avatarNick Curtis <nicholas.curtis@amd.com>
parent 8defca2d
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2011-2019 Stanford University and the Authors. *
* Portions copyright (c) 2020 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "HipBondedUtilities.h"
#include "HipContext.h"
#include "HipExpressionUtilities.h"
#include "HipKernelSources.h"
#include "openmm/OpenMMException.h"
#include "HipNonbondedUtilities.h"
#include <iostream>
using namespace OpenMM;
using namespace std;
HipBondedUtilities::HipBondedUtilities(HipContext& context) : context(context), numForceBuffers(0), maxBonds(0), allGroups(0), hasInitializedKernels(false) {
}
void HipBondedUtilities::addInteraction(const vector<vector<int> >& atoms, const string& source, int group) {
if (atoms.size() > 0) {
forceAtoms.push_back(atoms);
forceSource.push_back(source);
forceGroup.push_back(group);
allGroups |= 1<<group;
}
}
string HipBondedUtilities::addArgument(hipDeviceptr_t data, const string& type) {
arguments.push_back(data);
argTypes.push_back(type);
return "customArg"+context.intToString(arguments.size());
}
string HipBondedUtilities::addArgument(ArrayInterface& data, const string& type) {
return addArgument(context.unwrap(data).getDevicePointer(), type);
}
string HipBondedUtilities::addEnergyParameterDerivative(const string& param) {
// See if the parameter has already been added.
int index;
for (index = 0; index < energyParameterDerivatives.size(); index++)
if (param == energyParameterDerivatives[index])
break;
if (index == energyParameterDerivatives.size())
energyParameterDerivatives.push_back(param);
context.addEnergyParameterDerivative(param);
return string("energyParamDeriv")+context.intToString(index);
}
void HipBondedUtilities::addPrefixCode(const string& source) {
for (int i = 0; i < (int) prefixCode.size(); i++)
if (prefixCode[i] == source)
return;
prefixCode.push_back(source);
}
void HipBondedUtilities::initialize(const System& system) {
int numForces = forceAtoms.size();
hasInteractions = (numForces > 0);
if (!hasInteractions)
return;
// Build the lists of atom indices.
atomIndices.resize(numForces);
for (int i = 0; i < numForces; i++) {
int numBonds = forceAtoms[i].size();
int numAtoms = forceAtoms[i][0].size();
int numArrays = (numAtoms+3)/4;
int startAtom = 0;
atomIndices[i].resize(numArrays);
for (int j = 0; j < numArrays; j++) {
int width = min(numAtoms-startAtom, 4);
int paddedWidth = (width == 3 ? 4 : width);
vector<unsigned int> indexVec(paddedWidth*numBonds);
for (int bond = 0; bond < numBonds; bond++) {
for (int atom = 0; atom < width; atom++)
indexVec[bond*paddedWidth+atom] = forceAtoms[i][bond][startAtom+atom];
}
atomIndices[i][j].initialize(context, numBonds, 4*paddedWidth, "bondedIndices");
atomIndices[i][j].upload(&indexVec[0]);
startAtom += width;
}
}
// Create the kernel.
stringstream s;
s<<HipKernelSources::vectorOps;
for (int i = 0; i < (int) prefixCode.size(); i++)
s<<prefixCode[i];
s<<"extern \"C\" __global__ void computeBondedForces(unsigned long long* __restrict__ forceBuffer, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq, int groups, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ";
for (int force = 0; force < numForces; force++) {
for (int i = 0; i < (int) atomIndices[force].size(); i++) {
int indexWidth = atomIndices[force][i].getElementSize()/4;
string indexType = "uint"+context.intToString(indexWidth);
s<<", const "<<indexType<<"* __restrict__ atomIndices"<<force<<"_"<<i;
}
}
for (int i = 0; i < (int) arguments.size(); i++)
s<<", "<<argTypes[i]<<"* customArg"<<(i+1);
if (energyParameterDerivatives.size() > 0)
s<<", mixed* __restrict__ energyParamDerivs";
s<<") {\n";
s<<"mixed energy = 0;\n";
for (int i = 0; i < energyParameterDerivatives.size(); i++)
s<<"mixed energyParamDeriv"<<i<<" = 0;\n";
for (int force = 0; force < numForces; force++)
s<<createForceSource(force, forceAtoms[force].size(), forceAtoms[force][0].size(), forceGroup[force], forceSource[force]);
s<<"energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;\n";
const vector<string>& allParamDerivNames = context.getEnergyParamDerivNames();
int numDerivs = allParamDerivNames.size();
for (int i = 0; i < energyParameterDerivatives.size(); i++)
for (int index = 0; index < numDerivs; index++)
if (allParamDerivNames[index] == energyParameterDerivatives[i])
s<<"energyParamDerivs[(blockIdx.x*blockDim.x+threadIdx.x)*"<<numDerivs<<"+"<<index<<"] += energyParamDeriv"<<i<<";\n";
s<<"}\n";
map<string, string> defines;
defines["PADDED_NUM_ATOMS"] = context.intToString(context.getPaddedNumAtoms());
hipModule_t module = context.createModule(s.str(), defines);
kernel = context.getKernel(module, "computeBondedForces");
forceAtoms.clear();
forceSource.clear();
}
string HipBondedUtilities::createForceSource(int forceIndex, int numBonds, int numAtoms, int group, const string& computeForce) {
maxBonds = max(maxBonds, numBonds);
string suffix[] = {".x", ".y", ".z", ".w"};
stringstream s;
s<<"if ((groups&"<<(1<<group)<<") != 0)\n";
s<<"for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < "<<numBonds<<"; index += blockDim.x*gridDim.x) {\n";
int startAtom = 0;
for (int i = 0; i < (int) atomIndices[forceIndex].size(); i++) {
int indexWidth = atomIndices[forceIndex][i].getElementSize()/4;
string indexType = "uint"+context.intToString(indexWidth);
s<<" "<<indexType<<" atoms"<<i<<" = atomIndices"<<forceIndex<<"_"<<i<<"[index];\n";
int atomsToLoad = min(indexWidth, numAtoms-startAtom);
for (int j = 0; j < atomsToLoad; j++) {
s<<" unsigned int atom"<<(startAtom+j+1)<<" = atoms"<<i<<suffix[j]<<";\n";
s<<" real4 pos"<<(startAtom+j+1)<<" = posq[atom"<<(startAtom+j+1)<<"];\n";
}
startAtom += indexWidth;
}
s<<computeForce<<"\n";
for (int i = 0; i < numAtoms; i++) {
s<<" atomicAdd(&forceBuffer[atom"<<(i+1)<<"], static_cast<unsigned long long>((long long) (force"<<(i+1)<<".x*0x100000000)));\n";
s<<" atomicAdd(&forceBuffer[atom"<<(i+1)<<"+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force"<<(i+1)<<".y*0x100000000)));\n";
s<<" atomicAdd(&forceBuffer[atom"<<(i+1)<<"+PADDED_NUM_ATOMS*2], static_cast<unsigned long long>((long long) (force"<<(i+1)<<".z*0x100000000)));\n";
s<<" __threadfence_block();\n";
}
s<<"}\n";
return s.str();
}
void HipBondedUtilities::computeInteractions(int groups) {
if ((groups&allGroups) == 0)
return;
if (!hasInitializedKernels) {
hasInitializedKernels = true;
kernelArgs.push_back(&context.getForce().getDevicePointer());
kernelArgs.push_back(&context.getEnergyBuffer().getDevicePointer());
kernelArgs.push_back(&context.getPosq().getDevicePointer());
kernelArgs.push_back(NULL);
kernelArgs.push_back(context.getPeriodicBoxSizePointer());
kernelArgs.push_back(context.getInvPeriodicBoxSizePointer());
kernelArgs.push_back(context.getPeriodicBoxVecXPointer());
kernelArgs.push_back(context.getPeriodicBoxVecYPointer());
kernelArgs.push_back(context.getPeriodicBoxVecZPointer());
for (int i = 0; i < (int) atomIndices.size(); i++)
for (int j = 0; j < (int) atomIndices[i].size(); j++)
kernelArgs.push_back(&atomIndices[i][j].getDevicePointer());
for (int i = 0; i < (int) arguments.size(); i++)
kernelArgs.push_back(&arguments[i]);
if (energyParameterDerivatives.size() > 0)
kernelArgs.push_back(&context.getEnergyParamDerivBuffer().getDevicePointer());
}
if (!hasInteractions)
return;
kernelArgs[3] = &groups;
context.executeKernel(kernel, &kernelArgs[0], maxBonds);
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2019 Stanford University and the Authors. *
* Portions copyright (c) 2020 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#ifdef WIN32
#error "Windows unsupported for HIP platform"
#endif
#include <cmath>
#include "HipContext.h"
#include "HipArray.h"
#include "HipBondedUtilities.h"
#include "HipEvent.h"
#include "HipIntegrationUtilities.h"
#include "HipKernels.h"
#include "HipKernelSources.h"
#include "HipNonbondedUtilities.h"
#include "HipProgram.h"
#include "openmm/common/ComputeArray.h"
#include "SHA1.h"
#include "openmm/Platform.h"
#include "openmm/System.h"
#include "openmm/VirtualSite.h"
#include "HipExpressionUtilities.h"
#include "openmm/internal/ContextImpl.h"
#include <algorithm>
#include <cstdlib>
#include <fstream>
#include <iomanip>
#include <iostream>
#include <set>
#include <sstream>
#include <typeinfo>
#include <sys/stat.h>
#include <unistd.h>
#define CHECK_RESULT(result) CHECK_RESULT2(result, errorMessage);
#define CHECK_RESULT2(result, prefix) \
if (result != hipSuccess) { \
std::stringstream m; \
m<<prefix<<": "<<getErrorString(result)<<" ("<<result<<")"<<" at "<<__FILE__<<":"<<__LINE__; \
throw OpenMMException(m.str());\
}
using namespace OpenMM;
using namespace std;
const int HipContext::ThreadBlockSize = 64;
const int HipContext::TileSize = sizeof(tileflags)*8;
static_assert(HipContext::ThreadBlockSize == HipContext::TileSize);
bool HipContext::hasInitializedHip = false;
HipContext::HipContext(const System& system, int deviceIndex, bool useBlockingSync, const string& precision, const string& compiler,
const string& tempDir, const std::string& hostCompiler, HipPlatform::PlatformData& platformData, HipContext* originalContext) : ComputeContext(system), currentStream(0),
platformData(platformData), contextIsValid(false), hasAssignedPosqCharges(false),
hasCompilerKernel(false), isHipccAvailable(false), pinnedBuffer(NULL), integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL) {
// Determine what compiler to use.
this->compiler = "\""+compiler+"\"";
if (platformData.context != NULL) {
try {
compilerKernel = platformData.context->getPlatform().createKernel(HipCompilerKernel::Name(), *platformData.context);
hasCompilerKernel = true;
}
catch (...) {
// The runtime compiler plugin isn't available.
}
}
string testCompilerCommand = this->compiler+" --version > /dev/null 2> /dev/null";
int res = std::system(testCompilerCommand.c_str());
struct stat info;
isHipccAvailable = (res == 0 && stat(tempDir.c_str(), &info) == 0);
if (!hasInitializedHip) {
CHECK_RESULT2(hipInit(0), "Error initializing HIP");
hasInitializedHip = true;
}
if (precision == "single") {
useDoublePrecision = false;
useMixedPrecision = false;
}
else if (precision == "mixed") {
useDoublePrecision = false;
useMixedPrecision = true;
}
else if (precision == "double") {
useDoublePrecision = true;
useMixedPrecision = false;
}
else
throw OpenMMException("Illegal value for Precision: "+precision);
char* cacheVariable = getenv("OPENMM_CACHE_DIR");
cacheDir = (cacheVariable == NULL ? tempDir : string(cacheVariable));
this->tempDir = tempDir+"/";
cacheDir = cacheDir+"/";
contextIndex = platformData.contexts.size();
string errorMessage = "Error initializing Context";
if (originalContext == NULL) {
isLinkedContext = false;
int numDevices;
CHECK_RESULT(hipGetDeviceCount(&numDevices));
if (deviceIndex < -1 || deviceIndex >= numDevices)
throw OpenMMException("Illegal value for DeviceIndex: "+intToString(deviceIndex));
vector<int> devicePrecedence;
if (deviceIndex == -1) {
devicePrecedence = getDevicePrecedence();
} else {
devicePrecedence.push_back(deviceIndex);
}
this->deviceIndex = -1;
for (int i = 0; i < static_cast<int>(devicePrecedence.size()); i++) {
int trialDeviceIndex = devicePrecedence[i];
CHECK_RESULT(hipDeviceGet(&device, trialDeviceIndex));
defaultOptimizationOptions = "-ffast-math -Wall";
// try setting device
if (hipSetDevice(device) == hipSuccess) {
// and set flags
unsigned int flags = hipDeviceMapHost;
if (useBlockingSync)
flags += hipDeviceScheduleBlockingSync;
else
flags += hipDeviceScheduleSpin;
if (hipSetDeviceFlags(flags) == hipSuccess) {
this->deviceIndex = trialDeviceIndex;
break;
}
}
}
if (this->deviceIndex == -1) {
if (deviceIndex != -1)
throw OpenMMException("The requested HIP device could not be loaded");
else
throw OpenMMException("No compatible HIP device is available");
}
}
else {
isLinkedContext = true;
this->deviceIndex = originalContext->deviceIndex;
this->device = originalContext->device;
}
hipDeviceProp_t props;
CHECK_RESULT(hipGetDeviceProperties(&props, device));
// set device properties
this->simdWidth = props.warpSize;
this->sharedMemPerBlock = props.sharedMemPerBlock;
this->hasGlobalInt64Atomics = props.arch.hasGlobalInt64Atomics;
this->hasDoubles = props.arch.hasDoubles;
// HIP-TODO: remove hasWarpShuffle==false paths completely?
this->hasWarpShuffle = true;
gpuArchitecture = props.gcnArchName;
// HIP-TODO: find a good value here
int numThreadBlocksPerComputeUnit = 6;
contextIsValid = true;
if (contextIndex > 0) {
int canAccess;
CHECK_RESULT(hipDeviceCanAccessPeer(&canAccess, getDevice(), platformData.contexts[0]->getDevice()));
if (canAccess) {
platformData.contexts[0]->setAsCurrent();
CHECK_RESULT(hipDeviceEnablePeerAccess(getDevice(), 0));
setAsCurrent();
CHECK_RESULT(hipDeviceEnablePeerAccess(platformData.contexts[0]->getDevice(), 0));
}
}
numAtoms = system.getNumParticles();
paddedNumAtoms = TileSize*((numAtoms+TileSize-1)/TileSize);
numAtomBlocks = (paddedNumAtoms+(TileSize-1))/TileSize;
int multiprocessors;
CHECK_RESULT(hipDeviceGetAttribute(&multiprocessors, hipDeviceAttributeMultiprocessorCount, device));
numThreadBlocks = numThreadBlocksPerComputeUnit*multiprocessors;
// GCN hardware is more like CUDA-8, w/ no independent forward progress
// or *_sync primatives
compilationDefines["SYNC_WARPS"] = "";
compilationDefines["SHFL(var, srcLane)"] = "__shfl(var, srcLane);";
// Important: the predicate for ballot is defined as an integer, hence
// it is important that we convert the variable field to a true/false value
// before running ballot, such that we do not discard the top half when
// running on a long int
compilationDefines["BALLOT(var)"] = "__ballot((var) != 0);";
if (useDoublePrecision) {
posq.initialize<double4>(*this, paddedNumAtoms, "posq");
velm.initialize<double4>(*this, paddedNumAtoms, "velm");
compilationDefines["USE_DOUBLE_PRECISION"] = "1";
compilationDefines["make_real2"] = "make_double2";
compilationDefines["make_real3"] = "make_double3";
compilationDefines["make_real4"] = "make_double4";
compilationDefines["make_mixed2"] = "make_double2";
compilationDefines["make_mixed3"] = "make_double3";
compilationDefines["make_mixed4"] = "make_double4";
}
else if (useMixedPrecision) {
posq.initialize<float4>(*this, paddedNumAtoms, "posq");
posqCorrection.initialize<float4>(*this, paddedNumAtoms, "posqCorrection");
velm.initialize<double4>(*this, paddedNumAtoms, "velm");
compilationDefines["USE_MIXED_PRECISION"] = "1";
compilationDefines["make_real2"] = "make_float2";
compilationDefines["make_real3"] = "make_float3";
compilationDefines["make_real4"] = "make_float4";
compilationDefines["make_mixed2"] = "make_double2";
compilationDefines["make_mixed3"] = "make_double3";
compilationDefines["make_mixed4"] = "make_double4";
}
else {
posq.initialize<float4>(*this, paddedNumAtoms, "posq");
velm.initialize<float4>(*this, paddedNumAtoms, "velm");
compilationDefines["make_real2"] = "make_float2";
compilationDefines["make_real3"] = "make_float3";
compilationDefines["make_real4"] = "make_float4";
compilationDefines["make_mixed2"] = "make_float2";
compilationDefines["make_mixed3"] = "make_float3";
compilationDefines["make_mixed4"] = "make_float4";
}
force.initialize<long long>(*this, paddedNumAtoms*3, "force");
posCellOffsets.resize(paddedNumAtoms, mm_int4(0, 0, 0, 0));
atomIndexDevice.initialize<int>(*this, paddedNumAtoms, "atomIndex");
atomIndex.resize(paddedNumAtoms);
for (int i = 0; i < paddedNumAtoms; ++i)
atomIndex[i] = i;
atomIndexDevice.upload(atomIndex);
// Create utility kernels that are used in multiple places.
hipModule_t utilities = createModule(HipKernelSources::vectorOps+HipKernelSources::utilities);
clearBufferKernel = getKernel(utilities, "clearBuffer");
clearTwoBuffersKernel = getKernel(utilities, "clearTwoBuffers");
clearThreeBuffersKernel = getKernel(utilities, "clearThreeBuffers");
clearFourBuffersKernel = getKernel(utilities, "clearFourBuffers");
clearFiveBuffersKernel = getKernel(utilities, "clearFiveBuffers");
clearSixBuffersKernel = getKernel(utilities, "clearSixBuffers");
reduceEnergyKernel = getKernel(utilities, "reduceEnergy");
setChargesKernel = getKernel(utilities, "setCharges");
// Set defines based on the requested precision.
compilationDefines["SQRT"] = useDoublePrecision ? "sqrt" : "sqrtf";
compilationDefines["RSQRT"] = useDoublePrecision ? "rsqrt" : "rsqrtf";
compilationDefines["RECIP"] = useDoublePrecision ? "1.0/" : "1.0f/";
compilationDefines["EXP"] = useDoublePrecision ? "exp" : "expf";
compilationDefines["LOG"] = useDoublePrecision ? "log" : "logf";
compilationDefines["POW"] = useDoublePrecision ? "pow" : "powf";
compilationDefines["COS"] = useDoublePrecision ? "cos" : "cosf";
compilationDefines["SIN"] = useDoublePrecision ? "sin" : "sinf";
compilationDefines["TAN"] = useDoublePrecision ? "tan" : "tanf";
compilationDefines["ACOS"] = useDoublePrecision ? "acos" : "acosf";
compilationDefines["ASIN"] = useDoublePrecision ? "asin" : "asinf";
compilationDefines["ATAN"] = useDoublePrecision ? "atan" : "atanf";
compilationDefines["ERF"] = useDoublePrecision ? "erf" : "erff";
compilationDefines["ERFC"] = useDoublePrecision ? "erfc" : "erfcf";
// Set defines for applying periodic boundary conditions.
Vec3 boxVectors[3];
system.getDefaultPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
boxIsTriclinic = (boxVectors[0][1] != 0.0 || boxVectors[0][2] != 0.0 ||
boxVectors[1][0] != 0.0 || boxVectors[1][2] != 0.0 ||
boxVectors[2][0] != 0.0 || boxVectors[2][1] != 0.0);
if (boxIsTriclinic) {
compilationDefines["APPLY_PERIODIC_TO_DELTA(delta)"] =
"{"
"real scale3 = floor(delta.z*invPeriodicBoxSize.z+0.5f); \\\n"
"delta.x -= scale3*periodicBoxVecZ.x; \\\n"
"delta.y -= scale3*periodicBoxVecZ.y; \\\n"
"delta.z -= scale3*periodicBoxVecZ.z; \\\n"
"real scale2 = floor(delta.y*invPeriodicBoxSize.y+0.5f); \\\n"
"delta.x -= scale2*periodicBoxVecY.x; \\\n"
"delta.y -= scale2*periodicBoxVecY.y; \\\n"
"real scale1 = floor(delta.x*invPeriodicBoxSize.x+0.5f); \\\n"
"delta.x -= scale1*periodicBoxVecX.x;}";
compilationDefines["APPLY_PERIODIC_TO_POS(pos)"] =
"{"
"real scale3 = floor(pos.z*invPeriodicBoxSize.z); \\\n"
"pos.x -= scale3*periodicBoxVecZ.x; \\\n"
"pos.y -= scale3*periodicBoxVecZ.y; \\\n"
"pos.z -= scale3*periodicBoxVecZ.z; \\\n"
"real scale2 = floor(pos.y*invPeriodicBoxSize.y); \\\n"
"pos.x -= scale2*periodicBoxVecY.x; \\\n"
"pos.y -= scale2*periodicBoxVecY.y; \\\n"
"real scale1 = floor(pos.x*invPeriodicBoxSize.x); \\\n"
"pos.x -= scale1*periodicBoxVecX.x;}";
compilationDefines["APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, center)"] =
"{"
"real scale3 = floor((pos.z-center.z)*invPeriodicBoxSize.z+0.5f); \\\n"
"pos.x -= scale3*periodicBoxVecZ.x; \\\n"
"pos.y -= scale3*periodicBoxVecZ.y; \\\n"
"pos.z -= scale3*periodicBoxVecZ.z; \\\n"
"real scale2 = floor((pos.y-center.y)*invPeriodicBoxSize.y+0.5f); \\\n"
"pos.x -= scale2*periodicBoxVecY.x; \\\n"
"pos.y -= scale2*periodicBoxVecY.y; \\\n"
"real scale1 = floor((pos.x-center.x)*invPeriodicBoxSize.x+0.5f); \\\n"
"pos.x -= scale1*periodicBoxVecX.x;}";
}
else {
compilationDefines["APPLY_PERIODIC_TO_DELTA(delta)"] =
"{"
"delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; \\\n"
"delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; \\\n"
"delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;}";
compilationDefines["APPLY_PERIODIC_TO_POS(pos)"] =
"{"
"pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x; \\\n"
"pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y; \\\n"
"pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;}";
compilationDefines["APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, center)"] =
"{"
"pos.x -= floor((pos.x-center.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; \\\n"
"pos.y -= floor((pos.y-center.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; \\\n"
"pos.z -= floor((pos.z-center.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;}";
}
// Create utilities objects.
bonded = new HipBondedUtilities(*this);
nonbonded = new HipNonbondedUtilities(*this);
integration = new HipIntegrationUtilities(*this, system);
expression = new HipExpressionUtilities(*this);
}
HipContext::~HipContext() {
setAsCurrent();
for (auto force : forces)
delete force;
for (auto listener : reorderListeners)
delete listener;
for (auto computation : preComputations)
delete computation;
for (auto computation : postComputations)
delete computation;
if (pinnedBuffer != NULL)
hipHostFree(pinnedBuffer);
if (integration != NULL)
delete integration;
if (expression != NULL)
delete expression;
if (bonded != NULL)
delete bonded;
if (nonbonded != NULL)
delete nonbonded;
contextIsValid = false;
}
void HipContext::initialize() {
hipSetDevice(device);
string errorMessage = "Error initializing Context";
int numEnergyBuffers = max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers());
if (useDoublePrecision) {
energyBuffer.initialize<double>(*this, numEnergyBuffers, "energyBuffer");
energySum.initialize<double>(*this, 1, "energySum");
int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers);
CHECK_RESULT(hipHostMalloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0));
}
else if (useMixedPrecision) {
energyBuffer.initialize<double>(*this, numEnergyBuffers, "energyBuffer");
energySum.initialize<double>(*this, 1, "energySum");
int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers);
CHECK_RESULT(hipHostMalloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0));
}
else {
energyBuffer.initialize<float>(*this, numEnergyBuffers, "energyBuffer");
energySum.initialize<float>(*this, 1, "energySum");
int pinnedBufferSize = max(paddedNumAtoms*6, numEnergyBuffers);
CHECK_RESULT(hipHostMalloc(&pinnedBuffer, pinnedBufferSize*sizeof(float), 0));
}
for (int i = 0; i < numAtoms; i++) {
double mass = system.getParticleMass(i);
if (useDoublePrecision || useMixedPrecision)
((double4*) pinnedBuffer)[i] = make_double4(0.0, 0.0, 0.0, mass == 0.0 ? 0.0 : 1.0/mass);
else
((float4*) pinnedBuffer)[i] = make_float4(0.0f, 0.0f, 0.0f, mass == 0.0 ? 0.0f : (float) (1.0/mass));
}
velm.upload(pinnedBuffer);
bonded->initialize(system);
addAutoclearBuffer(force.getDevicePointer(), force.getSize()*force.getElementSize());
addAutoclearBuffer(energyBuffer.getDevicePointer(), energyBuffer.getSize()*energyBuffer.getElementSize());
int numEnergyParamDerivs = energyParamDerivNames.size();
if (numEnergyParamDerivs > 0) {
if (useDoublePrecision || useMixedPrecision)
energyParamDerivBuffer.initialize<double>(*this, numEnergyParamDerivs*numEnergyBuffers, "energyParamDerivBuffer");
else
energyParamDerivBuffer.initialize<float>(*this, numEnergyParamDerivs*numEnergyBuffers, "energyParamDerivBuffer");
addAutoclearBuffer(energyParamDerivBuffer);
}
findMoleculeGroups();
nonbonded->initialize(system);
}
void HipContext::initializeContexts() {
getPlatformData().initializeContexts(system);
}
void HipContext::setAsCurrent() {
if (contextIsValid)
hipSetDevice(device);
}
hipModule_t HipContext::createModule(const string source, const char* optimizationFlags) {
return createModule(source, map<string, string>(), optimizationFlags);
}
hipModule_t HipContext::createModule(const string source, const map<string, string>& defines, const char* optimizationFlags) {
static_assert(8*sizeof(void*) == HipContext::TileSize);
string bits = intToString(8*sizeof(void*));
string options = (optimizationFlags == NULL ? defaultOptimizationOptions : string(optimizationFlags));
stringstream src;
if (!options.empty())
src << "// Compilation Options: " << options << endl << endl;
for (auto& pair : compilationDefines) {
// Query defines to avoid duplicate variables
if (defines.find(pair.first) == defines.end()) {
src << "#define " << pair.first;
if (!pair.second.empty())
src << " " << pair.second;
src << endl;
}
}
if (!compilationDefines.empty())
src << endl;
// include the main header for built-in variables (threadIdx etc.) and functions
src << "#include \"hip/hip_runtime.h\"\n";
// include the vector types
src << "#include \"hip/hip_vector_types.h\"\n";
if (useDoublePrecision) {
src << "typedef double real;\n";
src << "typedef double2 real2;\n";
src << "typedef double3 real3;\n";
src << "typedef double4 real4;\n";
}
else {
src << "typedef float real;\n";
src << "typedef float2 real2;\n";
src << "typedef float3 real3;\n";
src << "typedef float4 real4;\n";
}
if (useDoublePrecision || useMixedPrecision) {
src << "typedef double mixed;\n";
src << "typedef double2 mixed2;\n";
src << "typedef double3 mixed3;\n";
src << "typedef double4 mixed4;\n";
}
else {
src << "typedef float mixed;\n";
src << "typedef float2 mixed2;\n";
src << "typedef float3 mixed3;\n";
src << "typedef float4 mixed4;\n";
}
src << "typedef unsigned long tileflags;\n";
src << "static_assert(sizeof(tileflags)*8==" << HipContext::TileSize << ",\"tileflags size does not match TILE_SIZE\");\n";
src << HipKernelSources::common << endl;
for (auto& pair : defines) {
src << "#define " << pair.first;
if (!pair.second.empty())
src << " " << pair.second;
src << endl;
}
if (!defines.empty())
src << endl;
src << source << endl;
// See whether we already have PTX for this kernel cached.
CSHA1 sha1;
sha1.Update((const UINT_8*) src.str().c_str(), src.str().size());
sha1.Final();
UINT_8 hash[20];
sha1.GetHash(hash);
stringstream cacheFile;
cacheFile << cacheDir;
cacheFile.flags(ios::hex);
for (int i = 0; i < 20; i++)
cacheFile << setw(2) << setfill('0') << (int) hash[i];
cacheFile << '_' << gpuArchitecture << '_' << bits;
hipModule_t module;
if (hipModuleLoad(&module, cacheFile.str().c_str()) == hipSuccess)
return module;
// Select names for the various temporary files.
stringstream tempFileName;
tempFileName << "openmmTempKernel" << this; // Include a pointer to this context as part of the filename to avoid collisions.
tempFileName << "_" << getpid();
string inputFile = (tempDir+tempFileName.str()+".hip.cpp");
string outputFile = (tempDir+tempFileName.str()+".hsaco");
string logFile = (tempDir+tempFileName.str()+".log");
int res = 0;
// If the runtime compiler plugin is available, use it.
if (hasCompilerKernel) {
string ptx = compilerKernel.getAs<HipCompilerKernel>().createModule(src.str(), options, *this);
// If possible, write the PTX out to a temporary file so we can cache it for later use.
bool wroteCache = false;
try {
ofstream out(outputFile.c_str());
out << ptx;
out.close();
if (!out.fail())
wroteCache = true;
}
catch (...) {
// Ignore.
}
if (!wroteCache) {
// An error occurred. Possibly we don't have permission to write to the temp directory. Just try to load the module directly.
CHECK_RESULT2(hipModuleLoadDataEx(&module, &ptx[0], 0, NULL, NULL), "Error loading HIP module");
return module;
}
}
else {
// Write out the source to a temporary file.
ofstream out(inputFile.c_str());
out << src.str();
out.close();
string command = compiler + " --genco --amdgpu-target=" + gpuArchitecture + " " + options + " -o \""+outputFile+"\" " + " \""+inputFile+"\" 2> \""+logFile+"\"";
res = std::system(command.c_str());
}
try {
if (res != 0) {
// Load the error log.
stringstream error;
error << "Error launching HIP compiler: " << res;
ifstream log(logFile.c_str());
if (log.is_open()) {
string line;
while (!log.eof()) {
getline(log, line);
error << '\n' << line;
}
log.close();
}
throw OpenMMException(error.str());
}
hipError_t result = hipModuleLoad(&module, outputFile.c_str());
if (result != hipSuccess) {
std::stringstream m;
m<<"Error loading HIP module: "<<getErrorString(result)<<" ("<<result<<")";
throw OpenMMException(m.str());
}
const char* save = getenv("OPENMM_SAVE_TEMPS");
bool savetemps = save != nullptr;
if (!savetemps) {
remove(inputFile.c_str());
remove(logFile.c_str());
}
if (rename(outputFile.c_str(), cacheFile.str().c_str()) != 0 && !savetemps)
remove(outputFile.c_str());
return module;
}
catch (...) {
const char* save = getenv("OPENMM_SAVE_TEMPS");
bool savetemps = save != nullptr;
if (!savetemps) {
remove(inputFile.c_str());
remove(outputFile.c_str());
remove(logFile.c_str());
}
throw;
}
}
hipFunction_t HipContext::getKernel(hipModule_t& module, const string& name) {
hipFunction_t function;
hipError_t result = hipModuleGetFunction(&function, module, name.c_str());
if (result != hipSuccess) {
std::stringstream m;
m<<"Error creating kernel "<<name<<": "<<getErrorString(result)<<" ("<<result<<")";
throw OpenMMException(m.str());
}
return function;
}
hipStream_t HipContext::getCurrentStream() {
return currentStream;
}
void HipContext::setCurrentStream(hipStream_t stream) {
currentStream = stream;
}
void HipContext::restoreDefaultStream() {
setCurrentStream(0);
}
HipArray* HipContext::createArray() {
return new HipArray();
}
ComputeEvent HipContext::createEvent() {
return shared_ptr<ComputeEventImpl>(new HipEvent(*this));
}
ComputeProgram HipContext::compileProgram(const std::string source, const std::map<std::string, std::string>& defines) {
hipModule_t module = createModule(HipKernelSources::vectorOps+source, defines);
return shared_ptr<ComputeProgramImpl>(new HipProgram(*this, module));
}
HipArray& HipContext::unwrap(ArrayInterface& array) const {
HipArray* cuarray;
ComputeArray* wrapper = dynamic_cast<ComputeArray*>(&array);
if (wrapper != NULL)
cuarray = dynamic_cast<HipArray*>(&wrapper->getArray());
else
cuarray = dynamic_cast<HipArray*>(&array);
if (cuarray == NULL)
throw OpenMMException("Array argument is not an HipArray");
return *cuarray;
}
std::string HipContext::getErrorString(hipError_t result) {
return string(hipGetErrorName(result));
}
void HipContext::executeKernel(hipFunction_t kernel, void** arguments, int threads, int blockSize, unsigned int sharedSize) {
if (blockSize == -1)
blockSize = ThreadBlockSize;
int gridSize = std::min((threads+blockSize-1)/blockSize, numThreadBlocks);
hipError_t result = hipModuleLaunchKernel(kernel, gridSize, 1, 1, blockSize, 1, 1, sharedSize, currentStream, arguments, NULL);
if (result != hipSuccess) {
stringstream str;
str<<"Error invoking kernel: "<<getErrorString(result)<<" ("<<result<<")";
throw OpenMMException(str.str());
}
}
int HipContext::computeThreadBlockSize(double memory) const {
int maxShared = this->sharedMemPerBlock;
int max = (int) (maxShared/memory);
if (max < HipContext::ThreadBlockSize) {
throw OpenMMException("Too much shared memory requested!");
}
int threads = this->simdWidth;
while (threads+this->simdWidth < max)
threads += this->simdWidth;
return threads;
}
void HipContext::clearBuffer(ArrayInterface& array) {
clearBuffer(unwrap(array).getDevicePointer(), array.getSize()*array.getElementSize());
}
void HipContext::clearBuffer(hipDeviceptr_t memory, int size) {
int words = size/4;
void* args[] = {&memory, &words};
executeKernel(clearBufferKernel, args, words, 4 * this->simdWidth);
}
void HipContext::addAutoclearBuffer(ArrayInterface& array) {
addAutoclearBuffer(unwrap(array).getDevicePointer(), array.getSize()*array.getElementSize());
}
void HipContext::addAutoclearBuffer(hipDeviceptr_t memory, int size) {
autoclearBuffers.push_back(memory);
autoclearBufferSizes.push_back(size/4);
}
void HipContext::clearAutoclearBuffers() {
int preferredTBSize = this->simdWidth * 4;
int base = 0;
int total = autoclearBufferSizes.size();
while (total-base >= 6) {
void* args[] = {&autoclearBuffers[base], &autoclearBufferSizes[base],
&autoclearBuffers[base+1], &autoclearBufferSizes[base+1],
&autoclearBuffers[base+2], &autoclearBufferSizes[base+2],
&autoclearBuffers[base+3], &autoclearBufferSizes[base+3],
&autoclearBuffers[base+4], &autoclearBufferSizes[base+4],
&autoclearBuffers[base+5], &autoclearBufferSizes[base+5]};
executeKernel(clearSixBuffersKernel, args, max(max(max(max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), autoclearBufferSizes[base+3]), autoclearBufferSizes[base+4]), autoclearBufferSizes[base+5]), preferredTBSize);
base += 6;
}
if (total-base == 5) {
void* args[] = {&autoclearBuffers[base], &autoclearBufferSizes[base],
&autoclearBuffers[base+1], &autoclearBufferSizes[base+1],
&autoclearBuffers[base+2], &autoclearBufferSizes[base+2],
&autoclearBuffers[base+3], &autoclearBufferSizes[base+3],
&autoclearBuffers[base+4], &autoclearBufferSizes[base+4]};
executeKernel(clearFiveBuffersKernel, args, max(max(max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), autoclearBufferSizes[base+3]), autoclearBufferSizes[base+4]), preferredTBSize);
}
else if (total-base == 4) {
void* args[] = {&autoclearBuffers[base], &autoclearBufferSizes[base],
&autoclearBuffers[base+1], &autoclearBufferSizes[base+1],
&autoclearBuffers[base+2], &autoclearBufferSizes[base+2],
&autoclearBuffers[base+3], &autoclearBufferSizes[base+3]};
executeKernel(clearFourBuffersKernel, args, max(max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), autoclearBufferSizes[base+3]), preferredTBSize);
}
else if (total-base == 3) {
void* args[] = {&autoclearBuffers[base], &autoclearBufferSizes[base],
&autoclearBuffers[base+1], &autoclearBufferSizes[base+1],
&autoclearBuffers[base+2], &autoclearBufferSizes[base+2]};
executeKernel(clearThreeBuffersKernel, args, max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), preferredTBSize);
}
else if (total-base == 2) {
void* args[] = {&autoclearBuffers[base], &autoclearBufferSizes[base],
&autoclearBuffers[base+1], &autoclearBufferSizes[base+1]};
executeKernel(clearTwoBuffersKernel, args, max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), preferredTBSize);
}
else if (total-base == 1) {
clearBuffer(autoclearBuffers[base], autoclearBufferSizes[base]*4);
}
}
double HipContext::reduceEnergy() {
int bufferSize = energyBuffer.getSize();
int workGroupSize = 512;
void* args[] = {&energyBuffer.getDevicePointer(), &energySum.getDevicePointer(), &bufferSize, &workGroupSize};
executeKernel(reduceEnergyKernel, args, workGroupSize, workGroupSize, workGroupSize*energyBuffer.getElementSize());
energySum.download(pinnedBuffer);
if (getUseDoublePrecision() || getUseMixedPrecision())
return *((double*) pinnedBuffer);
else
return *((float*) pinnedBuffer);
}
void HipContext::setCharges(const vector<double>& charges) {
if (!chargeBuffer.isInitialized())
chargeBuffer.initialize(*this, numAtoms, useDoublePrecision ? sizeof(double) : sizeof(float), "chargeBuffer");
vector<double> c(numAtoms);
for (int i = 0; i < numAtoms; i++)
c[i] = charges[i];
chargeBuffer.upload(c, true);
void* args[] = {&chargeBuffer.getDevicePointer(), &posq.getDevicePointer(), &atomIndexDevice.getDevicePointer(), &numAtoms};
executeKernel(setChargesKernel, args, numAtoms);
}
bool HipContext::requestPosqCharges() {
bool allow = !hasAssignedPosqCharges;
hasAssignedPosqCharges = true;
return allow;
}
void HipContext::addEnergyParameterDerivative(const string& param) {
// See if this parameter has already been registered.
for (int i = 0; i < energyParamDerivNames.size(); i++)
if (param == energyParamDerivNames[i])
return;
energyParamDerivNames.push_back(param);
}
void HipContext::flushQueue() {
hipStreamSynchronize(getCurrentStream());
}
vector<int> HipContext::getDevicePrecedence() {
int numDevices;
hipDeviceProp_t thisDevice;
string errorMessage = "Error initializing Context";
vector<pair<int, int> > devices;
CHECK_RESULT(hipGetDeviceCount(&numDevices));
for (int i = 0; i < numDevices; i++) {
CHECK_RESULT(hipGetDeviceProperties(&thisDevice, i));
int clock, multiprocessors, speed;
clock = thisDevice.clockRate;
multiprocessors = thisDevice.multiProcessorCount;
speed = clock*multiprocessors;
devices.push_back(std::make_pair(speed, -i));
}
// sort first by speed (higher is better), and finally device index (lower is better)
std::sort(devices.begin(), devices.end());
std::reverse(devices.begin(), devices.end());
vector<int> precedence;
for (int i = 0; i < static_cast<int>(devices.size()); i++) {
precedence.push_back(-devices[i].second);
}
return precedence;
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2019 Stanford University and the Authors. *
* Portions copyright (c) 2020 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "HipEvent.h"
#include "openmm/OpenMMException.h"
using namespace OpenMM;
HipEvent::HipEvent(HipContext& context) : context(context), eventCreated(false) {
hipError_t result = hipEventCreateWithFlags(&event, hipEventDisableTiming);
if (result != hipSuccess)
throw OpenMMException("Error creating HIP event:"+HipContext::getErrorString(result));
eventCreated = true;
}
HipEvent::~HipEvent() {
if (eventCreated)
hipEventDestroy(event);
}
void HipEvent::enqueue() {
hipEventRecord(event, 0);
}
void HipEvent::wait() {
hipEventSynchronize(event);
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2015 Stanford University and the Authors. *
* Portions copyright (c) 2020 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "HipFFT3D.h"
#include "HipContext.h"
#include "HipKernelSources.h"
#include "SimTKOpenMMRealType.h"
#include <map>
#include <sstream>
#include <string>
using namespace OpenMM;
using namespace std;
HipFFT3D::HipFFT3D(HipContext& context, int xsize, int ysize, int zsize, bool realToComplex) :
context(context), xsize(xsize), ysize(ysize), zsize(zsize) {
packRealAsComplex = false;
int packedXSize = xsize;
int packedYSize = ysize;
int packedZSize = zsize;
if (realToComplex) {
// If any axis size is even, we can pack the real values into a complex grid that is only half as large.
// Look for an appropriate axis.
packRealAsComplex = true;
int packedAxis, bufferSize;
if (xsize%2 == 0) {
packedAxis = 0;
packedXSize /= 2;
bufferSize = packedXSize;
}
else if (ysize%2 == 0) {
packedAxis = 1;
packedYSize /= 2;
bufferSize = packedYSize;
}
else if (zsize%2 == 0) {
packedAxis = 2;
packedZSize /= 2;
bufferSize = packedZSize;
}
else
packRealAsComplex = false;
if (packRealAsComplex) {
// Build the kernels for packing and unpacking the data.
map<string, string> defines;
defines["XSIZE"] = context.intToString(xsize);
defines["YSIZE"] = context.intToString(ysize);
defines["ZSIZE"] = context.intToString(zsize);
defines["PACKED_AXIS"] = context.intToString(packedAxis);
defines["PACKED_XSIZE"] = context.intToString(packedXSize);
defines["PACKED_YSIZE"] = context.intToString(packedYSize);
defines["PACKED_ZSIZE"] = context.intToString(packedZSize);
defines["M_PI"] = context.doubleToString(M_PI);
hipModule_t module = context.createModule(HipKernelSources::vectorOps+HipKernelSources::fftR2C, defines);
packForwardKernel = context.getKernel(module, "packForwardData");
unpackForwardKernel = context.getKernel(module, "unpackForwardData");
packBackwardKernel = context.getKernel(module, "packBackwardData");
unpackBackwardKernel = context.getKernel(module, "unpackBackwardData");
}
}
bool inputIsReal = (realToComplex && !packRealAsComplex);
zkernel = createKernel(packedXSize, packedYSize, packedZSize, zthreads, 0, true, inputIsReal);
xkernel = createKernel(packedYSize, packedZSize, packedXSize, xthreads, 1, true, inputIsReal);
ykernel = createKernel(packedZSize, packedXSize, packedYSize, ythreads, 2, true, inputIsReal);
invzkernel = createKernel(packedXSize, packedYSize, packedZSize, zthreads, 0, false, inputIsReal);
invxkernel = createKernel(packedYSize, packedZSize, packedXSize, xthreads, 1, false, inputIsReal);
invykernel = createKernel(packedZSize, packedXSize, packedYSize, ythreads, 2, false, inputIsReal);
}
void HipFFT3D::execFFT(HipArray& in, HipArray& out, bool forward) {
hipFunction_t kernel1 = (forward ? zkernel : invzkernel);
hipFunction_t kernel2 = (forward ? xkernel : invxkernel);
hipFunction_t kernel3 = (forward ? ykernel : invykernel);
void* args1[] = {&in.getDevicePointer(), &out.getDevicePointer()};
void* args2[] = {&out.getDevicePointer(), &in.getDevicePointer()};
if (packRealAsComplex) {
hipFunction_t packKernel = (forward ? packForwardKernel : packBackwardKernel);
hipFunction_t unpackKernel = (forward ? unpackForwardKernel : unpackBackwardKernel);
int gridSize = xsize*ysize*zsize/2;
// Pack the data into a half sized grid.
context.executeKernel(packKernel, args1, gridSize, 128);
// Perform the FFT.
context.executeKernel(kernel1, args2, gridSize, zthreads);
context.executeKernel(kernel2, args1, gridSize, xthreads);
context.executeKernel(kernel3, args2, gridSize, ythreads);
// Unpack the data.
context.executeKernel(unpackKernel, args1, gridSize, 128);
}
else {
context.executeKernel(kernel1, args1, xsize*ysize*zsize, zthreads);
context.executeKernel(kernel2, args2, xsize*ysize*zsize, xthreads);
context.executeKernel(kernel3, args1, xsize*ysize*zsize, ythreads);
}
}
int HipFFT3D::findLegalDimension(int minimum) {
if (minimum < 1)
return 1;
while (true) {
// Attempt to factor the current value.
int unfactored = minimum;
for (int factor = 2; factor < 8; factor++) {
while (unfactored > 1 && unfactored%factor == 0)
unfactored /= factor;
}
if (unfactored == 1)
return minimum;
minimum++;
}
}
static int getSmallestRadix(int size) {
int minRadix = 1;
int unfactored = size;
while (unfactored%7 == 0) {
minRadix = 7;
unfactored /= 7;
}
while (unfactored%5 == 0) {
minRadix = 5;
unfactored /= 5;
}
while (unfactored%4 == 0) {
minRadix = 4;
unfactored /= 4;
}
while (unfactored%3 == 0) {
minRadix = 3;
unfactored /= 3;
}
while (unfactored%2 == 0) {
minRadix = 2;
unfactored /= 2;
}
return minRadix;
}
hipFunction_t HipFFT3D::createKernel(int xsize, int ysize, int zsize, int& threads, int axis, bool forward, bool inputIsReal) {
int maxThreads = (context.getUseDoublePrecision() ? 128 : 256);
// while (maxThreads > 128 && maxThreads-64 >= zsize)
// maxThreads -= 64;
int threadsPerBlock = zsize/getSmallestRadix(zsize);
stringstream source;
int blocksPerGroup = max(1, maxThreads/threadsPerBlock);
int stage = 0;
int L = zsize;
int m = 1;
// Factor zsize, generating an appropriate block of code for each factor.
while (L > 1) {
int input = stage%2;
int output = 1-input;
int radix;
if (L%7 == 0)
radix = 7;
else if (L%5 == 0)
radix = 5;
else if (L%4 == 0)
radix = 4;
else if (L%3 == 0)
radix = 3;
else if (L%2 == 0)
radix = 2;
else
throw OpenMMException("Illegal size for FFT: "+context.intToString(zsize));
source<<"{\n";
L = L/radix;
source<<"// Pass "<<(stage+1)<<" (radix "<<radix<<")\n";
if (L*m < threadsPerBlock)
source<<"if (threadIdx.x < "<<(blocksPerGroup*L*m)<<") {\n";
else
source<<"{\n";
source<<"int block = threadIdx.x/"<<(L*m)<<";\n";
source<<"int i = threadIdx.x-block*"<<(L*m)<<";\n";
source<<"int base = i+block*"<<zsize<<";\n";
source<<"int j = i/"<<m<<";\n";
if (radix == 7) {
source<<"real2 c0 = data"<<input<<"[base];\n";
source<<"real2 c1 = data"<<input<<"[base+"<<(L*m)<<"];\n";
source<<"real2 c2 = data"<<input<<"[base+"<<(2*L*m)<<"];\n";
source<<"real2 c3 = data"<<input<<"[base+"<<(3*L*m)<<"];\n";
source<<"real2 c4 = data"<<input<<"[base+"<<(4*L*m)<<"];\n";
source<<"real2 c5 = data"<<input<<"[base+"<<(5*L*m)<<"];\n";
source<<"real2 c6 = data"<<input<<"[base+"<<(6*L*m)<<"];\n";
source<<"real2 d0 = c1+c6;\n";
source<<"real2 d1 = c1-c6;\n";
source<<"real2 d2 = c2+c5;\n";
source<<"real2 d3 = c2-c5;\n";
source<<"real2 d4 = c4+c3;\n";
source<<"real2 d5 = c4-c3;\n";
source<<"real2 d6 = d2+d0;\n";
source<<"real2 d7 = d5+d3;\n";
source<<"real2 b0 = c0+d6+d4;\n";
source<<"real2 b1 = "<<context.doubleToString((cos(2*M_PI/7)+cos(4*M_PI/7)+cos(6*M_PI/7))/3-1)<<"*(d6+d4);\n";
source<<"real2 b2 = "<<context.doubleToString((2*cos(2*M_PI/7)-cos(4*M_PI/7)-cos(6*M_PI/7))/3)<<"*(d0-d4);\n";
source<<"real2 b3 = "<<context.doubleToString((cos(2*M_PI/7)-2*cos(4*M_PI/7)+cos(6*M_PI/7))/3)<<"*(d4-d2);\n";
source<<"real2 b4 = "<<context.doubleToString((cos(2*M_PI/7)+cos(4*M_PI/7)-2*cos(6*M_PI/7))/3)<<"*(d2-d0);\n";
source<<"real2 b5 = -(SIGN)*"<<context.doubleToString((sin(2*M_PI/7)+sin(4*M_PI/7)-sin(6*M_PI/7))/3)<<"*(d7+d1);\n";
source<<"real2 b6 = -(SIGN)*"<<context.doubleToString((2*sin(2*M_PI/7)-sin(4*M_PI/7)+sin(6*M_PI/7))/3)<<"*(d1-d5);\n";
source<<"real2 b7 = -(SIGN)*"<<context.doubleToString((sin(2*M_PI/7)-2*sin(4*M_PI/7)-sin(6*M_PI/7))/3)<<"*(d5-d3);\n";
source<<"real2 b8 = -(SIGN)*"<<context.doubleToString((sin(2*M_PI/7)+sin(4*M_PI/7)+2*sin(6*M_PI/7))/3)<<"*(d3-d1);\n";
source<<"real2 t0 = b0+b1;\n";
source<<"real2 t1 = b2+b3;\n";
source<<"real2 t2 = b4-b3;\n";
source<<"real2 t3 = -b2-b4;\n";
source<<"real2 t4 = b6+b7;\n";
source<<"real2 t5 = b8-b7;\n";
source<<"real2 t6 = -b8-b6;\n";
source<<"real2 t7 = t0+t1;\n";
source<<"real2 t8 = t0+t2;\n";
source<<"real2 t9 = t0+t3;\n";
source<<"real2 t10 = make_real2(t4.y+b5.y, -(t4.x+b5.x));\n";
source<<"real2 t11 = make_real2(t5.y+b5.y, -(t5.x+b5.x));\n";
source<<"real2 t12 = make_real2(t6.y+b5.y, -(t6.x+b5.x));\n";
source<<"data"<<output<<"[base+6*j*"<<m<<"] = b0;\n";
source<<"data"<<output<<"[base+(6*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(7*L)<<"], t7-t10);\n";
source<<"data"<<output<<"[base+(6*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(7*L)<<"], t9-t12);\n";
source<<"data"<<output<<"[base+(6*j+3)*"<<m<<"] = multiplyComplex(w[j*"<<(3*zsize)<<"/"<<(7*L)<<"], t8+t11);\n";
source<<"data"<<output<<"[base+(6*j+4)*"<<m<<"] = multiplyComplex(w[j*"<<(4*zsize)<<"/"<<(7*L)<<"], t8-t11);\n";
source<<"data"<<output<<"[base+(6*j+5)*"<<m<<"] = multiplyComplex(w[j*"<<(5*zsize)<<"/"<<(7*L)<<"], t9+t12);\n";
source<<"data"<<output<<"[base+(6*j+6)*"<<m<<"] = multiplyComplex(w[j*"<<(6*zsize)<<"/"<<(7*L)<<"], t7+t10);\n";
}
else if (radix == 5) {
source<<"real2 c0 = data"<<input<<"[base];\n";
source<<"real2 c1 = data"<<input<<"[base+"<<(L*m)<<"];\n";
source<<"real2 c2 = data"<<input<<"[base+"<<(2*L*m)<<"];\n";
source<<"real2 c3 = data"<<input<<"[base+"<<(3*L*m)<<"];\n";
source<<"real2 c4 = data"<<input<<"[base+"<<(4*L*m)<<"];\n";
source<<"real2 d0 = c1+c4;\n";
source<<"real2 d1 = c2+c3;\n";
source<<"real2 d2 = "<<context.doubleToString(sin(0.4*M_PI))<<"*(c1-c4);\n";
source<<"real2 d3 = "<<context.doubleToString(sin(0.4*M_PI))<<"*(c2-c3);\n";
source<<"real2 d4 = d0+d1;\n";
source<<"real2 d5 = "<<context.doubleToString(0.25*sqrt(5.0))<<"*(d0-d1);\n";
source<<"real2 d6 = c0-0.25f*d4;\n";
source<<"real2 d7 = d6+d5;\n";
source<<"real2 d8 = d6-d5;\n";
string coeff = context.doubleToString(sin(0.2*M_PI)/sin(0.4*M_PI));
source<<"real2 d9 = (SIGN)*make_real2(d2.y+"<<coeff<<"*d3.y, -d2.x-"<<coeff<<"*d3.x);\n";
source<<"real2 d10 = (SIGN)*make_real2("<<coeff<<"*d2.y-d3.y, d3.x-"<<coeff<<"*d2.x);\n";
source<<"data"<<output<<"[base+4*j*"<<m<<"] = c0+d4;\n";
source<<"data"<<output<<"[base+(4*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(5*L)<<"], d7+d9);\n";
source<<"data"<<output<<"[base+(4*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(5*L)<<"], d8+d10);\n";
source<<"data"<<output<<"[base+(4*j+3)*"<<m<<"] = multiplyComplex(w[j*"<<(3*zsize)<<"/"<<(5*L)<<"], d8-d10);\n";
source<<"data"<<output<<"[base+(4*j+4)*"<<m<<"] = multiplyComplex(w[j*"<<(4*zsize)<<"/"<<(5*L)<<"], d7-d9);\n";
}
else if (radix == 4) {
source<<"real2 c0 = data"<<input<<"[base];\n";
source<<"real2 c1 = data"<<input<<"[base+"<<(L*m)<<"];\n";
source<<"real2 c2 = data"<<input<<"[base+"<<(2*L*m)<<"];\n";
source<<"real2 c3 = data"<<input<<"[base+"<<(3*L*m)<<"];\n";
source<<"real2 d0 = c0+c2;\n";
source<<"real2 d1 = c0-c2;\n";
source<<"real2 d2 = c1+c3;\n";
source<<"real2 d3 = (SIGN)*make_real2(c1.y-c3.y, c3.x-c1.x);\n";
source<<"data"<<output<<"[base+3*j*"<<m<<"] = d0+d2;\n";
source<<"data"<<output<<"[base+(3*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(4*L)<<"], d1+d3);\n";
source<<"data"<<output<<"[base+(3*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(4*L)<<"], d0-d2);\n";
source<<"data"<<output<<"[base+(3*j+3)*"<<m<<"] = multiplyComplex(w[j*"<<(3*zsize)<<"/"<<(4*L)<<"], d1-d3);\n";
}
else if (radix == 3) {
source<<"real2 c0 = data"<<input<<"[base];\n";
source<<"real2 c1 = data"<<input<<"[base+"<<(L*m)<<"];\n";
source<<"real2 c2 = data"<<input<<"[base+"<<(2*L*m)<<"];\n";
source<<"real2 d0 = c1+c2;\n";
source<<"real2 d1 = c0-0.5f*d0;\n";
source<<"real2 d2 = (SIGN)*"<<context.doubleToString(sin(M_PI/3.0))<<"*make_real2(c1.y-c2.y, c2.x-c1.x);\n";
source<<"data"<<output<<"[base+2*j*"<<m<<"] = c0+d0;\n";
source<<"data"<<output<<"[base+(2*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(3*L)<<"], d1+d2);\n";
source<<"data"<<output<<"[base+(2*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(3*L)<<"], d1-d2);\n";
}
else if (radix == 2) {
source<<"real2 c0 = data"<<input<<"[base];\n";
source<<"real2 c1 = data"<<input<<"[base+"<<(L*m)<<"];\n";
source<<"data"<<output<<"[base+j*"<<m<<"] = c0+c1;\n";
source<<"data"<<output<<"[base+(j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(2*L)<<"], c0-c1);\n";
}
source<<"}\n";
m = m*radix;
source<<"__syncthreads();\n";
source<<"}\n";
++stage;
}
// Create the kernel.
bool outputIsReal = (inputIsReal && axis == 2 && !forward);
bool outputIsPacked = (inputIsReal && axis == 2 && forward);
string outputSuffix = (outputIsReal ? ".x" : "");
if (outputIsPacked)
source<<"if (index < XSIZE*YSIZE && x < XSIZE/2+1)\n";
else
source<<"if (index < XSIZE*YSIZE)\n";
source<<"for (int i = threadIdx.x-block*THREADS_PER_BLOCK; i < ZSIZE; i += THREADS_PER_BLOCK)\n";
if (outputIsPacked)
source<<"out[y*(ZSIZE*(XSIZE/2+1))+i*(XSIZE/2+1)+x] = data"<<(stage%2)<<"[i+block*ZSIZE]"<<outputSuffix<<";\n";
else
source<<"out[y*(ZSIZE*XSIZE)+i*XSIZE+x] = data"<<(stage%2)<<"[i+block*ZSIZE]"<<outputSuffix<<";\n";
map<string, string> replacements;
replacements["XSIZE"] = context.intToString(xsize);
replacements["YSIZE"] = context.intToString(ysize);
replacements["ZSIZE"] = context.intToString(zsize);
replacements["BLOCKS_PER_GROUP"] = context.intToString(blocksPerGroup);
replacements["THREADS_PER_BLOCK"] = context.intToString(threadsPerBlock);
replacements["M_PI"] = context.doubleToString(M_PI);
replacements["COMPUTE_FFT"] = source.str();
replacements["SIGN"] = (forward ? "1" : "-1");
replacements["INPUT_TYPE"] = (inputIsReal && axis == 0 && forward ? "real" : "real2");
replacements["OUTPUT_TYPE"] = (outputIsReal ? "real" : "real2");
replacements["INPUT_IS_REAL"] = (inputIsReal && axis == 0 && forward ? "1" : "0");
replacements["INPUT_IS_PACKED"] = (inputIsReal && axis == 0 && !forward ? "1" : "0");
replacements["OUTPUT_IS_PACKED"] = (outputIsPacked ? "1" : "0");
hipModule_t module = context.createModule(HipKernelSources::vectorOps+context.replaceStrings(HipKernelSources::fft, replacements));
hipFunction_t kernel = context.getKernel(module, "execFFT");
threads = blocksPerGroup*threadsPerBlock;
return kernel;
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2019 Stanford University and the Authors. *
* Portions copyright (c) 2020 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "HipIntegrationUtilities.h"
#include "HipContext.h"
using namespace OpenMM;
using namespace std;
#define CHECK_RESULT(result) CHECK_RESULT2(result, errorMessage);
#define CHECK_RESULT2(result, prefix) \
if (result != hipSuccess) { \
std::stringstream m; \
m<<prefix<<": "<<dynamic_cast<HipContext&>(context).getErrorString(result)<<" ("<<result<<")"<<" at "<<__FILE__<<":"<<__LINE__; \
throw OpenMMException(m.str());\
}
HipIntegrationUtilities::HipIntegrationUtilities(HipContext& context, const System& system) : IntegrationUtilities(context, system),
ccmaConvergedMemory(NULL) {
CHECK_RESULT2(hipEventCreateWithFlags(&ccmaEvent, hipEventDisableTiming), "Error creating event for CCMA");
CHECK_RESULT2(hipHostMalloc((void**) &ccmaConvergedMemory, sizeof(int), hipHostMallocMapped), "Error allocating pinned memory");
CHECK_RESULT2(hipHostGetDevicePointer(&ccmaConvergedDeviceMemory, ccmaConvergedMemory, 0), "Error getting device address for pinned memory");
}
HipIntegrationUtilities::~HipIntegrationUtilities() {
context.setAsCurrent();
if (ccmaConvergedMemory != NULL) {
hipHostFree(ccmaConvergedMemory);
hipEventDestroy(ccmaEvent);
}
}
HipArray& HipIntegrationUtilities::getPosDelta() {
return dynamic_cast<HipContext&>(context).unwrap(posDelta);
}
HipArray& HipIntegrationUtilities::getRandom() {
return dynamic_cast<HipContext&>(context).unwrap(random);
}
HipArray& HipIntegrationUtilities::getStepSize() {
return dynamic_cast<HipContext&>(context).unwrap(stepSize);
}
void HipIntegrationUtilities::applyConstraintsImpl(bool constrainVelocities, double tol) {
ComputeKernel settleKernel, shakeKernel, ccmaForceKernel;
if (constrainVelocities) {
settleKernel = settleVelKernel;
shakeKernel = shakeVelKernel;
ccmaForceKernel = ccmaVelForceKernel;
}
else {
settleKernel = settlePosKernel;
shakeKernel = shakePosKernel;
ccmaForceKernel = ccmaPosForceKernel;
}
if (settleAtoms.isInitialized()) {
if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
settleKernel->setArg(1, tol);
else
settleKernel->setArg(1, (float) tol);
settleKernel->execute(settleAtoms.getSize());
}
if (shakeAtoms.isInitialized()) {
if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
shakeKernel->setArg(1, tol);
else
shakeKernel->setArg(1, (float) tol);
shakeKernel->execute(shakeAtoms.getSize());
}
if (ccmaConstraintAtoms.isInitialized()) {
if (ccmaConstraintAtoms.getSize() <= 1024) {
// Use the version of CCMA that runs in a single kernel with one workgroup.
ccmaFullKernel->setArg(0, (int) constrainVelocities);
if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
ccmaFullKernel->setArg(14, tol);
else
ccmaFullKernel->setArg(14, (float) tol);
ccmaFullKernel->execute(128, 128);
}
else {
ccmaForceKernel->setArg(6, ccmaConvergedDeviceMemory);
if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
ccmaForceKernel->setArg(7, tol);
else
ccmaForceKernel->setArg(7, (float) tol);
ccmaDirectionsKernel->execute(ccmaConstraintAtoms.getSize());
const int checkInterval = 4;
ccmaConvergedMemory[0] = 0;
ccmaUpdateKernel->setArg(4, constrainVelocities ? context.getVelm() : posDelta);
for (int i = 0; i < 150; i++) {
ccmaForceKernel->setArg(8, i);
ccmaForceKernel->execute(ccmaConstraintAtoms.getSize());
if ((i+1)%checkInterval == 0)
CHECK_RESULT2(hipEventRecord(ccmaEvent, 0), "Error recording event for CCMA");
ccmaMultiplyKernel->setArg(5, i);
ccmaMultiplyKernel->execute(ccmaConstraintAtoms.getSize());
ccmaUpdateKernel->setArg(9, i);
ccmaUpdateKernel->execute(context.getNumAtoms());
if ((i+1)%checkInterval == 0) {
CHECK_RESULT2(hipEventSynchronize(ccmaEvent), "Error synchronizing on event for CCMA");
if (ccmaConvergedMemory[0])
break;
}
}
}
}
}
void HipIntegrationUtilities::distributeForcesFromVirtualSites() {
if (numVsites > 0) {
vsiteForceKernel->setArg(2, context.getLongForceBuffer());
vsiteForceKernel->execute(numVsites);
}
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2019 Stanford University and the Authors. *
* Portions copyright (c) 2020 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "HipKernel.h"
#include "openmm/common/ComputeArray.h"
#include "openmm/internal/AssertionUtilities.h"
#include <cstring>
#include <vector>
using namespace OpenMM;
using namespace std;
HipKernel::HipKernel(HipContext& context, hipFunction_t kernel, const string& name) : context(context), kernel(kernel), name(name) {
}
string HipKernel::getName() const {
return name;
}
int HipKernel::getMaxBlockSize() const {
int size;
hipError_t result = hipFuncGetAttribute(&size, HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, kernel);
if (result != hipSuccess)
throw OpenMMException("Error querying max thread block size: "+context.getErrorString(result));
return size;
}
void HipKernel::execute(int threads, int blockSize) {
int numArgs = arrayArgs.size();
argPointers.resize(numArgs);
for (int i = 0; i < numArgs; i++) {
if (arrayArgs[i] != NULL)
argPointers[i] = &arrayArgs[i]->getDevicePointer();
else
argPointers[i] = &primitiveArgs[i];
}
context.executeKernel(kernel, argPointers.data(), threads, blockSize);
}
void HipKernel::addArrayArg(ArrayInterface& value) {
int index = arrayArgs.size();
addEmptyArg();
setArrayArg(index, value);
}
void HipKernel::addPrimitiveArg(const void* value, int size) {
int index = arrayArgs.size();
addEmptyArg();
setPrimitiveArg(index, value, size);
}
void HipKernel::addEmptyArg() {
primitiveArgs.push_back(make_double4(0, 0, 0, 0));
arrayArgs.push_back(NULL);
}
void HipKernel::setArrayArg(int index, ArrayInterface& value) {
ASSERT_VALID_INDEX(index, arrayArgs);
arrayArgs[index] = &context.unwrap(value);
}
void HipKernel::setPrimitiveArg(int index, const void* value, int size) {
ASSERT_VALID_INDEX(index, primitiveArgs);
if (size > sizeof(double4))
throw OpenMMException("Unsupported value type for kernel argument");
memcpy(&primitiveArgs[index], value, size);
arrayArgs[index] = NULL;
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008-2019 Stanford University and the Authors. *
* Portions copyright (c) 2020 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "HipKernelFactory.h"
#include "HipKernels.h"
#include "HipParallelKernels.h"
#include "HipPlatform.h"
#include "openmm/common/CommonKernels.h"
#include "openmm/internal/ContextImpl.h"
#include "openmm/OpenMMException.h"
using namespace OpenMM;
KernelImpl* HipKernelFactory::createKernelImpl(std::string name, const Platform& platform, ContextImpl& context) const {
HipPlatform::PlatformData& data = *static_cast<HipPlatform::PlatformData*>(context.getPlatformData());
if (data.contexts.size() > 1) {
// We are running in parallel on multiple devices, so we may want to create a parallel kernel.
if (name == CalcForcesAndEnergyKernel::Name())
return new HipParallelCalcForcesAndEnergyKernel(name, platform, data);
if (name == CalcHarmonicBondForceKernel::Name())
return new HipParallelCalcHarmonicBondForceKernel(name, platform, data, context.getSystem());
if (name == CalcCustomBondForceKernel::Name())
return new HipParallelCalcCustomBondForceKernel(name, platform, data, context.getSystem());
if (name == CalcHarmonicAngleForceKernel::Name())
return new HipParallelCalcHarmonicAngleForceKernel(name, platform, data, context.getSystem());
if (name == CalcCustomAngleForceKernel::Name())
return new HipParallelCalcCustomAngleForceKernel(name, platform, data, context.getSystem());
if (name == CalcPeriodicTorsionForceKernel::Name())
return new HipParallelCalcPeriodicTorsionForceKernel(name, platform, data, context.getSystem());
if (name == CalcRBTorsionForceKernel::Name())
return new HipParallelCalcRBTorsionForceKernel(name, platform, data, context.getSystem());
if (name == CalcCMAPTorsionForceKernel::Name())
return new HipParallelCalcCMAPTorsionForceKernel(name, platform, data, context.getSystem());
if (name == CalcCustomTorsionForceKernel::Name())
return new HipParallelCalcCustomTorsionForceKernel(name, platform, data, context.getSystem());
if (name == CalcNonbondedForceKernel::Name())
return new HipParallelCalcNonbondedForceKernel(name, platform, data, context.getSystem());
if (name == CalcCustomNonbondedForceKernel::Name())
return new HipParallelCalcCustomNonbondedForceKernel(name, platform, data, context.getSystem());
if (name == CalcCustomExternalForceKernel::Name())
return new HipParallelCalcCustomExternalForceKernel(name, platform, data, context.getSystem());
if (name == CalcCustomHbondForceKernel::Name())
return new HipParallelCalcCustomHbondForceKernel(name, platform, data, context.getSystem());
if (name == CalcCustomCompoundBondForceKernel::Name())
return new HipParallelCalcCustomCompoundBondForceKernel(name, platform, data, context.getSystem());
}
HipContext& cu = *data.contexts[0];
if (name == CalcForcesAndEnergyKernel::Name())
return new HipCalcForcesAndEnergyKernel(name, platform, cu);
if (name == UpdateStateDataKernel::Name())
return new HipUpdateStateDataKernel(name, platform, cu);
if (name == ApplyConstraintsKernel::Name())
return new CommonApplyConstraintsKernel(name, platform, cu);
if (name == VirtualSitesKernel::Name())
return new CommonVirtualSitesKernel(name, platform, cu);
if (name == CalcHarmonicBondForceKernel::Name())
return new CommonCalcHarmonicBondForceKernel(name, platform, cu, context.getSystem());
if (name == CalcCustomBondForceKernel::Name())
return new CommonCalcCustomBondForceKernel(name, platform, cu, context.getSystem());
if (name == CalcHarmonicAngleForceKernel::Name())
return new CommonCalcHarmonicAngleForceKernel(name, platform, cu, context.getSystem());
if (name == CalcCustomAngleForceKernel::Name())
return new CommonCalcCustomAngleForceKernel(name, platform, cu, context.getSystem());
if (name == CalcPeriodicTorsionForceKernel::Name())
return new CommonCalcPeriodicTorsionForceKernel(name, platform, cu, context.getSystem());
if (name == CalcRBTorsionForceKernel::Name())
return new CommonCalcRBTorsionForceKernel(name, platform, cu, context.getSystem());
if (name == CalcCMAPTorsionForceKernel::Name())
return new CommonCalcCMAPTorsionForceKernel(name, platform, cu, context.getSystem());
if (name == CalcCustomTorsionForceKernel::Name())
return new CommonCalcCustomTorsionForceKernel(name, platform, cu, context.getSystem());
if (name == CalcNonbondedForceKernel::Name())
return new HipCalcNonbondedForceKernel(name, platform, cu, context.getSystem());
if (name == CalcCustomNonbondedForceKernel::Name())
return new CommonCalcCustomNonbondedForceKernel(name, platform, cu, context.getSystem());
if (name == CalcGBSAOBCForceKernel::Name())
return new CommonCalcGBSAOBCForceKernel(name, platform, cu);
if (name == CalcCustomGBForceKernel::Name())
return new CommonCalcCustomGBForceKernel(name, platform, cu, context.getSystem());
if (name == CalcCustomExternalForceKernel::Name())
return new CommonCalcCustomExternalForceKernel(name, platform, cu, context.getSystem());
if (name == CalcCustomHbondForceKernel::Name())
return new CommonCalcCustomHbondForceKernel(name, platform, cu, context.getSystem());
if (name == CalcCustomCentroidBondForceKernel::Name())
return new CommonCalcCustomCentroidBondForceKernel(name, platform, cu, context.getSystem());
if (name == CalcCustomCompoundBondForceKernel::Name())
return new CommonCalcCustomCompoundBondForceKernel(name, platform, cu, context.getSystem());
if (name == CalcCustomCVForceKernel::Name())
return new HipCalcCustomCVForceKernel(name, platform, cu);
if (name == CalcRMSDForceKernel::Name())
return new CommonCalcRMSDForceKernel(name, platform, cu);
if (name == CalcCustomManyParticleForceKernel::Name())
return new CommonCalcCustomManyParticleForceKernel(name, platform, cu, context.getSystem());
if (name == CalcGayBerneForceKernel::Name())
return new CommonCalcGayBerneForceKernel(name, platform, cu);
if (name == IntegrateVerletStepKernel::Name())
return new CommonIntegrateVerletStepKernel(name, platform, cu);
if (name == IntegrateLangevinStepKernel::Name())
return new CommonIntegrateLangevinStepKernel(name, platform, cu);
if (name == IntegrateLangevinMiddleStepKernel::Name())
return new CommonIntegrateLangevinMiddleStepKernel(name, platform, cu);
if (name == IntegrateBrownianStepKernel::Name())
return new CommonIntegrateBrownianStepKernel(name, platform, cu);
if (name == IntegrateVariableVerletStepKernel::Name())
return new CommonIntegrateVariableVerletStepKernel(name, platform, cu);
if (name == IntegrateVariableLangevinStepKernel::Name())
return new CommonIntegrateVariableLangevinStepKernel(name, platform, cu);
if (name == IntegrateCustomStepKernel::Name())
return new CommonIntegrateCustomStepKernel(name, platform, cu);
if (name == ApplyAndersenThermostatKernel::Name())
return new CommonApplyAndersenThermostatKernel(name, platform, cu);
if (name == IntegrateNoseHooverStepKernel::Name())
return new CommonIntegrateNoseHooverStepKernel(name, platform, cu);
if (name == ApplyMonteCarloBarostatKernel::Name())
return new CommonApplyMonteCarloBarostatKernel(name, platform, cu);
if (name == RemoveCMMotionKernel::Name())
return new CommonRemoveCMMotionKernel(name, platform, cu);
throw OpenMMException((std::string("Tried to create kernel with illegal kernel name '")+name+"'").c_str());
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2012 Stanford University and the Authors. *
* Portions copyright (c) 2020 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "HipKernelSources.h"
using namespace OpenMM;
using namespace std;
#ifndef OPENMM_HIPKERNELSOURCES_H_
#define OPENMM_HIPKERNELSOURCES_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2010-2012 Stanford University and the Authors. *
* Portions copyright (c) 2020 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#ifdef _MSC_VER
#error "Windows unsupported for HIP platform"
#endif
#include "openmm/common/windowsExportCommon.h"
#include <string>
namespace OpenMM {
/**
* This class is a central holding place for the source code of HIP kernels.
* The CMake build script inserts declarations into it based on the .hip files in the
* kernels subfolder.
*/
class OPENMM_EXPORT_COMMON HipKernelSources {
public:
@KERNEL_FILE_DECLARATIONS@
};
} // namespace OpenMM
#endif /*OPENMM_HIPKERNELSOURCES_H_*/
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008-2019 Stanford University and the Authors. *
* Portions copyright (c) 2020 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "HipKernels.h"
#include "HipForceInfo.h"
#include "openmm/Context.h"
#include "openmm/internal/ContextImpl.h"
#include "openmm/internal/NonbondedForceImpl.h"
#include "CommonKernelSources.h"
#include "HipBondedUtilities.h"
#include "HipExpressionUtilities.h"
#include "HipIntegrationUtilities.h"
#include "HipNonbondedUtilities.h"
#include "HipKernelSources.h"
#include "SimTKOpenMMRealType.h"
#include "SimTKOpenMMUtilities.h"
#include <algorithm>
#include <cmath>
#include <iterator>
#include <set>
#include <assert.h>
using namespace OpenMM;
using namespace std;
#define CHECK_RESULT(result, prefix) \
if (result != hipSuccess) { \
std::stringstream m; \
m<<prefix<<": "<<HipContext::getErrorString(result)<<" ("<<result<<")"<<" at "<<__FILE__<<":"<<__LINE__; \
throw OpenMMException(m.str());\
}
void HipCalcForcesAndEnergyKernel::initialize(const System& system) {
}
void HipCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForces, bool includeEnergy, int groups) {
cu.setForcesValid(true);
cu.setAsCurrent();
cu.clearAutoclearBuffers();
for (auto computation : cu.getPreComputations())
computation->computeForceAndEnergy(includeForces, includeEnergy, groups);
HipNonbondedUtilities& nb = cu.getNonbondedUtilities();
cu.setComputeForceCount(cu.getComputeForceCount()+1);
nb.prepareInteractions(groups);
map<string, double>& derivs = cu.getEnergyParamDerivWorkspace();
for (auto& param : context.getParameters())
derivs[param.first] = 0;
}
double HipCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bool includeForces, bool includeEnergy, int groups, bool& valid) {
cu.setAsCurrent();
cu.getBondedUtilities().computeInteractions(groups);
cu.getNonbondedUtilities().computeInteractions(groups, includeForces, includeEnergy);
double sum = 0.0;
for (auto computation : cu.getPostComputations())
sum += computation->computeForceAndEnergy(includeForces, includeEnergy, groups);
cu.getIntegrationUtilities().distributeForcesFromVirtualSites();
if (includeEnergy)
sum += cu.reduceEnergy();
if (!cu.getForcesValid())
valid = false;
return sum;
}
void HipUpdateStateDataKernel::initialize(const System& system) {
}
double HipUpdateStateDataKernel::getTime(const ContextImpl& context) const {
return cu.getTime();
}
void HipUpdateStateDataKernel::setTime(ContextImpl& context, double time) {
vector<HipContext*>& contexts = cu.getPlatformData().contexts;
for (auto ctx : contexts)
ctx->setTime(time);
}
void HipUpdateStateDataKernel::getPositions(ContextImpl& context, vector<Vec3>& positions) {
cu.setAsCurrent();
int numParticles = context.getSystem().getNumParticles();
positions.resize(numParticles);
vector<float4> posCorrection;
if (cu.getUseDoublePrecision()) {
double4* posq = (double4*) cu.getPinnedBuffer();
cu.getPosq().download(posq);
}
else if (cu.getUseMixedPrecision()) {
float4* posq = (float4*) cu.getPinnedBuffer();
cu.getPosq().download(posq, false);
posCorrection.resize(numParticles);
cu.getPosqCorrection().download(posCorrection);
}
else {
float4* posq = (float4*) cu.getPinnedBuffer();
cu.getPosq().download(posq);
}
// Filling in the output array is done in parallel for speed.
cu.getPlatformData().threads.execute([&] (ThreadPool& threads, int threadIndex) {
// Compute the position of each particle to return to the user. This is done in parallel for speed.
const vector<int>& order = cu.getAtomIndex();
int numParticles = cu.getNumAtoms();
Vec3 boxVectors[3];
cu.getPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
int numThreads = threads.getNumThreads();
int start = threadIndex*numParticles/numThreads;
int end = (threadIndex+1)*numParticles/numThreads;
if (cu.getUseDoublePrecision()) {
double4* posq = (double4*) cu.getPinnedBuffer();
for (int i = start; i < end; ++i) {
double4 pos = posq[i];
mm_int4 offset = cu.getPosCellOffsets()[i];
positions[order[i]] = Vec3(pos.x, pos.y, pos.z)-boxVectors[0]*offset.x-boxVectors[1]*offset.y-boxVectors[2]*offset.z;
}
}
else if (cu.getUseMixedPrecision()) {
float4* posq = (float4*) cu.getPinnedBuffer();
for (int i = start; i < end; ++i) {
float4 pos1 = posq[i];
float4 pos2 = posCorrection[i];
mm_int4 offset = cu.getPosCellOffsets()[i];
positions[order[i]] = Vec3((double)pos1.x+(double)pos2.x, (double)pos1.y+(double)pos2.y, (double)pos1.z+(double)pos2.z)-boxVectors[0]*offset.x-boxVectors[1]*offset.y-boxVectors[2]*offset.z;
}
}
else {
float4* posq = (float4*) cu.getPinnedBuffer();
for (int i = start; i < end; ++i) {
float4 pos = posq[i];
mm_int4 offset = cu.getPosCellOffsets()[i];
positions[order[i]] = Vec3(pos.x, pos.y, pos.z)-boxVectors[0]*offset.x-boxVectors[1]*offset.y-boxVectors[2]*offset.z;
}
}
});
cu.getPlatformData().threads.waitForThreads();
}
void HipUpdateStateDataKernel::setPositions(ContextImpl& context, const vector<Vec3>& positions) {
cu.setAsCurrent();
const vector<int>& order = cu.getAtomIndex();
int numParticles = context.getSystem().getNumParticles();
if (cu.getUseDoublePrecision()) {
double4* posq = (double4*) cu.getPinnedBuffer();
cu.getPosq().download(posq);
for (int i = 0; i < numParticles; ++i) {
double4& pos = posq[i];
const Vec3& p = positions[order[i]];
pos.x = p[0];
pos.y = p[1];
pos.z = p[2];
}
for (int i = numParticles; i < cu.getPaddedNumAtoms(); i++)
posq[i] = make_double4(0.0, 0.0, 0.0, 0.0);
cu.getPosq().upload(posq);
}
else {
float4* posq = (float4*) cu.getPinnedBuffer();
cu.getPosq().download(posq);
for (int i = 0; i < numParticles; ++i) {
float4& pos = posq[i];
const Vec3& p = positions[order[i]];
pos.x = (float) p[0];
pos.y = (float) p[1];
pos.z = (float) p[2];
}
for (int i = numParticles; i < cu.getPaddedNumAtoms(); i++)
posq[i] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
cu.getPosq().upload(posq);
}
if (cu.getUseMixedPrecision()) {
float4* posCorrection = (float4*) cu.getPinnedBuffer();
for (int i = 0; i < numParticles; ++i) {
float4& c = posCorrection[i];
const Vec3& p = positions[order[i]];
c.x = (float) (p[0]-(float)p[0]);
c.y = (float) (p[1]-(float)p[1]);
c.z = (float) (p[2]-(float)p[2]);
c.w = 0;
}
for (int i = numParticles; i < cu.getPaddedNumAtoms(); i++)
posCorrection[i] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
cu.getPosqCorrection().upload(posCorrection);
}
for (auto& offset : cu.getPosCellOffsets())
offset = mm_int4(0, 0, 0, 0);
cu.reorderAtoms();
}
void HipUpdateStateDataKernel::getVelocities(ContextImpl& context, vector<Vec3>& velocities) {
cu.setAsCurrent();
const vector<int>& order = cu.getAtomIndex();
int numParticles = context.getSystem().getNumParticles();
velocities.resize(numParticles);
if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
double4* velm = (double4*) cu.getPinnedBuffer();
cu.getVelm().download(velm);
for (int i = 0; i < numParticles; ++i) {
double4 vel = velm[i];
mm_int4 offset = cu.getPosCellOffsets()[i];
velocities[order[i]] = Vec3(vel.x, vel.y, vel.z);
}
}
else {
float4* velm = (float4*) cu.getPinnedBuffer();
cu.getVelm().download(velm);
for (int i = 0; i < numParticles; ++i) {
float4 vel = velm[i];
mm_int4 offset = cu.getPosCellOffsets()[i];
velocities[order[i]] = Vec3(vel.x, vel.y, vel.z);
}
}
}
void HipUpdateStateDataKernel::setVelocities(ContextImpl& context, const vector<Vec3>& velocities) {
cu.setAsCurrent();
const vector<int>& order = cu.getAtomIndex();
int numParticles = context.getSystem().getNumParticles();
if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
double4* velm = (double4*) cu.getPinnedBuffer();
cu.getVelm().download(velm);
for (int i = 0; i < numParticles; ++i) {
double4& vel = velm[i];
const Vec3& p = velocities[order[i]];
vel.x = p[0];
vel.y = p[1];
vel.z = p[2];
}
for (int i = numParticles; i < cu.getPaddedNumAtoms(); i++)
velm[i] = make_double4(0.0, 0.0, 0.0, 0.0);
cu.getVelm().upload(velm);
}
else {
float4* velm = (float4*) cu.getPinnedBuffer();
cu.getVelm().download(velm);
for (int i = 0; i < numParticles; ++i) {
float4& vel = velm[i];
const Vec3& p = velocities[order[i]];
vel.x = p[0];
vel.y = p[1];
vel.z = p[2];
}
for (int i = numParticles; i < cu.getPaddedNumAtoms(); i++)
velm[i] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
cu.getVelm().upload(velm);
}
}
void HipUpdateStateDataKernel::getForces(ContextImpl& context, vector<Vec3>& forces) {
cu.setAsCurrent();
long long* force = (long long*) cu.getPinnedBuffer();
cu.getForce().download(force);
const vector<int>& order = cu.getAtomIndex();
int numParticles = context.getSystem().getNumParticles();
int paddedNumParticles = cu.getPaddedNumAtoms();
forces.resize(numParticles);
double scale = 1.0/(double) 0x100000000LL;
for (int i = 0; i < numParticles; ++i)
forces[order[i]] = Vec3(scale*force[i], scale*force[i+paddedNumParticles], scale*force[i+paddedNumParticles*2]);
}
void HipUpdateStateDataKernel::getEnergyParameterDerivatives(ContextImpl& context, map<string, double>& derivs) {
const vector<string>& paramDerivNames = cu.getEnergyParamDerivNames();
int numDerivs = paramDerivNames.size();
if (numDerivs == 0)
return;
derivs = cu.getEnergyParamDerivWorkspace();
HipArray& derivArray = cu.getEnergyParamDerivBuffer();
if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
vector<double> derivBuffers;
derivArray.download(derivBuffers);
for (int i = numDerivs; i < derivArray.getSize(); i += numDerivs)
for (int j = 0; j < numDerivs; j++)
derivBuffers[j] += derivBuffers[i+j];
for (int i = 0; i < numDerivs; i++)
derivs[paramDerivNames[i]] += derivBuffers[i];
}
else {
vector<float> derivBuffers;
derivArray.download(derivBuffers);
for (int i = numDerivs; i < derivArray.getSize(); i += numDerivs)
for (int j = 0; j < numDerivs; j++)
derivBuffers[j] += derivBuffers[i+j];
for (int i = 0; i < numDerivs; i++)
derivs[paramDerivNames[i]] += derivBuffers[i];
}
}
void HipUpdateStateDataKernel::getPeriodicBoxVectors(ContextImpl& context, Vec3& a, Vec3& b, Vec3& c) const {
cu.getPeriodicBoxVectors(a, b, c);
}
void HipUpdateStateDataKernel::setPeriodicBoxVectors(ContextImpl& context, const Vec3& a, const Vec3& b, const Vec3& c) {
vector<HipContext*>& contexts = cu.getPlatformData().contexts;
// If any particles have been wrapped to the first periodic box, we need to unwrap them
// to avoid changing their positions.
vector<Vec3> positions;
for (auto& offset : cu.getPosCellOffsets()) {
if (offset.x != 0 || offset.y != 0 || offset.z != 0) {
getPositions(context, positions);
break;
}
}
// Update the vectors.
for (auto ctx : contexts)
ctx->setPeriodicBoxVectors(a, b, c);
if (positions.size() > 0)
setPositions(context, positions);
}
void HipUpdateStateDataKernel::createCheckpoint(ContextImpl& context, ostream& stream) {
cu.setAsCurrent();
int version = 3;
stream.write((char*) &version, sizeof(int));
int precision = (cu.getUseDoublePrecision() ? 2 : cu.getUseMixedPrecision() ? 1 : 0);
stream.write((char*) &precision, sizeof(int));
double time = cu.getTime();
stream.write((char*) &time, sizeof(double));
int stepCount = cu.getStepCount();
stream.write((char*) &stepCount, sizeof(int));
int stepsSinceReorder = cu.getStepsSinceReorder();
stream.write((char*) &stepsSinceReorder, sizeof(int));
char* buffer = (char*) cu.getPinnedBuffer();
cu.getPosq().download(buffer);
stream.write(buffer, cu.getPosq().getSize()*cu.getPosq().getElementSize());
if (cu.getUseMixedPrecision()) {
cu.getPosqCorrection().download(buffer);
stream.write(buffer, cu.getPosqCorrection().getSize()*cu.getPosqCorrection().getElementSize());
}
cu.getVelm().download(buffer);
stream.write(buffer, cu.getVelm().getSize()*cu.getVelm().getElementSize());
stream.write((char*) &cu.getAtomIndex()[0], sizeof(int)*cu.getAtomIndex().size());
stream.write((char*) &cu.getPosCellOffsets()[0], sizeof(int4)*cu.getPosCellOffsets().size());
Vec3 boxVectors[3];
cu.getPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
stream.write((char*) boxVectors, 3*sizeof(Vec3));
cu.getIntegrationUtilities().createCheckpoint(stream);
SimTKOpenMMUtilities::createCheckpoint(stream);
}
void HipUpdateStateDataKernel::loadCheckpoint(ContextImpl& context, istream& stream) {
cu.setAsCurrent();
int version;
stream.read((char*) &version, sizeof(int));
if (version != 3)
throw OpenMMException("Checkpoint was created with a different version of OpenMM");
int precision;
stream.read((char*) &precision, sizeof(int));
int expectedPrecision = (cu.getUseDoublePrecision() ? 2 : cu.getUseMixedPrecision() ? 1 : 0);
if (precision != expectedPrecision)
throw OpenMMException("Checkpoint was created with a different numeric precision");
double time;
stream.read((char*) &time, sizeof(double));
int stepCount, stepsSinceReorder;
stream.read((char*) &stepCount, sizeof(int));
stream.read((char*) &stepsSinceReorder, sizeof(int));
vector<HipContext*>& contexts = cu.getPlatformData().contexts;
for (auto ctx : contexts) {
ctx->setTime(time);
ctx->setStepCount(stepCount);
ctx->setStepsSinceReorder(stepsSinceReorder);
}
char* buffer = (char*) cu.getPinnedBuffer();
stream.read(buffer, cu.getPosq().getSize()*cu.getPosq().getElementSize());
cu.getPosq().upload(buffer);
if (cu.getUseMixedPrecision()) {
stream.read(buffer, cu.getPosqCorrection().getSize()*cu.getPosqCorrection().getElementSize());
cu.getPosqCorrection().upload(buffer);
}
stream.read(buffer, cu.getVelm().getSize()*cu.getVelm().getElementSize());
cu.getVelm().upload(buffer);
stream.read((char*) &cu.getAtomIndex()[0], sizeof(int)*cu.getAtomIndex().size());
cu.getAtomIndexArray().upload(cu.getAtomIndex());
stream.read((char*) &cu.getPosCellOffsets()[0], sizeof(int4)*cu.getPosCellOffsets().size());
Vec3 boxVectors[3];
stream.read((char*) &boxVectors, 3*sizeof(Vec3));
for (auto ctx : contexts)
ctx->setPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
cu.getIntegrationUtilities().loadCheckpoint(stream);
SimTKOpenMMUtilities::loadCheckpoint(stream);
for (auto listener : cu.getReorderListeners())
listener->execute();
}
class HipCalcNonbondedForceKernel::ForceInfo : public HipForceInfo {
public:
ForceInfo(const NonbondedForce& force) : force(force) {
}
bool areParticlesIdentical(int particle1, int particle2) {
double charge1, charge2, sigma1, sigma2, epsilon1, epsilon2;
force.getParticleParameters(particle1, charge1, sigma1, epsilon1);
force.getParticleParameters(particle2, charge2, sigma2, epsilon2);
return (charge1 == charge2 && sigma1 == sigma2 && epsilon1 == epsilon2);
}
int getNumParticleGroups() {
return force.getNumExceptions();
}
void getParticlesInGroup(int index, vector<int>& particles) {
int particle1, particle2;
double chargeProd, sigma, epsilon;
force.getExceptionParameters(index, particle1, particle2, chargeProd, sigma, epsilon);
particles.resize(2);
particles[0] = particle1;
particles[1] = particle2;
}
bool areGroupsIdentical(int group1, int group2) {
int particle1, particle2;
double chargeProd1, chargeProd2, sigma1, sigma2, epsilon1, epsilon2;
force.getExceptionParameters(group1, particle1, particle2, chargeProd1, sigma1, epsilon1);
force.getExceptionParameters(group2, particle1, particle2, chargeProd2, sigma2, epsilon2);
return (chargeProd1 == chargeProd2 && sigma1 == sigma2 && epsilon1 == epsilon2);
}
private:
const NonbondedForce& force;
};
class HipCalcNonbondedForceKernel::PmeIO : public CalcPmeReciprocalForceKernel::IO {
public:
PmeIO(HipContext& cu, hipFunction_t addForcesKernel) : cu(cu), addForcesKernel(addForcesKernel) {
forceTemp.initialize<float4>(cu, cu.getNumAtoms(), "PmeForce");
}
float* getPosq() {
cu.setAsCurrent();
cu.getPosq().download(posq);
return (float*) &posq[0];
}
void setForce(float* force) {
forceTemp.upload(force);
void* args[] = {&forceTemp.getDevicePointer(), &cu.getForce().getDevicePointer()};
cu.executeKernel(addForcesKernel, args, cu.getNumAtoms());
}
private:
HipContext& cu;
vector<float4> posq;
HipArray forceTemp;
hipFunction_t addForcesKernel;
};
class HipCalcNonbondedForceKernel::PmePreComputation : public HipContext::ForcePreComputation {
public:
PmePreComputation(HipContext& cu, Kernel& pme, CalcPmeReciprocalForceKernel::IO& io) : cu(cu), pme(pme), io(io) {
}
void computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) {
Vec3 boxVectors[3] = {Vec3(cu.getPeriodicBoxSize().x, 0, 0), Vec3(0, cu.getPeriodicBoxSize().y, 0), Vec3(0, 0, cu.getPeriodicBoxSize().z)};
pme.getAs<CalcPmeReciprocalForceKernel>().beginComputation(io, boxVectors, includeEnergy);
}
private:
HipContext& cu;
Kernel pme;
CalcPmeReciprocalForceKernel::IO& io;
};
class HipCalcNonbondedForceKernel::PmePostComputation : public HipContext::ForcePostComputation {
public:
PmePostComputation(Kernel& pme, CalcPmeReciprocalForceKernel::IO& io) : pme(pme), io(io) {
}
double computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) {
return pme.getAs<CalcPmeReciprocalForceKernel>().finishComputation(io);
}
private:
Kernel pme;
CalcPmeReciprocalForceKernel::IO& io;
};
class HipCalcNonbondedForceKernel::SyncStreamPreComputation : public HipContext::ForcePreComputation {
public:
SyncStreamPreComputation(HipContext& cu, hipStream_t stream, hipEvent_t event, int forceGroup) : cu(cu), stream(stream), event(event), forceGroup(forceGroup) {
}
void computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) {
if ((groups&(1<<forceGroup)) != 0) {
hipEventRecord(event, cu.getCurrentStream());
hipStreamWaitEvent(stream, event, 0);
}
}
private:
HipContext& cu;
hipStream_t stream;
hipEvent_t event;
int forceGroup;
};
class HipCalcNonbondedForceKernel::SyncStreamPostComputation : public HipContext::ForcePostComputation {
public:
SyncStreamPostComputation(HipContext& cu, hipEvent_t event, hipFunction_t addEnergyKernel, HipArray& pmeEnergyBuffer, int forceGroup) : cu(cu), event(event),
addEnergyKernel(addEnergyKernel), pmeEnergyBuffer(pmeEnergyBuffer), forceGroup(forceGroup) {
}
double computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) {
if ((groups&(1<<forceGroup)) != 0) {
hipStreamWaitEvent(cu.getCurrentStream(), event, 0);
if (includeEnergy) {
int bufferSize = pmeEnergyBuffer.getSize();
void* args[] = {&pmeEnergyBuffer.getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(), &bufferSize};
cu.executeKernel(addEnergyKernel, args, bufferSize);
}
}
return 0.0;
}
private:
HipContext& cu;
hipEvent_t event;
hipFunction_t addEnergyKernel;
HipArray& pmeEnergyBuffer;
int forceGroup;
};
HipCalcNonbondedForceKernel::~HipCalcNonbondedForceKernel() {
cu.setAsCurrent();
if (sort != NULL)
delete sort;
if (fft != NULL)
delete fft;
if (dispersionFft != NULL)
delete dispersionFft;
if (pmeio != NULL)
delete pmeio;
if (hasInitializedFFT) {
if (useHipFFT) {
hipfftDestroy(fftForward);
hipfftDestroy(fftBackward);
if (doLJPME) {
hipfftDestroy(dispersionFftForward);
hipfftDestroy(dispersionFftBackward);
}
}
if (usePmeStream) {
hipStreamDestroy(pmeStream);
hipEventDestroy(pmeSyncEvent);
hipEventDestroy(paramsSyncEvent);
}
}
}
void HipCalcNonbondedForceKernel::initialize(const System& system, const NonbondedForce& force) {
cu.setAsCurrent();
int forceIndex;
for (forceIndex = 0; forceIndex < system.getNumForces() && &system.getForce(forceIndex) != &force; ++forceIndex)
;
string prefix = "nonbonded"+cu.intToString(forceIndex)+"_";
// Identify which exceptions are 1-4 interactions.
set<int> exceptionsWithOffsets;
for (int i = 0; i < force.getNumExceptionParameterOffsets(); i++) {
string param;
int exception;
double charge, sigma, epsilon;
force.getExceptionParameterOffset(i, param, exception, charge, sigma, epsilon);
exceptionsWithOffsets.insert(exception);
}
vector<pair<int, int> > exclusions;
vector<int> exceptions;
map<int, int> exceptionIndex;
for (int i = 0; i < force.getNumExceptions(); i++) {
int particle1, particle2;
double chargeProd, sigma, epsilon;
force.getExceptionParameters(i, particle1, particle2, chargeProd, sigma, epsilon);
exclusions.push_back(pair<int, int>(particle1, particle2));
if (chargeProd != 0.0 || epsilon != 0.0 || exceptionsWithOffsets.find(i) != exceptionsWithOffsets.end()) {
exceptionIndex[i] = exceptions.size();
exceptions.push_back(i);
}
}
// Initialize nonbonded interactions.
int numParticles = force.getNumParticles();
vector<float4> baseParticleParamVec(cu.getPaddedNumAtoms(), make_float4(0, 0, 0, 0));
vector<vector<int> > exclusionList(numParticles);
hasCoulomb = false;
hasLJ = false;
for (int i = 0; i < numParticles; i++) {
double charge, sigma, epsilon;
force.getParticleParameters(i, charge, sigma, epsilon);
baseParticleParamVec[i] = make_float4(charge, sigma, epsilon, 0);
exclusionList[i].push_back(i);
if (charge != 0.0)
hasCoulomb = true;
if (epsilon != 0.0)
hasLJ = true;
}
for (int i = 0; i < force.getNumParticleParameterOffsets(); i++) {
string param;
int particle;
double charge, sigma, epsilon;
force.getParticleParameterOffset(i, param, particle, charge, sigma, epsilon);
if (charge != 0.0)
hasCoulomb = true;
if (epsilon != 0.0)
hasLJ = true;
}
for (auto exclusion : exclusions) {
exclusionList[exclusion.first].push_back(exclusion.second);
exclusionList[exclusion.second].push_back(exclusion.first);
}
nonbondedMethod = CalcNonbondedForceKernel::NonbondedMethod(force.getNonbondedMethod());
bool useCutoff = (nonbondedMethod != NoCutoff);
bool usePeriodic = (nonbondedMethod != NoCutoff && nonbondedMethod != CutoffNonPeriodic);
doLJPME = (nonbondedMethod == LJPME && hasLJ);
usePosqCharges = hasCoulomb ? cu.requestPosqCharges() : false;
map<string, string> defines;
defines["HAS_COULOMB"] = (hasCoulomb ? "1" : "0");
defines["HAS_LENNARD_JONES"] = (hasLJ ? "1" : "0");
defines["USE_LJ_SWITCH"] = (useCutoff && force.getUseSwitchingFunction() ? "1" : "0");
if (useCutoff) {
// Compute the reaction field constants.
double reactionFieldK = pow(force.getCutoffDistance(), -3.0)*(force.getReactionFieldDielectric()-1.0)/(2.0*force.getReactionFieldDielectric()+1.0);
double reactionFieldC = (1.0 / force.getCutoffDistance())*(3.0*force.getReactionFieldDielectric())/(2.0*force.getReactionFieldDielectric()+1.0);
defines["REACTION_FIELD_K"] = cu.doubleToString(reactionFieldK);
defines["REACTION_FIELD_C"] = cu.doubleToString(reactionFieldC);
// Compute the switching coefficients.
if (force.getUseSwitchingFunction()) {
defines["LJ_SWITCH_CUTOFF"] = cu.doubleToString(force.getSwitchingDistance());
defines["LJ_SWITCH_C3"] = cu.doubleToString(10/pow(force.getSwitchingDistance()-force.getCutoffDistance(), 3.0));
defines["LJ_SWITCH_C4"] = cu.doubleToString(15/pow(force.getSwitchingDistance()-force.getCutoffDistance(), 4.0));
defines["LJ_SWITCH_C5"] = cu.doubleToString(6/pow(force.getSwitchingDistance()-force.getCutoffDistance(), 5.0));
}
}
if (force.getUseDispersionCorrection() && cu.getContextIndex() == 0 && !doLJPME)
dispersionCoefficient = NonbondedForceImpl::calcDispersionCorrection(system, force);
else
dispersionCoefficient = 0.0;
alpha = 0;
ewaldSelfEnergy = 0.0;
map<string, string> paramsDefines;
paramsDefines["ONE_4PI_EPS0"] = cu.doubleToString(ONE_4PI_EPS0);
hasOffsets = (force.getNumParticleParameterOffsets() > 0 || force.getNumExceptionParameterOffsets() > 0);
if (hasOffsets)
paramsDefines["HAS_OFFSETS"] = "1";
if (usePosqCharges)
paramsDefines["USE_POSQ_CHARGES"] = "1";
if (nonbondedMethod == Ewald) {
// Compute the Ewald parameters.
int kmaxx, kmaxy, kmaxz;
NonbondedForceImpl::calcEwaldParameters(system, force, alpha, kmaxx, kmaxy, kmaxz);
defines["EWALD_ALPHA"] = cu.doubleToString(alpha);
defines["TWO_OVER_SQRT_PI"] = cu.doubleToString(2.0/sqrt(M_PI));
defines["USE_EWALD"] = "1";
if (cu.getContextIndex() == 0) {
paramsDefines["INCLUDE_EWALD"] = "1";
paramsDefines["EWALD_SELF_ENERGY_SCALE"] = cu.doubleToString(ONE_4PI_EPS0*alpha/sqrt(M_PI));
for (int i = 0; i < numParticles; i++)
ewaldSelfEnergy -= baseParticleParamVec[i].x*baseParticleParamVec[i].x*ONE_4PI_EPS0*alpha/sqrt(M_PI);
// Create the reciprocal space kernels.
map<string, string> replacements;
replacements["NUM_ATOMS"] = cu.intToString(numParticles);
replacements["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
replacements["KMAX_X"] = cu.intToString(kmaxx);
replacements["KMAX_Y"] = cu.intToString(kmaxy);
replacements["KMAX_Z"] = cu.intToString(kmaxz);
replacements["EXP_COEFFICIENT"] = cu.doubleToString(-1.0/(4.0*alpha*alpha));
replacements["ONE_4PI_EPS0"] = cu.doubleToString(ONE_4PI_EPS0);
replacements["M_PI"] = cu.doubleToString(M_PI);
hipModule_t module = cu.createModule(HipKernelSources::vectorOps+CommonKernelSources::ewald, replacements);
ewaldSumsKernel = cu.getKernel(module, "calculateEwaldCosSinSums");
ewaldForcesKernel = cu.getKernel(module, "calculateEwaldForces");
int elementSize = (cu.getUseDoublePrecision() ? sizeof(double2) : sizeof(float2));
cosSinSums.initialize(cu, (2*kmaxx-1)*(2*kmaxy-1)*(2*kmaxz-1), elementSize, "cosSinSums");
}
}
else if (((nonbondedMethod == PME || nonbondedMethod == LJPME) && hasCoulomb) || doLJPME) {
// Compute the PME parameters.
NonbondedForceImpl::calcPMEParameters(system, force, alpha, gridSizeX, gridSizeY, gridSizeZ, false);
gridSizeX = HipFFT3D::findLegalDimension(gridSizeX);
gridSizeY = HipFFT3D::findLegalDimension(gridSizeY);
gridSizeZ = HipFFT3D::findLegalDimension(gridSizeZ);
if (doLJPME) {
NonbondedForceImpl::calcPMEParameters(system, force, dispersionAlpha, dispersionGridSizeX,
dispersionGridSizeY, dispersionGridSizeZ, true);
dispersionGridSizeX = HipFFT3D::findLegalDimension(dispersionGridSizeX);
dispersionGridSizeY = HipFFT3D::findLegalDimension(dispersionGridSizeY);
dispersionGridSizeZ = HipFFT3D::findLegalDimension(dispersionGridSizeZ);
}
defines["EWALD_ALPHA"] = cu.doubleToString(alpha);
defines["TWO_OVER_SQRT_PI"] = cu.doubleToString(2.0/sqrt(M_PI));
defines["USE_EWALD"] = "1";
defines["DO_LJPME"] = doLJPME ? "1" : "0";
if (doLJPME)
defines["EWALD_DISPERSION_ALPHA"] = cu.doubleToString(dispersionAlpha);
if (cu.getContextIndex() == 0) {
paramsDefines["INCLUDE_EWALD"] = "1";
paramsDefines["EWALD_SELF_ENERGY_SCALE"] = cu.doubleToString(ONE_4PI_EPS0*alpha/sqrt(M_PI));
for (int i = 0; i < numParticles; i++)
ewaldSelfEnergy -= baseParticleParamVec[i].x*baseParticleParamVec[i].x*ONE_4PI_EPS0*alpha/sqrt(M_PI);
if (doLJPME) {
paramsDefines["INCLUDE_LJPME"] = "1";
paramsDefines["LJPME_SELF_ENERGY_SCALE"] = cu.doubleToString(pow(dispersionAlpha, 6)/3.0);
for (int i = 0; i < numParticles; i++)
ewaldSelfEnergy += baseParticleParamVec[i].z*pow(baseParticleParamVec[i].y*dispersionAlpha, 6)/3.0;
}
char deviceName[100];
hipDeviceGetName(deviceName, 100, cu.getDevice());
usePmeStream = (!cu.getPlatformData().disablePmeStream && !cu.getPlatformData().useCpuPme && string(deviceName) != "GeForce GTX 980"); // Using a separate stream is slower on GTX 980
map<string, string> pmeDefines;
pmeDefines["PME_ORDER"] = cu.intToString(PmeOrder);
pmeDefines["NUM_ATOMS"] = cu.intToString(numParticles);
pmeDefines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
pmeDefines["RECIP_EXP_FACTOR"] = cu.doubleToString(M_PI*M_PI/(alpha*alpha));
pmeDefines["GRID_SIZE_X"] = cu.intToString(gridSizeX);
pmeDefines["GRID_SIZE_Y"] = cu.intToString(gridSizeY);
pmeDefines["GRID_SIZE_Z"] = cu.intToString(gridSizeZ);
pmeDefines["EPSILON_FACTOR"] = cu.doubleToString(sqrt(ONE_4PI_EPS0));
pmeDefines["M_PI"] = cu.doubleToString(M_PI);
if (cu.getUseDoublePrecision() || cu.getPlatformData().deterministicForces)
pmeDefines["USE_FIXED_POINT_CHARGE_SPREADING"] = "1";
if (usePmeStream)
pmeDefines["USE_PME_STREAM"] = "1";
map<string, string> replacements;
replacements["CHARGE"] = (usePosqCharges ? "pos.w" : "charges[atom]");
hipModule_t module = cu.createModule(HipKernelSources::vectorOps+cu.replaceStrings(CommonKernelSources::pme, replacements), pmeDefines);
if (cu.getPlatformData().useCpuPme && !doLJPME && usePosqCharges) {
// Create the CPU PME kernel.
try {
cpuPme = getPlatform().createKernel(CalcPmeReciprocalForceKernel::Name(), *cu.getPlatformData().context);
cpuPme.getAs<CalcPmeReciprocalForceKernel>().initialize(gridSizeX, gridSizeY, gridSizeZ, numParticles, alpha, cu.getPlatformData().deterministicForces);
hipFunction_t addForcesKernel = cu.getKernel(module, "addForces");
pmeio = new PmeIO(cu, addForcesKernel);
cu.addPreComputation(new PmePreComputation(cu, cpuPme, *pmeio));
cu.addPostComputation(new PmePostComputation(cpuPme, *pmeio));
}
catch (OpenMMException& ex) {
// The CPU PME plugin isn't available.
}
}
if (pmeio == NULL) {
pmeGridIndexKernel = cu.getKernel(module, "findAtomGridIndex");
pmeSpreadChargeKernel = cu.getKernel(module, "gridSpreadCharge");
pmeConvolutionKernel = cu.getKernel(module, "reciprocalConvolution");
pmeInterpolateForceKernel = cu.getKernel(module, "gridInterpolateForce");
pmeEvalEnergyKernel = cu.getKernel(module, "gridEvaluateEnergy");
pmeFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadCharge");
hipFuncSetCacheConfig(pmeSpreadChargeKernel, hipFuncCachePreferShared);
hipFuncSetCacheConfig(pmeInterpolateForceKernel, hipFuncCachePreferL1);
if (doLJPME) {
pmeDefines["EWALD_ALPHA"] = cu.doubleToString(dispersionAlpha);
pmeDefines["GRID_SIZE_X"] = cu.intToString(dispersionGridSizeX);
pmeDefines["GRID_SIZE_Y"] = cu.intToString(dispersionGridSizeY);
pmeDefines["GRID_SIZE_Z"] = cu.intToString(dispersionGridSizeZ);
pmeDefines["RECIP_EXP_FACTOR"] = cu.doubleToString(M_PI*M_PI/(dispersionAlpha*dispersionAlpha));
pmeDefines["USE_LJPME"] = "1";
pmeDefines["CHARGE_FROM_SIGEPS"] = "1";
if (cu.getUseDoublePrecision() || cu.getPlatformData().deterministicForces)
pmeDefines["USE_FIXED_POINT_CHARGE_SPREADING"] = "1";
double invRCut6 = pow(force.getCutoffDistance(), -6);
double dalphaR = dispersionAlpha * force.getCutoffDistance();
double dar2 = dalphaR*dalphaR;
double dar4 = dar2*dar2;
double multShift6 = -invRCut6*(1.0 - exp(-dar2) * (1.0 + dar2 + 0.5*dar4));
defines["INVCUT6"] = cu.doubleToString(invRCut6);
defines["MULTSHIFT6"] = cu.doubleToString(multShift6);
module = cu.createModule(HipKernelSources::vectorOps+CommonKernelSources::pme, pmeDefines);
pmeDispersionFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadCharge");
pmeDispersionGridIndexKernel = cu.getKernel(module, "findAtomGridIndex");
pmeDispersionSpreadChargeKernel = cu.getKernel(module, "gridSpreadCharge");
pmeDispersionConvolutionKernel = cu.getKernel(module, "reciprocalConvolution");
pmeEvalDispersionEnergyKernel = cu.getKernel(module, "gridEvaluateEnergy");
pmeInterpolateDispersionForceKernel = cu.getKernel(module, "gridInterpolateForce");
hipFuncSetCacheConfig(pmeDispersionSpreadChargeKernel, hipFuncCachePreferL1);
}
// Create required data structures.
int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
int roundedZSize = PmeOrder*(int) ceil(gridSizeZ/(double) PmeOrder);
int gridElements = gridSizeX*gridSizeY*roundedZSize;
if (doLJPME) {
roundedZSize = PmeOrder*(int) ceil(dispersionGridSizeZ/(double) PmeOrder);
gridElements = max(gridElements, dispersionGridSizeX*dispersionGridSizeY*roundedZSize);
}
pmeGrid1.initialize(cu, gridElements, 2*elementSize, "pmeGrid1");
pmeGrid2.initialize(cu, gridElements, 2*elementSize, "pmeGrid2");
cu.addAutoclearBuffer(pmeGrid2);
pmeBsplineModuliX.initialize(cu, gridSizeX, elementSize, "pmeBsplineModuliX");
pmeBsplineModuliY.initialize(cu, gridSizeY, elementSize, "pmeBsplineModuliY");
pmeBsplineModuliZ.initialize(cu, gridSizeZ, elementSize, "pmeBsplineModuliZ");
if (doLJPME) {
pmeDispersionBsplineModuliX.initialize(cu, dispersionGridSizeX, elementSize, "pmeDispersionBsplineModuliX");
pmeDispersionBsplineModuliY.initialize(cu, dispersionGridSizeY, elementSize, "pmeDispersionBsplineModuliY");
pmeDispersionBsplineModuliZ.initialize(cu, dispersionGridSizeZ, elementSize, "pmeDispersionBsplineModuliZ");
}
pmeAtomGridIndex.initialize<int2>(cu, numParticles, "pmeAtomGridIndex");
int energyElementSize = (cu.getUseDoublePrecision() || cu.getUseMixedPrecision() ? sizeof(double) : sizeof(float));
pmeEnergyBuffer.initialize(cu, cu.getNumThreadBlocks()*HipContext::ThreadBlockSize, energyElementSize, "pmeEnergyBuffer");
cu.clearBuffer(pmeEnergyBuffer);
sort = new HipSort(cu, new SortTrait(), cu.getNumAtoms());
int cufftVersion;
hipfftGetVersion(&cufftVersion);
useHipFFT = (cufftVersion >= 7050); // There was a critical bug in version 7.0
if (useHipFFT) {
hipfftResult result = hipfftPlan3d(&fftForward, gridSizeX, gridSizeY, gridSizeZ, cu.getUseDoublePrecision() ? HIPFFT_D2Z : HIPFFT_R2C);
if (result != HIPFFT_SUCCESS)
throw OpenMMException("Error initializing FFT: "+cu.intToString(result));
result = hipfftPlan3d(&fftBackward, gridSizeX, gridSizeY, gridSizeZ, cu.getUseDoublePrecision() ? HIPFFT_Z2D : HIPFFT_C2R);
if (result != HIPFFT_SUCCESS)
throw OpenMMException("Error initializing FFT: "+cu.intToString(result));
if (doLJPME) {
result = hipfftPlan3d(&dispersionFftForward, dispersionGridSizeX, dispersionGridSizeY,
dispersionGridSizeZ, cu.getUseDoublePrecision() ? HIPFFT_D2Z : HIPFFT_R2C);
if (result != HIPFFT_SUCCESS)
throw OpenMMException("Error initializing disperison FFT: "+cu.intToString(result));
result = hipfftPlan3d(&dispersionFftBackward, dispersionGridSizeX, dispersionGridSizeY,
dispersionGridSizeZ, cu.getUseDoublePrecision() ? HIPFFT_Z2D : HIPFFT_C2R);
if (result != HIPFFT_SUCCESS)
throw OpenMMException("Error initializing disperison FFT: "+cu.intToString(result));
}
}
else {
fft = new HipFFT3D(cu, gridSizeX, gridSizeY, gridSizeZ, true);
if (doLJPME)
dispersionFft = new HipFFT3D(cu, dispersionGridSizeX, dispersionGridSizeY, dispersionGridSizeZ, true);
}
// Prepare for doing PME on its own stream.
if (usePmeStream) {
hipStreamCreateWithFlags(&pmeStream, hipStreamNonBlocking);
if (useHipFFT) {
hipfftSetStream(fftForward, pmeStream);
hipfftSetStream(fftBackward, pmeStream);
if (doLJPME) {
hipfftSetStream(dispersionFftForward, pmeStream);
hipfftSetStream(dispersionFftBackward, pmeStream);
}
}
CHECK_RESULT(hipEventCreateWithFlags(&pmeSyncEvent, hipEventDisableTiming), "Error creating event for NonbondedForce");
CHECK_RESULT(hipEventCreateWithFlags(&paramsSyncEvent, hipEventDisableTiming), "Error creating event for NonbondedForce");
int recipForceGroup = force.getReciprocalSpaceForceGroup();
if (recipForceGroup < 0)
recipForceGroup = force.getForceGroup();
cu.addPreComputation(new SyncStreamPreComputation(cu, pmeStream, pmeSyncEvent, recipForceGroup));
cu.addPostComputation(new SyncStreamPostComputation(cu, pmeSyncEvent, cu.getKernel(module, "addEnergy"), pmeEnergyBuffer, recipForceGroup));
}
hasInitializedFFT = true;
// Initialize the b-spline moduli.
for (int grid = 0; grid < 2; grid++) {
int xsize, ysize, zsize;
HipArray *xmoduli, *ymoduli, *zmoduli;
if (grid == 0) {
xsize = gridSizeX;
ysize = gridSizeY;
zsize = gridSizeZ;
xmoduli = &pmeBsplineModuliX;
ymoduli = &pmeBsplineModuliY;
zmoduli = &pmeBsplineModuliZ;
}
else {
if (!doLJPME)
continue;
xsize = dispersionGridSizeX;
ysize = dispersionGridSizeY;
zsize = dispersionGridSizeZ;
xmoduli = &pmeDispersionBsplineModuliX;
ymoduli = &pmeDispersionBsplineModuliY;
zmoduli = &pmeDispersionBsplineModuliZ;
}
int maxSize = max(max(xsize, ysize), zsize);
vector<double> data(PmeOrder);
vector<double> ddata(PmeOrder);
vector<double> bsplines_data(maxSize);
data[PmeOrder-1] = 0.0;
data[1] = 0.0;
data[0] = 1.0;
for (int i = 3; i < PmeOrder; i++) {
double div = 1.0/(i-1.0);
data[i-1] = 0.0;
for (int j = 1; j < (i-1); j++)
data[i-j-1] = div*(j*data[i-j-2]+(i-j)*data[i-j-1]);
data[0] = div*data[0];
}
// Differentiate.
ddata[0] = -data[0];
for (int i = 1; i < PmeOrder; i++)
ddata[i] = data[i-1]-data[i];
double div = 1.0/(PmeOrder-1);
data[PmeOrder-1] = 0.0;
for (int i = 1; i < (PmeOrder-1); i++)
data[PmeOrder-i-1] = div*(i*data[PmeOrder-i-2]+(PmeOrder-i)*data[PmeOrder-i-1]);
data[0] = div*data[0];
for (int i = 0; i < maxSize; i++)
bsplines_data[i] = 0.0;
for (int i = 1; i <= PmeOrder; i++)
bsplines_data[i] = data[i-1];
// Evaluate the actual bspline moduli for X/Y/Z.
for (int dim = 0; dim < 3; dim++) {
int ndata = (dim == 0 ? xsize : dim == 1 ? ysize : zsize);
vector<double> moduli(ndata);
for (int i = 0; i < ndata; i++) {
double sc = 0.0;
double ss = 0.0;
for (int j = 0; j < ndata; j++) {
double arg = (2.0*M_PI*i*j)/ndata;
sc += bsplines_data[j]*cos(arg);
ss += bsplines_data[j]*sin(arg);
}
moduli[i] = sc*sc+ss*ss;
}
for (int i = 0; i < ndata; i++)
if (moduli[i] < 1.0e-7)
moduli[i] = (moduli[(i-1+ndata)%ndata]+moduli[(i+1)%ndata])*0.5;
if (dim == 0)
xmoduli->upload(moduli, true);
else if (dim == 1)
ymoduli->upload(moduli, true);
else
zmoduli->upload(moduli, true);
}
}
}
}
}
// Add code to subtract off the reciprocal part of excluded interactions.
if ((nonbondedMethod == Ewald || nonbondedMethod == PME || nonbondedMethod == LJPME) && pmeio == NULL) {
int numContexts = cu.getPlatformData().contexts.size();
int startIndex = cu.getContextIndex()*force.getNumExceptions()/numContexts;
int endIndex = (cu.getContextIndex()+1)*force.getNumExceptions()/numContexts;
int numExclusions = endIndex-startIndex;
if (numExclusions > 0) {
paramsDefines["HAS_EXCLUSIONS"] = "1";
vector<vector<int> > atoms(numExclusions, vector<int>(2));
exclusionAtoms.initialize<int2>(cu, numExclusions, "exclusionAtoms");
exclusionParams.initialize<float4>(cu, numExclusions, "exclusionParams");
vector<int2> exclusionAtomsVec(numExclusions);
for (int i = 0; i < numExclusions; i++) {
int j = i+startIndex;
exclusionAtomsVec[i] = make_int2(exclusions[j].first, exclusions[j].second);
atoms[i][0] = exclusions[j].first;
atoms[i][1] = exclusions[j].second;
}
exclusionAtoms.upload(exclusionAtomsVec);
map<string, string> replacements;
replacements["PARAMS"] = cu.getBondedUtilities().addArgument(exclusionParams.getDevicePointer(), "float4");
replacements["EWALD_ALPHA"] = cu.doubleToString(alpha);
replacements["TWO_OVER_SQRT_PI"] = cu.doubleToString(2.0/sqrt(M_PI));
replacements["DO_LJPME"] = doLJPME ? "1" : "0";
replacements["USE_PERIODIC"] = force.getExceptionsUsePeriodicBoundaryConditions() ? "1" : "0";
if (doLJPME)
replacements["EWALD_DISPERSION_ALPHA"] = cu.doubleToString(dispersionAlpha);
cu.getBondedUtilities().addInteraction(atoms, cu.replaceStrings(CommonKernelSources::pmeExclusions, replacements), force.getForceGroup());
}
}
// Add the interaction to the default nonbonded kernel.
string source = cu.replaceStrings(CommonKernelSources::coulombLennardJones, defines);
charges.initialize(cu, cu.getPaddedNumAtoms(), cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float), "charges");
baseParticleParams.initialize<float4>(cu, cu.getPaddedNumAtoms(), "baseParticleParams");
baseParticleParams.upload(baseParticleParamVec);
map<string, string> replacements;
replacements["ONE_4PI_EPS0"] = cu.doubleToString(ONE_4PI_EPS0);
if (usePosqCharges) {
replacements["CHARGE1"] = "posq1.w";
replacements["CHARGE2"] = "posq2.w";
}
else {
replacements["CHARGE1"] = prefix+"charge1";
replacements["CHARGE2"] = prefix+"charge2";
}
if (hasCoulomb)
cu.getNonbondedUtilities().addParameter(HipNonbondedUtilities::ParameterInfo(prefix+"charge", "real", 1, charges.getElementSize(), charges.getDevicePointer()));
sigmaEpsilon.initialize<float2>(cu, cu.getPaddedNumAtoms(), "sigmaEpsilon");
if (hasLJ) {
replacements["SIGMA_EPSILON1"] = prefix+"sigmaEpsilon1";
replacements["SIGMA_EPSILON2"] = prefix+"sigmaEpsilon2";
cu.getNonbondedUtilities().addParameter(HipNonbondedUtilities::ParameterInfo(prefix+"sigmaEpsilon", "float", 2, sizeof(float2), sigmaEpsilon.getDevicePointer()));
}
source = cu.replaceStrings(source, replacements);
cu.getNonbondedUtilities().addInteraction(useCutoff, usePeriodic, true, force.getCutoffDistance(), exclusionList, source, force.getForceGroup(), true);
// Initialize the exceptions.
int numContexts = cu.getPlatformData().contexts.size();
int startIndex = cu.getContextIndex()*exceptions.size()/numContexts;
int endIndex = (cu.getContextIndex()+1)*exceptions.size()/numContexts;
int numExceptions = endIndex-startIndex;
if (numExceptions > 0) {
paramsDefines["HAS_EXCEPTIONS"] = "1";
exceptionAtoms.resize(numExceptions);
vector<vector<int> > atoms(numExceptions, vector<int>(2));
exceptionParams.initialize<float4>(cu, numExceptions, "exceptionParams");
baseExceptionParams.initialize<float4>(cu, numExceptions, "baseExceptionParams");
vector<float4> baseExceptionParamsVec(numExceptions);
for (int i = 0; i < numExceptions; i++) {
double chargeProd, sigma, epsilon;
force.getExceptionParameters(exceptions[startIndex+i], atoms[i][0], atoms[i][1], chargeProd, sigma, epsilon);
baseExceptionParamsVec[i] = make_float4(chargeProd, sigma, epsilon, 0);
exceptionAtoms[i] = make_pair(atoms[i][0], atoms[i][1]);
}
baseExceptionParams.upload(baseExceptionParamsVec);
map<string, string> replacements;
replacements["APPLY_PERIODIC"] = (usePeriodic && force.getExceptionsUsePeriodicBoundaryConditions() ? "1" : "0");
replacements["PARAMS"] = cu.getBondedUtilities().addArgument(exceptionParams.getDevicePointer(), "float4");
cu.getBondedUtilities().addInteraction(atoms, cu.replaceStrings(CommonKernelSources::nonbondedExceptions, replacements), force.getForceGroup());
}
// Initialize parameter offsets.
vector<vector<float4> > particleOffsetVec(force.getNumParticles());
vector<vector<float4> > exceptionOffsetVec(force.getNumExceptions());
for (int i = 0; i < force.getNumParticleParameterOffsets(); i++) {
string param;
int particle;
double charge, sigma, epsilon;
force.getParticleParameterOffset(i, param, particle, charge, sigma, epsilon);
auto paramPos = find(paramNames.begin(), paramNames.end(), param);
int paramIndex;
if (paramPos == paramNames.end()) {
paramIndex = paramNames.size();
paramNames.push_back(param);
}
else
paramIndex = paramPos-paramNames.begin();
particleOffsetVec[particle].push_back(make_float4(charge, sigma, epsilon, paramIndex));
}
for (int i = 0; i < force.getNumExceptionParameterOffsets(); i++) {
string param;
int exception;
double charge, sigma, epsilon;
force.getExceptionParameterOffset(i, param, exception, charge, sigma, epsilon);
auto paramPos = find(paramNames.begin(), paramNames.end(), param);
int paramIndex;
if (paramPos == paramNames.end()) {
paramIndex = paramNames.size();
paramNames.push_back(param);
}
else
paramIndex = paramPos-paramNames.begin();
exceptionOffsetVec[exceptionIndex[exception]].push_back(make_float4(charge, sigma, epsilon, paramIndex));
}
paramValues.resize(paramNames.size(), 0.0);
particleParamOffsets.initialize<float4>(cu, max(force.getNumParticleParameterOffsets(), 1), "particleParamOffsets");
exceptionParamOffsets.initialize<float4>(cu, max(force.getNumExceptionParameterOffsets(), 1), "exceptionParamOffsets");
particleOffsetIndices.initialize<int>(cu, cu.getPaddedNumAtoms()+1, "particleOffsetIndices");
exceptionOffsetIndices.initialize<int>(cu, force.getNumExceptions()+1, "exceptionOffsetIndices");
vector<int> particleOffsetIndicesVec, exceptionOffsetIndicesVec;
vector<float4> p, e;
for (int i = 0; i < particleOffsetVec.size(); i++) {
particleOffsetIndicesVec.push_back(p.size());
for (int j = 0; j < particleOffsetVec[i].size(); j++)
p.push_back(particleOffsetVec[i][j]);
}
while (particleOffsetIndicesVec.size() < particleOffsetIndices.getSize())
particleOffsetIndicesVec.push_back(p.size());
for (int i = 0; i < exceptionOffsetVec.size(); i++) {
exceptionOffsetIndicesVec.push_back(e.size());
for (int j = 0; j < exceptionOffsetVec[i].size(); j++)
e.push_back(exceptionOffsetVec[i][j]);
}
exceptionOffsetIndicesVec.push_back(e.size());
if (force.getNumParticleParameterOffsets() > 0) {
particleParamOffsets.upload(p);
particleOffsetIndices.upload(particleOffsetIndicesVec);
}
if (force.getNumExceptionParameterOffsets() > 0) {
exceptionParamOffsets.upload(e);
exceptionOffsetIndices.upload(exceptionOffsetIndicesVec);
}
globalParams.initialize(cu, max((int) paramValues.size(), 1), cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float), "globalParams");
if (paramValues.size() > 0)
globalParams.upload(paramValues, true);
recomputeParams = true;
// Initialize the kernel for updating parameters.
hipModule_t module = cu.createModule(CommonKernelSources::nonbondedParameters, paramsDefines);
computeParamsKernel = cu.getKernel(module, "computeParameters");
computeExclusionParamsKernel = cu.getKernel(module, "computeExclusionParameters");
info = new ForceInfo(force);
cu.addForce(info);
}
double HipCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy, bool includeDirect, bool includeReciprocal) {
// Update particle and exception parameters.
bool paramChanged = false;
for (int i = 0; i < paramNames.size(); i++) {
double value = context.getParameter(paramNames[i]);
if (value != paramValues[i]) {
paramValues[i] = value;;
paramChanged = true;
}
}
if (paramChanged) {
recomputeParams = true;
globalParams.upload(paramValues, true);
}
double energy = (includeReciprocal ? ewaldSelfEnergy : 0.0);
if (recomputeParams || hasOffsets) {
bool computeSelfEnergy = (includeEnergy && includeReciprocal);
int numAtoms = cu.getPaddedNumAtoms();
vector<void*> paramsArgs = {&cu.getEnergyBuffer().getDevicePointer(), &computeSelfEnergy, &globalParams.getDevicePointer(), &numAtoms,
&baseParticleParams.getDevicePointer(), &cu.getPosq().getDevicePointer(), &charges.getDevicePointer(), &sigmaEpsilon.getDevicePointer(),
&particleParamOffsets.getDevicePointer(), &particleOffsetIndices.getDevicePointer()};
int numExceptions;
if (exceptionParams.isInitialized()) {
numExceptions = exceptionParams.getSize();
paramsArgs.push_back(&numExceptions);
paramsArgs.push_back(&baseExceptionParams.getDevicePointer());
paramsArgs.push_back(&exceptionParams.getDevicePointer());
paramsArgs.push_back(&exceptionParamOffsets.getDevicePointer());
paramsArgs.push_back(&exceptionOffsetIndices.getDevicePointer());
}
cu.executeKernel(computeParamsKernel, &paramsArgs[0], cu.getPaddedNumAtoms());
if (exclusionParams.isInitialized()) {
int numExclusions = exclusionParams.getSize();
vector<void*> exclusionParamsArgs = {&cu.getPosq().getDevicePointer(), &charges.getDevicePointer(), &sigmaEpsilon.getDevicePointer(),
&numExclusions, &exclusionAtoms.getDevicePointer(), &exclusionParams.getDevicePointer()};
cu.executeKernel(computeExclusionParamsKernel, &exclusionParamsArgs[0], numExclusions);
}
if (usePmeStream) {
hipEventRecord(paramsSyncEvent, cu.getCurrentStream());
hipStreamWaitEvent(pmeStream, paramsSyncEvent, 0);
}
if (hasOffsets)
energy = 0.0; // The Ewald self energy was computed in the kernel.
recomputeParams = false;
}
// Do reciprocal space calculations.
if (cosSinSums.isInitialized() && includeReciprocal) {
void* sumsArgs[] = {&cu.getEnergyBuffer().getDevicePointer(), &cu.getPosq().getDevicePointer(), &cosSinSums.getDevicePointer(), cu.getPeriodicBoxSizePointer()};
cu.executeKernel(ewaldSumsKernel, sumsArgs, cosSinSums.getSize());
void* forcesArgs[] = {&cu.getForce().getDevicePointer(), &cu.getPosq().getDevicePointer(), &cosSinSums.getDevicePointer(), cu.getPeriodicBoxSizePointer()};
cu.executeKernel(ewaldForcesKernel, forcesArgs, cu.getNumAtoms());
}
if (pmeGrid1.isInitialized() && includeReciprocal) {
if (usePmeStream)
cu.setCurrentStream(pmeStream);
// Invert the periodic box vectors.
Vec3 boxVectors[3];
cu.getPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
double determinant = boxVectors[0][0]*boxVectors[1][1]*boxVectors[2][2];
double scale = 1.0/determinant;
double4 recipBoxVectors[3];
recipBoxVectors[0] = make_double4(boxVectors[1][1]*boxVectors[2][2]*scale, 0, 0, 0);
recipBoxVectors[1] = make_double4(-boxVectors[1][0]*boxVectors[2][2]*scale, boxVectors[0][0]*boxVectors[2][2]*scale, 0, 0);
recipBoxVectors[2] = make_double4((boxVectors[1][0]*boxVectors[2][1]-boxVectors[1][1]*boxVectors[2][0])*scale, -boxVectors[0][0]*boxVectors[2][1]*scale, boxVectors[0][0]*boxVectors[1][1]*scale, 0);
float4 recipBoxVectorsFloat[3];
void* recipBoxVectorPointer[3];
if (cu.getUseDoublePrecision()) {
recipBoxVectorPointer[0] = &recipBoxVectors[0];
recipBoxVectorPointer[1] = &recipBoxVectors[1];
recipBoxVectorPointer[2] = &recipBoxVectors[2];
}
else {
recipBoxVectorsFloat[0] = make_float4((float) recipBoxVectors[0].x, 0, 0, 0);
recipBoxVectorsFloat[1] = make_float4((float) recipBoxVectors[1].x, (float) recipBoxVectors[1].y, 0, 0);
recipBoxVectorsFloat[2] = make_float4((float) recipBoxVectors[2].x, (float) recipBoxVectors[2].y, (float) recipBoxVectors[2].z, 0);
recipBoxVectorPointer[0] = &recipBoxVectorsFloat[0];
recipBoxVectorPointer[1] = &recipBoxVectorsFloat[1];
recipBoxVectorPointer[2] = &recipBoxVectorsFloat[2];
}
// Execute the reciprocal space kernels.
if (hasCoulomb) {
void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomGridIndex.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeGridIndexKernel, gridIndexArgs, cu.getNumAtoms());
sort->sort(pmeAtomGridIndex);
void* spreadArgs[] = {&cu.getPosq().getDevicePointer(), &pmeGrid2.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(),
&charges.getDevicePointer()};
cu.executeKernel(pmeSpreadChargeKernel, spreadArgs, cu.getNumAtoms(), 128);
void* finishSpreadArgs[] = {&pmeGrid2.getDevicePointer(), &pmeGrid1.getDevicePointer()};
cu.executeKernel(pmeFinishSpreadChargeKernel, finishSpreadArgs, gridSizeX*gridSizeY*gridSizeZ, 256);
if (useHipFFT) {
if (cu.getUseDoublePrecision()) {
hipfftResult result = hipfftExecD2Z(fftForward, (double*) pmeGrid1.getDevicePointer(), (double2*) pmeGrid2.getDevicePointer());
if (result != HIPFFT_SUCCESS)
throw OpenMMException("Error executing FFT: "+cu.intToString(result));
} else {
hipfftResult result = hipfftExecR2C(fftForward, (float*) pmeGrid1.getDevicePointer(), (float2*) pmeGrid2.getDevicePointer());
if (result != HIPFFT_SUCCESS)
throw OpenMMException("Error executing FFT: "+cu.intToString(result));
}
}
else {
fft->execFFT(pmeGrid1, pmeGrid2, true);
}
if (includeEnergy) {
void* computeEnergyArgs[] = {&pmeGrid2.getDevicePointer(), usePmeStream ? &pmeEnergyBuffer.getDevicePointer() : &cu.getEnergyBuffer().getDevicePointer(),
&pmeBsplineModuliX.getDevicePointer(), &pmeBsplineModuliY.getDevicePointer(), &pmeBsplineModuliZ.getDevicePointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeEvalEnergyKernel, computeEnergyArgs, gridSizeX*gridSizeY*gridSizeZ);
}
void* convolutionArgs[] = {&pmeGrid2.getDevicePointer(), &pmeBsplineModuliX.getDevicePointer(),
&pmeBsplineModuliY.getDevicePointer(), &pmeBsplineModuliZ.getDevicePointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeConvolutionKernel, convolutionArgs, gridSizeX*gridSizeY*gridSizeZ, 256);
if (useHipFFT) {
if (cu.getUseDoublePrecision()) {
hipfftResult result = hipfftExecZ2D(fftBackward, (double2*) pmeGrid2.getDevicePointer(), (double*) pmeGrid1.getDevicePointer());
if (result != HIPFFT_SUCCESS)
throw OpenMMException("Error executing FFT: "+cu.intToString(result));
} else {
hipfftResult result = hipfftExecC2R(fftBackward, (float2*) pmeGrid2.getDevicePointer(), (float*) pmeGrid1.getDevicePointer());
if (result != HIPFFT_SUCCESS)
throw OpenMMException("Error executing FFT: "+cu.intToString(result));
}
}
else {
fft->execFFT(pmeGrid2, pmeGrid1, false);
}
void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &pmeGrid1.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(),
&charges.getDevicePointer()};
cu.executeKernel(pmeInterpolateForceKernel, interpolateArgs, cu.getNumAtoms(), 128);
}
if (doLJPME && hasLJ) {
if (!hasCoulomb) {
void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomGridIndex.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeDispersionGridIndexKernel, gridIndexArgs, cu.getNumAtoms());
sort->sort(pmeAtomGridIndex);
cu.clearBuffer(pmeEnergyBuffer);
}
cu.clearBuffer(pmeGrid2);
void* spreadArgs[] = {&cu.getPosq().getDevicePointer(), &pmeGrid2.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(),
&sigmaEpsilon.getDevicePointer()};
cu.executeKernel(pmeDispersionSpreadChargeKernel, spreadArgs, cu.getNumAtoms(), 128);
void* finishSpreadArgs[] = {&pmeGrid2.getDevicePointer(), &pmeGrid1.getDevicePointer()};
cu.executeKernel(pmeDispersionFinishSpreadChargeKernel, finishSpreadArgs, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ, 256);
if (useHipFFT) {
if (cu.getUseDoublePrecision()) {
hipfftResult result = hipfftExecD2Z(dispersionFftForward, (double*) pmeGrid1.getDevicePointer(), (double2*) pmeGrid2.getDevicePointer());
if (result != HIPFFT_SUCCESS)
throw OpenMMException("Error executing FFT: "+cu.intToString(result));
} else {
hipfftResult result = hipfftExecR2C(dispersionFftForward, (float*) pmeGrid1.getDevicePointer(), (float2*) pmeGrid2.getDevicePointer());
if (result != HIPFFT_SUCCESS)
throw OpenMMException("Error executing FFT: "+cu.intToString(result));
}
}
else {
dispersionFft->execFFT(pmeGrid1, pmeGrid2, true);
}
if (includeEnergy) {
void* computeEnergyArgs[] = {&pmeGrid2.getDevicePointer(), usePmeStream ? &pmeEnergyBuffer.getDevicePointer() : &cu.getEnergyBuffer().getDevicePointer(),
&pmeDispersionBsplineModuliX.getDevicePointer(), &pmeDispersionBsplineModuliY.getDevicePointer(), &pmeDispersionBsplineModuliZ.getDevicePointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeEvalDispersionEnergyKernel, computeEnergyArgs, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ);
}
void* convolutionArgs[] = {&pmeGrid2.getDevicePointer(), &pmeDispersionBsplineModuliX.getDevicePointer(),
&pmeDispersionBsplineModuliY.getDevicePointer(), &pmeDispersionBsplineModuliZ.getDevicePointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeDispersionConvolutionKernel, convolutionArgs, dispersionGridSizeX*dispersionGridSizeY*dispersionGridSizeZ, 256);
if (useHipFFT) {
if (cu.getUseDoublePrecision()) {
hipfftResult result = hipfftExecZ2D(dispersionFftBackward, (double2*) pmeGrid2.getDevicePointer(), (double*) pmeGrid1.getDevicePointer());
if (result != HIPFFT_SUCCESS)
throw OpenMMException("Error executing FFT: "+cu.intToString(result));
} else {
hipfftResult result = hipfftExecC2R(dispersionFftBackward, (float2*) pmeGrid2.getDevicePointer(), (float*) pmeGrid1.getDevicePointer());
if (result != HIPFFT_SUCCESS)
throw OpenMMException("Error executing FFT: "+cu.intToString(result));
}
}
else {
dispersionFft->execFFT(pmeGrid2, pmeGrid1, false);
}
void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &pmeGrid1.getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex.getDevicePointer(),
&sigmaEpsilon.getDevicePointer()};
cu.executeKernel(pmeInterpolateDispersionForceKernel, interpolateArgs, cu.getNumAtoms(), 128);
}
if (usePmeStream) {
hipEventRecord(pmeSyncEvent, pmeStream);
cu.restoreDefaultStream();
}
}
if (dispersionCoefficient != 0.0 && includeDirect) {
double4 boxSize = cu.getPeriodicBoxSize();
energy += dispersionCoefficient/(boxSize.x*boxSize.y*boxSize.z);
}
return energy;
}
void HipCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& context, const NonbondedForce& force) {
// Make sure the new parameters are acceptable.
cu.setAsCurrent();
if (force.getNumParticles() != cu.getNumAtoms())
throw OpenMMException("updateParametersInContext: The number of particles has changed");
if (!hasCoulomb || !hasLJ) {
for (int i = 0; i < force.getNumParticles(); i++) {
double charge, sigma, epsilon;
force.getParticleParameters(i, charge, sigma, epsilon);
if (!hasCoulomb && charge != 0.0)
throw OpenMMException("updateParametersInContext: The nonbonded force kernel does not include Coulomb interactions, because all charges were originally 0");
if (!hasLJ && epsilon != 0.0)
throw OpenMMException("updateParametersInContext: The nonbonded force kernel does not include Lennard-Jones interactions, because all epsilons were originally 0");
}
}
vector<int> exceptions;
for (int i = 0; i < force.getNumExceptions(); i++) {
int particle1, particle2;
double chargeProd, sigma, epsilon;
force.getExceptionParameters(i, particle1, particle2, chargeProd, sigma, epsilon);
if (exceptionAtoms.size() > exceptions.size() && make_pair(particle1, particle2) == exceptionAtoms[exceptions.size()])
exceptions.push_back(i);
else if (chargeProd != 0.0 || epsilon != 0.0)
throw OpenMMException("updateParametersInContext: The set of non-excluded exceptions has changed");
}
int numContexts = cu.getPlatformData().contexts.size();
int startIndex = cu.getContextIndex()*exceptions.size()/numContexts;
int endIndex = (cu.getContextIndex()+1)*exceptions.size()/numContexts;
int numExceptions = endIndex-startIndex;
// Record the per-particle parameters.
vector<float4> baseParticleParamVec(cu.getPaddedNumAtoms(), make_float4(0, 0, 0, 0));
const vector<int>& order = cu.getAtomIndex();
for (int i = 0; i < force.getNumParticles(); i++) {
double charge, sigma, epsilon;
force.getParticleParameters(i, charge, sigma, epsilon);
baseParticleParamVec[i] = make_float4(charge, sigma, epsilon, 0);
}
baseParticleParams.upload(baseParticleParamVec);
// Record the exceptions.
if (numExceptions > 0) {
vector<vector<int> > atoms(numExceptions, vector<int>(2));
vector<float4> baseExceptionParamsVec(numExceptions);
for (int i = 0; i < numExceptions; i++) {
double chargeProd, sigma, epsilon;
force.getExceptionParameters(exceptions[startIndex+i], atoms[i][0], atoms[i][1], chargeProd, sigma, epsilon);
baseExceptionParamsVec[i] = make_float4(chargeProd, sigma, epsilon, 0);
}
baseExceptionParams.upload(baseExceptionParamsVec);
}
// Compute other values.
ewaldSelfEnergy = 0.0;
if (nonbondedMethod == Ewald || nonbondedMethod == PME || nonbondedMethod == LJPME) {
if (cu.getContextIndex() == 0) {
for (int i = 0; i < force.getNumParticles(); i++) {
ewaldSelfEnergy -= baseParticleParamVec[i].x*baseParticleParamVec[i].x*ONE_4PI_EPS0*alpha/sqrt(M_PI);
if (doLJPME)
ewaldSelfEnergy += baseParticleParamVec[i].z*pow(baseParticleParamVec[i].y*dispersionAlpha, 6)/3.0;
}
}
}
if (force.getUseDispersionCorrection() && cu.getContextIndex() == 0 && (nonbondedMethod == CutoffPeriodic || nonbondedMethod == Ewald || nonbondedMethod == PME))
dispersionCoefficient = NonbondedForceImpl::calcDispersionCorrection(context.getSystem(), force);
cu.invalidateMolecules();
recomputeParams = true;
}
void HipCalcNonbondedForceKernel::getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
if (nonbondedMethod != PME)
throw OpenMMException("getPMEParametersInContext: This Context is not using PME");
if (cu.getPlatformData().useCpuPme)
cpuPme.getAs<CalcPmeReciprocalForceKernel>().getPMEParameters(alpha, nx, ny, nz);
else {
alpha = this->alpha;
nx = gridSizeX;
ny = gridSizeY;
nz = gridSizeZ;
}
}
void HipCalcNonbondedForceKernel::getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
if (!doLJPME)
throw OpenMMException("getPMEParametersInContext: This Context is not using PME");
if (cu.getPlatformData().useCpuPme)
//cpuPme.getAs<CalcPmeReciprocalForceKernel>().getLJPMEParameters(alpha, nx, ny, nz);
throw OpenMMException("getPMEParametersInContext: CPUPME has not been implemented for LJPME yet.");
else {
alpha = this->dispersionAlpha;
nx = dispersionGridSizeX;
ny = dispersionGridSizeY;
nz = dispersionGridSizeZ;
}
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2018 Stanford University and the Authors. *
* Portions copyright (c) 2020 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "openmm/OpenMMException.h"
#include "HipNonbondedUtilities.h"
#include "HipArray.h"
#include "HipContext.h"
#include "HipKernelSources.h"
#include "HipExpressionUtilities.h"
#include "HipSort.h"
#include <algorithm>
#include <map>
#include <set>
#include <utility>
using namespace OpenMM;
using namespace std;
#define CHECK_RESULT(result) \
if (result != hipSuccess) { \
std::stringstream m; \
m<<errorMessage<<": "<<context.getErrorString(result)<<" ("<<result<<")"<<" at "<<__FILE__<<":"<<__LINE__; \
throw OpenMMException(m.str());\
}
class HipNonbondedUtilities::BlockSortTrait : public HipSort::SortTrait {
public:
BlockSortTrait(bool useDouble) : useDouble(useDouble) {
}
int getDataSize() const {return useDouble ? sizeof(double2) : sizeof(float2);}
int getKeySize() const {return useDouble ? sizeof(double) : sizeof(float);}
const char* getDataType() const {return "real2";}
const char* getKeyType() const {return "real";}
const char* getMinKey() const {return "-3.40282e+38f";}
const char* getMaxKey() const {return "3.40282e+38f";}
const char* getMaxValue() const {return "make_real2(3.40282e+38f, 3.40282e+38f)";}
const char* getSortKey() const {return "value.x";}
private:
bool useDouble;
};
HipNonbondedUtilities::HipNonbondedUtilities(HipContext& context) : context(context), useCutoff(false), usePeriodic(false), anyExclusions(false), usePadding(true),
blockSorter(NULL), pinnedCountBuffer(NULL), forceRebuildNeighborList(true), lastCutoff(0.0), groupFlags(0), canUsePairList(true) {
// Decide how many thread blocks to use.
string errorMessage = "Error initializing nonbonded utilities";
int multiprocessors;
CHECK_RESULT(hipDeviceGetAttribute(&multiprocessors, hipDeviceAttributeMultiprocessorCount, context.getDevice()));
CHECK_RESULT(hipEventCreateWithFlags(&downloadCountEvent, 0));
CHECK_RESULT(hipHostMalloc((void**) &pinnedCountBuffer, 2*sizeof(int), hipHostMallocPortable));
numForceThreadBlocks = 4*multiprocessors;
forceThreadBlockSize = (256);
setKernelSource(HipKernelSources::nonbonded);
}
HipNonbondedUtilities::~HipNonbondedUtilities() {
if (blockSorter != NULL)
delete blockSorter;
if (pinnedCountBuffer != NULL)
hipHostFree(pinnedCountBuffer);
hipEventDestroy(downloadCountEvent);
}
void HipNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector<vector<int> >& exclusionList, const string& kernel, int forceGroup) {
addInteraction(usesCutoff, usesPeriodic, usesExclusions, cutoffDistance, exclusionList, kernel, forceGroup, false);
}
void HipNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector<vector<int> >& exclusionList, const string& kernel, int forceGroup, bool supportsPairList) {
if (groupCutoff.size() > 0) {
if (usesCutoff != useCutoff)
throw OpenMMException("All Forces must agree on whether to use a cutoff");
if (usesPeriodic != usePeriodic)
throw OpenMMException("All Forces must agree on whether to use periodic boundary conditions");
if (usesCutoff && groupCutoff.find(forceGroup) != groupCutoff.end() && groupCutoff[forceGroup] != cutoffDistance)
throw OpenMMException("All Forces in a single force group must use the same cutoff distance");
}
if (usesExclusions)
requestExclusions(exclusionList);
useCutoff = usesCutoff;
usePeriodic = usesPeriodic;
groupCutoff[forceGroup] = cutoffDistance;
groupFlags |= 1<<forceGroup;
canUsePairList &= supportsPairList;
if (kernel.size() > 0) {
if (groupKernelSource.find(forceGroup) == groupKernelSource.end())
groupKernelSource[forceGroup] = "";
map<string, string> replacements;
replacements["CUTOFF"] = "CUTOFF_"+context.intToString(forceGroup);
replacements["CUTOFF_SQUARED"] = "CUTOFF_"+context.intToString(forceGroup)+"_SQUARED";
groupKernelSource[forceGroup] += context.replaceStrings(kernel, replacements)+"\n";
}
}
void HipNonbondedUtilities::addParameter(ComputeParameterInfo parameter) {
parameters.push_back(ParameterInfo(parameter.getName(), parameter.getComponentType(), parameter.getNumComponents(),
parameter.getSize(), context.unwrap(parameter.getArray()).getDevicePointer(), parameter.isConstant()));
}
void HipNonbondedUtilities::addParameter(const ParameterInfo& parameter) {
parameters.push_back(parameter);
}
void HipNonbondedUtilities::addArgument(ComputeParameterInfo parameter) {
arguments.push_back(ParameterInfo(parameter.getName(), parameter.getComponentType(), parameter.getNumComponents(),
parameter.getSize(), context.unwrap(parameter.getArray()).getDevicePointer(), parameter.isConstant()));
}
void HipNonbondedUtilities::addArgument(const ParameterInfo& parameter) {
arguments.push_back(parameter);
}
string HipNonbondedUtilities::addEnergyParameterDerivative(const string& param) {
// See if the parameter has already been added.
int index;
for (index = 0; index < energyParameterDerivatives.size(); index++)
if (param == energyParameterDerivatives[index])
break;
if (index == energyParameterDerivatives.size())
energyParameterDerivatives.push_back(param);
context.addEnergyParameterDerivative(param);
return string("energyParamDeriv")+context.intToString(index);
}
void HipNonbondedUtilities::requestExclusions(const vector<vector<int> >& exclusionList) {
if (anyExclusions) {
bool sameExclusions = (exclusionList.size() == atomExclusions.size());
for (int i = 0; i < (int) exclusionList.size() && sameExclusions; i++) {
if (exclusionList[i].size() != atomExclusions[i].size())
sameExclusions = false;
set<int> expectedExclusions;
expectedExclusions.insert(atomExclusions[i].begin(), atomExclusions[i].end());
for (int j = 0; j < (int) exclusionList[i].size(); j++)
if (expectedExclusions.find(exclusionList[i][j]) == expectedExclusions.end())
sameExclusions = false;
}
if (!sameExclusions)
throw OpenMMException("All Forces must have identical exceptions");
}
else {
atomExclusions = exclusionList;
anyExclusions = true;
}
}
static bool compareInt2(int2 a, int2 b) {
return ((a.y < b.y) || (a.y == b.y && a.x < b.x));
}
void HipNonbondedUtilities::initialize(const System& system) {
string errorMessage = "Error initializing nonbonded utilities";
if (atomExclusions.size() == 0) {
// No exclusions were specifically requested, so just mark every atom as not interacting with itself.
atomExclusions.resize(context.getNumAtoms());
for (int i = 0; i < (int) atomExclusions.size(); i++)
atomExclusions[i].push_back(i);
}
// Create the list of tiles.
numAtoms = context.getNumAtoms();
int numAtomBlocks = context.getNumAtomBlocks();
int numContexts = context.getPlatformData().contexts.size();
setAtomBlockRange(context.getContextIndex()/(double) numContexts, (context.getContextIndex()+1)/(double) numContexts);
// Build a list of tiles that contain exclusions.
set<pair<int, int> > tilesWithExclusions;
for (int atom1 = 0; atom1 < (int) atomExclusions.size(); ++atom1) {
int x = atom1/HipContext::TileSize;
for (int j = 0; j < (int) atomExclusions[atom1].size(); ++j) {
int atom2 = atomExclusions[atom1][j];
int y = atom2/HipContext::TileSize;
tilesWithExclusions.insert(make_pair(max(x, y), min(x, y)));
}
}
vector<int2> exclusionTilesVec;
for (set<pair<int, int> >::const_iterator iter = tilesWithExclusions.begin(); iter != tilesWithExclusions.end(); ++iter)
exclusionTilesVec.push_back(make_int2(iter->first, iter->second));
sort(exclusionTilesVec.begin(), exclusionTilesVec.end(), compareInt2);
exclusionTiles.initialize<int2>(context, exclusionTilesVec.size(), "exclusionTiles");
exclusionTiles.upload(exclusionTilesVec);
map<pair<int, int>, int> exclusionTileMap;
for (int i = 0; i < (int) exclusionTilesVec.size(); i++) {
int2 tile = exclusionTilesVec[i];
exclusionTileMap[make_pair(tile.x, tile.y)] = i;
}
vector<vector<int> > exclusionBlocksForBlock(numAtomBlocks);
for (set<pair<int, int> >::const_iterator iter = tilesWithExclusions.begin(); iter != tilesWithExclusions.end(); ++iter) {
exclusionBlocksForBlock[iter->first].push_back(iter->second);
if (iter->first != iter->second)
exclusionBlocksForBlock[iter->second].push_back(iter->first);
}
vector<unsigned int> exclusionRowIndicesVec(numAtomBlocks+1, 0);
vector<unsigned int> exclusionIndicesVec;
for (int i = 0; i < numAtomBlocks; i++) {
exclusionIndicesVec.insert(exclusionIndicesVec.end(), exclusionBlocksForBlock[i].begin(), exclusionBlocksForBlock[i].end());
exclusionRowIndicesVec[i+1] = exclusionIndicesVec.size();
}
maxExclusions = 0;
for (int i = 0; i < (int) exclusionBlocksForBlock.size(); i++)
maxExclusions = (maxExclusions > exclusionBlocksForBlock[i].size() ? maxExclusions : exclusionBlocksForBlock[i].size());
exclusionIndices.initialize<unsigned int>(context, exclusionIndicesVec.size(), "exclusionIndices");
exclusionRowIndices.initialize<unsigned int>(context, exclusionRowIndicesVec.size(), "exclusionRowIndices");
exclusionIndices.upload(exclusionIndicesVec);
exclusionRowIndices.upload(exclusionRowIndicesVec);
// Record the exclusion data.
exclusions.initialize<tileflags>(context, tilesWithExclusions.size()*HipContext::TileSize, "exclusions");
tileflags allFlags = static_cast<tileflags>(-1);
vector<tileflags> exclusionVec(exclusions.getSize(), allFlags);
for (int atom1 = 0; atom1 < (int) atomExclusions.size(); ++atom1) {
int x = atom1/HipContext::TileSize;
int offset1 = atom1-x*HipContext::TileSize;
for (int j = 0; j < (int) atomExclusions[atom1].size(); ++j) {
int atom2 = atomExclusions[atom1][j];
int y = atom2/HipContext::TileSize;
int offset2 = atom2-y*HipContext::TileSize;
if (x > y) {
int index = exclusionTileMap[make_pair(x, y)]*HipContext::TileSize;
exclusionVec[index+offset1] &= allFlags-(static_cast<tileflags>(1)<<offset2);
}
else {
int index = exclusionTileMap[make_pair(y, x)]*HipContext::TileSize;
exclusionVec[index+offset2] &= allFlags-(static_cast<tileflags>(1)<<offset1);
}
}
}
atomExclusions.clear(); // We won't use this again, so free the memory it used
exclusions.upload(exclusionVec);
// Create data structures for the neighbor list.
if (useCutoff) {
// Select a size for the arrays that hold the neighbor list. We have to make a fairly
// arbitrary guess, but if this turns out to be too small we'll increase it later.
maxTiles = 20*numAtomBlocks;
if (maxTiles > numTiles)
maxTiles = numTiles;
if (maxTiles < 1)
maxTiles = 1;
maxSinglePairs = 5*numAtoms;
interactingTiles.initialize<int>(context, maxTiles, "interactingTiles");
interactingAtoms.initialize<int>(context, HipContext::TileSize*maxTiles, "interactingAtoms");
interactionCount.initialize<unsigned int>(context, 2, "interactionCount");
singlePairs.initialize<int2>(context, maxSinglePairs, "singlePairs");
int elementSize = (context.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
blockCenter.initialize(context, numAtomBlocks, 4*elementSize, "blockCenter");
blockBoundingBox.initialize(context, numAtomBlocks, 4*elementSize, "blockBoundingBox");
sortedBlocks.initialize(context, numAtomBlocks, 2*elementSize, "sortedBlocks");
sortedBlockCenter.initialize(context, numAtomBlocks+1, 4*elementSize, "sortedBlockCenter");
sortedBlockBoundingBox.initialize(context, numAtomBlocks+1, 4*elementSize, "sortedBlockBoundingBox");
oldPositions.initialize(context, numAtoms, 4*elementSize, "oldPositions");
rebuildNeighborList.initialize<int>(context, 1, "rebuildNeighborList");
blockSorter = new HipSort(context, new BlockSortTrait(context.getUseDoublePrecision()), numAtomBlocks);
vector<unsigned int> count(2, 0);
interactionCount.upload(count);
rebuildNeighborList.upload(&count[0]);
}
// Record arguments for kernels.
forceArgs.push_back(&context.getForce().getDevicePointer());
forceArgs.push_back(&context.getEnergyBuffer().getDevicePointer());
forceArgs.push_back(&context.getPosq().getDevicePointer());
forceArgs.push_back(&exclusions.getDevicePointer());
forceArgs.push_back(&exclusionTiles.getDevicePointer());
forceArgs.push_back(&startTileIndex);
forceArgs.push_back(&numTiles);
if (useCutoff) {
forceArgs.push_back(&interactingTiles.getDevicePointer());
forceArgs.push_back(&interactionCount.getDevicePointer());
forceArgs.push_back(context.getPeriodicBoxSizePointer());
forceArgs.push_back(context.getInvPeriodicBoxSizePointer());
forceArgs.push_back(context.getPeriodicBoxVecXPointer());
forceArgs.push_back(context.getPeriodicBoxVecYPointer());
forceArgs.push_back(context.getPeriodicBoxVecZPointer());
forceArgs.push_back(&maxTiles);
forceArgs.push_back(&blockCenter.getDevicePointer());
forceArgs.push_back(&blockBoundingBox.getDevicePointer());
forceArgs.push_back(&interactingAtoms.getDevicePointer());
forceArgs.push_back(&maxSinglePairs);
forceArgs.push_back(&singlePairs.getDevicePointer());
}
for (int i = 0; i < (int) parameters.size(); i++)
forceArgs.push_back(&parameters[i].getMemory());
for (ParameterInfo& arg : arguments)
forceArgs.push_back(&arg.getMemory());
if (energyParameterDerivatives.size() > 0)
forceArgs.push_back(&context.getEnergyParamDerivBuffer().getDevicePointer());
if (useCutoff) {
findBlockBoundsArgs.push_back(&numAtoms);
findBlockBoundsArgs.push_back(context.getPeriodicBoxSizePointer());
findBlockBoundsArgs.push_back(context.getInvPeriodicBoxSizePointer());
findBlockBoundsArgs.push_back(context.getPeriodicBoxVecXPointer());
findBlockBoundsArgs.push_back(context.getPeriodicBoxVecYPointer());
findBlockBoundsArgs.push_back(context.getPeriodicBoxVecZPointer());
findBlockBoundsArgs.push_back(&context.getPosq().getDevicePointer());
findBlockBoundsArgs.push_back(&blockCenter.getDevicePointer());
findBlockBoundsArgs.push_back(&blockBoundingBox.getDevicePointer());
findBlockBoundsArgs.push_back(&rebuildNeighborList.getDevicePointer());
findBlockBoundsArgs.push_back(&sortedBlocks.getDevicePointer());
sortBoxDataArgs.push_back(&sortedBlocks.getDevicePointer());
sortBoxDataArgs.push_back(&blockCenter.getDevicePointer());
sortBoxDataArgs.push_back(&blockBoundingBox.getDevicePointer());
sortBoxDataArgs.push_back(&sortedBlockCenter.getDevicePointer());
sortBoxDataArgs.push_back(&sortedBlockBoundingBox.getDevicePointer());
sortBoxDataArgs.push_back(&context.getPosq().getDevicePointer());
sortBoxDataArgs.push_back(&oldPositions.getDevicePointer());
sortBoxDataArgs.push_back(&interactionCount.getDevicePointer());
sortBoxDataArgs.push_back(&rebuildNeighborList.getDevicePointer());
sortBoxDataArgs.push_back(&forceRebuildNeighborList);
findInteractingBlocksArgs.push_back(context.getPeriodicBoxSizePointer());
findInteractingBlocksArgs.push_back(context.getInvPeriodicBoxSizePointer());
findInteractingBlocksArgs.push_back(context.getPeriodicBoxVecXPointer());
findInteractingBlocksArgs.push_back(context.getPeriodicBoxVecYPointer());
findInteractingBlocksArgs.push_back(context.getPeriodicBoxVecZPointer());
findInteractingBlocksArgs.push_back(&interactionCount.getDevicePointer());
findInteractingBlocksArgs.push_back(&interactingTiles.getDevicePointer());
findInteractingBlocksArgs.push_back(&interactingAtoms.getDevicePointer());
findInteractingBlocksArgs.push_back(&singlePairs.getDevicePointer());
findInteractingBlocksArgs.push_back(&context.getPosq().getDevicePointer());
findInteractingBlocksArgs.push_back(&maxTiles);
findInteractingBlocksArgs.push_back(&maxSinglePairs);
findInteractingBlocksArgs.push_back(&startBlockIndex);
findInteractingBlocksArgs.push_back(&numBlocks);
findInteractingBlocksArgs.push_back(&sortedBlocks.getDevicePointer());
findInteractingBlocksArgs.push_back(&sortedBlockCenter.getDevicePointer());
findInteractingBlocksArgs.push_back(&sortedBlockBoundingBox.getDevicePointer());
findInteractingBlocksArgs.push_back(&exclusionIndices.getDevicePointer());
findInteractingBlocksArgs.push_back(&exclusionRowIndices.getDevicePointer());
findInteractingBlocksArgs.push_back(&oldPositions.getDevicePointer());
findInteractingBlocksArgs.push_back(&rebuildNeighborList.getDevicePointer());
}
}
double HipNonbondedUtilities::getMaxCutoffDistance() {
double cutoff = 0.0;
for (map<int, double>::const_iterator iter = groupCutoff.begin(); iter != groupCutoff.end(); ++iter)
cutoff = max(cutoff, iter->second);
return cutoff;
}
double HipNonbondedUtilities::padCutoff(double cutoff) {
double padding = (usePadding ? 0.08*cutoff : 0.0);
return cutoff+padding;
}
void HipNonbondedUtilities::prepareInteractions(int forceGroups) {
if ((forceGroups&groupFlags) == 0)
return;
if (groupKernels.find(forceGroups) == groupKernels.end())
createKernelsForGroups(forceGroups);
if (!useCutoff)
return;
if (numTiles == 0)
return;
KernelSet& kernels = groupKernels[forceGroups];
if (usePeriodic) {
double4 box = context.getPeriodicBoxSize();
double minAllowedSize = 1.999999*kernels.cutoffDistance;
if (box.x < minAllowedSize || box.y < minAllowedSize || box.z < minAllowedSize)
throw OpenMMException("The periodic box size has decreased to less than twice the nonbonded cutoff.");
}
// Compute the neighbor list.
if (lastCutoff != kernels.cutoffDistance)
forceRebuildNeighborList = true;
context.executeKernel(kernels.findBlockBoundsKernel, &findBlockBoundsArgs[0], context.getNumAtoms());
blockSorter->sort(sortedBlocks);
context.executeKernel(kernels.sortBoxDataKernel, &sortBoxDataArgs[0], context.getNumAtoms());
context.executeKernel(kernels.findInteractingBlocksKernel, &findInteractingBlocksArgs[0], context.getNumAtoms(), 256);
forceRebuildNeighborList = false;
lastCutoff = kernels.cutoffDistance;
interactionCount.download(pinnedCountBuffer, false);
hipEventRecord(downloadCountEvent, context.getCurrentStream());
}
void HipNonbondedUtilities::computeInteractions(int forceGroups, bool includeForces, bool includeEnergy) {
if ((forceGroups&groupFlags) == 0)
return;
KernelSet& kernels = groupKernels[forceGroups];
if (kernels.hasForces) {
hipFunction_t& kernel = (includeForces ? (includeEnergy ? kernels.forceEnergyKernel : kernels.forceKernel) : kernels.energyKernel);
if (kernel == NULL)
kernel = createInteractionKernel(kernels.source, parameters, arguments, true, true, forceGroups, includeForces, includeEnergy);
context.executeKernel(kernel, &forceArgs[0], numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
}
if (useCutoff && numTiles > 0) {
hipEventSynchronize(downloadCountEvent);
updateNeighborListSize();
}
}
bool HipNonbondedUtilities::updateNeighborListSize() {
if (!useCutoff)
return false;
if (pinnedCountBuffer[0] <= maxTiles && pinnedCountBuffer[1] <= maxSinglePairs)
return false;
// The most recent timestep had too many interactions to fit in the arrays. Make the arrays bigger to prevent
// this from happening in the future.
if (pinnedCountBuffer[0] > maxTiles) {
maxTiles = (int) (1.2*pinnedCountBuffer[0]);
int totalTiles = context.getNumAtomBlocks()*(context.getNumAtomBlocks()+1)/2;
if (maxTiles > totalTiles)
maxTiles = totalTiles;
interactingTiles.resize(maxTiles);
interactingAtoms.resize(HipContext::TileSize*maxTiles);
if (forceArgs.size() > 0)
forceArgs[7] = &interactingTiles.getDevicePointer();
findInteractingBlocksArgs[6] = &interactingTiles.getDevicePointer();
if (forceArgs.size() > 0)
forceArgs[17] = &interactingAtoms.getDevicePointer();
findInteractingBlocksArgs[7] = &interactingAtoms.getDevicePointer();
}
if (pinnedCountBuffer[1] > maxSinglePairs) {
maxSinglePairs = (int) (1.2*pinnedCountBuffer[1]);
singlePairs.resize(maxSinglePairs);
if (forceArgs.size() > 0)
forceArgs[19] = &singlePairs.getDevicePointer();
findInteractingBlocksArgs[8] = &singlePairs.getDevicePointer();
}
forceRebuildNeighborList = true;
context.setForcesValid(false);
return true;
}
void HipNonbondedUtilities::setUsePadding(bool padding) {
usePadding = padding;
}
void HipNonbondedUtilities::setAtomBlockRange(double startFraction, double endFraction) {
int numAtomBlocks = context.getNumAtomBlocks();
startBlockIndex = (int) (startFraction*numAtomBlocks);
numBlocks = (int) (endFraction*numAtomBlocks)-startBlockIndex;
long long totalTiles = context.getNumAtomBlocks()*((long long)context.getNumAtomBlocks()+1)/2;
startTileIndex = (int) (startFraction*totalTiles);
numTiles = (long long) (endFraction*totalTiles)-startTileIndex;
forceRebuildNeighborList = true;
}
void HipNonbondedUtilities::createKernelsForGroups(int groups) {
KernelSet kernels;
double cutoff = 0.0;
string source;
for (int i = 0; i < 32; i++) {
if ((groups&(1<<i)) != 0) {
cutoff = max(cutoff, groupCutoff[i]);
source += groupKernelSource[i];
}
}
kernels.hasForces = (source.size() > 0);
kernels.cutoffDistance = cutoff;
kernels.source = source;
kernels.forceKernel = kernels.energyKernel = kernels.forceEnergyKernel = NULL;
if (useCutoff) {
double paddedCutoff = padCutoff(cutoff);
map<string, string> defines;
defines["TILE_SIZE"] = context.intToString(HipContext::TileSize);
defines["NUM_BLOCKS"] = context.intToString(context.getNumAtomBlocks());
defines["NUM_ATOMS"] = context.intToString(context.getNumAtoms());
defines["PADDING"] = context.doubleToString(paddedCutoff-cutoff);
defines["PADDED_CUTOFF"] = context.doubleToString(paddedCutoff);
defines["PADDED_CUTOFF_SQUARED"] = context.doubleToString(paddedCutoff*paddedCutoff);
defines["NUM_TILES_WITH_EXCLUSIONS"] = context.intToString(exclusionTiles.getSize());
if (usePeriodic)
defines["USE_PERIODIC"] = "1";
if (context.getBoxIsTriclinic())
defines["TRICLINIC"] = "1";
defines["MAX_EXCLUSIONS"] = context.intToString(maxExclusions);
defines["MAX_BITS_FOR_PAIRS"] = (canUsePairList ? "2" : "0");
hipModule_t interactingBlocksProgram = context.createModule(HipKernelSources::vectorOps+HipKernelSources::findInteractingBlocks, defines);
kernels.findBlockBoundsKernel = context.getKernel(interactingBlocksProgram, "findBlockBounds");
kernels.sortBoxDataKernel = context.getKernel(interactingBlocksProgram, "sortBoxData");
kernels.findInteractingBlocksKernel = context.getKernel(interactingBlocksProgram, "findBlocksWithInteractions");
}
groupKernels[groups] = kernels;
}
hipFunction_t HipNonbondedUtilities::createInteractionKernel(const string& source, vector<ParameterInfo>& params, vector<ParameterInfo>& arguments, bool useExclusions, bool isSymmetric, int groups, bool includeForces, bool includeEnergy) {
map<string, string> replacements;
replacements["COMPUTE_INTERACTION"] = source;
const string suffixes[] = {"x", "y", "z", "w"};
stringstream localData;
int localDataSize = 0;
for (const ParameterInfo& param : params) {
if (param.getNumComponents() == 1)
localData<<param.getType()<<" "<<param.getName()<<";\n";
else {
for (int j = 0; j < param.getNumComponents(); ++j)
localData<<param.getComponentType()<<" "<<param.getName()<<"_"<<suffixes[j]<<";\n";
}
localDataSize += param.getSize();
}
replacements["ATOM_PARAMETER_DATA"] = localData.str();
stringstream args;
for (const ParameterInfo& param : params) {
args << ", ";
if (param.isConstant())
args << "const ";
args << param.getType();
args << "* __restrict__ global_";
args << param.getName();
}
for (const ParameterInfo& arg : arguments) {
args << ", ";
if (arg.isConstant())
args << "const ";
args << arg.getType();
args << "* __restrict__ ";
args << arg.getName();
}
if (energyParameterDerivatives.size() > 0)
args << ", mixed* __restrict__ energyParamDerivs";
replacements["PARAMETER_ARGUMENTS"] = args.str();
stringstream load1;
for (const ParameterInfo& param : params) {
load1 << param.getType();
load1 << " ";
load1 << param.getName();
load1 << "1 = global_";
load1 << param.getName();
load1 << "[atom1];\n";
}
replacements["LOAD_ATOM1_PARAMETERS"] = load1.str();
bool useShuffle = context.getSupportsWarpShuffle();
// Part 1. Defines for on diagonal exclusion tiles
stringstream loadLocal1;
if(useShuffle) {
// not needed if using shuffles as we can directly fetch from register
} else {
for (const ParameterInfo& param : params) {
if (param.getNumComponents() == 1)
loadLocal1<<"localData[LOCAL_ID]."<<param.getName()<<" = "<<param.getName()<<"1;\n";
else {
for (int j = 0; j < param.getNumComponents(); ++j)
loadLocal1<<"localData[LOCAL_ID]."<<param.getName()<<"_"<<suffixes[j]<<" = "<<param.getName()<<"1."<<suffixes[j]<<";\n";
}
}
}
replacements["LOAD_LOCAL_PARAMETERS_FROM_1"] = loadLocal1.str();
stringstream broadcastWarpData;
if (useShuffle) {
broadcastWarpData << "posq2.x = real_shfl(shflPosq.x, j);\n";
broadcastWarpData << "posq2.y = real_shfl(shflPosq.y, j);\n";
broadcastWarpData << "posq2.z = real_shfl(shflPosq.z, j);\n";
broadcastWarpData << "posq2.w = real_shfl(shflPosq.w, j);\n";
for (const ParameterInfo& param : params) {
broadcastWarpData << param.getType() << " shfl" << param.getName() << ";\n";
for (int j = 0; j < param.getNumComponents(); j++) {
if (param.getNumComponents() == 1)
broadcastWarpData << "shfl" << param.getName() << "=real_shfl(" << param.getName() <<"1,j);\n";
else
broadcastWarpData << "shfl" << param.getName()+"."+suffixes[j] << "=real_shfl(" << param.getName()+"1."+suffixes[j] <<",j);\n";
}
}
} else {
// not used if not shuffling
}
replacements["BROADCAST_WARP_DATA"] = broadcastWarpData.str();
// Part 2. Defines for off-diagonal exclusions, and neighborlist tiles.
stringstream declareLocal2;
if (useShuffle) {
for (const ParameterInfo& param : params)
declareLocal2<<param.getType()<<" shfl"<<param.getName()<<";\n";
}
else {
// not used if using shared memory
}
replacements["DECLARE_LOCAL_PARAMETERS"] = declareLocal2.str();
stringstream loadLocal2;
if (useShuffle) {
for (const ParameterInfo& param : params)
loadLocal2<<"shfl"<<param.getName()<<" = global_"<<param.getName()<<"[j];\n";
}
else {
for (const ParameterInfo& param : params) {
if (param.getNumComponents() == 1)
loadLocal2<<"localData[LOCAL_ID]."<<param.getName()<<" = global_"<<param.getName()<<"[j];\n";
else {
loadLocal2<<param.getType()<<" temp_"<<param.getName()<<" = global_"<<param.getName()<<"[j];\n";
for (int j = 0; j < param.getNumComponents(); ++j)
loadLocal2<<"localData[LOCAL_ID]."<<param.getName()<<"_"<<suffixes[j]<<" = temp_"<<param.getName()<<"."<<suffixes[j]<<";\n";
}
}
}
replacements["LOAD_LOCAL_PARAMETERS_FROM_GLOBAL"] = loadLocal2.str();
stringstream load2j;
if (useShuffle) {
for (const ParameterInfo& param : params)
load2j<<param.getType()<<" "<<param.getName()<<"2 = shfl"<<param.getName()<<";\n";
}
else {
for (const ParameterInfo& param : params) {
if (param.getNumComponents() == 1)
load2j<<param.getType()<<" "<<param.getName()<<"2 = localData[atom2]."<<param.getName()<<";\n";
else {
load2j<<param.getType()<<" "<<param.getName()<<"2 = make_"<<param.getType()<<"(";
for (int j = 0; j < param.getNumComponents(); ++j) {
if (j > 0)
load2j<<", ";
load2j<<"localData[atom2]."<<param.getName()<<"_"<<suffixes[j];
}
load2j<<");\n";
}
}
}
replacements["LOAD_ATOM2_PARAMETERS"] = load2j.str();
stringstream clearLocal;
for (const ParameterInfo& param : params) {
if (useShuffle)
clearLocal<<"shfl";
else
clearLocal<<"localData[atom2].";
clearLocal<<param.getName()<<" = ";
if (param.getNumComponents() == 1)
clearLocal<<"0;\n";
else
clearLocal<<"make_"<<param.getType()<<"(0);\n";
}
replacements["CLEAR_LOCAL_PARAMETERS"] = clearLocal.str();
stringstream initDerivs;
for (int i = 0; i < energyParameterDerivatives.size(); i++)
initDerivs<<"mixed energyParamDeriv"<<i<<" = 0;\n";
replacements["INIT_DERIVATIVES"] = initDerivs.str();
stringstream saveDerivs;
const vector<string>& allParamDerivNames = context.getEnergyParamDerivNames();
int numDerivs = allParamDerivNames.size();
for (int i = 0; i < energyParameterDerivatives.size(); i++)
for (int index = 0; index < numDerivs; index++)
if (allParamDerivNames[index] == energyParameterDerivatives[i])
saveDerivs<<"energyParamDerivs[GLOBAL_ID*"<<numDerivs<<"+"<<index<<"] += energyParamDeriv"<<i<<";\n";
replacements["SAVE_DERIVATIVES"] = saveDerivs.str();
stringstream shuffleWarpData;
if(useShuffle) {
shuffleWarpData << "shflPosq.x = real_shfl(shflPosq.x, tgx+1);\n";
shuffleWarpData << "shflPosq.y = real_shfl(shflPosq.y, tgx+1);\n";
shuffleWarpData << "shflPosq.z = real_shfl(shflPosq.z, tgx+1);\n";
shuffleWarpData << "shflPosq.w = real_shfl(shflPosq.w, tgx+1);\n";
shuffleWarpData << "shflForce.x = real_shfl(shflForce.x, tgx+1);\n";
shuffleWarpData << "shflForce.y = real_shfl(shflForce.y, tgx+1);\n";
shuffleWarpData << "shflForce.z = real_shfl(shflForce.z, tgx+1);\n";
for (const ParameterInfo& param : params) {
if (param.getNumComponents() == 1)
shuffleWarpData<<"shfl"<<param.getName()<<"=real_shfl(shfl"<<param.getName()<<", tgx+1);\n";
else {
for (int j = 0; j < param.getNumComponents(); j++) {
// looks something like shflsigmaEpsilon.x = real_shfl(shflsigmaEpsilon.x,tgx+1);
shuffleWarpData<<"shfl"<<param.getName()
<<"."<<suffixes[j]<<"=real_shfl(shfl"
<<param.getName()<<"."<<suffixes[j]
<<", tgx+1);\n";
}
}
}
} else {
// not used otherwise
}
replacements["SHUFFLE_WARP_DATA"] = shuffleWarpData.str();
map<string, string> defines;
if (useCutoff)
defines["USE_CUTOFF"] = "1";
if (usePeriodic)
defines["USE_PERIODIC"] = "1";
if (useExclusions)
defines["USE_EXCLUSIONS"] = "1";
if (isSymmetric)
defines["USE_SYMMETRIC"] = "1";
if (useShuffle)
defines["ENABLE_SHUFFLE"] = "1";
if (includeForces)
defines["INCLUDE_FORCES"] = "1";
if (includeEnergy)
defines["INCLUDE_ENERGY"] = "1";
defines["THREAD_BLOCK_SIZE"] = context.intToString(forceThreadBlockSize);
double maxCutoff = 0.0;
for (int i = 0; i < 32; i++) {
if ((groups&(1<<i)) != 0) {
double cutoff = groupCutoff[i];
maxCutoff = max(maxCutoff, cutoff);
defines["CUTOFF_"+context.intToString(i)+"_SQUARED"] = context.doubleToString(cutoff*cutoff);
defines["CUTOFF_"+context.intToString(i)] = context.doubleToString(cutoff);
}
}
defines["MAX_CUTOFF"] = context.doubleToString(maxCutoff);
defines["NUM_ATOMS"] = context.intToString(context.getNumAtoms());
defines["PADDED_NUM_ATOMS"] = context.intToString(context.getPaddedNumAtoms());
defines["NUM_BLOCKS"] = context.intToString(context.getNumAtomBlocks());
defines["TILE_SIZE"] = context.intToString(HipContext::TileSize);
int numExclusionTiles = exclusionTiles.getSize();
defines["NUM_TILES_WITH_EXCLUSIONS"] = context.intToString(numExclusionTiles);
int numContexts = context.getPlatformData().contexts.size();
int startExclusionIndex = context.getContextIndex()*numExclusionTiles/numContexts;
int endExclusionIndex = (context.getContextIndex()+1)*numExclusionTiles/numContexts;
defines["FIRST_EXCLUSION_TILE"] = context.intToString(startExclusionIndex);
defines["LAST_EXCLUSION_TILE"] = context.intToString(endExclusionIndex);
if ((localDataSize/4)%2 == 0 && !context.getUseDoublePrecision())
defines["PARAMETER_SIZE_IS_EVEN"] = "1";
hipModule_t program = context.createModule(HipKernelSources::vectorOps+context.replaceStrings(kernelSource, replacements), defines);
hipFunction_t kernel = context.getKernel(program, "computeNonbonded");
return kernel;
}
void HipNonbondedUtilities::setKernelSource(const string& source) {
kernelSource = source;
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2011-2019 Stanford University and the Authors. *
* Portions copyright (c) 2020 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "HipParallelKernels.h"
#include "HipKernelSources.h"
using namespace OpenMM;
using namespace std;
#define CHECK_RESULT(result, prefix) \
if (result != hipSuccess) { \
std::stringstream m; \
m<<prefix<<": "<<cu.getErrorString(result)<<" ("<<result<<")"<<" at "<<__FILE__<<":"<<__LINE__; \
throw OpenMMException(m.str());\
}
/**
* Get the current clock time, measured in microseconds.
*/
#ifdef _MSC_VER
#error "Windows unsupported for HIP platform"
#include <Windows.h>
static long long getTime() {
FILETIME ft;
GetSystemTimeAsFileTime(&ft); // 100-nanoseconds since 1-1-1601
ULARGE_INTEGER result;
result.LowPart = ft.dwLowDateTime;
result.HighPart = ft.dwHighDateTime;
return result.QuadPart/10;
}
#else
#include <sys/time.h>
static long long getTime() {
struct timeval tod;
gettimeofday(&tod, 0);
return 1000000*tod.tv_sec+tod.tv_usec;
}
#endif
class HipParallelCalcForcesAndEnergyKernel::BeginComputationTask : public HipContext::WorkTask {
public:
BeginComputationTask(ContextImpl& context, HipContext& cu, HipCalcForcesAndEnergyKernel& kernel,
bool includeForce, bool includeEnergy, int groups, void* pinnedMemory, hipEvent_t event, int2& interactionCount) : context(context), cu(cu), kernel(kernel),
includeForce(includeForce), includeEnergy(includeEnergy), groups(groups), pinnedMemory(pinnedMemory), event(event), interactionCount(interactionCount) {
}
void execute() {
// Copy coordinates over to this device and execute the kernel.
cu.setAsCurrent();
if (cu.getContextIndex() > 0) {
hipStreamWaitEvent(cu.getCurrentStream(), event, 0);
if (!cu.getPlatformData().peerAccessSupported)
cu.getPosq().upload(pinnedMemory, false);
}
kernel.beginComputation(context, includeForce, includeEnergy, groups);
if (cu.getNonbondedUtilities().getUsePeriodic())
cu.getNonbondedUtilities().getInteractionCount().download(&interactionCount, false);
}
private:
ContextImpl& context;
HipContext& cu;
HipCalcForcesAndEnergyKernel& kernel;
bool includeForce, includeEnergy;
int groups;
void* pinnedMemory;
hipEvent_t event;
int2& interactionCount;
};
class HipParallelCalcForcesAndEnergyKernel::FinishComputationTask : public HipContext::WorkTask {
public:
FinishComputationTask(ContextImpl& context, HipContext& cu, HipCalcForcesAndEnergyKernel& kernel,
bool includeForce, bool includeEnergy, int groups, double& energy, long long& completionTime, long long* pinnedMemory, HipArray& contextForces, bool& valid, int2& interactionCount) :
context(context), cu(cu), kernel(kernel), includeForce(includeForce), includeEnergy(includeEnergy), groups(groups), energy(energy),
completionTime(completionTime), pinnedMemory(pinnedMemory), contextForces(contextForces), valid(valid), interactionCount(interactionCount) {
}
void execute() {
// Execute the kernel, then download forces.
energy += kernel.finishComputation(context, includeForce, includeEnergy, groups, valid);
if (cu.getComputeForceCount() < 200) {
// Record timing information for load balancing. Since this takes time, only do it at the start of the simulation.
CHECK_RESULT(hipStreamSynchronize(cu.getCurrentStream()), "Error synchronizing HIP context");
completionTime = getTime();
}
if (includeForce) {
if (cu.getContextIndex() > 0) {
int numAtoms = cu.getPaddedNumAtoms();
if (cu.getPlatformData().peerAccessSupported) {
int numBytes = numAtoms*3*sizeof(long long);
int offset = (cu.getContextIndex()-1)*numBytes;
HipContext& context0 = *cu.getPlatformData().contexts[0];
CHECK_RESULT(hipMemcpy(static_cast<char*>(contextForces.getDevicePointer())+offset,
cu.getForce().getDevicePointer(), numBytes, hipMemcpyDeviceToDevice), "Error copying forces");
}
else
cu.getForce().download(&pinnedMemory[(cu.getContextIndex()-1)*numAtoms*3]);
}
}
if (cu.getNonbondedUtilities().getUsePeriodic() && (interactionCount.x > cu.getNonbondedUtilities().getInteractingTiles().getSize() ||
interactionCount.y > cu.getNonbondedUtilities().getSinglePairs().getSize())) {
valid = false;
cu.getNonbondedUtilities().updateNeighborListSize();
}
}
private:
ContextImpl& context;
HipContext& cu;
HipCalcForcesAndEnergyKernel& kernel;
bool includeForce, includeEnergy;
int groups;
double& energy;
long long& completionTime;
long long* pinnedMemory;
HipArray& contextForces;
bool& valid;
int2& interactionCount;
};
HipParallelCalcForcesAndEnergyKernel::HipParallelCalcForcesAndEnergyKernel(string name, const Platform& platform, HipPlatform::PlatformData& data) :
CalcForcesAndEnergyKernel(name, platform), data(data), completionTimes(data.contexts.size()), contextNonbondedFractions(data.contexts.size()),
interactionCounts(NULL), pinnedPositionBuffer(NULL), pinnedForceBuffer(NULL) {
for (int i = 0; i < (int) data.contexts.size(); i++)
kernels.push_back(Kernel(new HipCalcForcesAndEnergyKernel(name, platform, *data.contexts[i])));
}
HipParallelCalcForcesAndEnergyKernel::~HipParallelCalcForcesAndEnergyKernel() {
data.contexts[0]->setAsCurrent();
if (pinnedPositionBuffer != NULL)
hipHostFree(pinnedPositionBuffer);
if (pinnedForceBuffer != NULL)
hipHostFree(pinnedForceBuffer);
hipEventDestroy(event);
hipStreamDestroy(peerCopyStream);
if (interactionCounts != NULL)
hipHostFree(interactionCounts);
}
void HipParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
HipContext& cu = *data.contexts[0];
cu.setAsCurrent();
hipModule_t module = cu.createModule(HipKernelSources::parallel);
sumKernel = cu.getKernel(module, "sumForces");
int numContexts = data.contexts.size();
for (int i = 0; i < numContexts; i++)
getKernel(i).initialize(system);
for (int i = 0; i < numContexts; i++)
contextNonbondedFractions[i] = 1/(double) numContexts;
CHECK_RESULT(hipEventCreateWithFlags(&event, 0), "Error creating event");
CHECK_RESULT(hipStreamCreateWithFlags(&peerCopyStream, hipStreamNonBlocking), "Error creating stream");
CHECK_RESULT(hipHostMalloc((void**) &interactionCounts, numContexts*sizeof(int2), 0), "Error creating interaction counts buffer");
}
void HipParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) {
HipContext& cu = *data.contexts[0];
cu.setAsCurrent();
if (!contextForces.isInitialized()) {
contextForces.initialize<long long>(cu, 3*(data.contexts.size()-1)*cu.getPaddedNumAtoms(), "contextForces");
CHECK_RESULT(hipHostMalloc((void**) &pinnedForceBuffer, 3*(data.contexts.size()-1)*cu.getPaddedNumAtoms()*sizeof(long long), hipHostMallocPortable), "Error allocating pinned memory");
CHECK_RESULT(hipHostMalloc(&pinnedPositionBuffer, cu.getPaddedNumAtoms()*(cu.getUseDoublePrecision() ? sizeof(double4) : sizeof(float4)), hipHostMallocPortable), "Error allocating pinned memory");
}
// Copy coordinates over to each device and execute the kernel.
if (!cu.getPlatformData().peerAccessSupported) {
cu.getPosq().download(pinnedPositionBuffer, false);
hipEventRecord(event, cu.getCurrentStream());
}
else {
int numBytes = cu.getPosq().getSize()*cu.getPosq().getElementSize();
hipEventRecord(event, cu.getCurrentStream());
hipStreamWaitEvent(peerCopyStream, event, 0);
for (int i = 1; i < (int) data.contexts.size(); i++)
CHECK_RESULT(hipMemcpyAsync(
data.contexts[i]->getPosq().getDevicePointer(),
cu.getPosq().getDevicePointer(), numBytes,
hipMemcpyDeviceToDevice, peerCopyStream), "Error copying positions");
hipEventRecord(event, peerCopyStream);
}
for (int i = 0; i < (int) data.contexts.size(); i++) {
data.contextEnergy[i] = 0.0;
HipContext& cu = *data.contexts[i];
ComputeContext::WorkThread& thread = cu.getWorkThread();
thread.addTask(new BeginComputationTask(context, cu, getKernel(i), includeForce, includeEnergy, groups, pinnedPositionBuffer, event, interactionCounts[i]));
}
}
double HipParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups, bool& valid) {
for (int i = 0; i < (int) data.contexts.size(); i++) {
HipContext& cu = *data.contexts[i];
ComputeContext::WorkThread& thread = cu.getWorkThread();
thread.addTask(new FinishComputationTask(context, cu, getKernel(i), includeForce, includeEnergy, groups, data.contextEnergy[i], completionTimes[i], pinnedForceBuffer, contextForces, valid, interactionCounts[i]));
}
data.syncContexts();
double energy = 0.0;
for (int i = 0; i < (int) data.contextEnergy.size(); i++)
energy += data.contextEnergy[i];
if (includeForce && valid) {
// Sum the forces from all devices.
HipContext& cu = *data.contexts[0];
if (!cu.getPlatformData().peerAccessSupported)
contextForces.upload(pinnedForceBuffer, false);
int bufferSize = 3*cu.getPaddedNumAtoms();
int numBuffers = data.contexts.size()-1;
void* args[] = {&cu.getForce().getDevicePointer(), &contextForces.getDevicePointer(), &bufferSize, &numBuffers};
cu.executeKernel(sumKernel, args, bufferSize);
// Balance work between the contexts by transferring a little nonbonded work from the context that
// finished last to the one that finished first.
if (cu.getComputeForceCount() < 200) {
int firstIndex = 0, lastIndex = 0;
for (int i = 0; i < (int) completionTimes.size(); i++) {
if (completionTimes[i] < completionTimes[firstIndex])
firstIndex = i;
if (completionTimes[i] > completionTimes[lastIndex])
lastIndex = i;
}
double fractionToTransfer = min(0.01, contextNonbondedFractions[lastIndex]);
contextNonbondedFractions[firstIndex] += fractionToTransfer;
contextNonbondedFractions[lastIndex] -= fractionToTransfer;
double startFraction = 0.0;
for (int i = 0; i < (int) contextNonbondedFractions.size(); i++) {
double endFraction = startFraction+contextNonbondedFractions[i];
if (i == contextNonbondedFractions.size()-1)
endFraction = 1.0; // Avoid roundoff error
data.contexts[i]->getNonbondedUtilities().setAtomBlockRange(startFraction, endFraction);
startFraction = endFraction;
}
}
}
return energy;
}
class HipParallelCalcHarmonicBondForceKernel::Task : public HipContext::WorkTask {
public:
Task(ContextImpl& context, CommonCalcHarmonicBondForceKernel& kernel, bool includeForce,
bool includeEnergy, double& energy) : context(context), kernel(kernel),
includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
}
void execute() {
energy += kernel.execute(context, includeForce, includeEnergy);
}
private:
ContextImpl& context;
CommonCalcHarmonicBondForceKernel& kernel;
bool includeForce, includeEnergy;
double& energy;
};
HipParallelCalcHarmonicBondForceKernel::HipParallelCalcHarmonicBondForceKernel(std::string name, const Platform& platform, HipPlatform::PlatformData& data, const System& system) :
CalcHarmonicBondForceKernel(name, platform), data(data) {
for (int i = 0; i < (int) data.contexts.size(); i++)
kernels.push_back(Kernel(new CommonCalcHarmonicBondForceKernel(name, platform, *data.contexts[i], system)));
}
void HipParallelCalcHarmonicBondForceKernel::initialize(const System& system, const HarmonicBondForce& force) {
for (int i = 0; i < (int) kernels.size(); i++)
getKernel(i).initialize(system, force);
}
double HipParallelCalcHarmonicBondForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
for (int i = 0; i < (int) data.contexts.size(); i++) {
HipContext& cu = *data.contexts[i];
ComputeContext::WorkThread& thread = cu.getWorkThread();
thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
}
return 0.0;
}
void HipParallelCalcHarmonicBondForceKernel::copyParametersToContext(ContextImpl& context, const HarmonicBondForce& force) {
for (int i = 0; i < (int) kernels.size(); i++)
getKernel(i).copyParametersToContext(context, force);
}
class HipParallelCalcCustomBondForceKernel::Task : public HipContext::WorkTask {
public:
Task(ContextImpl& context, CommonCalcCustomBondForceKernel& kernel, bool includeForce,
bool includeEnergy, double& energy) : context(context), kernel(kernel),
includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
}
void execute() {
energy += kernel.execute(context, includeForce, includeEnergy);
}
private:
ContextImpl& context;
CommonCalcCustomBondForceKernel& kernel;
bool includeForce, includeEnergy;
double& energy;
};
HipParallelCalcCustomBondForceKernel::HipParallelCalcCustomBondForceKernel(std::string name, const Platform& platform, HipPlatform::PlatformData& data, const System& system) :
CalcCustomBondForceKernel(name, platform), data(data) {
for (int i = 0; i < (int) data.contexts.size(); i++)
kernels.push_back(Kernel(new CommonCalcCustomBondForceKernel(name, platform, *data.contexts[i], system)));
}
void HipParallelCalcCustomBondForceKernel::initialize(const System& system, const CustomBondForce& force) {
for (int i = 0; i < (int) kernels.size(); i++)
getKernel(i).initialize(system, force);
}
double HipParallelCalcCustomBondForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
for (int i = 0; i < (int) data.contexts.size(); i++) {
HipContext& cu = *data.contexts[i];
ComputeContext::WorkThread& thread = cu.getWorkThread();
thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
}
return 0.0;
}
void HipParallelCalcCustomBondForceKernel::copyParametersToContext(ContextImpl& context, const CustomBondForce& force) {
for (int i = 0; i < (int) kernels.size(); i++)
getKernel(i).copyParametersToContext(context, force);
}
class HipParallelCalcHarmonicAngleForceKernel::Task : public HipContext::WorkTask {
public:
Task(ContextImpl& context, CommonCalcHarmonicAngleForceKernel& kernel, bool includeForce,
bool includeEnergy, double& energy) : context(context), kernel(kernel),
includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
}
void execute() {
energy += kernel.execute(context, includeForce, includeEnergy);
}
private:
ContextImpl& context;
CommonCalcHarmonicAngleForceKernel& kernel;
bool includeForce, includeEnergy;
double& energy;
};
HipParallelCalcHarmonicAngleForceKernel::HipParallelCalcHarmonicAngleForceKernel(std::string name, const Platform& platform, HipPlatform::PlatformData& data, const System& system) :
CalcHarmonicAngleForceKernel(name, platform), data(data) {
for (int i = 0; i < (int) data.contexts.size(); i++)
kernels.push_back(Kernel(new CommonCalcHarmonicAngleForceKernel(name, platform, *data.contexts[i], system)));
}
void HipParallelCalcHarmonicAngleForceKernel::initialize(const System& system, const HarmonicAngleForce& force) {
for (int i = 0; i < (int) kernels.size(); i++)
getKernel(i).initialize(system, force);
}
double HipParallelCalcHarmonicAngleForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
for (int i = 0; i < (int) data.contexts.size(); i++) {
HipContext& cu = *data.contexts[i];
ComputeContext::WorkThread& thread = cu.getWorkThread();
thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
}
return 0.0;
}
void HipParallelCalcHarmonicAngleForceKernel::copyParametersToContext(ContextImpl& context, const HarmonicAngleForce& force) {
for (int i = 0; i < (int) kernels.size(); i++)
getKernel(i).copyParametersToContext(context, force);
}
class HipParallelCalcCustomAngleForceKernel::Task : public HipContext::WorkTask {
public:
Task(ContextImpl& context, CommonCalcCustomAngleForceKernel& kernel, bool includeForce,
bool includeEnergy, double& energy) : context(context), kernel(kernel),
includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
}
void execute() {
energy += kernel.execute(context, includeForce, includeEnergy);
}
private:
ContextImpl& context;
CommonCalcCustomAngleForceKernel& kernel;
bool includeForce, includeEnergy;
double& energy;
};
HipParallelCalcCustomAngleForceKernel::HipParallelCalcCustomAngleForceKernel(std::string name, const Platform& platform, HipPlatform::PlatformData& data, const System& system) :
CalcCustomAngleForceKernel(name, platform), data(data) {
for (int i = 0; i < (int) data.contexts.size(); i++)
kernels.push_back(Kernel(new CommonCalcCustomAngleForceKernel(name, platform, *data.contexts[i], system)));
}
void HipParallelCalcCustomAngleForceKernel::initialize(const System& system, const CustomAngleForce& force) {
for (int i = 0; i < (int) kernels.size(); i++)
getKernel(i).initialize(system, force);
}
double HipParallelCalcCustomAngleForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
for (int i = 0; i < (int) data.contexts.size(); i++) {
HipContext& cu = *data.contexts[i];
ComputeContext::WorkThread& thread = cu.getWorkThread();
thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
}
return 0.0;
}
void HipParallelCalcCustomAngleForceKernel::copyParametersToContext(ContextImpl& context, const CustomAngleForce& force) {
for (int i = 0; i < (int) kernels.size(); i++)
getKernel(i).copyParametersToContext(context, force);
}
class HipParallelCalcPeriodicTorsionForceKernel::Task : public HipContext::WorkTask {
public:
Task(ContextImpl& context, CommonCalcPeriodicTorsionForceKernel& kernel, bool includeForce,
bool includeEnergy, double& energy) : context(context), kernel(kernel),
includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
}
void execute() {
energy += kernel.execute(context, includeForce, includeEnergy);
}
private:
ContextImpl& context;
CommonCalcPeriodicTorsionForceKernel& kernel;
bool includeForce, includeEnergy;
double& energy;
};
HipParallelCalcPeriodicTorsionForceKernel::HipParallelCalcPeriodicTorsionForceKernel(std::string name, const Platform& platform, HipPlatform::PlatformData& data, const System& system) :
CalcPeriodicTorsionForceKernel(name, platform), data(data) {
for (int i = 0; i < (int) data.contexts.size(); i++)
kernels.push_back(Kernel(new CommonCalcPeriodicTorsionForceKernel(name, platform, *data.contexts[i], system)));
}
void HipParallelCalcPeriodicTorsionForceKernel::initialize(const System& system, const PeriodicTorsionForce& force) {
for (int i = 0; i < (int) kernels.size(); i++)
getKernel(i).initialize(system, force);
}
double HipParallelCalcPeriodicTorsionForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
for (int i = 0; i < (int) data.contexts.size(); i++) {
HipContext& cu = *data.contexts[i];
ComputeContext::WorkThread& thread = cu.getWorkThread();
thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
}
return 0.0;
}
void HipParallelCalcPeriodicTorsionForceKernel::copyParametersToContext(ContextImpl& context, const PeriodicTorsionForce& force) {
for (int i = 0; i < (int) kernels.size(); i++)
getKernel(i).copyParametersToContext(context, force);
}
class HipParallelCalcRBTorsionForceKernel::Task : public HipContext::WorkTask {
public:
Task(ContextImpl& context, CommonCalcRBTorsionForceKernel& kernel, bool includeForce,
bool includeEnergy, double& energy) : context(context), kernel(kernel),
includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
}
void execute() {
energy += kernel.execute(context, includeForce, includeEnergy);
}
private:
ContextImpl& context;
CommonCalcRBTorsionForceKernel& kernel;
bool includeForce, includeEnergy;
double& energy;
};
HipParallelCalcRBTorsionForceKernel::HipParallelCalcRBTorsionForceKernel(std::string name, const Platform& platform, HipPlatform::PlatformData& data, const System& system) :
CalcRBTorsionForceKernel(name, platform), data(data) {
for (int i = 0; i < (int) data.contexts.size(); i++)
kernels.push_back(Kernel(new CommonCalcRBTorsionForceKernel(name, platform, *data.contexts[i], system)));
}
void HipParallelCalcRBTorsionForceKernel::initialize(const System& system, const RBTorsionForce& force) {
for (int i = 0; i < (int) kernels.size(); i++)
getKernel(i).initialize(system, force);
}
double HipParallelCalcRBTorsionForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
for (int i = 0; i < (int) data.contexts.size(); i++) {
HipContext& cu = *data.contexts[i];
ComputeContext::WorkThread& thread = cu.getWorkThread();
thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
}
return 0.0;
}
void HipParallelCalcRBTorsionForceKernel::copyParametersToContext(ContextImpl& context, const RBTorsionForce& force) {
for (int i = 0; i < (int) kernels.size(); i++)
getKernel(i).copyParametersToContext(context, force);
}
class HipParallelCalcCMAPTorsionForceKernel::Task : public HipContext::WorkTask {
public:
Task(ContextImpl& context, CommonCalcCMAPTorsionForceKernel& kernel, bool includeForce,
bool includeEnergy, double& energy) : context(context), kernel(kernel),
includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
}
void execute() {
energy += kernel.execute(context, includeForce, includeEnergy);
}
private:
ContextImpl& context;
CommonCalcCMAPTorsionForceKernel& kernel;
bool includeForce, includeEnergy;
double& energy;
};
HipParallelCalcCMAPTorsionForceKernel::HipParallelCalcCMAPTorsionForceKernel(std::string name, const Platform& platform, HipPlatform::PlatformData& data, const System& system) :
CalcCMAPTorsionForceKernel(name, platform), data(data) {
for (int i = 0; i < (int) data.contexts.size(); i++)
kernels.push_back(Kernel(new CommonCalcCMAPTorsionForceKernel(name, platform, *data.contexts[i], system)));
}
void HipParallelCalcCMAPTorsionForceKernel::initialize(const System& system, const CMAPTorsionForce& force) {
for (int i = 0; i < (int) kernels.size(); i++)
getKernel(i).initialize(system, force);
}
double HipParallelCalcCMAPTorsionForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
for (int i = 0; i < (int) data.contexts.size(); i++) {
HipContext& cu = *data.contexts[i];
ComputeContext::WorkThread& thread = cu.getWorkThread();
thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
}
return 0.0;
}
void HipParallelCalcCMAPTorsionForceKernel::copyParametersToContext(ContextImpl& context, const CMAPTorsionForce& force) {
for (int i = 0; i < (int) kernels.size(); i++)
getKernel(i).copyParametersToContext(context, force);
}
class HipParallelCalcCustomTorsionForceKernel::Task : public HipContext::WorkTask {
public:
Task(ContextImpl& context, CommonCalcCustomTorsionForceKernel& kernel, bool includeForce,
bool includeEnergy, double& energy) : context(context), kernel(kernel),
includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
}
void execute() {
energy += kernel.execute(context, includeForce, includeEnergy);
}
private:
ContextImpl& context;
CommonCalcCustomTorsionForceKernel& kernel;
bool includeForce, includeEnergy;
double& energy;
};
HipParallelCalcCustomTorsionForceKernel::HipParallelCalcCustomTorsionForceKernel(std::string name, const Platform& platform, HipPlatform::PlatformData& data, const System& system) :
CalcCustomTorsionForceKernel(name, platform), data(data) {
for (int i = 0; i < (int) data.contexts.size(); i++)
kernels.push_back(Kernel(new CommonCalcCustomTorsionForceKernel(name, platform, *data.contexts[i], system)));
}
void HipParallelCalcCustomTorsionForceKernel::initialize(const System& system, const CustomTorsionForce& force) {
for (int i = 0; i < (int) kernels.size(); i++)
getKernel(i).initialize(system, force);
}
double HipParallelCalcCustomTorsionForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
for (int i = 0; i < (int) data.contexts.size(); i++) {
HipContext& cu = *data.contexts[i];
ComputeContext::WorkThread& thread = cu.getWorkThread();
thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
}
return 0.0;
}
void HipParallelCalcCustomTorsionForceKernel::copyParametersToContext(ContextImpl& context, const CustomTorsionForce& force) {
for (int i = 0; i < (int) kernels.size(); i++)
getKernel(i).copyParametersToContext(context, force);
}
class HipParallelCalcNonbondedForceKernel::Task : public HipContext::WorkTask {
public:
Task(ContextImpl& context, HipCalcNonbondedForceKernel& kernel, bool includeForce,
bool includeEnergy, bool includeDirect, bool includeReciprocal, double& energy) : context(context), kernel(kernel),
includeForce(includeForce), includeEnergy(includeEnergy), includeDirect(includeDirect), includeReciprocal(includeReciprocal), energy(energy) {
}
void execute() {
energy += kernel.execute(context, includeForce, includeEnergy, includeDirect, includeReciprocal);
}
private:
ContextImpl& context;
HipCalcNonbondedForceKernel& kernel;
bool includeForce, includeEnergy, includeDirect, includeReciprocal;
double& energy;
};
HipParallelCalcNonbondedForceKernel::HipParallelCalcNonbondedForceKernel(std::string name, const Platform& platform, HipPlatform::PlatformData& data, const System& system) :
CalcNonbondedForceKernel(name, platform), data(data) {
for (int i = 0; i < (int) data.contexts.size(); i++)
kernels.push_back(Kernel(new HipCalcNonbondedForceKernel(name, platform, *data.contexts[i], system)));
}
void HipParallelCalcNonbondedForceKernel::initialize(const System& system, const NonbondedForce& force) {
for (int i = 0; i < (int) kernels.size(); i++)
getKernel(i).initialize(system, force);
}
double HipParallelCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy, bool includeDirect, bool includeReciprocal) {
for (int i = 0; i < (int) data.contexts.size(); i++) {
HipContext& cu = *data.contexts[i];
ComputeContext::WorkThread& thread = cu.getWorkThread();
thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, includeDirect, includeReciprocal, data.contextEnergy[i]));
}
return 0.0;
}
void HipParallelCalcNonbondedForceKernel::copyParametersToContext(ContextImpl& context, const NonbondedForce& force) {
for (int i = 0; i < (int) kernels.size(); i++)
getKernel(i).copyParametersToContext(context, force);
}
void HipParallelCalcNonbondedForceKernel::getPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
dynamic_cast<const HipCalcNonbondedForceKernel&>(kernels[0].getImpl()).getPMEParameters(alpha, nx, ny, nz);
}
void HipParallelCalcNonbondedForceKernel::getLJPMEParameters(double& alpha, int& nx, int& ny, int& nz) const {
dynamic_cast<const HipCalcNonbondedForceKernel&>(kernels[0].getImpl()).getLJPMEParameters(alpha, nx, ny, nz);
}
class HipParallelCalcCustomNonbondedForceKernel::Task : public HipContext::WorkTask {
public:
Task(ContextImpl& context, CommonCalcCustomNonbondedForceKernel& kernel, bool includeForce,
bool includeEnergy, double& energy) : context(context), kernel(kernel),
includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
}
void execute() {
energy += kernel.execute(context, includeForce, includeEnergy);
}
private:
ContextImpl& context;
CommonCalcCustomNonbondedForceKernel& kernel;
bool includeForce, includeEnergy;
double& energy;
};
HipParallelCalcCustomNonbondedForceKernel::HipParallelCalcCustomNonbondedForceKernel(std::string name, const Platform& platform, HipPlatform::PlatformData& data, const System& system) :
CalcCustomNonbondedForceKernel(name, platform), data(data) {
for (int i = 0; i < (int) data.contexts.size(); i++)
kernels.push_back(Kernel(new CommonCalcCustomNonbondedForceKernel(name, platform, *data.contexts[i], system)));
}
void HipParallelCalcCustomNonbondedForceKernel::initialize(const System& system, const CustomNonbondedForce& force) {
for (int i = 0; i < (int) kernels.size(); i++)
getKernel(i).initialize(system, force);
}
double HipParallelCalcCustomNonbondedForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
for (int i = 0; i < (int) data.contexts.size(); i++) {
HipContext& cu = *data.contexts[i];
ComputeContext::WorkThread& thread = cu.getWorkThread();
thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
}
return 0.0;
}
void HipParallelCalcCustomNonbondedForceKernel::copyParametersToContext(ContextImpl& context, const CustomNonbondedForce& force) {
for (int i = 0; i < (int) kernels.size(); i++)
getKernel(i).copyParametersToContext(context, force);
}
class HipParallelCalcCustomExternalForceKernel::Task : public HipContext::WorkTask {
public:
Task(ContextImpl& context, CommonCalcCustomExternalForceKernel& kernel, bool includeForce,
bool includeEnergy, double& energy) : context(context), kernel(kernel),
includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
}
void execute() {
energy += kernel.execute(context, includeForce, includeEnergy);
}
private:
ContextImpl& context;
CommonCalcCustomExternalForceKernel& kernel;
bool includeForce, includeEnergy;
double& energy;
};
HipParallelCalcCustomExternalForceKernel::HipParallelCalcCustomExternalForceKernel(std::string name, const Platform& platform, HipPlatform::PlatformData& data, const System& system) :
CalcCustomExternalForceKernel(name, platform), data(data) {
for (int i = 0; i < (int) data.contexts.size(); i++)
kernels.push_back(Kernel(new CommonCalcCustomExternalForceKernel(name, platform, *data.contexts[i], system)));
}
void HipParallelCalcCustomExternalForceKernel::initialize(const System& system, const CustomExternalForce& force) {
for (int i = 0; i < (int) kernels.size(); i++)
getKernel(i).initialize(system, force);
}
double HipParallelCalcCustomExternalForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
for (int i = 0; i < (int) data.contexts.size(); i++) {
HipContext& cu = *data.contexts[i];
ComputeContext::WorkThread& thread = cu.getWorkThread();
thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
}
return 0.0;
}
void HipParallelCalcCustomExternalForceKernel::copyParametersToContext(ContextImpl& context, const CustomExternalForce& force) {
for (int i = 0; i < (int) kernels.size(); i++)
getKernel(i).copyParametersToContext(context, force);
}
class HipParallelCalcCustomHbondForceKernel::Task : public HipContext::WorkTask {
public:
Task(ContextImpl& context, CommonCalcCustomHbondForceKernel& kernel, bool includeForce,
bool includeEnergy, double& energy) : context(context), kernel(kernel),
includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
}
void execute() {
energy += kernel.execute(context, includeForce, includeEnergy);
}
private:
ContextImpl& context;
CommonCalcCustomHbondForceKernel& kernel;
bool includeForce, includeEnergy;
double& energy;
};
HipParallelCalcCustomHbondForceKernel::HipParallelCalcCustomHbondForceKernel(std::string name, const Platform& platform, HipPlatform::PlatformData& data, const System& system) :
CalcCustomHbondForceKernel(name, platform), data(data) {
for (int i = 0; i < (int) data.contexts.size(); i++)
kernels.push_back(Kernel(new CommonCalcCustomHbondForceKernel(name, platform, *data.contexts[i], system)));
}
void HipParallelCalcCustomHbondForceKernel::initialize(const System& system, const CustomHbondForce& force) {
for (int i = 0; i < (int) kernels.size(); i++)
getKernel(i).initialize(system, force);
}
double HipParallelCalcCustomHbondForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
for (int i = 0; i < (int) data.contexts.size(); i++) {
HipContext& cu = *data.contexts[i];
ComputeContext::WorkThread& thread = cu.getWorkThread();
thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
}
return 0.0;
}
void HipParallelCalcCustomHbondForceKernel::copyParametersToContext(ContextImpl& context, const CustomHbondForce& force) {
for (int i = 0; i < (int) kernels.size(); i++)
getKernel(i).copyParametersToContext(context, force);
}
class HipParallelCalcCustomCompoundBondForceKernel::Task : public HipContext::WorkTask {
public:
Task(ContextImpl& context, CommonCalcCustomCompoundBondForceKernel& kernel, bool includeForce,
bool includeEnergy, double& energy) : context(context), kernel(kernel),
includeForce(includeForce), includeEnergy(includeEnergy), energy(energy) {
}
void execute() {
energy += kernel.execute(context, includeForce, includeEnergy);
}
private:
ContextImpl& context;
CommonCalcCustomCompoundBondForceKernel& kernel;
bool includeForce, includeEnergy;
double& energy;
};
HipParallelCalcCustomCompoundBondForceKernel::HipParallelCalcCustomCompoundBondForceKernel(std::string name, const Platform& platform, HipPlatform::PlatformData& data, const System& system) :
CalcCustomCompoundBondForceKernel(name, platform), data(data) {
for (int i = 0; i < (int) data.contexts.size(); i++)
kernels.push_back(Kernel(new CommonCalcCustomCompoundBondForceKernel(name, platform, *data.contexts[i], system)));
}
void HipParallelCalcCustomCompoundBondForceKernel::initialize(const System& system, const CustomCompoundBondForce& force) {
for (int i = 0; i < (int) kernels.size(); i++)
getKernel(i).initialize(system, force);
}
double HipParallelCalcCustomCompoundBondForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
for (int i = 0; i < (int) data.contexts.size(); i++) {
HipContext& cu = *data.contexts[i];
ComputeContext::WorkThread& thread = cu.getWorkThread();
thread.addTask(new Task(context, getKernel(i), includeForces, includeEnergy, data.contextEnergy[i]));
}
return 0.0;
}
void HipParallelCalcCustomCompoundBondForceKernel::copyParametersToContext(ContextImpl& context, const CustomCompoundBondForce& force) {
for (int i = 0; i < (int) kernels.size(); i++)
getKernel(i).copyParametersToContext(context, force);
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2019 Stanford University and the Authors. *
* Portions copyright (c) 2020 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "HipParameterSet.h"
using namespace OpenMM;
using namespace std;
HipParameterSet::HipParameterSet(HipContext& context, int numParameters, int numObjects, const string& name, bool bufferPerParameter, bool useDoublePrecision) :
ComputeParameterSet(context, numParameters, numObjects, name, bufferPerParameter, useDoublePrecision) {
for (auto& info : getParameterInfos())
buffers.push_back(HipNonbondedUtilities::ParameterInfo(info.getName(), info.getComponentType(), info.getNumComponents(), info.getSize(), context.unwrap(info.getArray()).getDevicePointer()));
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008-2019 Stanford University and the Authors. *
* Portions copyright (c) 2020 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "HipContext.h"
#include "HipExpressionUtilities.h"
#include "HipPlatform.h"
#include "HipKernelFactory.h"
#include "HipKernels.h"
#include "openmm/Context.h"
#include "openmm/System.h"
#include "openmm/internal/ContextImpl.h"
#include "openmm/internal/hardware.h"
#include <algorithm>
#include <cctype>
#include <sstream>
#include <cstdio>
#ifdef _MSC_VER
#error "Windows unsupported for HIP platform"
#endif
using namespace OpenMM;
using namespace std;
#define CHECK_RESULT(result, prefix) \
if (result != hipSuccess) { \
std::stringstream m; \
m<<prefix<<": "<<HipContext::getErrorString(result)<<" ("<<result<<")"<<" at "<<__FILE__<<":"<<__LINE__; \
throw OpenMMException(m.str());\
}
#ifdef OPENMM_COMMON_BUILDING_STATIC_LIBRARY
extern "C" void registerHipPlatform() {
Platform::registerPlatform(new HipPlatform());
}
#else
extern "C" OPENMM_EXPORT_COMMON void registerPlatforms() {
Platform::registerPlatform(new HipPlatform());
}
#endif
HipPlatform::HipPlatform() {
deprecatedPropertyReplacements["HipDeviceIndex"] = HipDeviceIndex();
deprecatedPropertyReplacements["HipDeviceName"] = HipDeviceName();
deprecatedPropertyReplacements["HipUseBlockingSync"] = HipUseBlockingSync();
deprecatedPropertyReplacements["HipPrecision"] = HipPrecision();
deprecatedPropertyReplacements["HipUseCpuPme"] = HipUseCpuPme();
deprecatedPropertyReplacements["HipTempDirectory"] = HipTempDirectory();
deprecatedPropertyReplacements["HipDisablePmeStream"] = HipDisablePmeStream();
deprecatedPropertyReplacements["HipDeterministicForces"] = HipDeterministicForces();
HipKernelFactory* factory = new HipKernelFactory();
registerKernelFactory(CalcForcesAndEnergyKernel::Name(), factory);
registerKernelFactory(UpdateStateDataKernel::Name(), factory);
registerKernelFactory(ApplyConstraintsKernel::Name(), factory);
registerKernelFactory(VirtualSitesKernel::Name(), factory);
registerKernelFactory(CalcHarmonicBondForceKernel::Name(), factory);
registerKernelFactory(CalcCustomBondForceKernel::Name(), factory);
registerKernelFactory(CalcHarmonicAngleForceKernel::Name(), factory);
registerKernelFactory(CalcCustomAngleForceKernel::Name(), factory);
registerKernelFactory(CalcPeriodicTorsionForceKernel::Name(), factory);
registerKernelFactory(CalcRBTorsionForceKernel::Name(), factory);
registerKernelFactory(CalcCMAPTorsionForceKernel::Name(), factory);
registerKernelFactory(CalcCustomTorsionForceKernel::Name(), factory);
registerKernelFactory(CalcNonbondedForceKernel::Name(), factory);
registerKernelFactory(CalcCustomNonbondedForceKernel::Name(), factory);
registerKernelFactory(CalcGBSAOBCForceKernel::Name(), factory);
registerKernelFactory(CalcCustomGBForceKernel::Name(), factory);
registerKernelFactory(CalcCustomExternalForceKernel::Name(), factory);
registerKernelFactory(CalcCustomHbondForceKernel::Name(), factory);
registerKernelFactory(CalcCustomCentroidBondForceKernel::Name(), factory);
registerKernelFactory(CalcCustomCompoundBondForceKernel::Name(), factory);
registerKernelFactory(CalcCustomCVForceKernel::Name(), factory);
registerKernelFactory(CalcRMSDForceKernel::Name(), factory);
registerKernelFactory(CalcCustomManyParticleForceKernel::Name(), factory);
registerKernelFactory(CalcGayBerneForceKernel::Name(), factory);
registerKernelFactory(IntegrateVerletStepKernel::Name(), factory);
registerKernelFactory(IntegrateNoseHooverStepKernel::Name(), factory);
registerKernelFactory(IntegrateLangevinStepKernel::Name(), factory);
registerKernelFactory(IntegrateLangevinMiddleStepKernel::Name(), factory);
registerKernelFactory(IntegrateBrownianStepKernel::Name(), factory);
registerKernelFactory(IntegrateVariableVerletStepKernel::Name(), factory);
registerKernelFactory(IntegrateVariableLangevinStepKernel::Name(), factory);
registerKernelFactory(IntegrateCustomStepKernel::Name(), factory);
registerKernelFactory(ApplyAndersenThermostatKernel::Name(), factory);
registerKernelFactory(ApplyMonteCarloBarostatKernel::Name(), factory);
registerKernelFactory(RemoveCMMotionKernel::Name(), factory);
platformProperties.push_back(HipDeviceIndex());
platformProperties.push_back(HipDeviceName());
platformProperties.push_back(HipUseBlockingSync());
platformProperties.push_back(HipPrecision());
platformProperties.push_back(HipUseCpuPme());
platformProperties.push_back(HipCompiler());
platformProperties.push_back(HipTempDirectory());
platformProperties.push_back(HipHostCompiler());
platformProperties.push_back(HipDisablePmeStream());
platformProperties.push_back(HipDeterministicForces());
setPropertyDefaultValue(HipDeviceIndex(), "");
setPropertyDefaultValue(HipDeviceName(), "");
setPropertyDefaultValue(HipUseBlockingSync(), "true");
setPropertyDefaultValue(HipPrecision(), "single");
setPropertyDefaultValue(HipUseCpuPme(), "false");
setPropertyDefaultValue(HipDisablePmeStream(), "false");
setPropertyDefaultValue(HipDeterministicForces(), "false");
char* compiler = getenv("OPENMM_HIP_COMPILER");
char* rocm_path = getenv("ROCM_PATH");
string hipcc;
if (rocm_path != NULL) {
hipcc = string(rocm_path) + "/bin/hipcc";
} else if (compiler != NULL) {
hipcc = compiler;
} else {
hipcc = "/opt/rocm/bin/hipcc";
}
setPropertyDefaultValue(HipCompiler(), hipcc);
char* tmpdir = getenv("TMPDIR");
string tmp = (tmpdir == NULL ? string(P_tmpdir) : string(tmpdir));
setPropertyDefaultValue(HipTempDirectory(), tmp);
char* hostCompiler = getenv("HIP_HOST_COMPILER");
setPropertyDefaultValue(HipHostCompiler(), (hostCompiler == NULL ? "" : string(hostCompiler)));
}
double HipPlatform::getSpeed() const {
return 100;
}
bool HipPlatform::supportsDoublePrecision() const {
return true;
}
const string& HipPlatform::getPropertyValue(const Context& context, const string& property) const {
const ContextImpl& impl = getContextImpl(context);
const PlatformData* data = reinterpret_cast<const PlatformData*>(impl.getPlatformData());
string propertyName = property;
if (deprecatedPropertyReplacements.find(property) != deprecatedPropertyReplacements.end())
propertyName = deprecatedPropertyReplacements.find(property)->second;
map<string, string>::const_iterator value = data->propertyValues.find(propertyName);
if (value != data->propertyValues.end())
return value->second;
return Platform::getPropertyValue(context, property);
}
void HipPlatform::setPropertyValue(Context& context, const string& property, const string& value) const {
}
void HipPlatform::contextCreated(ContextImpl& context, const map<string, string>& properties) const {
const string& devicePropValue = (properties.find(HipDeviceIndex()) == properties.end() ?
getPropertyDefaultValue(HipDeviceIndex()) : properties.find(HipDeviceIndex())->second);
string blockingPropValue = (properties.find(HipUseBlockingSync()) == properties.end() ?
getPropertyDefaultValue(HipUseBlockingSync()) : properties.find(HipUseBlockingSync())->second);
string precisionPropValue = (properties.find(HipPrecision()) == properties.end() ?
getPropertyDefaultValue(HipPrecision()) : properties.find(HipPrecision())->second);
string cpuPmePropValue = (properties.find(HipUseCpuPme()) == properties.end() ?
getPropertyDefaultValue(HipUseCpuPme()) : properties.find(HipUseCpuPme())->second);
const string& compilerPropValue = (properties.find(HipCompiler()) == properties.end() ?
getPropertyDefaultValue(HipCompiler()) : properties.find(HipCompiler())->second);
const string& tempPropValue = (properties.find(HipTempDirectory()) == properties.end() ?
getPropertyDefaultValue(HipTempDirectory()) : properties.find(HipTempDirectory())->second);
const string& hostCompilerPropValue = (properties.find(HipHostCompiler()) == properties.end() ?
getPropertyDefaultValue(HipHostCompiler()) : properties.find(HipHostCompiler())->second);
string pmeStreamPropValue = (properties.find(HipDisablePmeStream()) == properties.end() ?
getPropertyDefaultValue(HipDisablePmeStream()) : properties.find(HipDisablePmeStream())->second);
string deterministicForcesValue = (properties.find(HipDeterministicForces()) == properties.end() ?
getPropertyDefaultValue(HipDeterministicForces()) : properties.find(HipDeterministicForces())->second);
transform(blockingPropValue.begin(), blockingPropValue.end(), blockingPropValue.begin(), ::tolower);
transform(precisionPropValue.begin(), precisionPropValue.end(), precisionPropValue.begin(), ::tolower);
transform(cpuPmePropValue.begin(), cpuPmePropValue.end(), cpuPmePropValue.begin(), ::tolower);
transform(pmeStreamPropValue.begin(), pmeStreamPropValue.end(), pmeStreamPropValue.begin(), ::tolower);
transform(deterministicForcesValue.begin(), deterministicForcesValue.end(), deterministicForcesValue.begin(), ::tolower);
vector<string> pmeKernelName;
pmeKernelName.push_back(CalcPmeReciprocalForceKernel::Name());
if (!supportsKernels(pmeKernelName))
cpuPmePropValue = "false";
int threads = getNumProcessors();
char* threadsEnv = getenv("OPENMM_CPU_THREADS");
if (threadsEnv != NULL)
stringstream(threadsEnv) >> threads;
context.setPlatformData(new PlatformData(&context, context.getSystem(), devicePropValue, blockingPropValue, precisionPropValue, cpuPmePropValue, compilerPropValue, tempPropValue,
hostCompilerPropValue, pmeStreamPropValue, deterministicForcesValue, threads, NULL));
}
void HipPlatform::linkedContextCreated(ContextImpl& context, ContextImpl& originalContext) const {
Platform& platform = originalContext.getPlatform();
string devicePropValue = platform.getPropertyValue(originalContext.getOwner(), HipDeviceIndex());
string blockingPropValue = platform.getPropertyValue(originalContext.getOwner(), HipUseBlockingSync());
string precisionPropValue = platform.getPropertyValue(originalContext.getOwner(), HipPrecision());
string cpuPmePropValue = platform.getPropertyValue(originalContext.getOwner(), HipUseCpuPme());
string compilerPropValue = platform.getPropertyValue(originalContext.getOwner(), HipCompiler());
string tempPropValue = platform.getPropertyValue(originalContext.getOwner(), HipTempDirectory());
string hostCompilerPropValue = platform.getPropertyValue(originalContext.getOwner(), HipHostCompiler());
string pmeStreamPropValue = platform.getPropertyValue(originalContext.getOwner(), HipDisablePmeStream());
string deterministicForcesValue = platform.getPropertyValue(originalContext.getOwner(), HipDeterministicForces());
int threads = reinterpret_cast<PlatformData*>(originalContext.getPlatformData())->threads.getNumThreads();
context.setPlatformData(new PlatformData(&context, context.getSystem(), devicePropValue, blockingPropValue, precisionPropValue, cpuPmePropValue, compilerPropValue, tempPropValue,
hostCompilerPropValue, pmeStreamPropValue, deterministicForcesValue, threads, &originalContext));
}
void HipPlatform::contextDestroyed(ContextImpl& context) const {
PlatformData* data = reinterpret_cast<PlatformData*>(context.getPlatformData());
delete data;
}
HipPlatform::PlatformData::PlatformData(ContextImpl* context, const System& system, const string& deviceIndexProperty, const string& blockingProperty, const string& precisionProperty,
const string& cpuPmeProperty, const string& compilerProperty, const string& tempProperty, const string& hostCompilerProperty, const string& pmeStreamProperty,
const string& deterministicForcesProperty, int numThreads, ContextImpl* originalContext) :
context(context), removeCM(false), stepCount(0), computeForceCount(0), time(0.0), hasInitializedContexts(false), threads(numThreads) {
bool blocking = (blockingProperty == "true");
vector<string> devices;
size_t searchPos = 0, nextPos;
while ((nextPos = deviceIndexProperty.find_first_of(", ", searchPos)) != string::npos) {
devices.push_back(deviceIndexProperty.substr(searchPos, nextPos-searchPos));
searchPos = nextPos+1;
}
devices.push_back(deviceIndexProperty.substr(searchPos));
PlatformData* originalData = NULL;
if (originalContext != NULL)
originalData = reinterpret_cast<PlatformData*>(originalContext->getPlatformData());
try {
for (int i = 0; i < (int) devices.size(); i++) {
if (devices[i].length() > 0) {
int deviceIndex;
stringstream(devices[i]) >> deviceIndex;
contexts.push_back(new HipContext(system, deviceIndex, blocking, precisionProperty, compilerProperty, tempProperty, hostCompilerProperty, *this, (originalData == NULL ? NULL : originalData->contexts[i])));
}
}
if (contexts.size() == 0)
contexts.push_back(new HipContext(system, -1, blocking, precisionProperty, compilerProperty, tempProperty, hostCompilerProperty, *this, (originalData == NULL ? NULL : originalData->contexts[0])));
}
catch (...) {
// If an exception was thrown, do our best to clean up memory.
for (int i = 0; i < (int) contexts.size(); i++)
delete contexts[i];
throw;
}
stringstream deviceIndex, deviceName;
for (int i = 0; i < (int) contexts.size(); i++) {
if (i > 0) {
deviceIndex << ',';
deviceName << ',';
}
deviceIndex << contexts[i]->getDeviceIndex();
char name[1000];
CHECK_RESULT(hipDeviceGetName(name, 1000, contexts[i]->getDevice()), "Error querying device name");
deviceName << name;
}
useCpuPme = (cpuPmeProperty == "true" && !contexts[0]->getUseDoublePrecision());
disablePmeStream = (pmeStreamProperty == "true");
deterministicForces = (deterministicForcesProperty == "true");
propertyValues[HipPlatform::HipDeviceIndex()] = deviceIndex.str();
propertyValues[HipPlatform::HipDeviceName()] = deviceName.str();
propertyValues[HipPlatform::HipUseBlockingSync()] = blocking ? "true" : "false";
propertyValues[HipPlatform::HipPrecision()] = precisionProperty;
propertyValues[HipPlatform::HipUseCpuPme()] = useCpuPme ? "true" : "false";
propertyValues[HipPlatform::HipCompiler()] = compilerProperty;
propertyValues[HipPlatform::HipTempDirectory()] = tempProperty;
propertyValues[HipPlatform::HipHostCompiler()] = hostCompilerProperty;
propertyValues[HipPlatform::HipDisablePmeStream()] = disablePmeStream ? "true" : "false";
propertyValues[HipPlatform::HipDeterministicForces()] = deterministicForces ? "true" : "false";
contextEnergy.resize(contexts.size());
// Determine whether peer-to-peer copying is supported, and enable it if so.
peerAccessSupported = true;
for (int i = 1; i < contexts.size(); i++) {
int canAccess;
hipDeviceCanAccessPeer(&canAccess, contexts[i]->getDevice(), contexts[0]->getDevice());
if (!canAccess) {
peerAccessSupported = false;
break;
}
}
}
HipPlatform::PlatformData::~PlatformData() {
for (int i = 0; i < (int) contexts.size(); i++)
delete contexts[i];
}
void HipPlatform::PlatformData::initializeContexts(const System& system) {
if (hasInitializedContexts)
return;
for (int i = 0; i < (int) contexts.size(); i++)
contexts[i]->initialize();
hasInitializedContexts = true;
}
void HipPlatform::PlatformData::syncContexts() {
for (int i = 0; i < (int) contexts.size(); i++)
contexts[i]->getWorkThread().flush();
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2019 Stanford University and the Authors. *
* Portions copyright (c) 2020 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "HipProgram.h"
#include "HipKernel.h"
using namespace OpenMM;
using namespace std;
HipProgram::HipProgram(HipContext& context, hipModule_t module) : context(context), module(module) {
}
ComputeKernel HipProgram::createKernel(const string& name) {
hipFunction_t kernel = context.getKernel(module, name.c_str());
return shared_ptr<ComputeKernelImpl>(new HipKernel(context, kernel, name));
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2010-2018 Stanford University and the Authors. *
* Portions copyright (c) 2020 Advanced Micro Devices, Inc. *
* Authors: Peter Eastman, Nicholas Curtis *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "HipSort.h"
#include "HipKernelSources.h"
#include <algorithm>
#include <map>
using namespace OpenMM;
using namespace std;
HipSort::HipSort(HipContext& context, SortTrait* trait, unsigned int length) : context(context), trait(trait), dataLength(length) {
// Create kernels.
map<string, string> replacements;
replacements["DATA_TYPE"] = trait->getDataType();
replacements["KEY_TYPE"] = trait->getKeyType();
replacements["SORT_KEY"] = trait->getSortKey();
replacements["MIN_KEY"] = trait->getMinKey();
replacements["MAX_KEY"] = trait->getMaxKey();
replacements["MAX_VALUE"] = trait->getMaxValue();
hipModule_t module = context.createModule(context.replaceStrings(HipKernelSources::sort, replacements));
shortListKernel = context.getKernel(module, "sortShortList");
shortList2Kernel = context.getKernel(module, "sortShortList2");
computeRangeKernel = context.getKernel(module, "computeRange");
assignElementsKernel = context.getKernel(module, "assignElementsToBuckets");
computeBucketPositionsKernel = context.getKernel(module, "computeBucketPositions");
copyToBucketsKernel = context.getKernel(module, "copyDataToBuckets");
sortBucketsKernel = context.getKernel(module, "sortBuckets");
// Work out the work group sizes for various kernels.
int maxBlockSize;
hipDeviceGetAttribute(&maxBlockSize, hipDeviceAttributeMaxBlockDimX, context.getDevice());
int maxSharedMem;
hipDeviceGetAttribute(&maxSharedMem, hipDeviceAttributeMaxSharedMemoryPerBlock, context.getDevice());
int maxLocalBuffer = (maxSharedMem/trait->getDataSize())/2;
int maxShortList = min(3000, max(maxLocalBuffer, HipContext::ThreadBlockSize*context.getNumThreadBlocks()));
isShortList = (length <= maxShortList);
for (rangeKernelSize = 1; rangeKernelSize*2 <= maxBlockSize; rangeKernelSize *= 2)
;
positionsKernelSize = rangeKernelSize;
sortKernelSize = (isShortList ? rangeKernelSize/2 : rangeKernelSize/4);
if (rangeKernelSize > length)
rangeKernelSize = length;
if (sortKernelSize > maxLocalBuffer)
sortKernelSize = maxLocalBuffer;
unsigned int targetBucketSize = sortKernelSize/2;
unsigned int numBuckets = length/targetBucketSize;
if (numBuckets < 1)
numBuckets = 1;
if (positionsKernelSize > numBuckets)
positionsKernelSize = numBuckets;
// Create workspace arrays.
if (!isShortList) {
dataRange.initialize(context, 2, trait->getKeySize(), "sortDataRange");
bucketOffset.initialize<uint1>(context, numBuckets, "bucketOffset");
bucketOfElement.initialize<uint1>(context, length, "bucketOfElement");
offsetInBucket.initialize<uint1>(context, length, "offsetInBucket");
}
buckets.initialize(context, length, trait->getDataSize(), "buckets");
}
HipSort::~HipSort() {
delete trait;
}
void HipSort::sort(HipArray& data) {
if (data.getSize() != dataLength || data.getElementSize() != trait->getDataSize())
throw OpenMMException("HipSort called with different data size");
if (data.getSize() == 0)
return;
if (isShortList) {
// We can use a simpler sort kernel that does the entire operation in one kernel.
if (dataLength <= HipContext::ThreadBlockSize*context.getNumThreadBlocks()) {
void* sortArgs[] = {&data.getDevicePointer(), &buckets.getDevicePointer(), &dataLength};
context.executeKernel(shortList2Kernel, sortArgs, dataLength);
buckets.copyTo(data);
}
else {
void* sortArgs[] = {&data.getDevicePointer(), &dataLength};
context.executeKernel(shortListKernel, sortArgs, sortKernelSize, sortKernelSize, dataLength*trait->getDataSize());
}
}
else {
// Compute the range of data values.
unsigned int numBuckets = bucketOffset.getSize();
void* rangeArgs[] = {&data.getDevicePointer(), &dataLength, &dataRange.getDevicePointer(), &numBuckets, &bucketOffset.getDevicePointer()};
context.executeKernel(computeRangeKernel, rangeArgs, rangeKernelSize, rangeKernelSize, 2*rangeKernelSize*trait->getKeySize());
// Assign array elements to buckets.
void* elementsArgs[] = {&data.getDevicePointer(), &dataLength, &numBuckets, &dataRange.getDevicePointer(),
&bucketOffset.getDevicePointer(), &bucketOfElement.getDevicePointer(), &offsetInBucket.getDevicePointer()};
context.executeKernel(assignElementsKernel, elementsArgs, data.getSize(), 128);
// Compute the position of each bucket.
void* computeArgs[] = {&numBuckets, &bucketOffset.getDevicePointer()};
context.executeKernel(computeBucketPositionsKernel, computeArgs, positionsKernelSize, positionsKernelSize, positionsKernelSize*sizeof(int));
// Copy the data into the buckets.
void* copyArgs[] = {&data.getDevicePointer(), &buckets.getDevicePointer(), &dataLength, &bucketOffset.getDevicePointer(),
&bucketOfElement.getDevicePointer(), &offsetInBucket.getDevicePointer()};
context.executeKernel(copyToBucketsKernel, copyArgs, data.getSize());
// Sort each bucket.
void* sortArgs[] = {&data.getDevicePointer(), &buckets.getDevicePointer(), &numBuckets, &bucketOffset.getDevicePointer()};
context.executeKernel(sortBucketsKernel, sortArgs, ((data.getSize()+sortKernelSize-1)/sortKernelSize)*sortKernelSize, sortKernelSize, sortKernelSize*trait->getDataSize());
}
}
/**
* This file contains HIP definitions for the macros and functions needed for the
* common compute framework.
*/
#define KERNEL extern "C" __global__
#define DEVICE __device__
#define LOCAL __shared__
#define LOCAL_ARG
#define GLOBAL
#define RESTRICT __restrict__
#define LOCAL_ID threadIdx.x
#define LOCAL_SIZE blockDim.x
#define GLOBAL_ID (blockIdx.x*blockDim.x+threadIdx.x)
#define GLOBAL_SIZE (blockDim.x*gridDim.x)
#define GROUP_ID blockIdx.x
#define NUM_GROUPS gridDim.x
#define SYNC_THREADS __syncthreads();
#define MEM_FENCE __threadfence_block();
#define ATOMIC_ADD(dest, value) atomicAdd(dest, value)
typedef long long mm_long;
typedef unsigned long long mm_ulong;
#define SUPPORTS_64_BIT_ATOMICS __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__
#define SUPPORTS_DOUBLE_PRECISION __HIP_ARCH_HAS_DOUBLES__
static __inline__ __device__ real2 multiplyComplex(real2 c1, real2 c2) {
return make_real2(c1.x*c2.x-c1.y*c2.y, c1.x*c2.y+c1.y*c2.x);
}
/**
* Load a value from the half-complex grid produces by a real-to-complex transform.
*/
static __inline__ __device__ real2 loadComplexValue(const real2* __restrict__ in, int x, int y, int z) {
const int inputZSize = ZSIZE/2+1;
if (z < inputZSize)
return in[x*YSIZE*inputZSize+y*inputZSize+z];
int xp = (x == 0 ? 0 : XSIZE-x);
int yp = (y == 0 ? 0 : YSIZE-y);
real2 value = in[xp*YSIZE*inputZSize+yp*inputZSize+(ZSIZE-z)];
return make_real2(value.x, -value.y);
}
/**
* Perform a 1D FFT on each row along one axis.
*/
extern "C" __global__ void execFFT(const INPUT_TYPE* __restrict__ in, OUTPUT_TYPE* __restrict__ out) {
__shared__ real2 w[ZSIZE];
__shared__ real2 data0[BLOCKS_PER_GROUP*ZSIZE];
__shared__ real2 data1[BLOCKS_PER_GROUP*ZSIZE];
for (int i = threadIdx.x; i < ZSIZE; i += blockDim.x)
w[i] = make_real2(cos(-(SIGN)*i*2*M_PI/ZSIZE), sin(-(SIGN)*i*2*M_PI/ZSIZE));
__syncthreads();
const int block = threadIdx.x/THREADS_PER_BLOCK;
for (int baseIndex = blockIdx.x*BLOCKS_PER_GROUP; baseIndex < XSIZE*YSIZE; baseIndex += gridDim.x*BLOCKS_PER_GROUP) {
int index = baseIndex+block;
int x = index/YSIZE;
int y = index-x*YSIZE;
#if OUTPUT_IS_PACKED
if (x < XSIZE/2+1) {
#endif
if (index < XSIZE*YSIZE)
for (int i = threadIdx.x-block*THREADS_PER_BLOCK; i < ZSIZE; i += THREADS_PER_BLOCK)
#if INPUT_IS_REAL
data0[i+block*ZSIZE] = make_real2(in[x*(YSIZE*ZSIZE)+y*ZSIZE+i], 0);
#elif INPUT_IS_PACKED
data0[i+block*ZSIZE] = loadComplexValue(in, x, y, i);
#else
data0[i+block*ZSIZE] = in[x*(YSIZE*ZSIZE)+y*ZSIZE+i];
#endif
#if OUTPUT_IS_PACKED
}
#endif
__syncthreads();
COMPUTE_FFT
}
}
/**
* Combine the two halves of a real grid into a complex grid that is half as large.
*/
extern "C" __global__ void packForwardData(const real* __restrict__ in, real2* __restrict__ out) {
const int gridSize = PACKED_XSIZE*PACKED_YSIZE*PACKED_ZSIZE;
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x) {
int x = index/(PACKED_YSIZE*PACKED_ZSIZE);
int remainder = index-x*(PACKED_YSIZE*PACKED_ZSIZE);
int y = remainder/PACKED_ZSIZE;
int z = remainder-y*PACKED_ZSIZE;
#if PACKED_AXIS == 0
real2 value = make_real2(in[2*x*YSIZE*ZSIZE+y*ZSIZE+z], in[(2*x+1)*YSIZE*ZSIZE+y*ZSIZE+z]);
#elif PACKED_AXIS == 1
real2 value = make_real2(in[x*YSIZE*ZSIZE+2*y*ZSIZE+z], in[x*YSIZE*ZSIZE+(2*y+1)*ZSIZE+z]);
#else
real2 value = make_real2(in[x*YSIZE*ZSIZE+y*ZSIZE+2*z], in[x*YSIZE*ZSIZE+y*ZSIZE+(2*z+1)]);
#endif
out[index] = value;
}
}
/**
* Split the transformed data back into a full sized, symmetric grid.
*/
extern "C" __global__ void unpackForwardData(const real2* __restrict__ in, real2* __restrict__ out) {
// Compute the phase factors.
#if PACKED_AXIS == 0
__shared__ real2 w[PACKED_XSIZE];
for (int i = threadIdx.x; i < PACKED_XSIZE; i += blockDim.x)
w[i] = make_real2(sin(i*2*M_PI/XSIZE), cos(i*2*M_PI/XSIZE));
#elif PACKED_AXIS == 1
__shared__ real2 w[PACKED_YSIZE];
for (int i = threadIdx.x; i < PACKED_YSIZE; i += blockDim.x)
w[i] = make_real2(sin(i*2*M_PI/YSIZE), cos(i*2*M_PI/YSIZE));
#else
__shared__ real2 w[PACKED_ZSIZE];
for (int i = threadIdx.x; i < PACKED_ZSIZE; i += blockDim.x)
w[i] = make_real2(sin(i*2*M_PI/ZSIZE), cos(i*2*M_PI/ZSIZE));
#endif
__syncthreads();
// Transform the data.
const int gridSize = PACKED_XSIZE*PACKED_YSIZE*PACKED_ZSIZE;
const int outputZSize = ZSIZE/2+1;
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x) {
int x = index/(PACKED_YSIZE*PACKED_ZSIZE);
int remainder = index-x*(PACKED_YSIZE*PACKED_ZSIZE);
int y = remainder/PACKED_ZSIZE;
int z = remainder-y*PACKED_ZSIZE;
int xp = (x == 0 ? 0 : PACKED_XSIZE-x);
int yp = (y == 0 ? 0 : PACKED_YSIZE-y);
int zp = (z == 0 ? 0 : PACKED_ZSIZE-z);
real2 z1 = in[x*PACKED_YSIZE*PACKED_ZSIZE+y*PACKED_ZSIZE+z];
real2 z2 = in[xp*PACKED_YSIZE*PACKED_ZSIZE+yp*PACKED_ZSIZE+zp];
#if PACKED_AXIS == 0
real2 wfac = w[x];
#elif PACKED_AXIS == 1
real2 wfac = w[y];
#else
real2 wfac = w[z];
#endif
real2 output = make_real2((z1.x+z2.x - wfac.x*(z1.x-z2.x) + wfac.y*(z1.y+z2.y))/2, (z1.y-z2.y - wfac.y*(z1.x-z2.x) - wfac.x*(z1.y+z2.y))/2);
if (z < outputZSize)
out[x*YSIZE*outputZSize+y*outputZSize+z] = output;
xp = (x == 0 ? 0 : XSIZE-x);
yp = (y == 0 ? 0 : YSIZE-y);
zp = (z == 0 ? 0 : ZSIZE-z);
if (zp < outputZSize) {
#if PACKED_AXIS == 0
if (x == 0)
out[PACKED_XSIZE*YSIZE*outputZSize+yp*outputZSize+zp] = make_real2((z1.x-z1.y+z2.x-z2.y)/2, (-z1.x-z1.y+z2.x+z2.y)/2);
#elif PACKED_AXIS == 1
if (y == 0)
out[xp*YSIZE*outputZSize+PACKED_YSIZE*outputZSize+zp] = make_real2((z1.x-z1.y+z2.x-z2.y)/2, (-z1.x-z1.y+z2.x+z2.y)/2);
#else
if (z == 0)
out[xp*YSIZE*outputZSize+yp*outputZSize+PACKED_ZSIZE] = make_real2((z1.x-z1.y+z2.x-z2.y)/2, (-z1.x-z1.y+z2.x+z2.y)/2);
#endif
else
out[xp*YSIZE*outputZSize+yp*outputZSize+zp] = make_real2(output.x, -output.y);
}
}
}
/**
* Load a value from the half-complex grid produced by a real-to-complex transform.
*/
static __inline__ __device__ real2 loadComplexValue(const real2* __restrict__ in, int x, int y, int z) {
const int inputZSize = ZSIZE/2+1;
if (z < inputZSize)
return in[x*YSIZE*inputZSize+y*inputZSize+z];
int xp = (x == 0 ? 0 : XSIZE-x);
int yp = (y == 0 ? 0 : YSIZE-y);
real2 value = in[xp*YSIZE*inputZSize+yp*inputZSize+(ZSIZE-z)];
return make_real2(value.x, -value.y);
}
/**
* Repack the symmetric complex grid into one half as large in preparation for doing an inverse complex-to-real transform.
*/
extern "C" __global__ void packBackwardData(const real2* __restrict__ in, real2* __restrict__ out) {
// Compute the phase factors.
#if PACKED_AXIS == 0
__shared__ real2 w[PACKED_XSIZE];
for (int i = threadIdx.x; i < PACKED_XSIZE; i += blockDim.x)
w[i] = make_real2(cos(i*2*M_PI/XSIZE), sin(i*2*M_PI/XSIZE));
#elif PACKED_AXIS == 1
__shared__ real2 w[PACKED_YSIZE];
for (int i = threadIdx.x; i < PACKED_YSIZE; i += blockDim.x)
w[i] = make_real2(cos(i*2*M_PI/YSIZE), sin(i*2*M_PI/YSIZE));
#else
__shared__ real2 w[PACKED_ZSIZE];
for (int i = threadIdx.x; i < PACKED_ZSIZE; i += blockDim.x)
w[i] = make_real2(cos(i*2*M_PI/ZSIZE), sin(i*2*M_PI/ZSIZE));
#endif
__syncthreads();
// Transform the data.
const int gridSize = PACKED_XSIZE*PACKED_YSIZE*PACKED_ZSIZE;
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x) {
int x = index/(PACKED_YSIZE*PACKED_ZSIZE);
int remainder = index-x*(PACKED_YSIZE*PACKED_ZSIZE);
int y = remainder/PACKED_ZSIZE;
int z = remainder-y*PACKED_ZSIZE;
int xp = (x == 0 ? 0 : PACKED_XSIZE-x);
int yp = (y == 0 ? 0 : PACKED_YSIZE-y);
int zp = (z == 0 ? 0 : PACKED_ZSIZE-z);
real2 z1 = loadComplexValue(in, x, y, z);
#if PACKED_AXIS == 0
real2 wfac = w[x];
real2 z2 = loadComplexValue(in, PACKED_XSIZE-x, yp, zp);
#elif PACKED_AXIS == 1
real2 wfac = w[y];
real2 z2 = loadComplexValue(in, xp, PACKED_YSIZE-y, zp);
#else
real2 wfac = w[z];
real2 z2 = loadComplexValue(in, xp, yp, PACKED_ZSIZE-z);
#endif
real2 even = make_real2((z1.x+z2.x)/2, (z1.y-z2.y)/2);
real2 odd = make_real2((z1.x-z2.x)/2, (z1.y+z2.y)/2);
odd = make_real2(odd.x*wfac.x-odd.y*wfac.y, odd.y*wfac.x+odd.x*wfac.y);
out[x*PACKED_YSIZE*PACKED_ZSIZE+y*PACKED_ZSIZE+z] = make_real2(even.x-odd.y, even.y+odd.x);
}
}
/**
* Split the data back into a full sized, real grid after an inverse transform.
*/
extern "C" __global__ void unpackBackwardData(const real2* __restrict__ in, real* __restrict__ out) {
const int gridSize = PACKED_XSIZE*PACKED_YSIZE*PACKED_ZSIZE;
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x) {
int x = index/(PACKED_YSIZE*PACKED_ZSIZE);
int remainder = index-x*(PACKED_YSIZE*PACKED_ZSIZE);
int y = remainder/PACKED_ZSIZE;
int z = remainder-y*PACKED_ZSIZE;
real2 value = 2*in[index];
#if PACKED_AXIS == 0
out[2*x*YSIZE*ZSIZE+y*ZSIZE+z] = value.x;
out[(2*x+1)*YSIZE*ZSIZE+y*ZSIZE+z] = value.y;
#elif PACKED_AXIS == 1
out[x*YSIZE*ZSIZE+2*y*ZSIZE+z] = value.x;
out[x*YSIZE*ZSIZE+(2*y+1)*ZSIZE+z] = value.y;
#else
out[x*YSIZE*ZSIZE+y*ZSIZE+2*z] = value.x;
out[x*YSIZE*ZSIZE+y*ZSIZE+(2*z+1)] = value.y;
#endif
}
}
#define GROUP_SIZE 256
#define BUFFER_SIZE 256
__device__ inline int __TILEFLAGS_FFS(unsigned long long int x) {
return __ffsll(x);
}
__device__ inline int __TILEFLAGS_CLZ(unsigned long long int x) {
return __clzll(x);
}
__device__ inline int __TILEFLAGS_POPC(unsigned long long int x) {
return __popcll(x);
}
/**
* Find a bounding box for the atoms in each block.
*/
extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
const real4* __restrict__ posq, real4* __restrict__ blockCenter, real4* __restrict__ blockBoundingBox, int* __restrict__ rebuildNeighborList,
real2* __restrict__ sortedBlocks) {
int index = blockIdx.x*blockDim.x+threadIdx.x;
int base = index*TILE_SIZE;
while (base < numAtoms) {
real4 pos = posq[base];
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_POS(pos)
#endif
real4 minPos = pos;
real4 maxPos = pos;
int last = min(base+TILE_SIZE, numAtoms);
for (int i = base+1; i < last; i++) {
pos = posq[i];
#ifdef USE_PERIODIC
real4 center = 0.5f*(maxPos+minPos);
APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, center)
#endif
minPos = make_real4(min(minPos.x,pos.x), min(minPos.y,pos.y), min(minPos.z,pos.z), 0);
maxPos = make_real4(max(maxPos.x,pos.x), max(maxPos.y,pos.y), max(maxPos.z,pos.z), 0);
}
real4 blockSize = 0.5f*(maxPos-minPos);
real4 center = 0.5f*(maxPos+minPos);
center.w = 0;
for (int i = base; i < last; i++) {
pos = posq[i];
real4 delta = posq[i]-center;
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
#endif
center.w = max(center.w, delta.x*delta.x+delta.y*delta.y+delta.z*delta.z);
}
center.w = sqrt(center.w);
blockBoundingBox[index] = blockSize;
blockCenter[index] = center;
sortedBlocks[index] = make_real2(blockSize.x+blockSize.y+blockSize.z, index);
index += blockDim.x*gridDim.x;
base = index*TILE_SIZE;
}
if (blockIdx.x == 0 && threadIdx.x == 0)
rebuildNeighborList[0] = 0;
}
/**
* Sort the data about bounding boxes so it can be accessed more efficiently in the next kernel.
*/
extern "C" __global__ void sortBoxData(const real2* __restrict__ sortedBlock, const real4* __restrict__ blockCenter,
const real4* __restrict__ blockBoundingBox, real4* __restrict__ sortedBlockCenter,
real4* __restrict__ sortedBlockBoundingBox, const real4* __restrict__ posq, const real4* __restrict__ oldPositions,
unsigned int* __restrict__ interactionCount, int* __restrict__ rebuildNeighborList, bool forceRebuild) {
for (int i = threadIdx.x+blockIdx.x*blockDim.x; i < NUM_BLOCKS; i += blockDim.x*gridDim.x) {
int index = (int) sortedBlock[i].y;
sortedBlockCenter[i] = blockCenter[index];
sortedBlockBoundingBox[i] = blockBoundingBox[index];
}
// Also check whether any atom has moved enough so that we really need to rebuild the neighbor list.
bool rebuild = forceRebuild;
for (int i = threadIdx.x+blockIdx.x*blockDim.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
real4 delta = oldPositions[i]-posq[i];
if (delta.x*delta.x + delta.y*delta.y + delta.z*delta.z > 0.25f*PADDING*PADDING)
rebuild = true;
}
if (rebuild) {
rebuildNeighborList[0] = 1;
interactionCount[0] = 0;
interactionCount[1] = 0;
}
}
__device__ int saveSinglePairs(int x, int* atoms, tileflags* flags, int length, unsigned int maxSinglePairs, unsigned int* singlePairCount, int2* singlePairs, int* sumBuffer, volatile int& pairStartIndex) {
// Record interactions that should be computed as single pairs rather than in blocks.
const int indexInWarp = threadIdx.x%warpSize;
int sum = 0;
for (int i = indexInWarp; i < length; i += warpSize) {
int count = __TILEFLAGS_POPC(flags[i]);
sum += (count <= MAX_BITS_FOR_PAIRS ? count : 0);
}
for (int i = 1; i < warpSize; i *= 2) {
int n = SHFL(sum, indexInWarp - i);
if (indexInWarp >= i)
sum += n;
}
if (indexInWarp == warpSize - 1)
pairStartIndex = atomicAdd(singlePairCount,(unsigned int) sum);
SYNC_WARPS;
int prevSum = SHFL(sum, indexInWarp - 1);
int pairIndex = pairStartIndex + (indexInWarp > 0 ? prevSum : 0);
for (int i = indexInWarp; i < length; i += warpSize) {
int count = __TILEFLAGS_POPC(flags[i]);
if (count <= MAX_BITS_FOR_PAIRS && pairIndex+count <= maxSinglePairs) {
tileflags f = flags[i];
while (f != 0) {
singlePairs[pairIndex] = make_int2(atoms[i], x*TILE_SIZE+__TILEFLAGS_FFS(f)-1);
f &= f-1;
pairIndex++;
}
}
}
// Compact the remaining interactions.
const tileflags warpMask = (static_cast<tileflags>(1)<<indexInWarp)-1;
int numCompacted = 0;
for (int start = 0; start < length; start += warpSize) {
int i = start+indexInWarp;
int atom = atoms[i];
tileflags flag = flags[i];
bool include = (i < length && __TILEFLAGS_POPC(flags[i]) > MAX_BITS_FOR_PAIRS);
tileflags includeFlags = BALLOT(include);
if (include) {
int index = numCompacted+__TILEFLAGS_POPC(includeFlags&warpMask);
atoms[index] = atom;
flags[index] = flag;
}
numCompacted += __TILEFLAGS_POPC(includeFlags);
}
return numCompacted;
}
/**
* Compare the bounding boxes for each pair of atom blocks (comprised of warpSize atoms each), forming a tile. If the two
* atom blocks are sufficiently far apart, mark them as non-interacting. There are two stages in the algorithm.
*
* STAGE 1:
*
* A coarse grained atom block against interacting atom block neighbour list is constructed.
*
* Each warp first loads in some block X of interest. Each thread within the warp then loads
* in a different atom block Y. If Y has exclusions with X, then Y is not processed. If the bounding boxes
* of the two atom blocks are within the cutoff distance, then the two atom blocks are considered to be
* interacting and Y is added to the buffer for X.
*
* STAGE 2:
*
* A fine grained atom block against interacting atoms neighbour list is constructed.
*
* The warp loops over atom blocks Y that were found to (possibly) interact with atom block X. Each thread
* in the warp loops over the warpSize atoms in X and compares their positions to one particular atom from block Y.
* If it finds one closer than the cutoff distance, the atom is added to the list of atoms interacting with block X.
* This continues until the buffer fills up, at which point the results are written to global memory.
*
* [in] periodicBoxSize - size of the rectangular periodic box
* [in] invPeriodicBoxSize - inverse of the periodic box
* [in] blockCenter - the center of each bounding box
* [in] blockBoundingBox - bounding box of each atom block
* [out] interactionCount - total number of tiles that have interactions
* [out] interactingTiles - set of blocks that have interactions
* [out] interactingAtoms - a list of atoms that interact with each atom block
* [in] posq - x,y,z coordinates of each atom and charge q
* [in] maxTiles - maximum number of tiles to process, used for multi-GPUs
* [in] startBlockIndex - first block to process, used for multi-GPUs,
* [in] numBlocks - total number of atom blocks
* [in] sortedBlocks - a sorted list of atom blocks based on volume
* [in] sortedBlockCenter - sorted centers, duplicated for fast access to avoid indexing
* [in] sortedBlockBoundingBox - sorted bounding boxes, duplicated for fast access
* [in] exclusionIndices - maps into exclusionRowIndices with the starting position for a given atom
* [in] exclusionRowIndices - stores the a continuous list of exclusions
* eg: block 0 is excluded from atom 3,5,6
* block 1 is excluded from atom 3,4
* block 2 is excluded from atom 1,3,5,6
* exclusionIndices[0][3][5][8]
* exclusionRowIndices[3][5][6][3][4][1][3][5][6]
* index 0 1 2 3 4 5 6 7 8
* [out] oldPos - stores the positions of the atoms in which this neighbourlist was built on
* - this is used to decide when to rebuild a neighbourlist
* [in] rebuildNeighbourList - whether or not to execute this kernel
*
*/
extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
unsigned int* __restrict__ interactionCount, int* __restrict__ interactingTiles, unsigned int* __restrict__ interactingAtoms,
int2* __restrict__ singlePairs, const real4* __restrict__ posq, unsigned int maxTiles, unsigned int maxSinglePairs,
unsigned int startBlockIndex, unsigned int numBlocks, real2* __restrict__ sortedBlocks, const real4* __restrict__ sortedBlockCenter,
const real4* __restrict__ sortedBlockBoundingBox, const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices,
real4* __restrict__ oldPositions, const int* __restrict__ rebuildNeighborList) {
if (rebuildNeighborList[0] == 0)
return; // The neighbor list doesn't need to be rebuilt.
const int indexInWarp = threadIdx.x%warpSize;
const int warpStart = threadIdx.x-indexInWarp;
const int totalWarps = blockDim.x*gridDim.x/warpSize;
const int warpIndex = (blockIdx.x*blockDim.x+threadIdx.x)/warpSize;
const tileflags warpMask = (static_cast<tileflags>(1)<<indexInWarp)-1;
__shared__ int workgroupBuffer[BUFFER_SIZE*(GROUP_SIZE/warpSize)];
__shared__ tileflags workgroupFlagsBuffer[BUFFER_SIZE*(GROUP_SIZE/warpSize)];
__shared__ int warpExclusions[MAX_EXCLUSIONS*(GROUP_SIZE/warpSize)];
__shared__ real4 posBuffer[GROUP_SIZE];
__shared__ volatile int workgroupTileIndex[GROUP_SIZE/warpSize];
__shared__ int worksgroupPairStartIndex[GROUP_SIZE/warpSize];
int* sumBuffer = (int*) posBuffer; // Reuse the same buffer to save memory
int* buffer = workgroupBuffer+BUFFER_SIZE*(warpStart/warpSize);
tileflags* flagsBuffer = workgroupFlagsBuffer+BUFFER_SIZE*(warpStart/warpSize);
int* exclusionsForX = warpExclusions+MAX_EXCLUSIONS*(warpStart/warpSize);
volatile int& tileStartIndex = workgroupTileIndex[warpStart/warpSize];
volatile int& pairStartIndex = worksgroupPairStartIndex[warpStart/warpSize];
// Loop over blocks.
for (int block1 = startBlockIndex+warpIndex; block1 < startBlockIndex+numBlocks; block1 += totalWarps) {
// Load data for this block. Note that all threads in a warp are processing the same block.
real2 sortedKey = sortedBlocks[block1];
int x = (int) sortedKey.y;
real4 blockCenterX = sortedBlockCenter[block1];
real4 blockSizeX = sortedBlockBoundingBox[block1];
int neighborsInBuffer = 0;
real4 pos1 = posq[x*TILE_SIZE+indexInWarp];
#ifdef USE_PERIODIC
const bool singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= PADDED_CUTOFF &&
0.5f*periodicBoxSize.y-blockSizeX.y >= PADDED_CUTOFF &&
0.5f*periodicBoxSize.z-blockSizeX.z >= PADDED_CUTOFF);
if (singlePeriodicCopy) {
// The box is small enough that we can just translate all the atoms into a single periodic
// box, then skip having to apply periodic boundary conditions later.
APPLY_PERIODIC_TO_POS_WITH_CENTER(pos1, blockCenterX)
}
#endif
pos1.w = 0.5f * (pos1.x * pos1.x + pos1.y * pos1.y + pos1.z * pos1.z);
posBuffer[threadIdx.x] = pos1;
// Load exclusion data for block x.
const int exclusionStart = exclusionRowIndices[x];
const int exclusionEnd = exclusionRowIndices[x+1];
const int numExclusions = exclusionEnd-exclusionStart;
for (int j = indexInWarp; j < numExclusions; j += warpSize)
exclusionsForX[j] = exclusionIndices[exclusionStart+j];
if (MAX_EXCLUSIONS > warpSize)
__syncthreads();
// Loop over atom blocks to search for neighbors. The threads in a warp compare block1 against warpSize
// other blocks in parallel.
for (int block2Base = block1+1; block2Base < NUM_BLOCKS; block2Base += warpSize) {
int block2 = block2Base+indexInWarp;
bool includeBlock2 = (block2 < NUM_BLOCKS);
bool forceInclude = false;
if (includeBlock2) {
real4 blockCenterY = sortedBlockCenter[block2];
real4 blockSizeY = sortedBlockBoundingBox[block2];
real4 blockDelta = blockCenterX-blockCenterY;
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(blockDelta)
#endif
includeBlock2 &= (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z < (PADDED_CUTOFF+blockCenterX.w+blockCenterY.w)*(PADDED_CUTOFF+blockCenterX.w+blockCenterY.w));
blockDelta.x = max(0.0f, fabs(blockDelta.x)-blockSizeX.x-blockSizeY.x);
blockDelta.y = max(0.0f, fabs(blockDelta.y)-blockSizeX.y-blockSizeY.y);
blockDelta.z = max(0.0f, fabs(blockDelta.z)-blockSizeX.z-blockSizeY.z);
includeBlock2 &= (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z < PADDED_CUTOFF_SQUARED);
#ifdef TRICLINIC
// The calculation to find the nearest periodic copy is only guaranteed to work if the nearest copy is less than half a box width away.
// If there's any possibility we might have missed it, do a detailed check.
if (periodicBoxSize.z/2-blockSizeX.z-blockSizeY.z < PADDED_CUTOFF || periodicBoxSize.y/2-blockSizeX.y-blockSizeY.y < PADDED_CUTOFF)
includeBlock2 = forceInclude = true;
#endif
if (includeBlock2) {
int y = (int) sortedBlocks[block2].y;
for (int k = 0; k < numExclusions; k++)
includeBlock2 &= (exclusionsForX[k] != y);
}
}
// Loop over any blocks we identified as potentially containing neighbors.
tileflags includeBlockFlags = BALLOT(includeBlock2);
tileflags forceIncludeFlags = BALLOT(forceInclude);
while (includeBlockFlags != 0) {
int i = __TILEFLAGS_FFS(includeBlockFlags)-1;
includeBlockFlags &= includeBlockFlags-1;
forceInclude = (forceIncludeFlags>>i) & 1;
int y = (int) sortedBlocks[block2Base+i].y;
// Check each atom in block Y for interactions.
int atom2 = y*TILE_SIZE+indexInWarp;
real4 pos2 = posq[atom2];
#ifdef USE_PERIODIC
if (singlePeriodicCopy) {
APPLY_PERIODIC_TO_POS_WITH_CENTER(pos2, blockCenterX)
}
#endif
pos2.w = 0.5f * (pos2.x * pos2.x + pos2.y * pos2.y + pos2.z * pos2.z);
real4 blockCenterY = sortedBlockCenter[block2Base+i];
real3 atomDelta = trimTo3(posBuffer[warpStart+indexInWarp])-trimTo3(blockCenterY);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(atomDelta)
#endif
tileflags atomFlags = BALLOT(forceInclude || atomDelta.x*atomDelta.x+atomDelta.y*atomDelta.y+atomDelta.z*atomDelta.z < (PADDED_CUTOFF+blockCenterY.w)*(PADDED_CUTOFF+blockCenterY.w));
tileflags interacts = 0;
if (atom2 < NUM_ATOMS && atomFlags != 0) {
int first = __TILEFLAGS_FFS(atomFlags)-1;
int last = warpSize-__TILEFLAGS_CLZ(atomFlags);
#ifdef USE_PERIODIC
if (!singlePeriodicCopy) {
for (int j = first; j < last; j++) {
real3 delta = trimTo3(pos2)-trimTo3(posBuffer[warpStart+j]);
APPLY_PERIODIC_TO_DELTA(delta)
interacts |= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED ? static_cast<tileflags>(1)<<j : 0);
}
}
else {
#endif
#pragma unroll
for (int j = 0; j < warpSize; j++) {
real4 posj = posBuffer[warpStart+j];
real halfDist2 = posj.w + pos2.w - posj.x*pos2.x - posj.y*pos2.y - posj.z*pos2.z;
interacts |= (halfDist2 < 0.5f * PADDED_CUTOFF_SQUARED ? static_cast<tileflags>(1)<<j : 0);
}
#ifdef USE_PERIODIC
}
#endif
}
// Add any interacting atoms to the buffer.
tileflags includeAtomFlags = BALLOT(interacts);
if (interacts) {
int index = neighborsInBuffer+__TILEFLAGS_POPC(includeAtomFlags&warpMask);
buffer[index] = atom2;
flagsBuffer[index] = interacts;
}
neighborsInBuffer += __TILEFLAGS_POPC(includeAtomFlags);
if (neighborsInBuffer > BUFFER_SIZE-TILE_SIZE) {
// Store the new tiles to memory.
#if MAX_BITS_FOR_PAIRS > 0
neighborsInBuffer = saveSinglePairs(x, buffer, flagsBuffer, neighborsInBuffer, maxSinglePairs, &interactionCount[1], singlePairs, sumBuffer+warpStart, pairStartIndex);
#endif
int tilesToStore = neighborsInBuffer/TILE_SIZE;
if (tilesToStore > 0) {
if (indexInWarp == 0)
tileStartIndex = atomicAdd(&interactionCount[0], tilesToStore);
int newTileStartIndex = tileStartIndex;
if (newTileStartIndex+tilesToStore <= maxTiles) {
if (indexInWarp < tilesToStore)
interactingTiles[newTileStartIndex+indexInWarp] = x;
for (int j = 0; j < tilesToStore; j++)
interactingAtoms[(newTileStartIndex+j)*TILE_SIZE+indexInWarp] = buffer[indexInWarp+j*TILE_SIZE];
}
buffer[indexInWarp] = buffer[indexInWarp+TILE_SIZE*tilesToStore];
neighborsInBuffer -= TILE_SIZE*tilesToStore;
}
}
}
}
// If we have a partially filled buffer, store it to memory.
#if MAX_BITS_FOR_PAIRS > 0
if (neighborsInBuffer > warpSize)
neighborsInBuffer = saveSinglePairs(x, buffer, flagsBuffer, neighborsInBuffer, maxSinglePairs, &interactionCount[1], singlePairs, sumBuffer+warpStart, pairStartIndex);
#endif
if (neighborsInBuffer > 0) {
int tilesToStore = (neighborsInBuffer+TILE_SIZE-1)/TILE_SIZE;
if (indexInWarp == 0)
tileStartIndex = atomicAdd(&interactionCount[0], tilesToStore);
int newTileStartIndex = tileStartIndex;
if (newTileStartIndex+tilesToStore <= maxTiles) {
if (indexInWarp < tilesToStore)
interactingTiles[newTileStartIndex+indexInWarp] = x;
for (int j = 0; j < tilesToStore; j++)
interactingAtoms[(newTileStartIndex+j)*TILE_SIZE+indexInWarp] = (indexInWarp+j*TILE_SIZE < neighborsInBuffer ? buffer[indexInWarp+j*TILE_SIZE] : NUM_ATOMS);
}
}
}
// Record the positions the neighbor list is based on.
for (int i = threadIdx.x+blockIdx.x*blockDim.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x)
oldPositions[i] = posq[i];
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment