Unverified Commit 67f5644d authored by Anton Gorenko's avatar Anton Gorenko
Browse files

Optimize computeNonbonded

* All AMD GPUs support shuffle, double precision and 64-bit int atomics;
* Remove unused code: !ENABLE_SHUFFLE code paths in nonbonded.hip;
* Use intrinsics in single-precision;
* Use realToFixedPoint (faster float32-to-int64);
* Remove shared atomIndices, use shuffles;
* Check early if atoms are in the cutoff range, sometimes all lanes in
  a warp can skip computations, single pairs can also skip useless
  atomics with zero values;
* Remove volatile skipTiles access, use shuffles;
* Distribute work for warps in a strided order;
* Skip warps that may be still busy in the first loop;
* Unify conditions for excluded atoms with `includeInteraction`;
* Move multiprocessors to HipContext;
* Increase number of warps for computeNonbonded;
* Disable packed math for >=MI200 (it affects performance of some
  kernels like computeGKForces of amoebagk);
* Remove defaultOptimizationOptions and createModule's optimizationFlags
  as they are never used;
* Support -save-temps.
parent 7279c539
...@@ -60,7 +60,7 @@ ...@@ -60,7 +60,7 @@
#include "openmm/common/ComputeContext.h" #include "openmm/common/ComputeContext.h"
#include "openmm/Kernel.h" #include "openmm/Kernel.h"
typedef unsigned long tileflags; typedef unsigned int tileflags;
namespace OpenMM { namespace OpenMM {
...@@ -259,19 +259,15 @@ public: ...@@ -259,19 +259,15 @@ public:
* Create a HIP module from source code. * Create a HIP module from source code.
* *
* @param source the source code of the module * @param source the source code of the module
* @param optimizationFlags the optimization flags to pass to the HIP compiler. If this is
* omitted, a default set of options will be used
*/ */
hipModule_t createModule(const std::string source, const char* optimizationFlags = NULL); hipModule_t createModule(const std::string source);
/** /**
* Create a HIP module from source code. * Create a HIP module from source code.
* *
* @param source the source code of the module * @param source the source code of the module
* @param defines a set of preprocessor definitions (name, value) to define when compiling the program * @param defines a set of preprocessor definitions (name, value) to define when compiling the program
* @param optimizationFlags the optimization flags to pass to the HIP compiler. If this is
* omitted, a default set of options will be used
*/ */
hipModule_t createModule(const std::string source, const std::map<std::string, std::string>& defines, const char* optimizationFlags = NULL); hipModule_t createModule(const std::string source, const std::map<std::string, std::string>& defines);
/** /**
* Get a kernel from a HIP module. * Get a kernel from a HIP module.
* *
...@@ -358,22 +354,22 @@ public: ...@@ -358,22 +354,22 @@ public:
return simdWidth; return simdWidth;
} }
/** /**
* Get whether the device being used warp shuffles. * Get the number of multiprocessors (compute units) of the device being used.
*/ */
bool getSupportsWarpShuffle() const { int getMultiprocessors() const {
return hasWarpShuffle; return multiprocessors;
} }
/** /**
* Get whether the device being used supports 64 bit atomic operations on global memory. * Get whether the device being used supports 64 bit atomic operations on global memory.
*/ */
bool getSupports64BitGlobalAtomics() const { bool getSupports64BitGlobalAtomics() const {
return hasGlobalInt64Atomics; return true;
} }
/** /**
* Get whether the device being used supports double precision math. * Get whether the device being used supports double precision math.
*/ */
bool getSupportsDoublePrecision() const { bool getSupportsDoublePrecision() const {
return hasDoubles; return true;
} }
/** /**
* Get whether double precision is being used. * Get whether double precision is being used.
...@@ -555,16 +551,13 @@ private: ...@@ -555,16 +551,13 @@ private:
int numAtomBlocks; int numAtomBlocks;
int numThreadBlocks; int numThreadBlocks;
int simdWidth; int simdWidth;
int multiprocessors;
int sharedMemPerBlock; int sharedMemPerBlock;
bool hasDoubles;
bool hasGlobalInt64Atomics;
bool hasWarpShuffle;
bool useBlockingSync, useDoublePrecision, useMixedPrecision, contextIsValid, boxIsTriclinic, hasCompilerKernel, isHipccAvailable, hasAssignedPosqCharges; bool useBlockingSync, useDoublePrecision, useMixedPrecision, contextIsValid, boxIsTriclinic, hasCompilerKernel, isHipccAvailable, hasAssignedPosqCharges;
bool isLinkedContext; bool isLinkedContext;
std::string compiler, tempDir, cacheDir, gpuArchitecture; std::string compiler, tempDir, cacheDir, gpuArchitecture;
float4 periodicBoxVecXFloat, periodicBoxVecYFloat, periodicBoxVecZFloat, periodicBoxSizeFloat, invPeriodicBoxSizeFloat; float4 periodicBoxVecXFloat, periodicBoxVecYFloat, periodicBoxVecZFloat, periodicBoxSizeFloat, invPeriodicBoxSizeFloat;
double4 periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ, periodicBoxSize, invPeriodicBoxSize; double4 periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ, periodicBoxSize, invPeriodicBoxSize;
std::string defaultOptimizationOptions;
std::map<std::string, std::string> compilationDefines; std::map<std::string, std::string> compilationDefines;
hipDevice_t device; hipDevice_t device;
hipStream_t currentStream; hipStream_t currentStream;
......
...@@ -69,8 +69,7 @@ using namespace OpenMM; ...@@ -69,8 +69,7 @@ using namespace OpenMM;
using namespace std; using namespace std;
const int HipContext::ThreadBlockSize = 64; const int HipContext::ThreadBlockSize = 64;
const int HipContext::TileSize = sizeof(tileflags)*8; const int HipContext::TileSize = 32;
static_assert(HipContext::ThreadBlockSize == HipContext::TileSize);
bool HipContext::hasInitializedHip = false; bool HipContext::hasInitializedHip = false;
...@@ -136,7 +135,6 @@ HipContext::HipContext(const System& system, int deviceIndex, bool useBlockingSy ...@@ -136,7 +135,6 @@ HipContext::HipContext(const System& system, int deviceIndex, bool useBlockingSy
for (int i = 0; i < static_cast<int>(devicePrecedence.size()); i++) { for (int i = 0; i < static_cast<int>(devicePrecedence.size()); i++) {
int trialDeviceIndex = devicePrecedence[i]; int trialDeviceIndex = devicePrecedence[i];
CHECK_RESULT(hipDeviceGet(&device, trialDeviceIndex)); CHECK_RESULT(hipDeviceGet(&device, trialDeviceIndex));
defaultOptimizationOptions = "-ffast-math -Wall";
// try setting device // try setting device
if (hipSetDevice(device) == hipSuccess) { if (hipSetDevice(device) == hipSuccess) {
// and set flags // and set flags
...@@ -172,10 +170,6 @@ HipContext::HipContext(const System& system, int deviceIndex, bool useBlockingSy ...@@ -172,10 +170,6 @@ HipContext::HipContext(const System& system, int deviceIndex, bool useBlockingSy
// set device properties // set device properties
this->simdWidth = props.warpSize; this->simdWidth = props.warpSize;
this->sharedMemPerBlock = props.sharedMemPerBlock; this->sharedMemPerBlock = props.sharedMemPerBlock;
this->hasGlobalInt64Atomics = props.arch.hasGlobalInt64Atomics;
this->hasDoubles = props.arch.hasDoubles;
// HIP-TODO: remove hasWarpShuffle==false paths completely?
this->hasWarpShuffle = true;
gpuArchitecture = props.gcnArchName; gpuArchitecture = props.gcnArchName;
// HIP-TODO: find a good value here // HIP-TODO: find a good value here
...@@ -195,19 +189,15 @@ HipContext::HipContext(const System& system, int deviceIndex, bool useBlockingSy ...@@ -195,19 +189,15 @@ HipContext::HipContext(const System& system, int deviceIndex, bool useBlockingSy
numAtoms = system.getNumParticles(); numAtoms = system.getNumParticles();
paddedNumAtoms = TileSize*((numAtoms+TileSize-1)/TileSize); paddedNumAtoms = TileSize*((numAtoms+TileSize-1)/TileSize);
numAtomBlocks = (paddedNumAtoms+(TileSize-1))/TileSize; numAtomBlocks = (paddedNumAtoms+(TileSize-1))/TileSize;
int multiprocessors;
CHECK_RESULT(hipDeviceGetAttribute(&multiprocessors, hipDeviceAttributeMultiprocessorCount, device)); CHECK_RESULT(hipDeviceGetAttribute(&multiprocessors, hipDeviceAttributeMultiprocessorCount, device));
// For RDNA GPUs hipDeviceAttributeMultiprocessorCount means WGP (work-group processors, two compute units), not CUs.
if (simdWidth == 32)
multiprocessors *= 2;
numThreadBlocks = numThreadBlocksPerComputeUnit*multiprocessors; numThreadBlocks = numThreadBlocksPerComputeUnit*multiprocessors;
// GCN hardware is more like CUDA-8, w/ no independent forward progress compilationDefines["USE_HIP"] = "1";
// or *_sync primatives if (simdWidth == 32)
compilationDefines["SYNC_WARPS"] = ""; compilationDefines["AMD_RDNA"] = "1";
compilationDefines["SHFL(var, srcLane)"] = "__shfl(var, srcLane);";
// Important: the predicate for ballot is defined as an integer, hence
// it is important that we convert the variable field to a true/false value
// before running ballot, such that we do not discard the top half when
// running on a long int
compilationDefines["BALLOT(var)"] = "__ballot((var) != 0);";
if (useDoublePrecision) { if (useDoublePrecision) {
posq.initialize<double4>(*this, paddedNumAtoms, "posq"); posq.initialize<double4>(*this, paddedNumAtoms, "posq");
velm.initialize<double4>(*this, paddedNumAtoms, "velm"); velm.initialize<double4>(*this, paddedNumAtoms, "velm");
...@@ -263,11 +253,11 @@ HipContext::HipContext(const System& system, int deviceIndex, bool useBlockingSy ...@@ -263,11 +253,11 @@ HipContext::HipContext(const System& system, int deviceIndex, bool useBlockingSy
// Set defines based on the requested precision. // Set defines based on the requested precision.
compilationDefines["SQRT"] = useDoublePrecision ? "sqrt" : "sqrtf"; compilationDefines["SQRT"] = useDoublePrecision ? "sqrt" : "__fsqrt_rn";
compilationDefines["RSQRT"] = useDoublePrecision ? "rsqrt" : "rsqrtf"; compilationDefines["RSQRT"] = useDoublePrecision ? "rsqrt" : "__frsqrt_rn";
compilationDefines["RECIP"] = useDoublePrecision ? "1.0/" : "1.0f/"; compilationDefines["RECIP(x)"] = useDoublePrecision ? "(1.0/(x))" : "(1.0f/(x))";
compilationDefines["EXP"] = useDoublePrecision ? "exp" : "expf"; compilationDefines["EXP"] = useDoublePrecision ? "exp" : "__expf";
compilationDefines["LOG"] = useDoublePrecision ? "log" : "logf"; compilationDefines["LOG"] = useDoublePrecision ? "log" : "__logf";
compilationDefines["POW"] = useDoublePrecision ? "pow" : "powf"; compilationDefines["POW"] = useDoublePrecision ? "pow" : "powf";
compilationDefines["COS"] = useDoublePrecision ? "cos" : "cosf"; compilationDefines["COS"] = useDoublePrecision ? "cos" : "cosf";
compilationDefines["SIN"] = useDoublePrecision ? "sin" : "sinf"; compilationDefines["SIN"] = useDoublePrecision ? "sin" : "sinf";
...@@ -423,14 +413,22 @@ void HipContext::setAsCurrent() { ...@@ -423,14 +413,22 @@ void HipContext::setAsCurrent() {
hipSetDevice(device); hipSetDevice(device);
} }
hipModule_t HipContext::createModule(const string source, const char* optimizationFlags) { hipModule_t HipContext::createModule(const string source) {
return createModule(source, map<string, string>(), optimizationFlags); return createModule(source, map<string, string>());
} }
hipModule_t HipContext::createModule(const string source, const map<string, string>& defines, const char* optimizationFlags) { hipModule_t HipContext::createModule(const string source, const map<string, string>& defines) {
static_assert(8*sizeof(void*) == HipContext::TileSize); const char* saveTempsEnv = getenv("OPENMM_SAVE_TEMPS");
bool saveTemps = saveTempsEnv != nullptr;
string bits = intToString(8*sizeof(void*)); string bits = intToString(8*sizeof(void*));
string options = (optimizationFlags == NULL ? defaultOptimizationOptions : string(optimizationFlags)); string options = "-ffast-math -Wall";
if (gpuArchitecture.find("gfx90a") == 0 ||
gpuArchitecture.find("gfx94") == 0) {
// HIP-TODO: Remove it when the compiler does a better job
// Disable SLP vectorization as it may generate unoptimal packed math instructions on
// >=MI200 (gfx90a, gfx942): more v_mov, higher register usage etc.
options += " -fno-slp-vectorize";
}
if (getMaxThreadBlockSize() < 1024) { if (getMaxThreadBlockSize() < 1024) {
options += " --gpu-max-threads-per-block=" + std::to_string(getMaxThreadBlockSize()); options += " --gpu-max-threads-per-block=" + std::to_string(getMaxThreadBlockSize());
} }
...@@ -478,9 +476,7 @@ hipModule_t HipContext::createModule(const string source, const map<string, stri ...@@ -478,9 +476,7 @@ hipModule_t HipContext::createModule(const string source, const map<string, stri
src << "typedef float3 mixed3;\n"; src << "typedef float3 mixed3;\n";
src << "typedef float4 mixed4;\n"; src << "typedef float4 mixed4;\n";
} }
src << "typedef unsigned int tileflags;\n";
src << "typedef unsigned long tileflags;\n";
src << "static_assert(sizeof(tileflags)*8==" << HipContext::TileSize << ",\"tileflags size does not match TILE_SIZE\");\n";
src << HipKernelSources::common << endl; src << HipKernelSources::common << endl;
for (auto& pair : defines) { for (auto& pair : defines) {
src << "#define " << pair.first; src << "#define " << pair.first;
...@@ -490,6 +486,7 @@ hipModule_t HipContext::createModule(const string source, const map<string, stri ...@@ -490,6 +486,7 @@ hipModule_t HipContext::createModule(const string source, const map<string, stri
} }
if (!defines.empty()) if (!defines.empty())
src << endl; src << endl;
src << HipKernelSources::intrinsics << endl;
src << source << endl; src << source << endl;
// See whether we already have PTX for this kernel cached. // See whether we already have PTX for this kernel cached.
...@@ -499,12 +496,12 @@ hipModule_t HipContext::createModule(const string source, const map<string, stri ...@@ -499,12 +496,12 @@ hipModule_t HipContext::createModule(const string source, const map<string, stri
sha1.Final(); sha1.Final();
UINT_8 hash[20]; UINT_8 hash[20];
sha1.GetHash(hash); sha1.GetHash(hash);
stringstream cacheFile; stringstream cacheHash;
cacheFile << cacheDir; cacheHash.flags(ios::hex);
cacheFile.flags(ios::hex);
for (int i = 0; i < 20; i++) for (int i = 0; i < 20; i++)
cacheFile << setw(2) << setfill('0') << (int) hash[i]; cacheHash << setw(2) << setfill('0') << (int) hash[i];
cacheFile << '_' << gpuArchitecture << '_' << bits; stringstream cacheFile;
cacheFile << cacheDir << cacheHash.str() << '_' << gpuArchitecture << '_' << bits;
hipModule_t module; hipModule_t module;
if (hipModuleLoad(&module, cacheFile.str().c_str()) == hipSuccess) if (hipModuleLoad(&module, cacheFile.str().c_str()) == hipSuccess)
return module; return module;
...@@ -512,11 +509,22 @@ hipModule_t HipContext::createModule(const string source, const map<string, stri ...@@ -512,11 +509,22 @@ hipModule_t HipContext::createModule(const string source, const map<string, stri
// Select names for the various temporary files. // Select names for the various temporary files.
stringstream tempFileName; stringstream tempFileName;
tempFileName << "openmmTempKernel" << this; // Include a pointer to this context as part of the filename to avoid collisions. if (saveTemps) {
tempFileName << "_" << getpid(); tempFileName << saveTempsEnv;
string inputFile = (tempDir+tempFileName.str()+".hip.cpp"); const char* saveTempsPrefixEnv = getenv("OPENMM_SAVE_TEMPS_PREFIX");
string outputFile = (tempDir+tempFileName.str()+".hsaco"); if (saveTempsPrefixEnv) {
string logFile = (tempDir+tempFileName.str()+".log"); tempFileName << saveTempsPrefixEnv;
}
tempFileName << cacheHash.str();
}
else {
tempFileName << tempDir;
tempFileName << "openmmTempKernel" << this; // Include a pointer to this context as part of the filename to avoid collisions.
tempFileName << "_" << getpid();
}
string inputFile = (tempFileName.str()+".hip.cpp");
string outputFile = (tempFileName.str()+".hsaco");
string logFile = (tempFileName.str()+".log");
int res = 0; int res = 0;
// If the runtime compiler plugin is available, use it. // If the runtime compiler plugin is available, use it.
...@@ -550,7 +558,7 @@ hipModule_t HipContext::createModule(const string source, const map<string, stri ...@@ -550,7 +558,7 @@ hipModule_t HipContext::createModule(const string source, const map<string, stri
ofstream out(inputFile.c_str()); ofstream out(inputFile.c_str());
out << src.str(); out << src.str();
out.close(); out.close();
string command = compiler + " --genco --amdgpu-target=" + gpuArchitecture + " " + options + " -o \""+outputFile+"\" " + " \""+inputFile+"\" 2> \""+logFile+"\""; string command = compiler + " --genco --amdgpu-target=" + gpuArchitecture + " " + options + (saveTemps ? " -save-temps=obj" : "") +" -o \""+outputFile+"\" " + " \""+inputFile+"\" 2> \""+logFile+"\"";
res = std::system(command.c_str()); res = std::system(command.c_str());
} }
try { try {
...@@ -576,20 +584,16 @@ hipModule_t HipContext::createModule(const string source, const map<string, stri ...@@ -576,20 +584,16 @@ hipModule_t HipContext::createModule(const string source, const map<string, stri
m<<"Error loading HIP module: "<<getErrorString(result)<<" ("<<result<<")"; m<<"Error loading HIP module: "<<getErrorString(result)<<" ("<<result<<")";
throw OpenMMException(m.str()); throw OpenMMException(m.str());
} }
const char* save = getenv("OPENMM_SAVE_TEMPS"); if (!saveTemps) {
bool savetemps = save != nullptr;
if (!savetemps) {
remove(inputFile.c_str()); remove(inputFile.c_str());
remove(logFile.c_str()); remove(logFile.c_str());
} }
if (rename(outputFile.c_str(), cacheFile.str().c_str()) != 0 && !savetemps) if (rename(outputFile.c_str(), cacheFile.str().c_str()) != 0 && !saveTemps)
remove(outputFile.c_str()); remove(outputFile.c_str());
return module; return module;
} }
catch (...) { catch (...) {
const char* save = getenv("OPENMM_SAVE_TEMPS"); if (!saveTemps) {
bool savetemps = save != nullptr;
if (!savetemps) {
remove(inputFile.c_str()); remove(inputFile.c_str());
remove(outputFile.c_str()); remove(outputFile.c_str());
remove(logFile.c_str()); remove(logFile.c_str());
......
...@@ -69,12 +69,10 @@ HipNonbondedUtilities::HipNonbondedUtilities(HipContext& context) : context(cont ...@@ -69,12 +69,10 @@ HipNonbondedUtilities::HipNonbondedUtilities(HipContext& context) : context(cont
// Decide how many thread blocks to use. // Decide how many thread blocks to use.
string errorMessage = "Error initializing nonbonded utilities"; string errorMessage = "Error initializing nonbonded utilities";
int multiprocessors; CHECK_RESULT(hipEventCreateWithFlags(&downloadCountEvent, context.getEventFlags()));
CHECK_RESULT(hipDeviceGetAttribute(&multiprocessors, hipDeviceAttributeMultiprocessorCount, context.getDevice())); CHECK_RESULT(hipHostMalloc((void**) &pinnedCountBuffer, 2*sizeof(unsigned int), hipHostMallocPortable));
CHECK_RESULT(hipEventCreateWithFlags(&downloadCountEvent, 0)); numForceThreadBlocks = 5*4*context.getMultiprocessors();
CHECK_RESULT(hipHostMalloc((void**) &pinnedCountBuffer, 2*sizeof(int), hipHostMallocPortable)); forceThreadBlockSize = 64;
numForceThreadBlocks = 4*multiprocessors;
forceThreadBlockSize = (256);
setKernelSource(HipKernelSources::nonbonded); setKernelSource(HipKernelSources::nonbonded);
} }
...@@ -234,7 +232,7 @@ void HipNonbondedUtilities::initialize(const System& system) { ...@@ -234,7 +232,7 @@ void HipNonbondedUtilities::initialize(const System& system) {
// Record the exclusion data. // Record the exclusion data.
exclusions.initialize<tileflags>(context, tilesWithExclusions.size()*HipContext::TileSize, "exclusions"); exclusions.initialize<tileflags>(context, tilesWithExclusions.size()*HipContext::TileSize, "exclusions");
tileflags allFlags = static_cast<tileflags>(-1); tileflags allFlags = (tileflags) -1;
vector<tileflags> exclusionVec(exclusions.getSize(), allFlags); vector<tileflags> exclusionVec(exclusions.getSize(), allFlags);
for (int atom1 = 0; atom1 < (int) atomExclusions.size(); ++atom1) { for (int atom1 = 0; atom1 < (int) atomExclusions.size(); ++atom1) {
int x = atom1/HipContext::TileSize; int x = atom1/HipContext::TileSize;
...@@ -245,11 +243,11 @@ void HipNonbondedUtilities::initialize(const System& system) { ...@@ -245,11 +243,11 @@ void HipNonbondedUtilities::initialize(const System& system) {
int offset2 = atom2-y*HipContext::TileSize; int offset2 = atom2-y*HipContext::TileSize;
if (x > y) { if (x > y) {
int index = exclusionTileMap[make_pair(x, y)]*HipContext::TileSize; int index = exclusionTileMap[make_pair(x, y)]*HipContext::TileSize;
exclusionVec[index+offset1] &= allFlags-(static_cast<tileflags>(1)<<offset2); exclusionVec[index+offset1] &= allFlags-(1<<offset2);
} }
else { else {
int index = exclusionTileMap[make_pair(y, x)]*HipContext::TileSize; int index = exclusionTileMap[make_pair(y, x)]*HipContext::TileSize;
exclusionVec[index+offset2] &= allFlags-(static_cast<tileflags>(1)<<offset1); exclusionVec[index+offset2] &= allFlags-(1<<offset1);
} }
} }
} }
...@@ -413,7 +411,7 @@ void HipNonbondedUtilities::computeInteractions(int forceGroups, bool includeFor ...@@ -413,7 +411,7 @@ void HipNonbondedUtilities::computeInteractions(int forceGroups, bool includeFor
hipFunction_t& kernel = (includeForces ? (includeEnergy ? kernels.forceEnergyKernel : kernels.forceKernel) : kernels.energyKernel); hipFunction_t& kernel = (includeForces ? (includeEnergy ? kernels.forceEnergyKernel : kernels.forceKernel) : kernels.energyKernel);
if (kernel == NULL) if (kernel == NULL)
kernel = createInteractionKernel(kernels.source, parameters, arguments, true, true, forceGroups, includeForces, includeEnergy); kernel = createInteractionKernel(kernels.source, parameters, arguments, true, true, forceGroups, includeForces, includeEnergy);
context.executeKernel(kernel, &forceArgs[0], numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize); context.executeKernelFlat(kernel, &forceArgs[0], numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
} }
if (useCutoff && numTiles > 0) { if (useCutoff && numTiles > 0) {
hipEventSynchronize(downloadCountEvent); hipEventSynchronize(downloadCountEvent);
...@@ -445,7 +443,7 @@ bool HipNonbondedUtilities::updateNeighborListSize() { ...@@ -445,7 +443,7 @@ bool HipNonbondedUtilities::updateNeighborListSize() {
findInteractingBlocksArgs[7] = &interactingAtoms.getDevicePointer(); findInteractingBlocksArgs[7] = &interactingAtoms.getDevicePointer();
} }
if (pinnedCountBuffer[1] > maxSinglePairs) { if (pinnedCountBuffer[1] > maxSinglePairs) {
maxSinglePairs = (int) (1.2*pinnedCountBuffer[1]); maxSinglePairs = (unsigned int) (1.2*pinnedCountBuffer[1]);
singlePairs.resize(maxSinglePairs); singlePairs.resize(maxSinglePairs);
if (forceArgs.size() > 0) if (forceArgs.size() > 0)
forceArgs[19] = &singlePairs.getDevicePointer(); forceArgs[19] = &singlePairs.getDevicePointer();
...@@ -512,18 +510,6 @@ hipFunction_t HipNonbondedUtilities::createInteractionKernel(const string& sourc ...@@ -512,18 +510,6 @@ hipFunction_t HipNonbondedUtilities::createInteractionKernel(const string& sourc
map<string, string> replacements; map<string, string> replacements;
replacements["COMPUTE_INTERACTION"] = source; replacements["COMPUTE_INTERACTION"] = source;
const string suffixes[] = {"x", "y", "z", "w"}; const string suffixes[] = {"x", "y", "z", "w"};
stringstream localData;
int localDataSize = 0;
for (const ParameterInfo& param : params) {
if (param.getNumComponents() == 1)
localData<<param.getType()<<" "<<param.getName()<<";\n";
else {
for (int j = 0; j < param.getNumComponents(); ++j)
localData<<param.getComponentType()<<" "<<param.getName()<<"_"<<suffixes[j]<<";\n";
}
localDataSize += param.getSize();
}
replacements["ATOM_PARAMETER_DATA"] = localData.str();
stringstream args; stringstream args;
for (const ParameterInfo& param : params) { for (const ParameterInfo& param : params) {
args << ", "; args << ", ";
...@@ -556,101 +542,43 @@ hipFunction_t HipNonbondedUtilities::createInteractionKernel(const string& sourc ...@@ -556,101 +542,43 @@ hipFunction_t HipNonbondedUtilities::createInteractionKernel(const string& sourc
} }
replacements["LOAD_ATOM1_PARAMETERS"] = load1.str(); replacements["LOAD_ATOM1_PARAMETERS"] = load1.str();
bool useShuffle = context.getSupportsWarpShuffle();
// Part 1. Defines for on diagonal exclusion tiles // Part 1. Defines for on diagonal exclusion tiles
stringstream loadLocal1;
if(useShuffle) {
// not needed if using shuffles as we can directly fetch from register
} else {
for (const ParameterInfo& param : params) {
if (param.getNumComponents() == 1)
loadLocal1<<"localData[LOCAL_ID]."<<param.getName()<<" = "<<param.getName()<<"1;\n";
else {
for (int j = 0; j < param.getNumComponents(); ++j)
loadLocal1<<"localData[LOCAL_ID]."<<param.getName()<<"_"<<suffixes[j]<<" = "<<param.getName()<<"1."<<suffixes[j]<<";\n";
}
}
}
replacements["LOAD_LOCAL_PARAMETERS_FROM_1"] = loadLocal1.str();
stringstream broadcastWarpData; stringstream broadcastWarpData;
if (useShuffle) { broadcastWarpData << "posq2.x = SHFL(shflPosq.x, j);\n";
broadcastWarpData << "posq2.x = real_shfl(shflPosq.x, j);\n"; broadcastWarpData << "posq2.y = SHFL(shflPosq.y, j);\n";
broadcastWarpData << "posq2.y = real_shfl(shflPosq.y, j);\n"; broadcastWarpData << "posq2.z = SHFL(shflPosq.z, j);\n";
broadcastWarpData << "posq2.z = real_shfl(shflPosq.z, j);\n"; broadcastWarpData << "posq2.w = SHFL(shflPosq.w, j);\n";
broadcastWarpData << "posq2.w = real_shfl(shflPosq.w, j);\n"; for (const ParameterInfo& param : params) {
for (const ParameterInfo& param : params) { broadcastWarpData << param.getType() << " shfl" << param.getName() << ";\n";
broadcastWarpData << param.getType() << " shfl" << param.getName() << ";\n"; for (int j = 0; j < param.getNumComponents(); j++) {
for (int j = 0; j < param.getNumComponents(); j++) { if (param.getNumComponents() == 1)
if (param.getNumComponents() == 1) broadcastWarpData << "shfl" << param.getName() << "=SHFL(" << param.getName() <<"1,j);\n";
broadcastWarpData << "shfl" << param.getName() << "=real_shfl(" << param.getName() <<"1,j);\n"; else
else broadcastWarpData << "shfl" << param.getName()+"."+suffixes[j] << "=SHFL(" << param.getName()+"1."+suffixes[j] <<",j);\n";
broadcastWarpData << "shfl" << param.getName()+"."+suffixes[j] << "=real_shfl(" << param.getName()+"1."+suffixes[j] <<",j);\n";
}
} }
} else {
// not used if not shuffling
} }
replacements["BROADCAST_WARP_DATA"] = broadcastWarpData.str(); replacements["BROADCAST_WARP_DATA"] = broadcastWarpData.str();
// Part 2. Defines for off-diagonal exclusions, and neighborlist tiles. // Part 2. Defines for off-diagonal exclusions, and neighborlist tiles.
stringstream declareLocal2; stringstream declareLocal2;
if (useShuffle) { for (const ParameterInfo& param : params)
for (const ParameterInfo& param : params) declareLocal2<<param.getType()<<" shfl"<<param.getName()<<";\n";
declareLocal2<<param.getType()<<" shfl"<<param.getName()<<";\n";
}
else {
// not used if using shared memory
}
replacements["DECLARE_LOCAL_PARAMETERS"] = declareLocal2.str(); replacements["DECLARE_LOCAL_PARAMETERS"] = declareLocal2.str();
stringstream loadLocal2; stringstream loadLocal2;
if (useShuffle) { for (const ParameterInfo& param : params)
for (const ParameterInfo& param : params) loadLocal2<<"shfl"<<param.getName()<<" = global_"<<param.getName()<<"[j];\n";
loadLocal2<<"shfl"<<param.getName()<<" = global_"<<param.getName()<<"[j];\n";
}
else {
for (const ParameterInfo& param : params) {
if (param.getNumComponents() == 1)
loadLocal2<<"localData[LOCAL_ID]."<<param.getName()<<" = global_"<<param.getName()<<"[j];\n";
else {
loadLocal2<<param.getType()<<" temp_"<<param.getName()<<" = global_"<<param.getName()<<"[j];\n";
for (int j = 0; j < param.getNumComponents(); ++j)
loadLocal2<<"localData[LOCAL_ID]."<<param.getName()<<"_"<<suffixes[j]<<" = temp_"<<param.getName()<<"."<<suffixes[j]<<";\n";
}
}
}
replacements["LOAD_LOCAL_PARAMETERS_FROM_GLOBAL"] = loadLocal2.str(); replacements["LOAD_LOCAL_PARAMETERS_FROM_GLOBAL"] = loadLocal2.str();
stringstream load2j; stringstream load2j;
if (useShuffle) { for (const ParameterInfo& param : params)
for (const ParameterInfo& param : params) load2j<<param.getType()<<" "<<param.getName()<<"2 = shfl"<<param.getName()<<";\n";
load2j<<param.getType()<<" "<<param.getName()<<"2 = shfl"<<param.getName()<<";\n";
}
else {
for (const ParameterInfo& param : params) {
if (param.getNumComponents() == 1)
load2j<<param.getType()<<" "<<param.getName()<<"2 = localData[atom2]."<<param.getName()<<";\n";
else {
load2j<<param.getType()<<" "<<param.getName()<<"2 = make_"<<param.getType()<<"(";
for (int j = 0; j < param.getNumComponents(); ++j) {
if (j > 0)
load2j<<", ";
load2j<<"localData[atom2]."<<param.getName()<<"_"<<suffixes[j];
}
load2j<<");\n";
}
}
}
replacements["LOAD_ATOM2_PARAMETERS"] = load2j.str(); replacements["LOAD_ATOM2_PARAMETERS"] = load2j.str();
stringstream clearLocal; stringstream clearLocal;
for (const ParameterInfo& param : params) { for (const ParameterInfo& param : params) {
if (useShuffle) clearLocal<<"shfl";
clearLocal<<"shfl";
else
clearLocal<<"localData[atom2].";
clearLocal<<param.getName()<<" = "; clearLocal<<param.getName()<<" = ";
if (param.getNumComponents() == 1) if (param.getNumComponents() == 1)
clearLocal<<"0;\n"; clearLocal<<"0;\n";
...@@ -673,29 +601,10 @@ hipFunction_t HipNonbondedUtilities::createInteractionKernel(const string& sourc ...@@ -673,29 +601,10 @@ hipFunction_t HipNonbondedUtilities::createInteractionKernel(const string& sourc
replacements["SAVE_DERIVATIVES"] = saveDerivs.str(); replacements["SAVE_DERIVATIVES"] = saveDerivs.str();
stringstream shuffleWarpData; stringstream shuffleWarpData;
if(useShuffle) { shuffleWarpData << "shflPosq = warpRotateLeft<TILE_SIZE>(shflPosq);\n";
shuffleWarpData << "shflPosq.x = real_shfl(shflPosq.x, tgx+1);\n"; shuffleWarpData << "shflForce = warpRotateLeft<TILE_SIZE>(shflForce);\n";
shuffleWarpData << "shflPosq.y = real_shfl(shflPosq.y, tgx+1);\n"; for (const ParameterInfo& param : params) {
shuffleWarpData << "shflPosq.z = real_shfl(shflPosq.z, tgx+1);\n"; shuffleWarpData<<"shfl"<<param.getName()<<"=warpRotateLeft<TILE_SIZE>(shfl"<<param.getName()<<");\n";
shuffleWarpData << "shflPosq.w = real_shfl(shflPosq.w, tgx+1);\n";
shuffleWarpData << "shflForce.x = real_shfl(shflForce.x, tgx+1);\n";
shuffleWarpData << "shflForce.y = real_shfl(shflForce.y, tgx+1);\n";
shuffleWarpData << "shflForce.z = real_shfl(shflForce.z, tgx+1);\n";
for (const ParameterInfo& param : params) {
if (param.getNumComponents() == 1)
shuffleWarpData<<"shfl"<<param.getName()<<"=real_shfl(shfl"<<param.getName()<<", tgx+1);\n";
else {
for (int j = 0; j < param.getNumComponents(); j++) {
// looks something like shflsigmaEpsilon.x = real_shfl(shflsigmaEpsilon.x,tgx+1);
shuffleWarpData<<"shfl"<<param.getName()
<<"."<<suffixes[j]<<"=real_shfl(shfl"
<<param.getName()<<"."<<suffixes[j]
<<", tgx+1);\n";
}
}
}
} else {
// not used otherwise
} }
replacements["SHUFFLE_WARP_DATA"] = shuffleWarpData.str(); replacements["SHUFFLE_WARP_DATA"] = shuffleWarpData.str();
...@@ -708,8 +617,7 @@ hipFunction_t HipNonbondedUtilities::createInteractionKernel(const string& sourc ...@@ -708,8 +617,7 @@ hipFunction_t HipNonbondedUtilities::createInteractionKernel(const string& sourc
defines["USE_EXCLUSIONS"] = "1"; defines["USE_EXCLUSIONS"] = "1";
if (isSymmetric) if (isSymmetric)
defines["USE_SYMMETRIC"] = "1"; defines["USE_SYMMETRIC"] = "1";
if (useShuffle) defines["ENABLE_SHUFFLE"] = "1"; // Used only in hippoNonbonded.cc
defines["ENABLE_SHUFFLE"] = "1";
if (includeForces) if (includeForces)
defines["INCLUDE_FORCES"] = "1"; defines["INCLUDE_FORCES"] = "1";
if (includeEnergy) if (includeEnergy)
...@@ -725,6 +633,7 @@ hipFunction_t HipNonbondedUtilities::createInteractionKernel(const string& sourc ...@@ -725,6 +633,7 @@ hipFunction_t HipNonbondedUtilities::createInteractionKernel(const string& sourc
} }
} }
defines["MAX_CUTOFF"] = context.doubleToString(maxCutoff); defines["MAX_CUTOFF"] = context.doubleToString(maxCutoff);
defines["MAX_CUTOFF_SQUARED"] = context.doubleToString(maxCutoff*maxCutoff);
defines["NUM_ATOMS"] = context.intToString(context.getNumAtoms()); defines["NUM_ATOMS"] = context.intToString(context.getNumAtoms());
defines["PADDED_NUM_ATOMS"] = context.intToString(context.getPaddedNumAtoms()); defines["PADDED_NUM_ATOMS"] = context.intToString(context.getPaddedNumAtoms());
defines["NUM_BLOCKS"] = context.intToString(context.getNumAtomBlocks()); defines["NUM_BLOCKS"] = context.intToString(context.getNumAtomBlocks());
...@@ -736,8 +645,6 @@ hipFunction_t HipNonbondedUtilities::createInteractionKernel(const string& sourc ...@@ -736,8 +645,6 @@ hipFunction_t HipNonbondedUtilities::createInteractionKernel(const string& sourc
int endExclusionIndex = (context.getContextIndex()+1)*numExclusionTiles/numContexts; int endExclusionIndex = (context.getContextIndex()+1)*numExclusionTiles/numContexts;
defines["FIRST_EXCLUSION_TILE"] = context.intToString(startExclusionIndex); defines["FIRST_EXCLUSION_TILE"] = context.intToString(startExclusionIndex);
defines["LAST_EXCLUSION_TILE"] = context.intToString(endExclusionIndex); defines["LAST_EXCLUSION_TILE"] = context.intToString(endExclusionIndex);
if ((localDataSize/4)%2 == 0 && !context.getUseDoublePrecision())
defines["PARAMETER_SIZE_IS_EVEN"] = "1";
hipModule_t program = context.createModule(HipKernelSources::vectorOps+context.replaceStrings(kernelSource, replacements), defines); hipModule_t program = context.createModule(HipKernelSources::vectorOps+context.replaceStrings(kernelSource, replacements), defines);
hipFunction_t kernel = context.getKernel(program, "computeNonbonded"); hipFunction_t kernel = context.getKernel(program, "computeNonbonded");
return kernel; return kernel;
......
...@@ -16,11 +16,36 @@ ...@@ -16,11 +16,36 @@
#define GROUP_ID blockIdx.x #define GROUP_ID blockIdx.x
#define NUM_GROUPS gridDim.x #define NUM_GROUPS gridDim.x
#define SYNC_THREADS __syncthreads(); #define SYNC_THREADS __syncthreads();
#define SYNC_WARPS {__builtin_amdgcn_wave_barrier(); __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "wavefront");}
#define MEM_FENCE __threadfence_block(); #define MEM_FENCE __threadfence_block();
#define ATOMIC_ADD(dest, value) atomicAdd(dest, value) #define ATOMIC_ADD(dest, value) atomicAdd(dest, value)
typedef long long mm_long; typedef long long mm_long;
typedef unsigned long long mm_ulong; typedef unsigned long long mm_ulong;
#define SUPPORTS_64_BIT_ATOMICS __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__ #define SUPPORTS_DOUBLE_PRECISION 1
#define SUPPORTS_DOUBLE_PRECISION __HIP_ARCH_HAS_DOUBLES__
#ifdef USE_DOUBLE_PRECISION
__device__ inline long long realToFixedPoint(double x) {
return static_cast<long long>(x * 0x100000000);
}
#else
__device__ inline long long realToFixedPoint(float x) {
// Faster way to calculate static_cast<long long>(x * 0x100000000) with exactly the same
// results but less instructions.
float integral = truncf(x);
float fractional = (x - integral) * 0x100000000;
unsigned int integral_u32 = static_cast<int>(integral);
unsigned int fractional_u32 = static_cast<unsigned int>(fabsf(fractional));
// A negative real number (with non-zero fractional) needs rounding-down x for integral and
// changing fractional's sign. However, -1 is used as a threshold instead of 0 because, when
// fractional is in (-1; 0], fractional_u32 is 0 and the number is considered an integer.
bool isNegReal = fractional <= -1.0f;
return (static_cast<unsigned long long>(isNegReal ? integral_u32 - 1 : integral_u32) << 32) |
static_cast<unsigned long long>(isNegReal ? 0 - fractional_u32 : fractional_u32);
}
#endif
/**
* This file contains the device function for using cross-lane operations (ballot and shuffle)
*/
#if defined(TILE_SIZE)
#if !defined(AMD_RDNA)
// Two subwarps per warp
#define SHFL(var, srcLane) __shfl(var, (srcLane) & (TILE_SIZE - 1), TILE_SIZE)
#define BALLOT(var) (unsigned int)(__ballot(var) >> (threadIdx.x & ((64 - 1) ^ (TILE_SIZE - 1))))
#else
#define SHFL(var, srcLane) __shfl(var, srcLane)
#define BALLOT(var) __ballot(var)
#endif
#endif
template<class T>
static __inline__ __device__
T warpShuffle(const T& input, const int src_lane) {
static_assert(sizeof(T) % sizeof(int) == 0, "incorrect type size");
constexpr int words_no = sizeof(T) / sizeof(int);
T output;
#pragma unroll
for(int i = 0; i < words_no; i++) {
int word;
__builtin_memcpy(&word, reinterpret_cast<const char*>(&input) + i * sizeof(int), sizeof(int));
word = __builtin_amdgcn_ds_bpermute(src_lane << 2, word);
__builtin_memcpy(reinterpret_cast<char*>(&output) + i * sizeof(int), &word, sizeof(int));
}
return output;
}
template<int Subwarp, class T>
static __inline__ __device__
typename std::enable_if<(Subwarp == warpSize), T>::type
warpRotateLeft(const T& input) {
return warpShuffle(input, threadIdx.x + 1);
}
template<int Subwarp, class T>
static __inline__ __device__
typename std::enable_if<!(Subwarp == warpSize), T>::type
warpRotateLeft(const T& input) {
return warpShuffle(input, ((threadIdx.x + 1) & (Subwarp - 1)) | (threadIdx.x & ~(Subwarp - 1)));
}
#ifndef ENABLE_SHUFFLE
typedef struct {
real x, y, z;
real q;
real fx, fy, fz;
ATOM_PARAMETER_DATA
#ifndef PARAMETER_SIZE_IS_EVEN
real padding;
#endif
} AtomData;
#endif
#ifdef ENABLE_SHUFFLE
#define real_shfl SHFL
#endif
/** /**
* Compute nonbonded interactions. The kernel is separated into two parts, * Compute nonbonded interactions. The kernel is separated into two parts,
* tiles with exclusions and tiles without exclusions. It relies heavily on * tiles with exclusions and tiles without exclusions. It relies heavily on
...@@ -75,7 +59,7 @@ typedef struct { ...@@ -75,7 +59,7 @@ typedef struct {
* [in]interactingAtoms - a list of interactions within a given tile * [in]interactingAtoms - a list of interactions within a given tile
* *
*/ */
extern "C" __global__ void computeNonbonded( extern "C" __launch_bounds__(THREAD_BLOCK_SIZE) __global__ void computeNonbonded(
unsigned long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq, const tileflags* __restrict__ exclusions, unsigned long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq, const tileflags* __restrict__ exclusions,
const int2* __restrict__ exclusionTiles, unsigned int startTileIndex, unsigned long long numTileIndices const int2* __restrict__ exclusionTiles, unsigned int startTileIndex, unsigned long long numTileIndices
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
...@@ -86,21 +70,15 @@ extern "C" __global__ void computeNonbonded( ...@@ -86,21 +70,15 @@ extern "C" __global__ void computeNonbonded(
#endif #endif
PARAMETER_ARGUMENTS) { PARAMETER_ARGUMENTS) {
const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE; const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE; // global warpIndex const unsigned int warp = blockIdx.x*(THREAD_BLOCK_SIZE/TILE_SIZE) + threadIdx.x/TILE_SIZE; // global warpIndex
const unsigned int tgx = threadIdx.x & (TILE_SIZE-1); // index within the warp const unsigned int tgx = threadIdx.x & (TILE_SIZE-1); // index within the warp
const unsigned int tbx = threadIdx.x - tgx; // block warpIndex const unsigned int tbx = threadIdx.x - tgx; // block warpIndex
mixed energy = 0; mixed energy = 0;
INIT_DERIVATIVES INIT_DERIVATIVES
// used shared memory if the device cannot shuffle
#ifndef ENABLE_SHUFFLE
__shared__ AtomData localData[THREAD_BLOCK_SIZE];
#endif
// First loop: process tiles that contain exclusions. // First loop: process tiles that contain exclusions.
const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps; for (int pos = FIRST_EXCLUSION_TILE+warp; pos < LAST_EXCLUSION_TILE; pos+=totalWarps) {
const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
const int2 tileIndices = exclusionTiles[pos]; const int2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x; const unsigned int x = tileIndices.x;
const unsigned int y = tileIndices.y; const unsigned int y = tileIndices.y;
...@@ -111,62 +89,57 @@ extern "C" __global__ void computeNonbonded( ...@@ -111,62 +89,57 @@ extern "C" __global__ void computeNonbonded(
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
tileflags excl = exclusions[pos*TILE_SIZE+tgx]; tileflags excl = exclusions[pos*TILE_SIZE+tgx];
#endif #endif
const bool hasExclusions = true;
if (x == y) { if (x == y) {
// This tile is on the diagonal. // This tile is on the diagonal.
#ifdef ENABLE_SHUFFLE
real4 shflPosq = posq1; real4 shflPosq = posq1;
#else
localData[threadIdx.x].x = posq1.x;
localData[threadIdx.x].y = posq1.y;
localData[threadIdx.x].z = posq1.z;
localData[threadIdx.x].q = posq1.w;
LOAD_LOCAL_PARAMETERS_FROM_1
#endif
// we do not need to fetch parameters from global since this is a symmetric tile // we do not need to fetch parameters from global since this is a symmetric tile
// instead we can broadcast the values using shuffle // instead we can broadcast the values using shuffle
#pragma unroll 2
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+j;
real4 posq2; real4 posq2;
#ifdef ENABLE_SHUFFLE
BROADCAST_WARP_DATA BROADCAST_WARP_DATA
#else
posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
#endif
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z); real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta) APPLY_PERIODIC_TO_DELTA(delta)
#endif #endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real invR = RSQRT(r2); #ifdef USE_CUTOFF
real r = r2*invR; if (r2 < MAX_CUTOFF_SQUARED) {
LOAD_ATOM2_PARAMETERS #endif
atom2 = y*TILE_SIZE+j; real invR = RSQRT(r2);
real r = r2*invR;
LOAD_ATOM2_PARAMETERS
int atom2 = y*TILE_SIZE+j;
#ifdef USE_SYMMETRIC #ifdef USE_SYMMETRIC
real dEdR = 0.0f; real dEdR = 0.0f;
#else #else
real3 dEdR1 = make_real3(0); real3 dEdR1 = make_real3(0);
real3 dEdR2 = make_real3(0); real3 dEdR2 = make_real3(0);
#endif #endif
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 1)); bool isExcluded = !(atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && (excl & 1));
#endif
real tempEnergy = 0.0f;
const real interactionScale = 0.5f;
COMPUTE_INTERACTION
#ifdef INCLUDE_ENERGY
energy += 0.5f*tempEnergy;
#endif #endif
real tempEnergy = 0.0f;
const real interactionScale = 0.5f;
COMPUTE_INTERACTION
energy += 0.5f*tempEnergy;
#ifdef INCLUDE_FORCES #ifdef INCLUDE_FORCES
#ifdef USE_SYMMETRIC #ifdef USE_SYMMETRIC
force.x -= delta.x*dEdR; force.x -= delta.x*dEdR;
force.y -= delta.y*dEdR; force.y -= delta.y*dEdR;
force.z -= delta.z*dEdR; force.z -= delta.z*dEdR;
#else #else
force.x -= dEdR1.x; force.x -= dEdR1.x;
force.y -= dEdR1.y; force.y -= dEdR1.y;
force.z -= dEdR1.z; force.z -= dEdR1.z;
#endif #endif
#endif #endif
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
excl >>= 1; excl >>= 1;
#endif #endif
...@@ -175,117 +148,88 @@ extern "C" __global__ void computeNonbonded( ...@@ -175,117 +148,88 @@ extern "C" __global__ void computeNonbonded(
else { else {
// This is an off-diagonal tile. // This is an off-diagonal tile.
unsigned int j = y*TILE_SIZE + tgx; unsigned int j = y*TILE_SIZE + tgx;
int atomIndex = j;
real4 shflPosq = posq[j]; real4 shflPosq = posq[j];
#ifdef ENABLE_SHUFFLE
real3 shflForce; real3 shflForce;
shflForce.x = 0.0f; shflForce.x = 0.0f;
shflForce.y = 0.0f; shflForce.y = 0.0f;
shflForce.z = 0.0f; shflForce.z = 0.0f;
#else
localData[threadIdx.x].x = shflPosq.x;
localData[threadIdx.x].y = shflPosq.y;
localData[threadIdx.x].z = shflPosq.z;
localData[threadIdx.x].q = shflPosq.w;
localData[threadIdx.x].fx = 0.0f;
localData[threadIdx.x].fy = 0.0f;
localData[threadIdx.x].fz = 0.0f;
#endif
DECLARE_LOCAL_PARAMETERS DECLARE_LOCAL_PARAMETERS
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx)); excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
#endif #endif
unsigned int tj = tgx; #pragma unroll 2
for (j = 0; j < TILE_SIZE; j++) { for (j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+tj;
#ifdef ENABLE_SHUFFLE
real4 posq2 = shflPosq; real4 posq2 = shflPosq;
#else
real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
#endif
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z); real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta) APPLY_PERIODIC_TO_DELTA(delta)
#endif #endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real invR = RSQRT(r2); #ifdef USE_CUTOFF
real r = r2*invR; if (r2 < MAX_CUTOFF_SQUARED) {
LOAD_ATOM2_PARAMETERS #endif
atom2 = y*TILE_SIZE+tj; real invR = RSQRT(r2);
real r = r2*invR;
LOAD_ATOM2_PARAMETERS
int atom2 = atomIndex;
#ifdef USE_SYMMETRIC #ifdef USE_SYMMETRIC
real dEdR = 0.0f; real dEdR = 0.0f;
#else #else
real3 dEdR1 = make_real3(0); real3 dEdR1 = make_real3(0);
real3 dEdR2 = make_real3(0); real3 dEdR2 = make_real3(0);
#endif #endif
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 1)); bool isExcluded = !(atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && (excl & 1));
#endif
real tempEnergy = 0.0f;
const real interactionScale = 1.0f;
COMPUTE_INTERACTION
#ifdef INCLUDE_ENERGY
energy += tempEnergy;
#endif #endif
real tempEnergy = 0.0f;
const real interactionScale = 1.0f;
COMPUTE_INTERACTION
energy += tempEnergy;
#ifdef INCLUDE_FORCES #ifdef INCLUDE_FORCES
#ifdef USE_SYMMETRIC #ifdef USE_SYMMETRIC
delta *= dEdR; delta *= dEdR;
force.x -= delta.x; force.x -= delta.x;
force.y -= delta.y; force.y -= delta.y;
force.z -= delta.z; force.z -= delta.z;
#ifdef ENABLE_SHUFFLE shflForce.x += delta.x;
shflForce.x += delta.x; shflForce.y += delta.y;
shflForce.y += delta.y; shflForce.z += delta.z;
shflForce.z += delta.z;
#else
localData[tbx+tj].fx += delta.x;
localData[tbx+tj].fy += delta.y;
localData[tbx+tj].fz += delta.z;
#endif
#else // !USE_SYMMETRIC #else // !USE_SYMMETRIC
force.x -= dEdR1.x; force.x -= dEdR1.x;
force.y -= dEdR1.y; force.y -= dEdR1.y;
force.z -= dEdR1.z; force.z -= dEdR1.z;
#ifdef ENABLE_SHUFFLE shflForce.x += dEdR2.x;
shflForce.x += dEdR2.x; shflForce.y += dEdR2.y;
shflForce.y += dEdR2.y; shflForce.z += dEdR2.z;
shflForce.z += dEdR2.z;
#else
localData[tbx+tj].fx += dEdR2.x;
localData[tbx+tj].fy += dEdR2.y;
localData[tbx+tj].fz += dEdR2.z;
#endif
#endif // end USE_SYMMETRIC #endif // end USE_SYMMETRIC
#endif #endif
#ifdef ENABLE_SHUFFLE #ifdef USE_CUTOFF
SHUFFLE_WARP_DATA }
#endif #endif
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
excl >>= 1; excl >>= 1;
#endif #endif
// cycles the indices SHUFFLE_WARP_DATA
// 0 1 2 3 4 5 6 7 -> 1 2 3 4 5 6 7 0 atomIndex = warpRotateLeft<TILE_SIZE>(atomIndex);
tj = (tj + 1) & (TILE_SIZE - 1);
} }
const unsigned int offset = y*TILE_SIZE + tgx; const unsigned int offset = atomIndex;
// write results for off diagonal tiles // write results for off diagonal tiles
#ifdef INCLUDE_FORCES #ifdef INCLUDE_FORCES
#ifdef ENABLE_SHUFFLE atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>(realToFixedPoint(shflForce.x)));
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (shflForce.x*0x100000000))); atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(shflForce.y)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (shflForce.y*0x100000000))); atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(shflForce.z)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (shflForce.z*0x100000000)));
#else
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fx*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fy*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fz*0x100000000)));
#endif
#endif #endif
} }
// Write results for on and off diagonal tiles // Write results for on and off diagonal tiles
#ifdef INCLUDE_FORCES #ifdef INCLUDE_FORCES
const unsigned int offset = x*TILE_SIZE + tgx; const unsigned int offset = x*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (force.x*0x100000000))); atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>(realToFixedPoint(force.x)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000))); atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(force.y)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000))); atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(force.z)));
#endif #endif
} }
...@@ -296,22 +240,21 @@ extern "C" __global__ void computeNonbonded( ...@@ -296,22 +240,21 @@ extern "C" __global__ void computeNonbonded(
const unsigned int numTiles = interactionCount[0]; const unsigned int numTiles = interactionCount[0];
if (numTiles > maxTiles) if (numTiles > maxTiles)
return; // There wasn't enough memory for the neighbor list. return; // There wasn't enough memory for the neighbor list.
int pos = (int) (warp*(long long)numTiles/totalWarps); for (unsigned int pos0 = warp; pos0 < LAST_EXCLUSION_TILE+numTiles; pos0+=totalWarps) {
int end = (int) ((warp+1)*(long long)numTiles/totalWarps); // Skip warps that may be still busy in the first loop
if (pos0 < LAST_EXCLUSION_TILE) {
continue;
}
const unsigned int pos = pos0-LAST_EXCLUSION_TILE;
#else #else
int pos = (int) (startTileIndex+warp*numTileIndices/totalWarps); int pos = (int) (startTileIndex+warp*numTileIndices/totalWarps);
int end = (int) (startTileIndex+(warp+1)*numTileIndices/totalWarps); int end = (int) (startTileIndex+(warp+1)*numTileIndices/totalWarps);
#endif
int skipBase = 0; int skipBase = 0;
int currentSkipIndex = tbx; int currentSkipIndex = tbx;
// atomIndices can probably be shuffled as well int skipTiles;
// but it probably wouldn't make things any faster skipTiles = -1;
__shared__ int atomIndices[THREAD_BLOCK_SIZE]; for (; pos < end; pos++) {
__shared__ volatile int skipTiles[THREAD_BLOCK_SIZE]; #endif
skipTiles[threadIdx.x] = -1;
while (pos < end) {
const bool hasExclusions = false;
real3 force = make_real3(0); real3 force = make_real3(0);
bool includeTile = true; bool includeTile = true;
...@@ -334,63 +277,44 @@ extern "C" __global__ void computeNonbonded( ...@@ -334,63 +277,44 @@ extern "C" __global__ void computeNonbonded(
// Skip over tiles that have exclusions, since they were already processed. // Skip over tiles that have exclusions, since they were already processed.
while (skipTiles[tbx+TILE_SIZE-1] < pos) { while (SHFL(skipTiles, TILE_SIZE-1) < pos) {
if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) { if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
int2 tile = exclusionTiles[skipBase+tgx]; int2 tile = exclusionTiles[skipBase+tgx];
skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2; skipTiles = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
} }
else else
skipTiles[threadIdx.x] = end; skipTiles = end;
skipBase += TILE_SIZE; skipBase += TILE_SIZE;
currentSkipIndex = tbx; currentSkipIndex = 0;
} }
while (skipTiles[currentSkipIndex] < pos) while (SHFL(skipTiles, currentSkipIndex) < pos)
currentSkipIndex++; currentSkipIndex++;
includeTile = (skipTiles[currentSkipIndex] != pos); includeTile = (SHFL(skipTiles, currentSkipIndex) != pos);
#endif #endif
if (includeTile) { if (includeTile) {
unsigned int atom1 = x*TILE_SIZE + tgx; unsigned int atom1 = x*TILE_SIZE + tgx;
// Load atom data for this tile. // Load atom data for this tile.
real4 posq1 = posq[atom1]; real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS LOAD_ATOM1_PARAMETERS
//const unsigned int localAtomIndex = threadIdx.x;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
unsigned int j = interactingAtoms[pos*TILE_SIZE+tgx]; unsigned int j = interactingAtoms[pos*TILE_SIZE+tgx];
#else #else
unsigned int j = y*TILE_SIZE + tgx; unsigned int j = y*TILE_SIZE + tgx;
#endif #endif
atomIndices[threadIdx.x] = j; int atomIndex = j;
#ifdef ENABLE_SHUFFLE
DECLARE_LOCAL_PARAMETERS DECLARE_LOCAL_PARAMETERS
real4 shflPosq; real4 shflPosq;
real3 shflForce; real3 shflForce;
shflForce.x = 0.0f; shflForce.x = 0.0f;
shflForce.y = 0.0f; shflForce.y = 0.0f;
shflForce.z = 0.0f; shflForce.z = 0.0f;
#endif
if (j < PADDED_NUM_ATOMS) { if (j < PADDED_NUM_ATOMS) {
// Load position of atom j from from global memory // Load position of atom j from from global memory
#ifdef ENABLE_SHUFFLE
shflPosq = posq[j]; shflPosq = posq[j];
#else
localData[threadIdx.x].x = posq[j].x;
localData[threadIdx.x].y = posq[j].y;
localData[threadIdx.x].z = posq[j].z;
localData[threadIdx.x].q = posq[j].w;
localData[threadIdx.x].fx = 0.0f;
localData[threadIdx.x].fy = 0.0f;
localData[threadIdx.x].fz = 0.0f;
#endif
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
} }
else { else {
#ifdef ENABLE_SHUFFLE
shflPosq = make_real4(0, 0, 0, 0); shflPosq = make_real4(0, 0, 0, 0);
#else
localData[threadIdx.x].x = 0;
localData[threadIdx.x].y = 0;
localData[threadIdx.x].z = 0;
#endif
CLEAR_LOCAL_PARAMETERS CLEAR_LOCAL_PARAMETERS
} }
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
...@@ -399,171 +323,132 @@ extern "C" __global__ void computeNonbonded( ...@@ -399,171 +323,132 @@ extern "C" __global__ void computeNonbonded(
// box, then skip having to apply periodic boundary conditions later. // box, then skip having to apply periodic boundary conditions later.
real4 blockCenterX = blockCenter[x]; real4 blockCenterX = blockCenter[x];
APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX) APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX)
#ifdef ENABLE_SHUFFLE
APPLY_PERIODIC_TO_POS_WITH_CENTER(shflPosq, blockCenterX) APPLY_PERIODIC_TO_POS_WITH_CENTER(shflPosq, blockCenterX)
#else #pragma unroll 2
APPLY_PERIODIC_TO_POS_WITH_CENTER(localData[threadIdx.x], blockCenterX)
#endif
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) { for (j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+tj;
#ifdef ENABLE_SHUFFLE
real4 posq2 = shflPosq; real4 posq2 = shflPosq;
#else
real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
#endif
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z); real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real invR = RSQRT(r2); #ifdef USE_CUTOFF
real r = r2*invR; if (r2 < MAX_CUTOFF_SQUARED) {
LOAD_ATOM2_PARAMETERS #endif
atom2 = atomIndices[tbx+tj]; real invR = RSQRT(r2);
real r = r2*invR;
LOAD_ATOM2_PARAMETERS
int atom2 = atomIndex;
#ifdef USE_SYMMETRIC #ifdef USE_SYMMETRIC
real dEdR = 0.0f; real dEdR = 0.0f;
#else #else
real3 dEdR1 = make_real3(0); real3 dEdR1 = make_real3(0);
real3 dEdR2 = make_real3(0); real3 dEdR2 = make_real3(0);
#endif #endif
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS); bool isExcluded = !(atom1 < NUM_ATOMS && atom2 < NUM_ATOMS);
#endif
real tempEnergy = 0.0f;
const real interactionScale = 1.0f;
COMPUTE_INTERACTION
#ifdef INCLUDE_ENERGY
energy += tempEnergy;
#endif #endif
real tempEnergy = 0.0f;
const real interactionScale = 1.0f;
COMPUTE_INTERACTION
energy += tempEnergy;
#ifdef INCLUDE_FORCES #ifdef INCLUDE_FORCES
#ifdef USE_SYMMETRIC #ifdef USE_SYMMETRIC
delta *= dEdR; delta *= dEdR;
force.x -= delta.x; force.x -= delta.x;
force.y -= delta.y; force.y -= delta.y;
force.z -= delta.z; force.z -= delta.z;
#ifdef ENABLE_SHUFFLE shflForce.x += delta.x;
shflForce.x += delta.x; shflForce.y += delta.y;
shflForce.y += delta.y; shflForce.z += delta.z;
shflForce.z += delta.z;
#else
localData[tbx+tj].fx += delta.x;
localData[tbx+tj].fy += delta.y;
localData[tbx+tj].fz += delta.z;
#endif
#else // !USE_SYMMETRIC #else // !USE_SYMMETRIC
force.x -= dEdR1.x; force.x -= dEdR1.x;
force.y -= dEdR1.y; force.y -= dEdR1.y;
force.z -= dEdR1.z; force.z -= dEdR1.z;
#ifdef ENABLE_SHUFFLE shflForce.x += dEdR2.x;
shflForce.x += dEdR2.x; shflForce.y += dEdR2.y;
shflForce.y += dEdR2.y; shflForce.z += dEdR2.z;
shflForce.z += dEdR2.z;
#else
localData[tbx+tj].fx += dEdR2.x;
localData[tbx+tj].fy += dEdR2.y;
localData[tbx+tj].fz += dEdR2.z;
#endif
#endif // end USE_SYMMETRIC #endif // end USE_SYMMETRIC
#endif #endif
#ifdef ENABLE_SHUFFLE #ifdef USE_CUTOFF
SHUFFLE_WARP_DATA }
#endif #endif
tj = (tj + 1) & (TILE_SIZE - 1); SHUFFLE_WARP_DATA
atomIndex = warpRotateLeft<TILE_SIZE>(atomIndex);
} }
} }
else else
#endif #endif
{ {
// We need to apply periodic boundary conditions separately for each interaction. // We need to apply periodic boundary conditions separately for each interaction.
unsigned int tj = tgx; #pragma unroll 2
for (j = 0; j < TILE_SIZE; j++) { for (j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+tj;
#ifdef ENABLE_SHUFFLE
real4 posq2 = shflPosq; real4 posq2 = shflPosq;
#else
real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
#endif
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z); real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta) APPLY_PERIODIC_TO_DELTA(delta)
#endif #endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real invR = RSQRT(r2); #ifdef USE_CUTOFF
real r = r2*invR; if (r2 < MAX_CUTOFF_SQUARED) {
LOAD_ATOM2_PARAMETERS #endif
atom2 = atomIndices[tbx+tj]; real invR = RSQRT(r2);
real r = r2*invR;
LOAD_ATOM2_PARAMETERS
int atom2 = atomIndex;
#ifdef USE_SYMMETRIC #ifdef USE_SYMMETRIC
real dEdR = 0.0f; real dEdR = 0.0f;
#else #else
real3 dEdR1 = make_real3(0); real3 dEdR1 = make_real3(0);
real3 dEdR2 = make_real3(0); real3 dEdR2 = make_real3(0);
#endif #endif
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS); bool isExcluded = !(atom1 < NUM_ATOMS && atom2 < NUM_ATOMS);
#endif
real tempEnergy = 0.0f;
const real interactionScale = 1.0f;
COMPUTE_INTERACTION
#ifdef INCLUDE_ENERGY
energy += tempEnergy;
#endif #endif
real tempEnergy = 0.0f;
const real interactionScale = 1.0f;
COMPUTE_INTERACTION
energy += tempEnergy;
#ifdef INCLUDE_FORCES #ifdef INCLUDE_FORCES
#ifdef USE_SYMMETRIC #ifdef USE_SYMMETRIC
delta *= dEdR; delta *= dEdR;
force.x -= delta.x; force.x -= delta.x;
force.y -= delta.y; force.y -= delta.y;
force.z -= delta.z; force.z -= delta.z;
#ifdef ENABLE_SHUFFLE shflForce.x += delta.x;
shflForce.x += delta.x; shflForce.y += delta.y;
shflForce.y += delta.y; shflForce.z += delta.z;
shflForce.z += delta.z;
#else
localData[tbx+tj].fx += delta.x;
localData[tbx+tj].fy += delta.y;
localData[tbx+tj].fz += delta.z;
#endif
#else // !USE_SYMMETRIC #else // !USE_SYMMETRIC
force.x -= dEdR1.x; force.x -= dEdR1.x;
force.y -= dEdR1.y; force.y -= dEdR1.y;
force.z -= dEdR1.z; force.z -= dEdR1.z;
#ifdef ENABLE_SHUFFLE shflForce.x += dEdR2.x;
shflForce.x += dEdR2.x; shflForce.y += dEdR2.y;
shflForce.y += dEdR2.y; shflForce.z += dEdR2.z;
shflForce.z += dEdR2.z;
#else
localData[tbx+tj].fx += dEdR2.x;
localData[tbx+tj].fy += dEdR2.y;
localData[tbx+tj].fz += dEdR2.z;
#endif
#endif // end USE_SYMMETRIC #endif // end USE_SYMMETRIC
#endif #endif
#ifdef ENABLE_SHUFFLE #ifdef USE_CUTOFF
SHUFFLE_WARP_DATA }
#endif #endif
tj = (tj + 1) & (TILE_SIZE - 1); SHUFFLE_WARP_DATA
atomIndex = warpRotateLeft<TILE_SIZE>(atomIndex);
} }
} }
// Write results. // Write results.
#ifdef INCLUDE_FORCES #ifdef INCLUDE_FORCES
atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (force.x*0x100000000))); atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>(realToFixedPoint(force.x)));
atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000))); atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(force.y)));
atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000))); atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(force.z)));
#ifdef USE_CUTOFF unsigned int atom2 = atomIndex;
unsigned int atom2 = atomIndices[threadIdx.x];
#else
unsigned int atom2 = y*TILE_SIZE + tgx;
#endif
if (atom2 < PADDED_NUM_ATOMS) { if (atom2 < PADDED_NUM_ATOMS) {
#ifdef ENABLE_SHUFFLE atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>(realToFixedPoint(shflForce.x)));
atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>((long long) (shflForce.x*0x100000000))); atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(shflForce.y)));
atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (shflForce.y*0x100000000))); atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(shflForce.z)));
atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (shflForce.z*0x100000000)));
#else
atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fx*0x100000000)));
atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fy*0x100000000)));
atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fz*0x100000000)));
#endif
} }
#endif #endif
} }
pos++;
} }
// Third loop: single pairs that aren't part of a tile. // Third loop: single pairs that aren't part of a tile.
...@@ -578,44 +463,45 @@ extern "C" __global__ void computeNonbonded( ...@@ -578,44 +463,45 @@ extern "C" __global__ void computeNonbonded(
int atom2 = pair.y; int atom2 = pair.y;
real4 posq1 = posq[atom1]; real4 posq1 = posq[atom1];
real4 posq2 = posq[atom2]; real4 posq2 = posq[atom2];
LOAD_ATOM1_PARAMETERS
int j = atom2;
atom2 = threadIdx.x;
DECLARE_LOCAL_PARAMETERS
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
LOAD_ATOM2_PARAMETERS
atom2 = pair.y;
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z); real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta) APPLY_PERIODIC_TO_DELTA(delta)
#endif #endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real invR = RSQRT(r2); if (r2 < MAX_CUTOFF_SQUARED) {
real r = r2*invR; LOAD_ATOM1_PARAMETERS
int j = atom2;
DECLARE_LOCAL_PARAMETERS
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
LOAD_ATOM2_PARAMETERS
real invR = RSQRT(r2);
real r = r2*invR;
#ifdef USE_SYMMETRIC #ifdef USE_SYMMETRIC
real dEdR = 0.0f; real dEdR = 0.0f;
#else #else
real3 dEdR1 = make_real3(0); real3 dEdR1 = make_real3(0);
real3 dEdR2 = make_real3(0); real3 dEdR2 = make_real3(0);
#endif #endif
bool hasExclusions = false; bool isExcluded = false;
bool isExcluded = false; real tempEnergy = 0.0f;
real tempEnergy = 0.0f; const real interactionScale = 1.0f;
const real interactionScale = 1.0f; COMPUTE_INTERACTION
COMPUTE_INTERACTION #ifdef INCLUDE_ENERGY
energy += tempEnergy; energy += tempEnergy;
#endif
#ifdef INCLUDE_FORCES #ifdef INCLUDE_FORCES
#ifdef USE_SYMMETRIC #ifdef USE_SYMMETRIC
real3 dEdR1 = delta*dEdR; real3 dEdR1 = delta*dEdR;
real3 dEdR2 = -dEdR1; real3 dEdR2 = -dEdR1;
#endif #endif
atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (-dEdR1.x*0x100000000))); atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>(realToFixedPoint(-dEdR1.x)));
atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (-dEdR1.y*0x100000000))); atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(-dEdR1.y)));
atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (-dEdR1.z*0x100000000))); atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(-dEdR1.z)));
atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>((long long) (-dEdR2.x*0x100000000))); atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>(realToFixedPoint(-dEdR2.x)));
atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (-dEdR2.y*0x100000000))); atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(-dEdR2.y)));
atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (-dEdR2.z*0x100000000))); atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>(realToFixedPoint(-dEdR2.z)));
#endif #endif
}
} }
#endif #endif
#ifdef INCLUDE_ENERGY #ifdef INCLUDE_ENERGY
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment