Commit 61d5cc0f authored by Peter's avatar Peter
Browse files

Merge branch 'master' into applecl

parents e2999354 afae4bc8
......@@ -58,7 +58,7 @@ void CudaArray::upload(const void* data, bool blocking) {
if (blocking)
result = cuMemcpyHtoD(pointer, data, size*elementSize);
else
result = cuMemcpyHtoDAsync(pointer, data, size*elementSize, 0);
result = cuMemcpyHtoDAsync(pointer, data, size*elementSize, context.getCurrentStream());
if (result != CUDA_SUCCESS) {
std::stringstream str;
str<<"Error uploading array "<<name<<": "<<CudaContext::getErrorString(result)<<" ("<<result<<")";
......@@ -71,7 +71,7 @@ void CudaArray::download(void* data, bool blocking) const {
if (blocking)
result = cuMemcpyDtoH(data, pointer, size*elementSize);
else
result = cuMemcpyDtoHAsync(data, pointer, size*elementSize, 0);
result = cuMemcpyDtoHAsync(data, pointer, size*elementSize, context.getCurrentStream());
if (result != CUDA_SUCCESS) {
std::stringstream str;
str<<"Error downloading array "<<name<<": "<<CudaContext::getErrorString(result)<<" ("<<result<<")";
......@@ -82,7 +82,7 @@ void CudaArray::download(void* data, bool blocking) const {
void CudaArray::copyTo(CudaArray& dest) const {
if (dest.getSize() != size || dest.getElementSize() != elementSize)
throw OpenMMException("Error copying array "+name+" to "+dest.getName()+": The destination array does not match the size of the array");
CUresult result = cuMemcpyDtoDAsync(dest.getDevicePointer(), pointer, size*elementSize, 0);
CUresult result = cuMemcpyDtoDAsync(dest.getDevicePointer(), pointer, size*elementSize, context.getCurrentStream());
if (result != CUDA_SUCCESS) {
std::stringstream str;
str<<"Error copying array "<<name<<" to "<<dest.getName()<<": "<<CudaContext::getErrorString(result)<<" ("<<result<<")";
......
......@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2013 Stanford University and the Authors. *
* Portions copyright (c) 2009-2015 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -33,6 +33,7 @@
#include "CudaBondedUtilities.h"
#include "CudaForceInfo.h"
#include "CudaIntegrationUtilities.h"
#include "CudaKernels.h"
#include "CudaKernelSources.h"
#include "CudaNonbondedUtilities.h"
#include "SHA1.h"
......@@ -72,10 +73,19 @@ const int CudaContext::TileSize = sizeof(tileflags)*8;
bool CudaContext::hasInitializedCuda = false;
CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const string& precision, const string& compiler,
const string& tempDir, const std::string& hostCompiler, CudaPlatform::PlatformData& platformData) : system(system),
time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), stepsSinceReorder(99999), contextIsValid(false), atomsWereReordered(false), pinnedBuffer(NULL), posq(NULL),
posqCorrection(NULL), velm(NULL), force(NULL), energyBuffer(NULL), integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL), thread(NULL) {
const string& tempDir, const std::string& hostCompiler, CudaPlatform::PlatformData& platformData) : system(system), currentStream(0),
time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), stepsSinceReorder(99999), contextIsValid(false), atomsWereReordered(false), hasCompilerKernel(false),
pinnedBuffer(NULL), posq(NULL), posqCorrection(NULL), velm(NULL), force(NULL), energyBuffer(NULL), integration(NULL), expression(NULL), bonded(NULL), nonbonded(NULL), thread(NULL) {
this->compiler = "\""+compiler+"\"";
if (platformData.context != NULL) {
try {
compilerKernel = platformData.context->getPlatform().createKernel(CudaCompilerKernel::Name(), *platformData.context);
hasCompilerKernel = true;
}
catch (...) {
// The runtime compiler plugin isn't available.
}
}
if (hostCompiler.size() > 0)
this->compiler = compiler+" --compiler-bindir "+hostCompiler;
if (!hasInitializedCuda) {
......@@ -136,6 +146,11 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
this->deviceIndex = deviceIndex;
int major, minor;
CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, device));
// This is a workaround to support GTX 980 with CUDA 6.5. It reports its compute capability
// as 5.2, but the compiler doesn't support anything beyond 5.0. We can remove this once
// CUDA 7.0 is released.
if (major == 5)
minor = 0;
gpuArchitecture = intToString(major)+intToString(minor);
computeCapability = major+0.1*minor;
if ((useDoublePrecision || useMixedPrecision) && computeCapability < 1.3)
......@@ -149,6 +164,16 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
CHECK_RESULT(cuCtxCreate(&context, flags, device));
contextIsValid = true;
CHECK_RESULT(cuCtxSetCacheConfig(CU_FUNC_CACHE_PREFER_SHARED));
if (contextIndex > 0) {
int canAccess;
cuDeviceCanAccessPeer(&canAccess, getDevice(), platformData.contexts[0]->getDevice());
if (canAccess) {
platformData.contexts[0]->setAsCurrent();
CHECK_RESULT(cuCtxEnablePeerAccess(getContext(), 0));
setAsCurrent();
CHECK_RESULT(cuCtxEnablePeerAccess(platformData.contexts[0]->getContext(), 0));
}
}
numAtoms = system.getNumParticles();
paddedNumAtoms = TileSize*((numAtoms+TileSize-1)/TileSize);
numAtomBlocks = (paddedNumAtoms+(TileSize-1))/TileSize;
......@@ -218,6 +243,66 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
compilationDefines["ERF"] = useDoublePrecision ? "erf" : "erff";
compilationDefines["ERFC"] = useDoublePrecision ? "erfc" : "erfcf";
// Set defines for applying periodic boundary conditions.
Vec3 boxVectors[3];
system.getDefaultPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
boxIsTriclinic = (boxVectors[0][1] != 0.0 || boxVectors[0][2] != 0.0 ||
boxVectors[1][0] != 0.0 || boxVectors[1][2] != 0.0 ||
boxVectors[2][0] != 0.0 || boxVectors[2][1] != 0.0);
if (boxIsTriclinic) {
compilationDefines["APPLY_PERIODIC_TO_DELTA(delta)"] =
"{"
"real scale3 = floor(delta.z*invPeriodicBoxSize.z+0.5f); \\\n"
"delta.x -= scale3*periodicBoxVecZ.x; \\\n"
"delta.y -= scale3*periodicBoxVecZ.y; \\\n"
"delta.z -= scale3*periodicBoxVecZ.z; \\\n"
"real scale2 = floor(delta.y*invPeriodicBoxSize.y+0.5f); \\\n"
"delta.x -= scale2*periodicBoxVecY.x; \\\n"
"delta.y -= scale2*periodicBoxVecY.y; \\\n"
"real scale1 = floor(delta.x*invPeriodicBoxSize.x+0.5f); \\\n"
"delta.x -= scale1*periodicBoxVecX.x;}";
compilationDefines["APPLY_PERIODIC_TO_POS(pos)"] =
"{"
"real scale3 = floor(pos.z*invPeriodicBoxSize.z); \\\n"
"pos.x -= scale3*periodicBoxVecZ.x; \\\n"
"pos.y -= scale3*periodicBoxVecZ.y; \\\n"
"pos.z -= scale3*periodicBoxVecZ.z; \\\n"
"real scale2 = floor(pos.y*invPeriodicBoxSize.y); \\\n"
"pos.x -= scale2*periodicBoxVecY.x; \\\n"
"pos.y -= scale2*periodicBoxVecY.y; \\\n"
"real scale1 = floor(pos.x*invPeriodicBoxSize.x); \\\n"
"pos.x -= scale1*periodicBoxVecX.x;}";
compilationDefines["APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, center)"] =
"{"
"real scale3 = floor((pos.z-center.z)*invPeriodicBoxSize.z+0.5f); \\\n"
"pos.x -= scale3*periodicBoxVecZ.x; \\\n"
"pos.y -= scale3*periodicBoxVecZ.y; \\\n"
"pos.z -= scale3*periodicBoxVecZ.z; \\\n"
"real scale2 = floor((pos.y-center.y)*invPeriodicBoxSize.y+0.5f); \\\n"
"pos.x -= scale2*periodicBoxVecY.x; \\\n"
"pos.y -= scale2*periodicBoxVecY.y; \\\n"
"real scale1 = floor((pos.x-center.x)*invPeriodicBoxSize.x+0.5f); \\\n"
"pos.x -= scale1*periodicBoxVecX.x;}";
}
else {
compilationDefines["APPLY_PERIODIC_TO_DELTA(delta)"] =
"{"
"delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; \\\n"
"delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; \\\n"
"delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;}";
compilationDefines["APPLY_PERIODIC_TO_POS(pos)"] =
"{"
"pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x; \\\n"
"pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y; \\\n"
"pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;}";
compilationDefines["APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, center)"] =
"{"
"pos.x -= floor((pos.x-center.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; \\\n"
"pos.y -= floor((pos.y-center.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; \\\n"
"pos.z -= floor((pos.z-center.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;}";
}
// Create the work thread used for parallelization when running on multiple devices.
thread = new WorkThread();
......@@ -433,7 +518,7 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
if (cuModuleLoad(&module, cacheFile.str().c_str()) == CUDA_SUCCESS)
return module;
// Write out the source to a temporary file.
// Select names for the various temporary files.
stringstream tempFileName;
tempFileName << "openmmTempKernel" << this; // Include a pointer to this context as part of the filename to avoid collisions.
......@@ -445,6 +530,36 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
string inputFile = (tempDir+tempFileName.str()+".cu");
string outputFile = (tempDir+tempFileName.str()+".ptx");
string logFile = (tempDir+tempFileName.str()+".log");
int res = 0;
// If the runtime compiler plugin is available, use it.
if (hasCompilerKernel) {
string ptx = compilerKernel.getAs<CudaCompilerKernel>().createModule(src.str(), "-arch=compute_"+gpuArchitecture+" "+options, *this);
// If possible, write the PTX out to a temporary file so we can cache it for later use.
bool wroteCache = false;
try {
ofstream out(outputFile.c_str());
out << ptx;
out.close();
if (!out.fail())
wroteCache = true;
}
catch (...) {
// Ignore.
}
if (!wroteCache) {
// An error occurred. Possibly we don't have permission to write to the temp directory. Just try to load the module directly.
CHECK_RESULT2(cuModuleLoadDataEx(&module, &ptx[0], 0, NULL, NULL), "Error loading CUDA module");
return module;
}
}
else {
// Write out the source to a temporary file.
ofstream out(inputFile.c_str());
out << src.str();
out.close();
......@@ -457,8 +572,9 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
int res = compileInWindows(command);
#else
string command = compiler+" --ptx --machine "+bits+" -arch=sm_"+gpuArchitecture+" -o \""+outputFile+"\" "+options+" \""+inputFile+"\" 2> \""+logFile+"\"";
int res = std::system(command.c_str());
res = std::system(command.c_str());
#endif
}
try {
if (res != 0) {
// Load the error log.
......@@ -507,7 +623,19 @@ CUfunction CudaContext::getKernel(CUmodule& module, const string& name) {
return function;
}
string CudaContext::doubleToString(double value) {
CUstream CudaContext::getCurrentStream() {
return currentStream;
}
void CudaContext::setCurrentStream(CUstream stream) {
currentStream = stream;
}
void CudaContext::restoreDefaultStream() {
setCurrentStream(0);
}
string CudaContext::doubleToString(double value) const {
stringstream s;
s.precision(useDoublePrecision ? 16 : 8);
s << scientific << value;
......@@ -516,7 +644,7 @@ string CudaContext::doubleToString(double value) {
return s.str();
}
string CudaContext::intToString(int value) {
string CudaContext::intToString(int value) const {
stringstream s;
s << value;
return s.str();
......@@ -550,6 +678,7 @@ std::string CudaContext::getErrorString(CUresult result) {
case CUDA_ERROR_ECC_UNCORRECTABLE: return "CUDA_ERROR_ECC_UNCORRECTABLE";
case CUDA_ERROR_UNSUPPORTED_LIMIT: return "CUDA_ERROR_UNSUPPORTED_LIMIT";
case CUDA_ERROR_CONTEXT_ALREADY_IN_USE: return "CUDA_ERROR_CONTEXT_ALREADY_IN_USE";
case CUDA_ERROR_PEER_ACCESS_UNSUPPORTED: return "CUDA_ERROR_PEER_ACCESS_UNSUPPORTED";
case CUDA_ERROR_INVALID_SOURCE: return "CUDA_ERROR_INVALID_SOURCE";
case CUDA_ERROR_FILE_NOT_FOUND: return "CUDA_ERROR_FILE_NOT_FOUND";
case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: return "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND";
......@@ -566,16 +695,22 @@ std::string CudaContext::getErrorString(CUresult result) {
case CUDA_ERROR_PEER_ACCESS_NOT_ENABLED: return "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED";
case CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE: return "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE";
case CUDA_ERROR_CONTEXT_IS_DESTROYED: return "CUDA_ERROR_CONTEXT_IS_DESTROYED";
case CUDA_ERROR_ASSERT: return "CUDA_ERROR_ASSERT";
case CUDA_ERROR_TOO_MANY_PEERS: return "CUDA_ERROR_TOO_MANY_PEERS";
case CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED: return "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED";
case CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED: return "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED";
case CUDA_ERROR_NOT_PERMITTED: return "CUDA_ERROR_NOT_PERMITTED";
case CUDA_ERROR_NOT_SUPPORTED: return "CUDA_ERROR_NOT_SUPPORTED";
case CUDA_ERROR_UNKNOWN: return "CUDA_ERROR_UNKNOWN";
}
return "Invalid error code";
return "CUDA error";
}
void CudaContext::executeKernel(CUfunction kernel, void** arguments, int threads, int blockSize, unsigned int sharedSize) {
if (blockSize == -1)
blockSize = ThreadBlockSize;
int gridSize = std::min((threads+blockSize-1)/blockSize, numThreadBlocks);
CUresult result = cuLaunchKernel(kernel, gridSize, 1, 1, blockSize, 1, 1, sharedSize, 0, arguments, NULL);
CUresult result = cuLaunchKernel(kernel, gridSize, 1, 1, blockSize, 1, 1, sharedSize, currentStream, arguments, NULL);
if (result != CUDA_SUCCESS) {
stringstream str;
str<<"Error invoking kernel: "<<getErrorString(result)<<" ("<<result<<")";
......@@ -1044,16 +1179,21 @@ void CudaContext::reorderAtomsImpl() {
// Move each molecule position into the same box.
for (int i = 0; i < numMolecules; i++) {
int xcell = (int) floor(molPos[i].x*invPeriodicBoxSize.x);
int ycell = (int) floor(molPos[i].y*invPeriodicBoxSize.y);
int zcell = (int) floor(molPos[i].z*invPeriodicBoxSize.z);
Real dx = xcell*periodicBoxSize.x;
Real dy = ycell*periodicBoxSize.y;
Real dz = zcell*periodicBoxSize.z;
if (dx != 0.0f || dy != 0.0f || dz != 0.0f) {
molPos[i].x -= dx;
molPos[i].y -= dy;
molPos[i].z -= dz;
Real4 center = molPos[i];
int zcell = (int) floor(center.z*invPeriodicBoxSize.z);
center.x -= zcell*periodicBoxVecZ.x;
center.y -= zcell*periodicBoxVecZ.y;
center.z -= zcell*periodicBoxVecZ.z;
int ycell = (int) floor(center.y*invPeriodicBoxSize.y);
center.x -= ycell*periodicBoxVecY.x;
center.y -= ycell*periodicBoxVecY.y;
int xcell = (int) floor(center.x*invPeriodicBoxSize.x);
center.x -= xcell*periodicBoxVecX.x;
if (xcell != 0 || ycell != 0 || zcell != 0) {
Real dx = molPos[i].x-center.x;
Real dy = molPos[i].y-center.y;
Real dz = molPos[i].z-center.z;
molPos[i] = center;
for (int j = 0; j < (int) atoms.size(); j++) {
int atom = atoms[j]+mol.offsets[i];
Real4 p = oldPosq[atom];
......
......@@ -239,7 +239,7 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
if (derivOrder[0] == 0) {
out << "real x = " << getTempName(node.getChildren()[0], temps) << ";\n";
out << "if (x >= 0 && x < " << paramsInt[0] << ") {\n";
out << "int index = (int) round(x);\n";
out << "int index = (int) floor(x+0.5f);\n";
out << nodeNames[j] << " = " << functionNames[i].second << "[index];\n";
out << "}\n";
}
......@@ -249,8 +249,8 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
for (int j = 0; j < nodes.size(); j++) {
const vector<int>& derivOrder = dynamic_cast<const Operation::Custom*>(&nodes[j]->getOperation())->getDerivOrder();
if (derivOrder[0] == 0 && derivOrder[1] == 0) {
out << "int x = (int) round(" << getTempName(node.getChildren()[0], temps) << ");\n";
out << "int y = (int) round(" << getTempName(node.getChildren()[1], temps) << ");\n";
out << "int x = (int) floor(" << getTempName(node.getChildren()[0], temps) << "+0.5f);\n";
out << "int y = (int) floor(" << getTempName(node.getChildren()[1], temps) << "+0.5f);\n";
out << "int xsize = (int) " << paramsInt[0] << ";\n";
out << "int ysize = (int) " << paramsInt[1] << ";\n";
out << "int index = x+y*xsize;\n";
......@@ -263,9 +263,9 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
for (int j = 0; j < nodes.size(); j++) {
const vector<int>& derivOrder = dynamic_cast<const Operation::Custom*>(&nodes[j]->getOperation())->getDerivOrder();
if (derivOrder[0] == 0 && derivOrder[1] == 0 && derivOrder[2] == 0) {
out << "int x = (int) round(" << getTempName(node.getChildren()[0], temps) << ");\n";
out << "int y = (int) round(" << getTempName(node.getChildren()[1], temps) << ");\n";
out << "int z = (int) round(" << getTempName(node.getChildren()[2], temps) << ");\n";
out << "int x = (int) floor(" << getTempName(node.getChildren()[0], temps) << "+0.5f);\n";
out << "int y = (int) floor(" << getTempName(node.getChildren()[1], temps) << "+0.5f);\n";
out << "int z = (int) floor(" << getTempName(node.getChildren()[2], temps) << "+0.5f);\n";
out << "int xsize = (int) " << paramsInt[0] << ";\n";
out << "int ysize = (int) " << paramsInt[1] << ";\n";
out << "int zsize = (int) " << paramsInt[2] << ";\n";
......@@ -457,6 +457,12 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
case Operation::ABS:
out << "fabs(" << getTempName(node.getChildren()[0], temps) << ")";
break;
case Operation::FLOOR:
out << "floor(" << getTempName(node.getChildren()[0], temps) << ")";
break;
case Operation::CEIL:
out << "ceil(" << getTempName(node.getChildren()[0], temps) << ")";
break;
default:
throw OpenMMException("Internal error: Unknown operation in user-defined expression: "+node.getOperation().getName());
}
......
......@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2014 Stanford University and the Authors. *
* Portions copyright (c) 2009-2015 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -27,10 +27,12 @@
#include "CudaIntegrationUtilities.h"
#include "CudaArray.h"
#include "CudaKernelSources.h"
#include "openmm/internal/OSRngSeed.h"
#include "openmm/HarmonicAngleForce.h"
#include "openmm/VirtualSite.h"
#include "quern.h"
#include "CudaExpressionUtilities.h"
#include "ReferenceCCMAAlgorithm.h"
#include <algorithm>
#include <cmath>
#include <cstdlib>
......@@ -303,157 +305,54 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
int numCCMA = (int) ccmaConstraints.size();
if (numCCMA > 0) {
vector<vector<int> > atomConstraints(context.getNumAtoms());
for (int i = 0; i < numCCMA; i++) {
atomConstraints[atom1[ccmaConstraints[i]]].push_back(i);
atomConstraints[atom2[ccmaConstraints[i]]].push_back(i);
}
vector<vector<int> > linkedConstraints(numCCMA);
for (unsigned atom = 0; atom < atomConstraints.size(); atom++) {
for (unsigned i = 0; i < atomConstraints[atom].size(); i++)
for (unsigned j = 0; j < i; j++) {
int c1 = atomConstraints[atom][i];
int c2 = atomConstraints[atom][j];
linkedConstraints[c1].push_back(c2);
linkedConstraints[c2].push_back(c1);
}
}
int maxLinks = 0;
for (unsigned i = 0; i < linkedConstraints.size(); i++)
maxLinks = max(maxLinks, (int) linkedConstraints[i].size());
int maxAtomConstraints = 0;
for (unsigned i = 0; i < atomConstraints.size(); i++)
maxAtomConstraints = max(maxAtomConstraints, (int) atomConstraints[i].size());
// Record information needed by ReferenceCCMAAlgorithm.
// Compute the constraint coupling matrix
vector<vector<int> > atomAngles(numAtoms);
HarmonicAngleForce const* angleForce = NULL;
for (int i = 0; i < system.getNumForces() && angleForce == NULL; i++)
angleForce = dynamic_cast<HarmonicAngleForce const*>(&system.getForce(i));
if (angleForce != NULL)
for (int i = 0; i < angleForce->getNumAngles(); i++) {
int particle1, particle2, particle3;
vector<pair<int, int> > refIndices(numCCMA);
vector<RealOpenMM> refDistance(numCCMA);
for (int i = 0; i < numCCMA; i++) {
int index = ccmaConstraints[i];
refIndices[i] = make_pair(atom1[index], atom2[index]);
refDistance[i] = distance[index];
}
vector<RealOpenMM> refMasses(numAtoms);
for (int i = 0; i < numAtoms; ++i)
refMasses[i] = (RealOpenMM) system.getParticleMass(i);
// Look up angles for CCMA.
vector<ReferenceCCMAAlgorithm::AngleInfo> angles;
for (int i = 0; i < system.getNumForces(); i++) {
const HarmonicAngleForce* force = dynamic_cast<const HarmonicAngleForce*>(&system.getForce(i));
if (force != NULL) {
for (int j = 0; j < force->getNumAngles(); j++) {
int atom1, atom2, atom3;
double angle, k;
angleForce->getAngleParameters(i, particle1, particle2, particle3, angle, k);
atomAngles[particle2].push_back(i);
}
vector<vector<pair<int, double> > > matrix(numCCMA);
for (int j = 0; j < numCCMA; j++) {
for (int k = 0; k < numCCMA; k++) {
if (j == k) {
matrix[j].push_back(pair<int, double>(j, 1.0));
continue;
}
double scale;
int cj = ccmaConstraints[j];
int ck = ccmaConstraints[k];
int atomj0 = atom1[cj];
int atomj1 = atom2[cj];
int atomk0 = atom1[ck];
int atomk1 = atom2[ck];
int atoma, atomb, atomc;
double imj0 = 1.0/system.getParticleMass(atomj0);
double imj1 = 1.0/system.getParticleMass(atomj1);
if (atomj0 == atomk0) {
atoma = atomj1;
atomb = atomj0;
atomc = atomk1;
scale = imj0/(imj0+imj1);
}
else if (atomj1 == atomk1) {
atoma = atomj0;
atomb = atomj1;
atomc = atomk0;
scale = imj1/(imj0+imj1);
}
else if (atomj0 == atomk1) {
atoma = atomj1;
atomb = atomj0;
atomc = atomk0;
scale = imj0/(imj0+imj1);
}
else if (atomj1 == atomk0) {
atoma = atomj0;
atomb = atomj1;
atomc = atomk1;
scale = imj1/(imj0+imj1);
}
else
continue; // These constraints are not connected.
// Look for a third constraint forming a triangle with these two.
bool foundConstraint = false;
for (int m = 0; m < numCCMA; m++) {
int other = ccmaConstraints[m];
if ((atom1[other] == atoma && atom2[other] == atomc) || (atom1[other] == atomc && atom2[other] == atoma)) {
double d1 = distance[cj];
double d2 = distance[ck];
double d3 = distance[other];
matrix[j].push_back(pair<int, double>(k, scale*(d1*d1+d2*d2-d3*d3)/(2.0*d1*d2)));
foundConstraint = true;
break;
}
}
if (!foundConstraint && angleForce != NULL) {
// We didn't find one, so look for an angle force field term.
const vector<int>& angleCandidates = atomAngles[atomb];
for (vector<int>::const_iterator iter = angleCandidates.begin(); iter != angleCandidates.end(); iter++) {
int particle1, particle2, particle3;
double angle, ka;
angleForce->getAngleParameters(*iter, particle1, particle2, particle3, angle, ka);
if ((particle1 == atoma && particle3 == atomc) || (particle3 == atoma && particle1 == atomc)) {
matrix[j].push_back(pair<int, double>(k, scale*cos(angle)));
break;
}
force->getAngleParameters(j, atom1, atom2, atom3, angle, k);
angles.push_back(ReferenceCCMAAlgorithm::AngleInfo(atom1, atom2, atom3, (RealOpenMM) angle));
}
}
}
}
// Invert it using QR.
vector<int> matrixRowStart;
vector<int> matrixColIndex;
vector<double> matrixValue;
for (int i = 0; i < numCCMA; i++) {
matrixRowStart.push_back(matrixValue.size());
for (int j = 0; j < (int) matrix[i].size(); j++) {
pair<int, double> element = matrix[i][j];
matrixColIndex.push_back(element.first);
matrixValue.push_back(element.second);
}
}
matrixRowStart.push_back(matrixValue.size());
int *qRowStart, *qColIndex, *rRowStart, *rColIndex;
double *qValue, *rValue;
int result = QUERN_compute_qr(numCCMA, numCCMA, &matrixRowStart[0], &matrixColIndex[0], &matrixValue[0], NULL,
&qRowStart, &qColIndex, &qValue, &rRowStart, &rColIndex, &rValue);
vector<double> rhs(numCCMA);
matrix.clear();
matrix.resize(numCCMA);
for (int i = 0; i < numCCMA; i++) {
// Extract column i of the inverse matrix.
// Create a ReferenceCCMAAlgorithm. It will build and invert the constraint matrix for us.
for (int j = 0; j < numCCMA; j++)
rhs[j] = (i == j ? 1.0 : 0.0);
result = QUERN_multiply_with_q_transpose(numCCMA, qRowStart, qColIndex, qValue, &rhs[0]);
result = QUERN_solve_with_r(numCCMA, rRowStart, rColIndex, rValue, &rhs[0], &rhs[0]);
for (int j = 0; j < numCCMA; j++) {
double value = rhs[j]*distance[ccmaConstraints[i]]/distance[ccmaConstraints[j]];
if (abs(value) > 0.1)
matrix[j].push_back(pair<int, double>(i, value));
}
}
QUERN_free_result(qRowStart, qColIndex, qValue);
QUERN_free_result(rRowStart, rColIndex, rValue);
ReferenceCCMAAlgorithm ccma(numAtoms, numCCMA, refIndices, refDistance, refMasses, angles, 0.1);
vector<vector<pair<int, double> > > matrix = ccma.getMatrix();
int maxRowElements = 0;
for (unsigned i = 0; i < matrix.size(); i++)
maxRowElements = max(maxRowElements, (int) matrix[i].size());
maxRowElements++;
// Build the list of constraints for each atom.
vector<vector<int> > atomConstraints(context.getNumAtoms());
for (int i = 0; i < numCCMA; i++) {
atomConstraints[atom1[ccmaConstraints[i]]].push_back(i);
atomConstraints[atom2[ccmaConstraints[i]]].push_back(i);
}
int maxAtomConstraints = 0;
for (unsigned i = 0; i < atomConstraints.size(); i++)
maxAtomConstraints = max(maxAtomConstraints, (int) atomConstraints[i].size());
// Sort the constraints.
vector<int> constraintOrder(numCCMA);
......@@ -858,6 +757,7 @@ void CudaIntegrationUtilities::initRandomNumberGenerator(unsigned int randomNumb
vector<int4> seed(randomSeed->getSize());
unsigned int r = randomNumberSeed;
if (r == 0) r = (unsigned int) osrngseed();
for (int i = 0; i < randomSeed->getSize(); i++) {
seed[i].x = r = (1664525*r + 1013904223) & 0xFFFFFFFF;
seed[i].y = r = (1664525*r + 1013904223) & 0xFFFFFFFF;
......
......@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008-2014 Stanford University and the Authors. *
* Portions copyright (c) 2008-2015 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -36,6 +36,7 @@
#include "openmm/internal/CustomManyParticleForceImpl.h"
#include "openmm/internal/CustomNonbondedForceImpl.h"
#include "openmm/internal/NonbondedForceImpl.h"
#include "openmm/internal/OSRngSeed.h"
#include "CudaBondedUtilities.h"
#include "CudaExpressionUtilities.h"
#include "CudaIntegrationUtilities.h"
......@@ -103,7 +104,7 @@ void CudaCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool
nb.prepareInteractions();
}
double CudaCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bool includeForces, bool includeEnergy, int groups) {
double CudaCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bool includeForces, bool includeEnergy, int groups, bool& valid) {
cu.getBondedUtilities().computeInteractions(groups);
if ((groups&(1<<cu.getNonbondedUtilities().getForceGroup())) != 0)
cu.getNonbondedUtilities().computeInteractions();
......@@ -147,14 +148,15 @@ void CudaUpdateStateDataKernel::getPositions(ContextImpl& context, vector<Vec3>&
const vector<int>& order = cu.getAtomIndex();
int numParticles = context.getSystem().getNumParticles();
positions.resize(numParticles);
double4 periodicBoxSize = cu.getPeriodicBoxSize();
Vec3 boxVectors[3];
cu.getPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
if (cu.getUseDoublePrecision()) {
double4* posq = (double4*) cu.getPinnedBuffer();
cu.getPosq().download(posq);
for (int i = 0; i < numParticles; ++i) {
double4 pos = posq[i];
int4 offset = cu.getPosCellOffsets()[i];
positions[order[i]] = Vec3(pos.x-offset.x*periodicBoxSize.x, pos.y-offset.y*periodicBoxSize.y, pos.z-offset.z*periodicBoxSize.z);
positions[order[i]] = Vec3(pos.x, pos.y, pos.z)-boxVectors[0]*offset.x-boxVectors[1]*offset.y-boxVectors[2]*offset.z;
}
}
else if (cu.getUseMixedPrecision()) {
......@@ -166,7 +168,7 @@ void CudaUpdateStateDataKernel::getPositions(ContextImpl& context, vector<Vec3>&
float4 pos1 = posq[i];
float4 pos2 = posCorrection[i];
int4 offset = cu.getPosCellOffsets()[i];
positions[order[i]] = Vec3((double)pos1.x+(double)pos2.x-offset.x*periodicBoxSize.x, (double)pos1.y+(double)pos2.y-offset.y*periodicBoxSize.y, (double)pos1.z+(double)pos2.z-offset.z*periodicBoxSize.z);
positions[order[i]] = Vec3((double)pos1.x+(double)pos2.x, (double)pos1.y+(double)pos2.y, (double)pos1.z+(double)pos2.z)-boxVectors[0]*offset.x-boxVectors[1]*offset.y-boxVectors[2]*offset.z;
}
}
else {
......@@ -175,7 +177,7 @@ void CudaUpdateStateDataKernel::getPositions(ContextImpl& context, vector<Vec3>&
for (int i = 0; i < numParticles; ++i) {
float4 pos = posq[i];
int4 offset = cu.getPosCellOffsets()[i];
positions[order[i]] = Vec3(pos.x-offset.x*periodicBoxSize.x, pos.y-offset.y*periodicBoxSize.y, pos.z-offset.z*periodicBoxSize.z);
positions[order[i]] = Vec3(pos.x, pos.y, pos.z)-boxVectors[0]*offset.x-boxVectors[1]*offset.y-boxVectors[2]*offset.z;
}
}
}
......@@ -304,21 +306,18 @@ void CudaUpdateStateDataKernel::getForces(ContextImpl& context, vector<Vec3>& fo
}
void CudaUpdateStateDataKernel::getPeriodicBoxVectors(ContextImpl& context, Vec3& a, Vec3& b, Vec3& c) const {
double4 box = cu.getPeriodicBoxSize();
a = Vec3(box.x, 0, 0);
b = Vec3(0, box.y, 0);
c = Vec3(0, 0, box.z);
cu.getPeriodicBoxVectors(a, b, c);
}
void CudaUpdateStateDataKernel::setPeriodicBoxVectors(ContextImpl& context, const Vec3& a, const Vec3& b, const Vec3& c) const {
vector<CudaContext*>& contexts = cu.getPlatformData().contexts;
for (int i = 0; i < (int) contexts.size(); i++)
contexts[i]->setPeriodicBoxSize(a[0], b[1], c[2]);
contexts[i]->setPeriodicBoxVectors(a, b, c);
}
void CudaUpdateStateDataKernel::createCheckpoint(ContextImpl& context, ostream& stream) {
cu.setAsCurrent();
int version = 1;
int version = 2;
stream.write((char*) &version, sizeof(int));
int precision = (cu.getUseDoublePrecision() ? 2 : cu.getUseMixedPrecision() ? 1 : 0);
stream.write((char*) &precision, sizeof(int));
......@@ -339,8 +338,9 @@ void CudaUpdateStateDataKernel::createCheckpoint(ContextImpl& context, ostream&
stream.write(buffer, cu.getVelm().getSize()*cu.getVelm().getElementSize());
stream.write((char*) &cu.getAtomIndex()[0], sizeof(int)*cu.getAtomIndex().size());
stream.write((char*) &cu.getPosCellOffsets()[0], sizeof(int4)*cu.getPosCellOffsets().size());
double4 box = cu.getPeriodicBoxSize();
stream.write((char*) &box, sizeof(double4));
Vec3 boxVectors[3];
cu.getPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
stream.write((char*) boxVectors, 3*sizeof(Vec3));
cu.getIntegrationUtilities().createCheckpoint(stream);
SimTKOpenMMUtilities::createCheckpoint(stream);
}
......@@ -349,7 +349,7 @@ void CudaUpdateStateDataKernel::loadCheckpoint(ContextImpl& context, istream& st
cu.setAsCurrent();
int version;
stream.read((char*) &version, sizeof(int));
if (version != 1)
if (version != 2)
throw OpenMMException("Checkpoint was created with a different version of OpenMM");
int precision;
stream.read((char*) &precision, sizeof(int));
......@@ -379,10 +379,10 @@ void CudaUpdateStateDataKernel::loadCheckpoint(ContextImpl& context, istream& st
stream.read((char*) &cu.getAtomIndex()[0], sizeof(int)*cu.getAtomIndex().size());
cu.getAtomIndexArray().upload(cu.getAtomIndex());
stream.read((char*) &cu.getPosCellOffsets()[0], sizeof(int4)*cu.getPosCellOffsets().size());
double4 box;
stream.read((char*) &box, sizeof(double4));
Vec3 boxVectors[3];
stream.read((char*) &boxVectors, 3*sizeof(Vec3));
for (int i = 0; i < (int) contexts.size(); i++)
contexts[i]->setPeriodicBoxSize(box.x, box.y, box.z);
contexts[i]->setPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
cu.getIntegrationUtilities().loadCheckpoint(stream);
SimTKOpenMMUtilities::loadCheckpoint(stream);
for (int i = 0; i < cu.getReorderListeners().size(); i++)
......@@ -1127,7 +1127,7 @@ void CudaCalcCMAPTorsionForceKernel::initialize(const System& system, const CMAP
return;
int numMaps = force.getNumMaps();
vector<float4> coeffVec;
vector<int2> mapPositionsVec(numMaps);
mapPositionsVec.resize(numMaps);
vector<double> energy;
vector<vector<double> > c;
int currentPosition = 0;
......@@ -1166,6 +1166,49 @@ double CudaCalcCMAPTorsionForceKernel::execute(ContextImpl& context, bool includ
return 0.0;
}
void CudaCalcCMAPTorsionForceKernel::copyParametersToContext(ContextImpl& context, const CMAPTorsionForce& force) {
int numMaps = force.getNumMaps();
int numContexts = cu.getPlatformData().contexts.size();
int startIndex = cu.getContextIndex()*force.getNumTorsions()/numContexts;
int endIndex = (cu.getContextIndex()+1)*force.getNumTorsions()/numContexts;
numTorsions = endIndex-startIndex;
if (mapPositions->getSize() != numMaps)
throw OpenMMException("updateParametersInContext: The number of maps has changed");
if (torsionMaps->getSize() != numTorsions)
throw OpenMMException("updateParametersInContext: The number of CMAP torsions has changed");
// Update the maps.
vector<float4> coeffVec;
vector<double> energy;
vector<vector<double> > c;
int currentPosition = 0;
for (int i = 0; i < numMaps; i++) {
int size;
force.getMapParameters(i, size, energy);
if (size != mapPositionsVec[i].y)
throw OpenMMException("updateParametersInContext: The size of a map has changed");
CMAPTorsionForceImpl::calcMapDerivatives(size, energy, c);
currentPosition += 4*size*size;
for (int j = 0; j < size*size; j++) {
coeffVec.push_back(make_float4((float) c[j][0], (float) c[j][1], (float) c[j][2], (float) c[j][3]));
coeffVec.push_back(make_float4((float) c[j][4], (float) c[j][5], (float) c[j][6], (float) c[j][7]));
coeffVec.push_back(make_float4((float) c[j][8], (float) c[j][9], (float) c[j][10], (float) c[j][11]));
coeffVec.push_back(make_float4((float) c[j][12], (float) c[j][13], (float) c[j][14], (float) c[j][15]));
}
}
coefficients->upload(coeffVec);
// Update the indices.
vector<int> torsionMapsVec(numTorsions);
for (int i = 0; i < numTorsions; i++) {
int index[8];
force.getTorsionParameters(i, torsionMapsVec[i], index[0], index[1], index[2], index[3], index[4], index[5], index[6], index[7]);
}
torsionMaps->upload(torsionMapsVec);
}
class CudaCustomTorsionForceInfo : public CudaForceInfo {
public:
CudaCustomTorsionForceInfo(const CustomTorsionForce& force) : force(force) {
......@@ -1377,8 +1420,8 @@ public:
PmePreComputation(CudaContext& cu, Kernel& pme, CalcPmeReciprocalForceKernel::IO& io) : cu(cu), pme(pme), io(io) {
}
void computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) {
Vec3 boxSize(cu.getPeriodicBoxSize().x, cu.getPeriodicBoxSize().y, cu.getPeriodicBoxSize().z);
pme.getAs<CalcPmeReciprocalForceKernel>().beginComputation(io, boxSize, includeEnergy);
Vec3 boxVectors[3] = {Vec3(cu.getPeriodicBoxSize().x, 0, 0), Vec3(0, cu.getPeriodicBoxSize().y, 0), Vec3(0, 0, cu.getPeriodicBoxSize().z)};
pme.getAs<CalcPmeReciprocalForceKernel>().beginComputation(io, boxVectors, includeEnergy);
}
private:
CudaContext& cu;
......@@ -1398,6 +1441,38 @@ private:
CalcPmeReciprocalForceKernel::IO& io;
};
class CudaCalcNonbondedForceKernel::SyncStreamPreComputation : public CudaContext::ForcePreComputation {
public:
SyncStreamPreComputation(CudaContext& cu, CUstream stream, CUevent event, int forceGroup) : cu(cu), stream(stream), event(event), forceGroup(forceGroup) {
}
void computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) {
if ((groups&(1<<forceGroup)) != 0) {
cuEventRecord(event, cu.getCurrentStream());
cuStreamWaitEvent(stream, event, 0);
}
}
private:
CudaContext& cu;
CUstream stream;
CUevent event;
int forceGroup;
};
class CudaCalcNonbondedForceKernel::SyncStreamPostComputation : public CudaContext::ForcePostComputation {
public:
SyncStreamPostComputation(CudaContext& cu, CUevent event, int forceGroup) : cu(cu), event(event), forceGroup(forceGroup) {
}
double computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) {
if ((groups&(1<<forceGroup)) != 0)
cuStreamWaitEvent(cu.getCurrentStream(), event, 0);
return 0.0;
}
private:
CudaContext& cu;
CUevent event;
int forceGroup;
};
CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
cu.setAsCurrent();
if (sigmaEpsilon != NULL)
......@@ -1427,6 +1502,10 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
if (hasInitializedFFT) {
cufftDestroy(fftForward);
cufftDestroy(fftBackward);
if (usePmeStream) {
cuStreamDestroy(pmeStream);
cuEventDestroy(pmeSyncEvent);
}
}
}
......@@ -1636,6 +1715,22 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
cufftSetCompatibilityMode(fftForward, CUFFT_COMPATIBILITY_NATIVE);
cufftSetCompatibilityMode(fftBackward, CUFFT_COMPATIBILITY_NATIVE);
// Prepare for doing PME on its own stream.
int cufftVersion;
cufftGetVersion(&cufftVersion);
usePmeStream = (cu.getComputeCapability() < 5.0 && numParticles < 130000 && cufftVersion >= 6000); // Workarounds for various CUDA bugs
if (usePmeStream) {
cuStreamCreate(&pmeStream, CU_STREAM_NON_BLOCKING);
cufftSetStream(fftForward, pmeStream);
cufftSetStream(fftBackward, pmeStream);
CHECK_RESULT(cuEventCreate(&pmeSyncEvent, CU_EVENT_DISABLE_TIMING), "Error creating event for NonbondedForce");
int recipForceGroup = force.getReciprocalSpaceForceGroup();
if (recipForceGroup < 0)
recipForceGroup = force.getForceGroup();
cu.addPreComputation(new SyncStreamPreComputation(cu, pmeStream, pmeSyncEvent, recipForceGroup));
cu.addPostComputation(new SyncStreamPostComputation(cu, pmeSyncEvent, recipForceGroup));
}
hasInitializedFFT = true;
// Initialize the b-spline moduli.
......@@ -1752,13 +1847,45 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
cu.executeKernel(ewaldForcesKernel, forcesArgs, cu.getNumAtoms());
}
if (directPmeGrid != NULL && includeReciprocal) {
void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(), cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()};
if (usePmeStream)
cu.setCurrentStream(pmeStream);
// Invert the periodic box vectors.
Vec3 boxVectors[3];
cu.getPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
double determinant = boxVectors[0][0]*boxVectors[1][1]*boxVectors[2][2];
double scale = 1.0/determinant;
double3 recipBoxVectors[3];
recipBoxVectors[0] = make_double3(boxVectors[1][1]*boxVectors[2][2]*scale, 0, 0);
recipBoxVectors[1] = make_double3(-boxVectors[1][0]*boxVectors[2][2]*scale, boxVectors[0][0]*boxVectors[2][2]*scale, 0);
recipBoxVectors[2] = make_double3((boxVectors[1][0]*boxVectors[2][1]-boxVectors[1][1]*boxVectors[2][0])*scale, -boxVectors[0][0]*boxVectors[2][1]*scale, boxVectors[0][0]*boxVectors[1][1]*scale);
float3 recipBoxVectorsFloat[3];
void* recipBoxVectorPointer[3];
if (cu.getUseDoublePrecision()) {
recipBoxVectorPointer[0] = &recipBoxVectors[0];
recipBoxVectorPointer[1] = &recipBoxVectors[1];
recipBoxVectorPointer[2] = &recipBoxVectors[2];
}
else {
recipBoxVectorsFloat[0] = make_float3((float) recipBoxVectors[0].x, 0, 0);
recipBoxVectorsFloat[1] = make_float3((float) recipBoxVectors[1].x, (float) recipBoxVectors[1].y, 0);
recipBoxVectorsFloat[2] = make_float3((float) recipBoxVectors[2].x, (float) recipBoxVectors[2].y, (float) recipBoxVectors[2].z);
recipBoxVectorPointer[0] = &recipBoxVectorsFloat[0];
recipBoxVectorPointer[1] = &recipBoxVectorsFloat[1];
recipBoxVectorPointer[2] = &recipBoxVectorsFloat[2];
}
// Execute the reciprocal space kernels.
void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(), cu.getPeriodicBoxSizePointer(),
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeGridIndexKernel, gridIndexArgs, cu.getNumAtoms());
sort->sort(*pmeAtomGridIndex);
void* spreadArgs[] = {&cu.getPosq().getDevicePointer(), &directPmeGrid->getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), &pmeAtomGridIndex->getDevicePointer()};
recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex->getDevicePointer()};
cu.executeKernel(pmeSpreadChargeKernel, spreadArgs, cu.getNumAtoms(), 128);
if (cu.getUseDoublePrecision() || cu.getComputeCapability() < 2.0) {
......@@ -1772,11 +1899,15 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
cufftExecR2C(fftForward, (float*) directPmeGrid->getDevicePointer(), (float2*) reciprocalPmeGrid->getDevicePointer());
if (includeEnergy) {
void* computeEnergyArgs[] = {&reciprocalPmeGrid->getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(), &pmeBsplineModuliX->getDevicePointer(), &pmeBsplineModuliY->getDevicePointer(), &pmeBsplineModuliZ->getDevicePointer(), cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()};
void* computeEnergyArgs[] = {&reciprocalPmeGrid->getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(),
&pmeBsplineModuliX->getDevicePointer(), &pmeBsplineModuliY->getDevicePointer(), &pmeBsplineModuliZ->getDevicePointer(),
cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeEvalEnergyKernel, computeEnergyArgs, cu.getNumAtoms());
}
void* convolutionArgs[] = {&reciprocalPmeGrid->getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(), &pmeBsplineModuliX->getDevicePointer(), &pmeBsplineModuliY->getDevicePointer(), &pmeBsplineModuliZ->getDevicePointer(), cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()};
void* convolutionArgs[] = {&reciprocalPmeGrid->getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(),
&pmeBsplineModuliX->getDevicePointer(), &pmeBsplineModuliY->getDevicePointer(), &pmeBsplineModuliZ->getDevicePointer(),
cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeConvolutionKernel, convolutionArgs, cu.getNumAtoms());
if (cu.getUseDoublePrecision())
......@@ -1786,9 +1917,12 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &directPmeGrid->getDevicePointer(),
cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer(), &pmeAtomGridIndex->getDevicePointer()};
cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2], &pmeAtomGridIndex->getDevicePointer()};
cu.executeKernel(pmeInterpolateForceKernel, interpolateArgs, cu.getNumAtoms(), 128);
if (usePmeStream) {
cuEventRecord(pmeSyncEvent, pmeStream);
cu.restoreDefaultStream();
}
}
double energy = (includeReciprocal ? ewaldSelfEnergy : 0.0);
if (dispersionCoefficient != 0.0 && includeDirect) {
......@@ -2327,6 +2461,9 @@ double CudaCalcCustomNonbondedForceKernel::execute(ContextImpl& context, bool in
interactionGroupArgs.push_back(&interactionGroupData->getDevicePointer());
interactionGroupArgs.push_back(cu.getPeriodicBoxSizePointer());
interactionGroupArgs.push_back(cu.getInvPeriodicBoxSizePointer());
interactionGroupArgs.push_back(cu.getPeriodicBoxVecXPointer());
interactionGroupArgs.push_back(cu.getPeriodicBoxVecYPointer());
interactionGroupArgs.push_back(cu.getPeriodicBoxVecZPointer());
for (int i = 0; i < (int) params->getBuffers().size(); i++)
interactionGroupArgs.push_back(&params->getBuffers()[i].getMemory());
if (globals != NULL)
......@@ -2435,6 +2572,7 @@ void CudaCalcGBSAOBCForceKernel::initialize(const System& system, const GBSAOBCF
posq.upload(&temp[0]);
params->upload(paramsVector);
prefactor = -ONE_4PI_EPS0*((1.0/force.getSoluteDielectric())-(1.0/force.getSolventDielectric()));
surfaceAreaFactor = -6.0*4*M_PI*force.getSurfaceAreaEnergy();
bool useCutoff = (force.getNonbondedMethod() != GBSAOBCForce::NoCutoff);
bool usePeriodic = (force.getNonbondedMethod() != GBSAOBCForce::NoCutoff && force.getNonbondedMethod() != GBSAOBCForce::CutoffNonPeriodic);
string source = CudaKernelSources::gbsaObc2;
......@@ -2461,6 +2599,7 @@ double CudaCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeFor
defines["CUTOFF_SQUARED"] = cu.doubleToString(nb.getCutoffDistance()*nb.getCutoffDistance());
defines["CUTOFF"] = cu.doubleToString(nb.getCutoffDistance());
defines["PREFACTOR"] = cu.doubleToString(prefactor);
defines["SURFACE_AREA_FACTOR"] = cu.doubleToString(surfaceAreaFactor);
defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
defines["NUM_BLOCKS"] = cu.intToString(cu.getNumAtomBlocks());
......@@ -2484,6 +2623,9 @@ double CudaCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeFor
computeSumArgs.push_back(&nb.getInteractionCount().getDevicePointer());
computeSumArgs.push_back(cu.getPeriodicBoxSizePointer());
computeSumArgs.push_back(cu.getInvPeriodicBoxSizePointer());
computeSumArgs.push_back(cu.getPeriodicBoxVecXPointer());
computeSumArgs.push_back(cu.getPeriodicBoxVecYPointer());
computeSumArgs.push_back(cu.getPeriodicBoxVecZPointer());
computeSumArgs.push_back(&maxTiles);
computeSumArgs.push_back(&nb.getBlockCenters().getDevicePointer());
computeSumArgs.push_back(&nb.getBlockBoundingBoxes().getDevicePointer());
......@@ -2503,6 +2645,9 @@ double CudaCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeFor
force1Args.push_back(&nb.getInteractionCount().getDevicePointer());
force1Args.push_back(cu.getPeriodicBoxSizePointer());
force1Args.push_back(cu.getInvPeriodicBoxSizePointer());
force1Args.push_back(cu.getPeriodicBoxVecXPointer());
force1Args.push_back(cu.getPeriodicBoxVecYPointer());
force1Args.push_back(cu.getPeriodicBoxVecZPointer());
force1Args.push_back(&maxTiles);
force1Args.push_back(&nb.getBlockCenters().getDevicePointer());
force1Args.push_back(&nb.getBlockBoundingBoxes().getDevicePointer());
......@@ -2519,8 +2664,8 @@ double CudaCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeFor
maxTiles = nb.getInteractingTiles().getSize();
computeSumArgs[3] = &nb.getInteractingTiles().getDevicePointer();
force1Args[5] = &nb.getInteractingTiles().getDevicePointer();
computeSumArgs[10] = &nb.getInteractingAtoms().getDevicePointer();
force1Args[12] = &nb.getInteractingAtoms().getDevicePointer();
computeSumArgs[13] = &nb.getInteractingAtoms().getDevicePointer();
force1Args[15] = &nb.getInteractingAtoms().getDevicePointer();
}
}
cu.executeKernel(computeBornSumKernel, &computeSumArgs[0], nb.getNumForceThreadBlocks()*nb.getForceThreadBlockSize(), nb.getForceThreadBlockSize());
......@@ -2979,7 +3124,7 @@ void CudaCalcCustomGBForceKernel::initialize(const System& system, const CustomG
pairEnergyDefines["USE_PERIODIC"] = "1";
if (anyExclusions)
pairEnergyDefines["USE_EXCLUSIONS"] = "1";
if (atomParamSize%2 == 0 && !cu.getUseDoublePrecision())
if (atomParamSize%2 != 0 && !cu.getUseDoublePrecision())
pairEnergyDefines["NEED_PADDING"] = "1";
pairEnergyDefines["THREAD_BLOCK_SIZE"] = cu.intToString(cu.getNonbondedUtilities().getForceThreadBlockSize());
pairEnergyDefines["WARPS_PER_GROUP"] = cu.intToString(cu.getNonbondedUtilities().getForceThreadBlockSize()/CudaContext::TileSize);
......@@ -3289,6 +3434,9 @@ double CudaCalcCustomGBForceKernel::execute(ContextImpl& context, bool includeFo
pairValueArgs.push_back(&nb.getInteractionCount().getDevicePointer());
pairValueArgs.push_back(cu.getPeriodicBoxSizePointer());
pairValueArgs.push_back(cu.getInvPeriodicBoxSizePointer());
pairValueArgs.push_back(cu.getPeriodicBoxVecXPointer());
pairValueArgs.push_back(cu.getPeriodicBoxVecYPointer());
pairValueArgs.push_back(cu.getPeriodicBoxVecZPointer());
pairValueArgs.push_back(&maxTiles);
pairValueArgs.push_back(&nb.getBlockCenters().getDevicePointer());
pairValueArgs.push_back(&nb.getBlockBoundingBoxes().getDevicePointer());
......@@ -3324,6 +3472,9 @@ double CudaCalcCustomGBForceKernel::execute(ContextImpl& context, bool includeFo
pairEnergyArgs.push_back(&nb.getInteractionCount().getDevicePointer());
pairEnergyArgs.push_back(cu.getPeriodicBoxSizePointer());
pairEnergyArgs.push_back(cu.getInvPeriodicBoxSizePointer());
pairEnergyArgs.push_back(cu.getPeriodicBoxVecXPointer());
pairEnergyArgs.push_back(cu.getPeriodicBoxVecYPointer());
pairEnergyArgs.push_back(cu.getPeriodicBoxVecZPointer());
pairEnergyArgs.push_back(&maxTiles);
pairEnergyArgs.push_back(&nb.getBlockCenters().getDevicePointer());
pairEnergyArgs.push_back(&nb.getBlockBoundingBoxes().getDevicePointer());
......@@ -3389,8 +3540,8 @@ double CudaCalcCustomGBForceKernel::execute(ContextImpl& context, bool includeFo
maxTiles = nb.getInteractingTiles().getSize();
pairValueArgs[4] = &nb.getInteractingTiles().getDevicePointer();
pairEnergyArgs[5] = &nb.getInteractingTiles().getDevicePointer();
pairValueArgs[11] = &nb.getInteractingAtoms().getDevicePointer();
pairEnergyArgs[12] = &nb.getInteractingAtoms().getDevicePointer();
pairValueArgs[14] = &nb.getInteractingAtoms().getDevicePointer();
pairEnergyArgs[15] = &nb.getInteractingAtoms().getDevicePointer();
}
}
cu.executeKernel(pairValueKernel, &pairValueArgs[0], nb.getNumForceThreadBlocks()*nb.getForceThreadBlockSize(), nb.getForceThreadBlockSize());
......@@ -4006,6 +4157,9 @@ double CudaCalcCustomHbondForceKernel::execute(ContextImpl& context, bool includ
donorArgs.push_back(&acceptors->getDevicePointer());
donorArgs.push_back(cu.getPeriodicBoxSizePointer());
donorArgs.push_back(cu.getInvPeriodicBoxSizePointer());
donorArgs.push_back(cu.getPeriodicBoxVecXPointer());
donorArgs.push_back(cu.getPeriodicBoxVecYPointer());
donorArgs.push_back(cu.getPeriodicBoxVecZPointer());
if (globals != NULL)
donorArgs.push_back(&globals->getDevicePointer());
for (int i = 0; i < (int) donorParams->getBuffers().size(); i++) {
......@@ -4027,6 +4181,9 @@ double CudaCalcCustomHbondForceKernel::execute(ContextImpl& context, bool includ
acceptorArgs.push_back(&acceptors->getDevicePointer());
acceptorArgs.push_back(cu.getPeriodicBoxSizePointer());
acceptorArgs.push_back(cu.getInvPeriodicBoxSizePointer());
acceptorArgs.push_back(cu.getPeriodicBoxVecXPointer());
acceptorArgs.push_back(cu.getPeriodicBoxVecYPointer());
acceptorArgs.push_back(cu.getPeriodicBoxVecZPointer());
if (globals != NULL)
acceptorArgs.push_back(&globals->getDevicePointer());
for (int i = 0; i < (int) donorParams->getBuffers().size(); i++) {
......@@ -4629,7 +4786,7 @@ void CudaCalcCustomManyParticleForceKernel::initialize(const System& system, con
const vector<int>& atoms = iter->second;
string deltaName = atomNames[atoms[0]]+atomNames[atoms[1]];
if (computedDeltas.count(deltaName) == 0) {
compute<<"real4 delta"<<deltaName<<" = delta("<<posNames[atoms[0]]<<", "<<posNames[atoms[1]]<<", periodicBoxSize, invPeriodicBoxSize);\n";
compute<<"real4 delta"<<deltaName<<" = delta("<<posNames[atoms[0]]<<", "<<posNames[atoms[1]]<<", periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ);\n";
computedDeltas.insert(deltaName);
}
compute<<"real r_"<<deltaName<<" = sqrt(delta"<<deltaName<<".w);\n";
......@@ -4643,11 +4800,11 @@ void CudaCalcCustomManyParticleForceKernel::initialize(const System& system, con
string deltaName2 = atomNames[atoms[1]]+atomNames[atoms[2]];
string angleName = "angle_"+atomNames[atoms[0]]+atomNames[atoms[1]]+atomNames[atoms[2]];
if (computedDeltas.count(deltaName1) == 0) {
compute<<"real4 delta"<<deltaName1<<" = delta("<<posNames[atoms[1]]<<", "<<posNames[atoms[0]]<<", periodicBoxSize, invPeriodicBoxSize);\n";
compute<<"real4 delta"<<deltaName1<<" = delta("<<posNames[atoms[1]]<<", "<<posNames[atoms[0]]<<", periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ);\n";
computedDeltas.insert(deltaName1);
}
if (computedDeltas.count(deltaName2) == 0) {
compute<<"real4 delta"<<deltaName2<<" = delta("<<posNames[atoms[1]]<<", "<<posNames[atoms[2]]<<", periodicBoxSize, invPeriodicBoxSize);\n";
compute<<"real4 delta"<<deltaName2<<" = delta("<<posNames[atoms[1]]<<", "<<posNames[atoms[2]]<<", periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ);\n";
computedDeltas.insert(deltaName2);
}
compute<<"real "<<angleName<<" = computeAngle(delta"<<deltaName1<<", delta"<<deltaName2<<");\n";
......@@ -4664,15 +4821,15 @@ void CudaCalcCustomManyParticleForceKernel::initialize(const System& system, con
string crossName2 = "cross_"+deltaName2+"_"+deltaName3;
string dihedralName = "dihedral_"+atomNames[atoms[0]]+atomNames[atoms[1]]+atomNames[atoms[2]]+atomNames[atoms[3]];
if (computedDeltas.count(deltaName1) == 0) {
compute<<"real4 delta"<<deltaName1<<" = delta("<<posNames[atoms[0]]<<", "<<posNames[atoms[1]]<<", periodicBoxSize, invPeriodicBoxSize);\n";
compute<<"real4 delta"<<deltaName1<<" = delta("<<posNames[atoms[0]]<<", "<<posNames[atoms[1]]<<", periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ);\n";
computedDeltas.insert(deltaName1);
}
if (computedDeltas.count(deltaName2) == 0) {
compute<<"real4 delta"<<deltaName2<<" = delta("<<posNames[atoms[2]]<<", "<<posNames[atoms[1]]<<", periodicBoxSize, invPeriodicBoxSize);\n";
compute<<"real4 delta"<<deltaName2<<" = delta("<<posNames[atoms[2]]<<", "<<posNames[atoms[1]]<<", periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ);\n";
computedDeltas.insert(deltaName2);
}
if (computedDeltas.count(deltaName3) == 0) {
compute<<"real4 delta"<<deltaName3<<" = delta("<<posNames[atoms[2]]<<", "<<posNames[atoms[3]]<<", periodicBoxSize, invPeriodicBoxSize);\n";
compute<<"real4 delta"<<deltaName3<<" = delta("<<posNames[atoms[2]]<<", "<<posNames[atoms[3]]<<", periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ);\n";
computedDeltas.insert(deltaName3);
}
compute<<"real4 "<<crossName1<<" = computeCross(delta"<<deltaName1<<", delta"<<deltaName2<<");\n";
......@@ -4842,7 +4999,7 @@ void CudaCalcCustomManyParticleForceKernel::initialize(const System& system, con
if (!centralParticleMode) {
for (int i = 1; i < particlesPerSet; i++) {
for (int j = i+1; j < particlesPerSet; j++)
verifyCutoff<<"includeInteraction &= (delta(pos"<<(i+1)<<", pos"<<(j+1)<<", periodicBoxSize, invPeriodicBoxSize).w < CUTOFF_SQUARED);\n";
verifyCutoff<<"includeInteraction &= (delta(pos"<<(i+1)<<", pos"<<(j+1)<<", periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ).w < CUTOFF_SQUARED);\n";
}
}
}
......@@ -4920,6 +5077,9 @@ double CudaCalcCustomManyParticleForceKernel::execute(ContextImpl& context, bool
forceArgs.push_back(&cu.getPosq().getDevicePointer());
forceArgs.push_back(cu.getPeriodicBoxSizePointer());
forceArgs.push_back(cu.getInvPeriodicBoxSizePointer());
forceArgs.push_back(cu.getPeriodicBoxVecXPointer());
forceArgs.push_back(cu.getPeriodicBoxVecYPointer());
forceArgs.push_back(cu.getPeriodicBoxVecZPointer());
if (nonbondedMethod != NoCutoff) {
forceArgs.push_back(&neighbors->getDevicePointer());
forceArgs.push_back(&neighborStartIndex->getDevicePointer());
......@@ -4945,6 +5105,9 @@ double CudaCalcCustomManyParticleForceKernel::execute(ContextImpl& context, bool
blockBoundsArgs.push_back(cu.getPeriodicBoxSizePointer());
blockBoundsArgs.push_back(cu.getInvPeriodicBoxSizePointer());
blockBoundsArgs.push_back(cu.getPeriodicBoxVecXPointer());
blockBoundsArgs.push_back(cu.getPeriodicBoxVecYPointer());
blockBoundsArgs.push_back(cu.getPeriodicBoxVecZPointer());
blockBoundsArgs.push_back(&cu.getPosq().getDevicePointer());
blockBoundsArgs.push_back(&blockCenter->getDevicePointer());
blockBoundsArgs.push_back(&blockBoundingBox->getDevicePointer());
......@@ -4954,6 +5117,9 @@ double CudaCalcCustomManyParticleForceKernel::execute(ContextImpl& context, bool
neighborsArgs.push_back(cu.getPeriodicBoxSizePointer());
neighborsArgs.push_back(cu.getInvPeriodicBoxSizePointer());
neighborsArgs.push_back(cu.getPeriodicBoxVecXPointer());
neighborsArgs.push_back(cu.getPeriodicBoxVecYPointer());
neighborsArgs.push_back(cu.getPeriodicBoxVecZPointer());
neighborsArgs.push_back(&cu.getPosq().getDevicePointer());
neighborsArgs.push_back(&blockCenter->getDevicePointer());
neighborsArgs.push_back(&blockBoundingBox->getDevicePointer());
......@@ -5908,7 +6074,11 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context,
uniformRandoms = CudaArray::create<float4>(cu, maxUniformRandoms, "uniformRandoms");
randomSeed = CudaArray::create<int4>(cu, cu.getNumThreadBlocks()*CudaContext::ThreadBlockSize, "randomSeed");
vector<int4> seed(randomSeed->getSize());
unsigned int r = integrator.getRandomNumberSeed()+1;
int rseed = integrator.getRandomNumberSeed();
// A random seed of 0 means use a unique one
if (rseed == 0)
rseed = osrngseed();
unsigned int r = (unsigned int) (rseed+1);
for (int i = 0; i < randomSeed->getSize(); i++) {
seed[i].x = r = (1664525*r + 1013904223) & 0xFFFFFFFF;
seed[i].y = r = (1664525*r + 1013904223) & 0xFFFFFFFF;
......@@ -6358,6 +6528,7 @@ void CudaApplyMonteCarloBarostatKernel::scaleCoordinates(ContextImpl& context, d
float scalefY = (float) scaleY;
float scalefZ = (float) scaleZ;
void* args[] = {&scalefX, &scalefY, &scalefZ, &numMolecules, cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer(),
cu.getPeriodicBoxVecXPointer(), cu.getPeriodicBoxVecYPointer(), cu.getPeriodicBoxVecZPointer(),
&cu.getPosq().getDevicePointer(), &moleculeAtoms->getDevicePointer(), &moleculeStartIndex->getDevicePointer()};
cu.executeKernel(kernel, args, cu.getNumAtoms());
for (int i = 0; i < (int) cu.getPosCellOffsets().size(); i++)
......
......@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2013 Stanford University and the Authors. *
* Portions copyright (c) 2009-2015 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -62,10 +62,10 @@ private:
bool useDouble;
};
CudaNonbondedUtilities::CudaNonbondedUtilities(CudaContext& context) : context(context), cutoff(-1.0), useCutoff(false), anyExclusions(false), usePadding(true),
CudaNonbondedUtilities::CudaNonbondedUtilities(CudaContext& context) : context(context), cutoff(-1.0), useCutoff(false), usePeriodic(false), anyExclusions(false), usePadding(true),
exclusionIndices(NULL), exclusionRowIndices(NULL), exclusionTiles(NULL), exclusions(NULL), interactingTiles(NULL), interactingAtoms(NULL),
interactionCount(NULL), blockCenter(NULL), blockBoundingBox(NULL), sortedBlocks(NULL), sortedBlockCenter(NULL), sortedBlockBoundingBox(NULL),
oldPositions(NULL), rebuildNeighborList(NULL), blockSorter(NULL), nonbondedForceGroup(0) {
oldPositions(NULL), rebuildNeighborList(NULL), blockSorter(NULL), nonbondedForceGroup(0), forceRebuildNeighborList(true) {
// Decide how many thread blocks to use.
string errorMessage = "Error initializing nonbonded utilities";
......@@ -264,14 +264,6 @@ void CudaNonbondedUtilities::initialize(const System& system) {
sortedBlockCenter = new CudaArray(context, numAtomBlocks+1, 4*elementSize, "sortedBlockCenter");
sortedBlockBoundingBox = new CudaArray(context, numAtomBlocks+1, 4*elementSize, "sortedBlockBoundingBox");
oldPositions = new CudaArray(context, numAtoms, 4*elementSize, "oldPositions");
if (context.getUseDoublePrecision()) {
vector<double4> oldPositionsVec(numAtoms, make_double4(1e30, 1e30, 1e30, 0));
oldPositions->upload(oldPositionsVec);
}
else {
vector<float4> oldPositionsVec(numAtoms, make_float4(1e30f, 1e30f, 1e30f, 0));
oldPositions->upload(oldPositionsVec);
}
rebuildNeighborList = CudaArray::create<int>(context, 1, "rebuildNeighborList");
blockSorter = new CudaSort(context, new BlockSortTrait(context.getUseDoublePrecision()), numAtomBlocks);
vector<unsigned int> count(1, 0);
......@@ -304,6 +296,9 @@ void CudaNonbondedUtilities::initialize(const System& system) {
findBlockBoundsArgs.push_back(&numAtoms);
findBlockBoundsArgs.push_back(context.getPeriodicBoxSizePointer());
findBlockBoundsArgs.push_back(context.getInvPeriodicBoxSizePointer());
findBlockBoundsArgs.push_back(context.getPeriodicBoxVecXPointer());
findBlockBoundsArgs.push_back(context.getPeriodicBoxVecYPointer());
findBlockBoundsArgs.push_back(context.getPeriodicBoxVecZPointer());
findBlockBoundsArgs.push_back(&context.getPosq().getDevicePointer());
findBlockBoundsArgs.push_back(&blockCenter->getDevicePointer());
findBlockBoundsArgs.push_back(&blockBoundingBox->getDevicePointer());
......@@ -319,9 +314,13 @@ void CudaNonbondedUtilities::initialize(const System& system) {
sortBoxDataArgs.push_back(&oldPositions->getDevicePointer());
sortBoxDataArgs.push_back(&interactionCount->getDevicePointer());
sortBoxDataArgs.push_back(&rebuildNeighborList->getDevicePointer());
sortBoxDataArgs.push_back(&forceRebuildNeighborList);
findInteractingBlocksKernel = context.getKernel(interactingBlocksProgram, "findBlocksWithInteractions");
findInteractingBlocksArgs.push_back(context.getPeriodicBoxSizePointer());
findInteractingBlocksArgs.push_back(context.getInvPeriodicBoxSizePointer());
findInteractingBlocksArgs.push_back(context.getPeriodicBoxVecXPointer());
findInteractingBlocksArgs.push_back(context.getPeriodicBoxVecYPointer());
findInteractingBlocksArgs.push_back(context.getPeriodicBoxVecZPointer());
findInteractingBlocksArgs.push_back(&interactionCount->getDevicePointer());
findInteractingBlocksArgs.push_back(&interactingTiles->getDevicePointer());
findInteractingBlocksArgs.push_back(&interactingAtoms->getDevicePointer());
......@@ -342,6 +341,8 @@ void CudaNonbondedUtilities::initialize(const System& system) {
void CudaNonbondedUtilities::prepareInteractions() {
if (!useCutoff)
return;
if (numTiles == 0)
return;
if (usePeriodic) {
double4 box = context.getPeriodicBoxSize();
double minAllowedSize = 1.999999*cutoff;
......@@ -355,6 +356,7 @@ void CudaNonbondedUtilities::prepareInteractions() {
blockSorter->sort(*sortedBlocks);
context.executeKernel(sortBoxDataKernel, &sortBoxDataArgs[0], context.getNumAtoms());
context.executeKernel(findInteractingBlocksKernel, &findInteractingBlocksArgs[0], context.getNumAtoms(), 256);
forceRebuildNeighborList = false;
}
void CudaNonbondedUtilities::computeInteractions() {
......@@ -388,18 +390,11 @@ void CudaNonbondedUtilities::updateNeighborListSize() {
interactingAtoms = CudaArray::create<int>(context, CudaContext::TileSize*maxTiles, "interactingAtoms");
if (forceArgs.size() > 0)
forceArgs[7] = &interactingTiles->getDevicePointer();
findInteractingBlocksArgs[3] = &interactingTiles->getDevicePointer();
findInteractingBlocksArgs[6] = &interactingTiles->getDevicePointer();
if (forceArgs.size() > 0)
forceArgs[14] = &interactingAtoms->getDevicePointer();
findInteractingBlocksArgs[4] = &interactingAtoms->getDevicePointer();
if (context.getUseDoublePrecision()) {
vector<double4> oldPositionsVec(numAtoms, make_double4(1e30, 1e30, 1e30, 0));
oldPositions->upload(oldPositionsVec);
}
else {
vector<float4> oldPositionsVec(numAtoms, make_float4(1e30f, 1e30f, 1e30f, 0));
oldPositions->upload(oldPositionsVec);
}
forceArgs[17] = &interactingAtoms->getDevicePointer();
findInteractingBlocksArgs[7] = &interactingAtoms->getDevicePointer();
forceRebuildNeighborList = true;
}
void CudaNonbondedUtilities::setUsePadding(bool padding) {
......@@ -411,8 +406,9 @@ void CudaNonbondedUtilities::setAtomBlockRange(double startFraction, double endF
startBlockIndex = (int) (startFraction*numAtomBlocks);
numBlocks = (int) (endFraction*numAtomBlocks)-startBlockIndex;
int totalTiles = context.getNumAtomBlocks()*(context.getNumAtomBlocks()+1)/2;
startTileIndex = (int) (startFraction*totalTiles);;
startTileIndex = (int) (startFraction*totalTiles);
numTiles = (int) (endFraction*totalTiles)-startTileIndex;
forceRebuildNeighborList = true;
}
CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source, vector<ParameterInfo>& params, vector<ParameterInfo>& arguments, bool useExclusions, bool isSymmetric) {
......@@ -625,6 +621,9 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
forceArgs.push_back(&interactionCount->getDevicePointer());
forceArgs.push_back(context.getPeriodicBoxSizePointer());
forceArgs.push_back(context.getInvPeriodicBoxSizePointer());
forceArgs.push_back(context.getPeriodicBoxVecXPointer());
forceArgs.push_back(context.getPeriodicBoxVecYPointer());
forceArgs.push_back(context.getPeriodicBoxVecZPointer());
forceArgs.push_back(&maxTiles);
forceArgs.push_back(&blockCenter->getDevicePointer());
forceArgs.push_back(&blockBoundingBox->getDevicePointer());
......
......@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2011-2013 Stanford University and the Authors. *
* Portions copyright (c) 2011-2015 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -63,23 +63,21 @@ if (result != CUDA_SUCCESS) { \
class CudaParallelCalcForcesAndEnergyKernel::BeginComputationTask : public CudaContext::WorkTask {
public:
BeginComputationTask(ContextImpl& context, CudaContext& cu, CudaCalcForcesAndEnergyKernel& kernel,
bool includeForce, bool includeEnergy, int groups, void* pinnedMemory) : context(context), cu(cu), kernel(kernel),
includeForce(includeForce), includeEnergy(includeEnergy), groups(groups), pinnedMemory(pinnedMemory) {
bool includeForce, bool includeEnergy, int groups, void* pinnedMemory, CUevent event, int& numTiles) : context(context), cu(cu), kernel(kernel),
includeForce(includeForce), includeEnergy(includeEnergy), groups(groups), pinnedMemory(pinnedMemory), event(event), numTiles(numTiles) {
}
void execute() {
// Copy coordinates over to this device and execute the kernel.
cu.setAsCurrent();
if (cu.getContextIndex() > 0) {
if (cu.getPlatformData().peerAccessSupported && cu.getPlatformData().contexts.size() < 3) {
CudaContext& context0 = *cu.getPlatformData().contexts[0];
int numBytes = cu.getPosq().getSize()*cu.getPosq().getElementSize();
CHECK_RESULT(cuMemcpyPeerAsync(cu.getPosq().getDevicePointer(), cu.getContext(), context0.getPosq().getDevicePointer(), context0.getContext(), numBytes, 0), "Error copying positions");
}
else
cuStreamWaitEvent(cu.getCurrentStream(), event, 0);
if (!cu.getPlatformData().peerAccessSupported)
cu.getPosq().upload(pinnedMemory, false);
}
kernel.beginComputation(context, includeForce, includeEnergy, groups);
if (cu.getNonbondedUtilities().getUsePeriodic())
cu.getNonbondedUtilities().getInteractionCount().download(&numTiles, false);
}
private:
ContextImpl& context;
......@@ -88,19 +86,27 @@ private:
bool includeForce, includeEnergy;
int groups;
void* pinnedMemory;
CUevent event;
int& numTiles;
};
class CudaParallelCalcForcesAndEnergyKernel::FinishComputationTask : public CudaContext::WorkTask {
public:
FinishComputationTask(ContextImpl& context, CudaContext& cu, CudaCalcForcesAndEnergyKernel& kernel,
bool includeForce, bool includeEnergy, int groups, double& energy, long long& completionTime, long long* pinnedMemory, CudaArray& contextForces) :
bool includeForce, bool includeEnergy, int groups, double& energy, long long& completionTime, long long* pinnedMemory, CudaArray& contextForces, bool& valid, int& numTiles) :
context(context), cu(cu), kernel(kernel), includeForce(includeForce), includeEnergy(includeEnergy), groups(groups), energy(energy),
completionTime(completionTime), pinnedMemory(pinnedMemory), contextForces(contextForces) {
completionTime(completionTime), pinnedMemory(pinnedMemory), contextForces(contextForces), valid(valid), numTiles(numTiles) {
}
void execute() {
// Execute the kernel, then download forces.
energy += kernel.finishComputation(context, includeForce, includeEnergy, groups);
energy += kernel.finishComputation(context, includeForce, includeEnergy, groups, valid);
if (cu.getComputeForceCount() < 200) {
// Record timing information for load balancing. Since this takes time, only do it at the start of the simulation.
CHECK_RESULT(cuCtxSynchronize(), "Error synchronizing CUDA context");
completionTime = getTime();
}
if (includeForce) {
if (cu.getContextIndex() > 0) {
int numAtoms = cu.getPaddedNumAtoms();
......@@ -108,16 +114,16 @@ public:
int numBytes = numAtoms*3*sizeof(long long);
int offset = (cu.getContextIndex()-1)*numBytes;
CudaContext& context0 = *cu.getPlatformData().contexts[0];
CHECK_RESULT(cuMemcpyPeer(contextForces.getDevicePointer()+offset, context0.getContext(), cu.getForce().getDevicePointer(), cu.getContext(), numBytes), "Error copying forces");
CHECK_RESULT(cuMemcpy(contextForces.getDevicePointer()+offset, cu.getForce().getDevicePointer(), numBytes), "Error copying forces");
}
else
cu.getForce().download(&pinnedMemory[(cu.getContextIndex()-1)*numAtoms*3]);
}
else {
CHECK_RESULT(cuCtxSynchronize(), "Error synchronizing CUDA context");
}
if (cu.getNonbondedUtilities().getUsePeriodic() && numTiles > cu.getNonbondedUtilities().getInteractingTiles().getSize()) {
valid = false;
cu.getNonbondedUtilities().updateNeighborListSize();
}
completionTime = getTime();
}
private:
ContextImpl& context;
......@@ -129,11 +135,13 @@ private:
long long& completionTime;
long long* pinnedMemory;
CudaArray& contextForces;
bool& valid;
int& numTiles;
};
CudaParallelCalcForcesAndEnergyKernel::CudaParallelCalcForcesAndEnergyKernel(string name, const Platform& platform, CudaPlatform::PlatformData& data) :
CalcForcesAndEnergyKernel(name, platform), data(data), completionTimes(data.contexts.size()), contextNonbondedFractions(data.contexts.size()), contextForces(NULL),
pinnedPositionBuffer(NULL), pinnedForceBuffer(NULL) {
CalcForcesAndEnergyKernel(name, platform), data(data), completionTimes(data.contexts.size()), contextNonbondedFractions(data.contexts.size()),
tileCounts(NULL), contextForces(NULL), pinnedPositionBuffer(NULL), pinnedForceBuffer(NULL) {
for (int i = 0; i < (int) data.contexts.size(); i++)
kernels.push_back(Kernel(new CudaCalcForcesAndEnergyKernel(name, platform, *data.contexts[i])));
}
......@@ -146,6 +154,10 @@ CudaParallelCalcForcesAndEnergyKernel::~CudaParallelCalcForcesAndEnergyKernel()
cuMemFreeHost(pinnedPositionBuffer);
if (pinnedForceBuffer != NULL)
cuMemFreeHost(pinnedForceBuffer);
cuEventDestroy(event);
cuStreamDestroy(peerCopyStream);
if (tileCounts != NULL)
cuMemFreeHost(tileCounts);
}
void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
......@@ -153,10 +165,14 @@ void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
cu.setAsCurrent();
CUmodule module = cu.createModule(CudaKernelSources::parallel);
sumKernel = cu.getKernel(module, "sumForces");
for (int i = 0; i < (int) kernels.size(); i++)
int numContexts = data.contexts.size();
for (int i = 0; i < numContexts; i++)
getKernel(i).initialize(system);
for (int i = 0; i < (int) contextNonbondedFractions.size(); i++)
contextNonbondedFractions[i] = 1/(double) contextNonbondedFractions.size();
for (int i = 0; i < numContexts; i++)
contextNonbondedFractions[i] = 1/(double) numContexts;
CHECK_RESULT(cuEventCreate(&event, 0), "Error creating event");
CHECK_RESULT(cuStreamCreate(&peerCopyStream, CU_STREAM_NON_BLOCKING), "Error creating stream");
CHECK_RESULT(cuMemHostAlloc((void**) &tileCounts, numContexts*sizeof(int), 0), "Error creating tile count buffer");
}
void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) {
......@@ -170,27 +186,37 @@ void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& contex
// Copy coordinates over to each device and execute the kernel.
if (!(cu.getPlatformData().peerAccessSupported && cu.getPlatformData().contexts.size() < 3))
cu.getPosq().download(pinnedPositionBuffer);
if (!cu.getPlatformData().peerAccessSupported) {
cu.getPosq().download(pinnedPositionBuffer, false);
cuEventRecord(event, cu.getCurrentStream());
}
else {
int numBytes = cu.getPosq().getSize()*cu.getPosq().getElementSize();
cuEventRecord(event, cu.getCurrentStream());
cuStreamWaitEvent(peerCopyStream, event, 0);
for (int i = 1; i < (int) data.contexts.size(); i++)
CHECK_RESULT(cuMemcpyAsync(data.contexts[i]->getPosq().getDevicePointer(), cu.getPosq().getDevicePointer(), numBytes, peerCopyStream), "Error copying positions");
cuEventRecord(event, peerCopyStream);
}
for (int i = 0; i < (int) data.contexts.size(); i++) {
data.contextEnergy[i] = 0.0;
CudaContext& cu = *data.contexts[i];
CudaContext::WorkThread& thread = cu.getWorkThread();
thread.addTask(new BeginComputationTask(context, cu, getKernel(i), includeForce, includeEnergy, groups, pinnedPositionBuffer));
thread.addTask(new BeginComputationTask(context, cu, getKernel(i), includeForce, includeEnergy, groups, pinnedPositionBuffer, event, tileCounts[i]));
}
}
double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) {
double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups, bool& valid) {
for (int i = 0; i < (int) data.contexts.size(); i++) {
CudaContext& cu = *data.contexts[i];
CudaContext::WorkThread& thread = cu.getWorkThread();
thread.addTask(new FinishComputationTask(context, cu, getKernel(i), includeForce, includeEnergy, groups, data.contextEnergy[i], completionTimes[i], pinnedForceBuffer, *contextForces));
thread.addTask(new FinishComputationTask(context, cu, getKernel(i), includeForce, includeEnergy, groups, data.contextEnergy[i], completionTimes[i], pinnedForceBuffer, *contextForces, valid, tileCounts[i]));
}
data.syncContexts();
double energy = 0.0;
for (int i = 0; i < (int) data.contextEnergy.size(); i++)
energy += data.contextEnergy[i];
if (includeForce) {
if (includeForce && valid) {
// Sum the forces from all devices.
CudaContext& cu = *data.contexts[0];
......@@ -204,6 +230,7 @@ double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& con
// Balance work between the contexts by transferring a little nonbonded work from the context that
// finished last to the one that finished first.
if (cu.getComputeForceCount() < 200) {
int firstIndex = 0, lastIndex = 0;
for (int i = 0; i < (int) completionTimes.size(); i++) {
if (completionTimes[i] < completionTimes[firstIndex])
......@@ -211,7 +238,7 @@ double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& con
if (completionTimes[i] > completionTimes[lastIndex])
lastIndex = i;
}
double fractionToTransfer = min(0.001, contextNonbondedFractions[lastIndex]);
double fractionToTransfer = min(0.01, contextNonbondedFractions[lastIndex]);
contextNonbondedFractions[firstIndex] += fractionToTransfer;
contextNonbondedFractions[lastIndex] -= fractionToTransfer;
double startFraction = 0.0;
......@@ -223,6 +250,7 @@ double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& con
startFraction = endFraction;
}
}
}
return energy;
}
......@@ -508,6 +536,11 @@ double CudaParallelCalcCMAPTorsionForceKernel::execute(ContextImpl& context, boo
return 0.0;
}
void CudaParallelCalcCMAPTorsionForceKernel::copyParametersToContext(ContextImpl& context, const CMAPTorsionForce& force) {
for (int i = 0; i < (int) kernels.size(); i++)
getKernel(i).copyParametersToContext(context, force);
}
class CudaParallelCalcCustomTorsionForceKernel::Task : public CudaContext::WorkTask {
public:
Task(ContextImpl& context, CudaCalcCustomTorsionForceKernel& kernel, bool includeForce,
......
......@@ -229,22 +229,13 @@ CudaPlatform::PlatformData::PlatformData(ContextImpl* context, const System& sys
// Determine whether peer-to-peer copying is supported, and enable it if so.
peerAccessSupported = false; // Disable until I figure out why it usually makes things slower
// peerAccessSupported = true;
// for (int i = 1; i < contexts.size(); i++) {
// int canAccess;
// cuDeviceCanAccessPeer(&canAccess, contexts[i]->getDevice(), contexts[0]->getDevice());
// if (!canAccess) {
// peerAccessSupported = false;
// break;
// }
// }
if (peerAccessSupported) {
peerAccessSupported = true;
for (int i = 1; i < contexts.size(); i++) {
contexts[0]->setAsCurrent();
CHECK_RESULT(cuCtxEnablePeerAccess(contexts[i]->getContext(), 0), "Error enabling peer access");
contexts[i]->setAsCurrent();
CHECK_RESULT(cuCtxEnablePeerAccess(contexts[0]->getContext(), 0), "Error enabling peer access");
int canAccess;
cuDeviceCanAccessPeer(&canAccess, contexts[i]->getDevice(), contexts[0]->getDevice());
if (!canAccess) {
peerAccessSupported = false;
break;
}
}
}
......
......@@ -112,13 +112,12 @@ void CudaSort::sort(CudaArray& data) {
else {
// Compute the range of data values.
void* rangeArgs[] = {&data.getDevicePointer(), &dataLength, &dataRange->getDevicePointer()};
unsigned int numBuckets = bucketOffset->getSize();
void* rangeArgs[] = {&data.getDevicePointer(), &dataLength, &dataRange->getDevicePointer(), &numBuckets, &bucketOffset->getDevicePointer()};
context.executeKernel(computeRangeKernel, rangeArgs, rangeKernelSize, rangeKernelSize, rangeKernelSize*trait->getKeySize());
// Assign array elements to buckets.
unsigned int numBuckets = bucketOffset->getSize();
context.clearBuffer(*bucketOffset);
void* elementsArgs[] = {&data.getDevicePointer(), &dataLength, &numBuckets, &dataRange->getDevicePointer(),
&bucketOffset->getDevicePointer(), &bucketOfElement->getDevicePointer(), &offsetInBucket->getDevicePointer()};
context.executeKernel(assignElementsKernel, elementsArgs, data.getSize());
......
{
#if USE_EWALD
bool needCorrection = hasExclusions && isExcluded && atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS;
if ((!isExcluded && r2 < CUTOFF_SQUARED) || needCorrection) {
bool needCorrection = hasExclusions && isExcluded && atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS;
unsigned int includeInteraction = ((!isExcluded && r2 < CUTOFF_SQUARED) || needCorrection);
const real alphaR = EWALD_ALPHA*r;
const real expAlphaRSqr = EXP(-alphaR*alphaR);
const real prefactor = 138.935456f*posq1.w*posq2.w*invR;
......@@ -44,16 +45,14 @@ if ((!isExcluded && r2 < CUTOFF_SQUARED) || needCorrection) {
}
#endif
tempForce += prefactor*(erfcAlphaR+alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
tempEnergy += ljEnergy + prefactor*erfcAlphaR;
tempEnergy += includeInteraction ? ljEnergy + prefactor*erfcAlphaR : 0;
#else
tempForce = prefactor*(erfcAlphaR+alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
tempEnergy += prefactor*erfcAlphaR;
tempEnergy += includeInteraction ? prefactor*erfcAlphaR : 0;
#endif
}
dEdR += tempForce*invR*invR;
}
dEdR += includeInteraction ? tempForce*invR*invR : 0;
#else
{
#ifdef USE_CUTOFF
unsigned int includeInteraction = (!isExcluded && r2 < CUTOFF_SQUARED);
#else
......@@ -91,5 +90,5 @@ if ((!isExcluded && r2 < CUTOFF_SQUARED) || needCorrection) {
#endif
#endif
dEdR += includeInteraction ? tempForce*invR*invR : 0;
}
#endif
}
......@@ -2,7 +2,7 @@
#define STORE_DERIVATIVE_2(INDEX) atomicAdd(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].deriv##INDEX*0x100000000)));
typedef struct {
real4 posq;
real3 pos;
real3 force;
ATOM_PARAMETER_DATA
#ifdef NEED_PADDING
......@@ -17,7 +17,8 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
const real4* __restrict__ posq, const unsigned int* __restrict__ exclusions, const ushort2* __restrict__ exclusionTiles,
#ifdef USE_CUTOFF
const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
unsigned int maxTiles, const real4* __restrict__ blockCenter, const real4* __restrict__ blockSize, const unsigned int* __restrict__ interactingAtoms
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, const real4* __restrict__ blockCenter,
const real4* __restrict__ blockSize, const unsigned int* __restrict__ interactingAtoms
#else
unsigned int numTiles
#endif
......@@ -40,7 +41,7 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
real3 force = make_real3(0);
DECLARE_ATOM1_DERIVATIVES
unsigned int atom1 = x*TILE_SIZE + tgx;
real4 posq1 = posq[atom1];
real4 pos1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
#ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[pos*TILE_SIZE+tgx];
......@@ -49,16 +50,14 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
// This tile is on the diagonal.
const unsigned int localAtomIndex = threadIdx.x;
localData[localAtomIndex].posq = posq1;
localData[localAtomIndex].pos = make_real3(pos1.x, pos1.y, pos1.z);
LOAD_LOCAL_PARAMETERS_FROM_1
for (unsigned int j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+j;
real4 posq2 = localData[atom2].posq;
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
real3 pos2 = localData[atom2].pos;
real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
APPLY_PERIODIC_TO_DELTA(delta)
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
......@@ -95,7 +94,8 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
const unsigned int localAtomIndex = threadIdx.x;
unsigned int j = y*TILE_SIZE + tgx;
localData[localAtomIndex].posq = posq[j];
real4 tempPosq = posq[j];
localData[localAtomIndex].pos = make_real3(tempPosq.x, tempPosq.y, tempPosq.z);
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
localData[localAtomIndex].force = make_real3(0);
CLEAR_LOCAL_DERIVATIVES
......@@ -105,12 +105,10 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+tj;
real4 posq2 = localData[atom2].posq;
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
real3 pos2 = localData[atom2].pos;
real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
APPLY_PERIODIC_TO_DELTA(delta)
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
......@@ -231,7 +229,7 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
// Load atom data for this tile.
real4 posq1 = posq[atom1];
real4 pos1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
const unsigned int localAtomIndex = threadIdx.x;
#ifdef USE_CUTOFF
......@@ -241,7 +239,8 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
#endif
atomIndices[threadIdx.x] = j;
if (j < PADDED_NUM_ATOMS) {
localData[localAtomIndex].posq = posq[j];
real4 tempPosq = posq[j];
localData[localAtomIndex].pos = make_real3(tempPosq.x, tempPosq.y, tempPosq.z);
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
localData[localAtomIndex].force = make_real3(0);
CLEAR_LOCAL_DERIVATIVES
......@@ -252,17 +251,13 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
// box, then skip having to apply periodic boundary conditions later.
real4 blockCenterX = blockCenter[x];
posq1.x -= floor((posq1.x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
posq1.y -= floor((posq1.y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
posq1.z -= floor((posq1.z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
localData[threadIdx.x].posq.x -= floor((localData[threadIdx.x].posq.x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
localData[threadIdx.x].posq.y -= floor((localData[threadIdx.x].posq.y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
localData[threadIdx.x].posq.z -= floor((localData[threadIdx.x].posq.z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
APPLY_PERIODIC_TO_POS_WITH_CENTER(pos1, blockCenterX)
APPLY_PERIODIC_TO_POS_WITH_CENTER(localData[threadIdx.x].pos, blockCenterX)
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+tj;
real4 posq2 = localData[atom2].posq;
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
real3 pos2 = localData[atom2].pos;
real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
......@@ -301,12 +296,10 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+tj;
real4 posq2 = localData[atom2].posq;
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
real3 pos2 = localData[atom2].pos;
real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
APPLY_PERIODIC_TO_DELTA(delta)
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
......
typedef struct {
real4 posq;
real value, temp;
real3 pos;
real value;
ATOM_PARAMETER_DATA
#ifdef NEED_PADDING
float padding;
......@@ -14,7 +14,8 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
const ushort2* __restrict__ exclusionTiles, unsigned long long* __restrict__ global_value,
#ifdef USE_CUTOFF
const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
unsigned int maxTiles, const real4* __restrict__ blockCenter, const real4* __restrict__ blockSize, const unsigned int* __restrict__ interactingAtoms
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, const real4* __restrict__ blockCenter,
const real4* __restrict__ blockSize, const unsigned int* __restrict__ interactingAtoms
#else
unsigned int numTiles
#endif
......@@ -35,7 +36,7 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
const unsigned int y = tileIndices.y;
real value = 0;
unsigned int atom1 = x*TILE_SIZE + tgx;
real4 posq1 = posq[atom1];
real4 pos1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
#ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[pos*TILE_SIZE+tgx];
......@@ -44,16 +45,14 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
// This tile is on the diagonal.
const unsigned int localAtomIndex = threadIdx.x;
localData[localAtomIndex].posq = posq1;
localData[localAtomIndex].pos = make_real3(pos1.x, pos1.y, pos1.z);
LOAD_LOCAL_PARAMETERS_FROM_1
for (unsigned int j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+j;
real4 posq2 = localData[atom2].posq;
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
real3 pos2 = localData[atom2].pos;
real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
APPLY_PERIODIC_TO_DELTA(delta)
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
......@@ -87,7 +86,8 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
const unsigned int localAtomIndex = threadIdx.x;
unsigned int j = y*TILE_SIZE + tgx;
localData[localAtomIndex].posq = posq[j];
real4 tempPosq = posq[j];
localData[localAtomIndex].pos = make_real3(tempPosq.x, tempPosq.y, tempPosq.z);
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
localData[localAtomIndex].value = 0;
#ifdef USE_EXCLUSIONS
......@@ -96,12 +96,10 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+tj;
real4 posq2 = localData[atom2].posq;
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
real3 pos2 = localData[atom2].pos;
real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
APPLY_PERIODIC_TO_DELTA(delta)
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
......@@ -207,7 +205,7 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
// Load atom data for this tile.
real4 posq1 = posq[atom1];
real4 pos1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
const unsigned int localAtomIndex = threadIdx.x;
#ifdef USE_CUTOFF
......@@ -217,7 +215,8 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
#endif
atomIndices[threadIdx.x] = j;
if (j < PADDED_NUM_ATOMS) {
localData[localAtomIndex].posq = posq[j];
real4 tempPosq = posq[j];
localData[localAtomIndex].pos = make_real3(tempPosq.x, tempPosq.y, tempPosq.z);
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
localData[localAtomIndex].value = 0;
}
......@@ -227,17 +226,13 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
// box, then skip having to apply periodic boundary conditions later.
real4 blockCenterX = blockCenter[x];
posq1.x -= floor((posq1.x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
posq1.y -= floor((posq1.y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
posq1.z -= floor((posq1.z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
localData[threadIdx.x].posq.x -= floor((localData[threadIdx.x].posq.x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
localData[threadIdx.x].posq.y -= floor((localData[threadIdx.x].posq.y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
localData[threadIdx.x].posq.z -= floor((localData[threadIdx.x].posq.z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
APPLY_PERIODIC_TO_POS_WITH_CENTER(pos1, blockCenterX)
APPLY_PERIODIC_TO_POS_WITH_CENTER(localData[threadIdx.x].pos, blockCenterX)
unsigned int tj = tgx;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+tj;
real4 posq2 = localData[atom2].posq;
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
real3 pos2 = localData[atom2].pos;
real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
if (r2 < CUTOFF_SQUARED) {
real invR = RSQRT(r2);
......@@ -263,12 +258,10 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
unsigned int tj = tgx;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+tj;
real4 posq2 = localData[atom2].posq;
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
real3 pos2 = localData[atom2].pos;
real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
APPLY_PERIODIC_TO_DELTA(delta)
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
......
......@@ -25,12 +25,10 @@ inline __device__ real4 delta(real4 vec1, real4 vec2) {
* Compute the difference between two vectors, taking periodic boundary conditions into account
* and setting the fourth component to the squared magnitude.
*/
inline __device__ real4 deltaPeriodic(real4 vec1, real4 vec2, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
inline __device__ real4 deltaPeriodic(real4 vec1, real4 vec2, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) {
real4 result = make_real4(vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0.0f);
#ifdef USE_PERIODIC
result.x -= floor(result.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
result.y -= floor(result.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
result.z -= floor(result.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
APPLY_PERIODIC_TO_DELTA(result)
#endif
result.w = result.x*result.x + result.y*result.y + result.z*result.z;
return result;
......@@ -69,7 +67,8 @@ inline __device__ real4 computeCross(real4 vec1, real4 vec2) {
* Compute forces on donors.
*/
extern "C" __global__ void computeDonorForces(unsigned long long* __restrict__ force, real* __restrict__ energyBuffer, const real4* __restrict__ posq,
const int4* __restrict__ exclusions, const int4* __restrict__ donorAtoms, const int4* __restrict__ acceptorAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize
const int4* __restrict__ exclusions, const int4* __restrict__ donorAtoms, const int4* __restrict__ acceptorAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
PARAMETER_ARGUMENTS) {
extern __shared__ real4 posBuffer[];
real energy = 0;
......@@ -116,7 +115,7 @@ extern "C" __global__ void computeDonorForces(unsigned long long* __restrict__ f
real4 a1 = posBuffer[3*index];
real4 a2 = posBuffer[3*index+1];
real4 a3 = posBuffer[3*index+2];
real4 deltaD1A1 = deltaPeriodic(d1, a1, periodicBoxSize, invPeriodicBoxSize);
real4 deltaD1A1 = deltaPeriodic(d1, a1, periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ);
#ifdef USE_CUTOFF
if (deltaD1A1.w < CUTOFF_SQUARED) {
#endif
......@@ -157,7 +156,8 @@ extern "C" __global__ void computeDonorForces(unsigned long long* __restrict__ f
* Compute forces on acceptors.
*/
extern "C" __global__ void computeAcceptorForces(unsigned long long* __restrict__ force, real* __restrict__ energyBuffer, const real4* __restrict__ posq,
const int4* __restrict__ exclusions, const int4* __restrict__ donorAtoms, const int4* __restrict__ acceptorAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize
const int4* __restrict__ exclusions, const int4* __restrict__ donorAtoms, const int4* __restrict__ acceptorAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
PARAMETER_ARGUMENTS) {
extern __shared__ real4 posBuffer[];
real3 f1 = make_real3(0);
......@@ -203,7 +203,7 @@ extern "C" __global__ void computeAcceptorForces(unsigned long long* __restrict_
real4 d1 = posBuffer[3*index];
real4 d2 = posBuffer[3*index+1];
real4 d3 = posBuffer[3*index+2];
real4 deltaD1A1 = deltaPeriodic(d1, a1, periodicBoxSize, invPeriodicBoxSize);
real4 deltaD1A1 = deltaPeriodic(d1, a1, periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ);
#ifdef USE_CUTOFF
if (deltaD1A1.w < CUTOFF_SQUARED) {
#endif
......
......@@ -18,12 +18,10 @@ inline __device__ real3 trim(real4 v) {
* Compute the difference between two vectors, taking periodic boundary conditions into account
* and setting the fourth component to the squared magnitude.
*/
inline __device__ real4 delta(real3 vec1, real3 vec2, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
inline __device__ real4 delta(real3 vec1, real3 vec2, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) {
real4 result = make_real4(vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0.0f);
#ifdef USE_PERIODIC
result.x -= floor(result.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
result.y -= floor(result.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
result.z -= floor(result.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
APPLY_PERIODIC_TO_DELTA(result)
#endif
result.w = result.x*result.x + result.y*result.y + result.z*result.z;
return result;
......@@ -81,7 +79,7 @@ __constant__ float globals[NUM_GLOBALS];
*/
extern "C" __global__ void computeInteraction(
unsigned long long* __restrict__ forceBuffers, real* __restrict__ energyBuffer, const real4* __restrict__ posq,
real4 periodicBoxSize, real4 invPeriodicBoxSize
real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
#ifdef USE_CUTOFF
, const int* __restrict__ neighbors, const int* __restrict__ neighborStartIndex
#endif
......@@ -144,16 +142,14 @@ extern "C" __global__ void computeInteraction(
/**
* Find a bounding box for the atoms in each block.
*/
extern "C" __global__ void findBlockBounds(real4 periodicBoxSize, real4 invPeriodicBoxSize, const real4* __restrict__ posq,
real4* __restrict__ blockCenter, real4* __restrict__ blockBoundingBox, int* __restrict__ numNeighborPairs) {
extern "C" __global__ void findBlockBounds(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
const real4* __restrict__ posq, real4* __restrict__ blockCenter, real4* __restrict__ blockBoundingBox, int* __restrict__ numNeighborPairs) {
int index = blockIdx.x*blockDim.x+threadIdx.x;
int base = index*TILE_SIZE;
while (base < NUM_ATOMS) {
real4 pos = posq[base];
#ifdef USE_PERIODIC
pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
APPLY_PERIODIC_TO_POS(pos)
#endif
real4 minPos = pos;
real4 maxPos = pos;
......@@ -162,9 +158,7 @@ extern "C" __global__ void findBlockBounds(real4 periodicBoxSize, real4 invPerio
pos = posq[i];
#ifdef USE_PERIODIC
real4 center = 0.5f*(maxPos+minPos);
pos.x -= floor((pos.x-center.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
pos.y -= floor((pos.y-center.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
pos.z -= floor((pos.z-center.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, center)
#endif
minPos = make_real4(min(minPos.x,pos.x), min(minPos.y,pos.y), min(minPos.z,pos.z), 0);
maxPos = make_real4(max(maxPos.x,pos.x), max(maxPos.y,pos.y), max(maxPos.z,pos.z), 0);
......@@ -182,8 +176,8 @@ extern "C" __global__ void findBlockBounds(real4 periodicBoxSize, real4 invPerio
/**
* Find a list of neighbors for each atom.
*/
extern "C" __global__ void findNeighbors(real4 periodicBoxSize, real4 invPeriodicBoxSize, const real4* __restrict__ posq,
const real4* __restrict__ blockCenter, const real4* __restrict__ blockBoundingBox, int2* __restrict__ neighborPairs,
extern "C" __global__ void findNeighbors(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
const real4* __restrict__ posq, const real4* __restrict__ blockCenter, const real4* __restrict__ blockBoundingBox, int2* __restrict__ neighborPairs,
int* __restrict__ numNeighborPairs, int* __restrict__ numNeighborsForAtom, int maxNeighborPairs
#ifdef USE_EXCLUSIONS
, int* __restrict__ exclusions, int* __restrict__ exclusionStartIndex
......@@ -216,9 +210,7 @@ extern "C" __global__ void findNeighbors(real4 periodicBoxSize, real4 invPeriodi
real4 blockSize2 = blockBoundingBox[block2];
real4 blockDelta = blockCenter1-blockCenter2;
#ifdef USE_PERIODIC
blockDelta.x -= floor(blockDelta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
blockDelta.y -= floor(blockDelta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
blockDelta.z -= floor(blockDelta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
APPLY_PERIODIC_TO_DELTA(blockDelta)
#endif
blockDelta.x = max(0.0f, fabs(blockDelta.x)-blockSize1.x-blockSize2.x);
blockDelta.y = max(0.0f, fabs(blockDelta.y)-blockSize1.y-blockSize2.y);
......@@ -247,7 +239,7 @@ extern "C" __global__ void findNeighbors(real4 periodicBoxSize, real4 invPeriodi
// Decide whether to include this atom pair in the neighbor list.
real4 atomDelta = delta(pos1, pos2, periodicBoxSize, invPeriodicBoxSize);
real4 atomDelta = delta(pos1, pos2, periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ);
#ifdef USE_CENTRAL_PARTICLE
bool includeAtom = (atom2 != atom1 && atom2 < NUM_ATOMS && atomDelta.w < CUTOFF_SQUARED);
#else
......
......@@ -10,7 +10,7 @@ typedef struct {
extern "C" __global__ void computeInteractionGroups(
unsigned long long* __restrict__ forceBuffers, real* __restrict__ energyBuffer, const real4* __restrict__ posq, const int4* __restrict__ groupData,
real4 periodicBoxSize, real4 invPeriodicBoxSize
real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
PARAMETER_ARGUMENTS) {
const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE; // global warpIndex
......@@ -47,9 +47,7 @@ extern "C" __global__ void computeInteractionGroups(
posq2 = make_real4(localData[localIndex].x, localData[localIndex].y, localData[localIndex].z, localData[localIndex].q);
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
APPLY_PERIODIC_TO_DELTA(delta)
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
......
......@@ -4,16 +4,15 @@
/**
* Find a bounding box for the atoms in each block.
*/
extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, const real4* __restrict__ posq,
real4* __restrict__ blockCenter, real4* __restrict__ blockBoundingBox, int* __restrict__ rebuildNeighborList, real2* __restrict__ sortedBlocks) {
extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
const real4* __restrict__ posq, real4* __restrict__ blockCenter, real4* __restrict__ blockBoundingBox, int* __restrict__ rebuildNeighborList,
real2* __restrict__ sortedBlocks) {
int index = blockIdx.x*blockDim.x+threadIdx.x;
int base = index*TILE_SIZE;
while (base < numAtoms) {
real4 pos = posq[base];
#ifdef USE_PERIODIC
pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
APPLY_PERIODIC_TO_POS(pos)
#endif
real4 minPos = pos;
real4 maxPos = pos;
......@@ -22,9 +21,7 @@ extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize,
pos = posq[i];
#ifdef USE_PERIODIC
real4 center = 0.5f*(maxPos+minPos);
pos.x -= floor((pos.x-center.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
pos.y -= floor((pos.y-center.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
pos.z -= floor((pos.z-center.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, center)
#endif
minPos = make_real4(min(minPos.x,pos.x), min(minPos.y,pos.y), min(minPos.z,pos.z), 0);
maxPos = make_real4(max(maxPos.x,pos.x), max(maxPos.y,pos.y), max(maxPos.z,pos.z), 0);
......@@ -46,7 +43,7 @@ extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize,
extern "C" __global__ void sortBoxData(const real2* __restrict__ sortedBlock, const real4* __restrict__ blockCenter,
const real4* __restrict__ blockBoundingBox, real4* __restrict__ sortedBlockCenter,
real4* __restrict__ sortedBlockBoundingBox, const real4* __restrict__ posq, const real4* __restrict__ oldPositions,
unsigned int* __restrict__ interactionCount, int* __restrict__ rebuildNeighborList) {
unsigned int* __restrict__ interactionCount, int* __restrict__ rebuildNeighborList, bool forceRebuild) {
for (int i = threadIdx.x+blockIdx.x*blockDim.x; i < NUM_BLOCKS; i += blockDim.x*gridDim.x) {
int index = (int) sortedBlock[i].y;
sortedBlockCenter[i] = blockCenter[index];
......@@ -55,7 +52,7 @@ extern "C" __global__ void sortBoxData(const real2* __restrict__ sortedBlock, co
// Also check whether any atom has moved enough so that we really need to rebuild the neighbor list.
bool rebuild = false;
bool rebuild = forceRebuild;
for (int i = threadIdx.x+blockIdx.x*blockDim.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
real4 delta = oldPositions[i]-posq[i];
if (delta.x*delta.x + delta.y*delta.y + delta.z*delta.z > 0.25f*PADDING*PADDING)
......@@ -116,11 +113,11 @@ extern "C" __global__ void sortBoxData(const real2* __restrict__ sortedBlock, co
* [in] rebuildNeighbourList - whether or not to execute this kernel
*
*/
extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int* __restrict__ interactionCount,
int* __restrict__ interactingTiles, unsigned int* __restrict__ interactingAtoms, const real4* __restrict__ posq, unsigned int maxTiles, unsigned int startBlockIndex,
unsigned int numBlocks, real2* __restrict__ sortedBlocks, const real4* __restrict__ sortedBlockCenter, const real4* __restrict__ sortedBlockBoundingBox,
const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices, real4* __restrict__ oldPositions,
const int* __restrict__ rebuildNeighborList) {
extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
unsigned int* __restrict__ interactionCount, int* __restrict__ interactingTiles, unsigned int* __restrict__ interactingAtoms, const real4* __restrict__ posq,
unsigned int maxTiles, unsigned int startBlockIndex, unsigned int numBlocks, real2* __restrict__ sortedBlocks, const real4* __restrict__ sortedBlockCenter,
const real4* __restrict__ sortedBlockBoundingBox, const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices,
real4* __restrict__ oldPositions, const int* __restrict__ rebuildNeighborList) {
if (rebuildNeighborList[0] == 0)
return; // The neighbor list doesn't need to be rebuilt.
......@@ -157,9 +154,7 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
// The box is small enough that we can just translate all the atoms into a single periodic
// box, then skip having to apply periodic boundary conditions later.
pos1.x -= floor((pos1.x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
pos1.y -= floor((pos1.y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
pos1.z -= floor((pos1.z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
APPLY_PERIODIC_TO_POS_WITH_CENTER(pos1, blockCenterX)
}
#endif
posBuffer[threadIdx.x] = pos1;
......@@ -185,9 +180,7 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
real4 blockSizeY = (block2 < NUM_BLOCKS ? sortedBlockBoundingBox[block2] : make_real4(0));
real4 blockDelta = blockCenterX-blockCenterY;
#ifdef USE_PERIODIC
blockDelta.x -= floor(blockDelta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
blockDelta.y -= floor(blockDelta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
blockDelta.z -= floor(blockDelta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
APPLY_PERIODIC_TO_DELTA(blockDelta)
#endif
blockDelta.x = max(0.0f, fabs(blockDelta.x)-blockSizeX.x-blockSizeY.x);
blockDelta.y = max(0.0f, fabs(blockDelta.y)-blockSizeX.y-blockSizeY.y);
......@@ -215,9 +208,7 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
real3 pos2 = trimTo3(posq[atom2]);
#ifdef USE_PERIODIC
if (singlePeriodicCopy) {
pos2.x -= floor((pos2.x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
pos2.y -= floor((pos2.y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
pos2.z -= floor((pos2.z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
APPLY_PERIODIC_TO_POS_WITH_CENTER(pos2, blockCenterX)
}
#endif
bool interacts = false;
......@@ -226,9 +217,7 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
if (!singlePeriodicCopy) {
for (int j = 0; j < TILE_SIZE; j++) {
real3 delta = pos2-posBuffer[warpStart+j];
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
APPLY_PERIODIC_TO_DELTA(delta)
interacts |= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED);
}
}
......@@ -256,7 +245,7 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
if (indexInWarp == 0)
tileStartIndex = atomicAdd(interactionCount, tilesToStore);
int newTileStartIndex = tileStartIndex;
if (newTileStartIndex+tilesToStore < maxTiles) {
if (newTileStartIndex+tilesToStore <= maxTiles) {
if (indexInWarp < tilesToStore)
interactingTiles[newTileStartIndex+indexInWarp] = x;
for (int j = 0; j < tilesToStore; j++)
......@@ -275,7 +264,7 @@ extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, rea
if (indexInWarp == 0)
tileStartIndex = atomicAdd(interactionCount, tilesToStore);
int newTileStartIndex = tileStartIndex;
if (newTileStartIndex+tilesToStore < maxTiles) {
if (newTileStartIndex+tilesToStore <= maxTiles) {
if (indexInWarp < tilesToStore)
interactingTiles[newTileStartIndex+indexInWarp] = x;
for (int j = 0; j < tilesToStore; j++)
......
#define DIELECTRIC_OFFSET 0.009f
#define PROBE_RADIUS 0.14f
#define SURFACE_AREA_FACTOR -170.351730667551f //-6.0f*3.14159265358979323846f*0.0216f*1000.0f*0.4184f;
#define WARPS_PER_GROUP (FORCE_WORK_GROUP_SIZE/TILE_SIZE)
/**
......@@ -70,7 +69,8 @@ typedef struct {
extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ global_bornSum, const real4* __restrict__ posq, const float2* __restrict__ global_params,
#ifdef USE_CUTOFF
const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
unsigned int maxTiles, const real4* __restrict__ blockCenter, const real4* __restrict__ blockSize, const unsigned int* __restrict__ interactingAtoms,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, const real4* __restrict__ blockCenter,
const real4* __restrict__ blockSize, const unsigned int* __restrict__ interactingAtoms,
#else
unsigned int numTiles,
#endif
......@@ -105,9 +105,7 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
for (unsigned int j = 0; j < TILE_SIZE; j++) {
real3 delta = make_real3(localData[tbx+j].x-posq1.x, localData[tbx+j].y-posq1.y, localData[tbx+j].z-posq1.z);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
APPLY_PERIODIC_TO_DELTA(delta)
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
......@@ -152,9 +150,7 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
for (j = 0; j < TILE_SIZE; j++) {
real3 delta = make_real3(localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
APPLY_PERIODIC_TO_DELTA(delta)
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
......@@ -292,12 +288,8 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
// box, then skip having to apply periodic boundary conditions later.
real4 blockCenterX = blockCenter[x];
posq1.x -= floor((posq1.x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
posq1.y -= floor((posq1.y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
posq1.z -= floor((posq1.z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
localData[threadIdx.x].x -= floor((localData[threadIdx.x].x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
localData[threadIdx.x].y -= floor((localData[threadIdx.x].y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
localData[threadIdx.x].z -= floor((localData[threadIdx.x].z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX)
APPLY_PERIODIC_TO_POS_WITH_CENTER(localData[threadIdx.x], blockCenterX)
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
real3 delta = make_real3(localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z);
......@@ -343,9 +335,7 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
for (j = 0; j < TILE_SIZE; j++) {
real3 delta = make_real3(localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
APPLY_PERIODIC_TO_DELTA(delta)
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
int atom2 = atomIndices[tbx+tj];
......@@ -415,7 +405,8 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
real* __restrict__ energyBuffer, const real4* __restrict__ posq, const real* __restrict__ global_bornRadii,
#ifdef USE_CUTOFF
const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
unsigned int maxTiles, const real4* __restrict__ blockCenter, const real4* __restrict__ blockSize, const unsigned int* __restrict__ interactingAtoms,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, const real4* __restrict__ blockCenter,
const real4* __restrict__ blockSize, const unsigned int* __restrict__ interactingAtoms,
#else
unsigned int numTiles,
#endif
......@@ -452,9 +443,7 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
real4 posq2 = make_real4(localData[tbx+j].x, localData[tbx+j].y, localData[tbx+j].z, localData[tbx+j].q);
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
APPLY_PERIODIC_TO_DELTA(delta)
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
......@@ -509,9 +498,7 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
real4 posq2 = make_real4(localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z, localData[tbx+tj].q);
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
APPLY_PERIODIC_TO_DELTA(delta)
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
......@@ -657,12 +644,8 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
// box, then skip having to apply periodic boundary conditions later.
real4 blockCenterX = blockCenter[x];
posq1.x -= floor((posq1.x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
posq1.y -= floor((posq1.y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
posq1.z -= floor((posq1.z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
localData[threadIdx.x].x -= floor((localData[threadIdx.x].x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
localData[threadIdx.x].y -= floor((localData[threadIdx.x].y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
localData[threadIdx.x].z -= floor((localData[threadIdx.x].z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX)
APPLY_PERIODIC_TO_POS_WITH_CENTER(localData[threadIdx.x], blockCenterX)
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
int atom2 = atomIndices[tbx+tj];
......@@ -714,9 +697,7 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
real4 posq2 = make_real4(localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z, localData[tbx+tj].q);
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
APPLY_PERIODIC_TO_DELTA(delta)
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
......
......@@ -90,7 +90,7 @@ extern "C" __global__ void selectLangevinStepSize(int numAtoms, int paddedNumAto
while (index < numAtoms) {
mixed3 f = make_mixed3(scale*force[index], scale*force[index+paddedNumAtoms], scale*force[index+paddedNumAtoms*2]);
mixed invMass = velm[index].w;
err += (f.x*f.x + f.y*f.y + f.z*f.z)*invMass;
err += (f.x*f.x + f.y*f.y + f.z*f.z)*invMass*invMass;
index += blockDim.x*gridDim.x;
}
error[threadIdx.x] = err;
......
......@@ -2,7 +2,8 @@
* Scale the particle positions with each axis independent
*/
extern "C" __global__ void scalePositions(float scaleX, float scaleY, float scaleZ, int numMolecules, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4* __restrict__ posq,
extern "C" __global__ void scalePositions(float scaleX, float scaleY, float scaleZ, int numMolecules, real4 periodicBoxSize,
real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, real4* __restrict__ posq,
const int* __restrict__ moleculeAtoms, const int* __restrict__ moleculeStartIndex) {
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numMolecules; index += blockDim.x*gridDim.x) {
int first = moleculeStartIndex[index];
......@@ -25,13 +26,9 @@ extern "C" __global__ void scalePositions(float scaleX, float scaleY, float scal
// Move it into the first periodic box.
int xcell = (int) floor(center.x*invPeriodicBoxSize.x);
int ycell = (int) floor(center.y*invPeriodicBoxSize.y);
int zcell = (int) floor(center.z*invPeriodicBoxSize.z);
real3 delta = make_real3(xcell*periodicBoxSize.x, ycell*periodicBoxSize.y, zcell*periodicBoxSize.z);
center.x -= delta.x;
center.y -= delta.y;
center.z -= delta.z;
real3 oldCenter = center;
APPLY_PERIODIC_TO_POS(center)
real3 delta = make_real3(oldCenter.x-center.x, oldCenter.y-center.y, oldCenter.z-center.z);
// Now scale the position of the molecule center.
......
......@@ -103,8 +103,9 @@ extern "C" __global__ void computeNonbonded(
unsigned long long* __restrict__ forceBuffers, real* __restrict__ energyBuffer, const real4* __restrict__ posq, const tileflags* __restrict__ exclusions,
const ushort2* __restrict__ exclusionTiles, unsigned int startTileIndex, unsigned int numTileIndices
#ifdef USE_CUTOFF
, const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
unsigned int maxTiles, const real4* __restrict__ blockCenter, const real4* __restrict__ blockSize, const unsigned int* __restrict__ interactingAtoms
, const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount,real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, const real4* __restrict__ blockCenter,
const real4* __restrict__ blockSize, const unsigned int* __restrict__ interactingAtoms
#endif
PARAMETER_ARGUMENTS) {
const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
......@@ -155,9 +156,7 @@ extern "C" __global__ void computeNonbonded(
#endif
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
APPLY_PERIODIC_TO_DELTA(delta)
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real invR = RSQRT(r2);
......@@ -223,14 +222,9 @@ extern "C" __global__ void computeNonbonded(
#endif
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
APPLY_PERIODIC_TO_DELTA(delta)
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = r2*invR;
LOAD_ATOM2_PARAMETERS
......@@ -276,9 +270,6 @@ extern "C" __global__ void computeNonbonded(
localData[tbx+tj].fz += dEdR2.z;
#endif
#endif // end USE_SYMMETRIC
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl >>= 1;
#endif
......@@ -404,22 +395,25 @@ extern "C" __global__ void computeNonbonded(
#endif
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
else {
#ifdef ENABLE_SHUFFLE
shflPosq = make_real4(0, 0, 0, 0);
#else
localData[threadIdx.x].x = 0;
localData[threadIdx.x].y = 0;
localData[threadIdx.x].z = 0;
#endif
}
#ifdef USE_PERIODIC
if (singlePeriodicCopy) {
// The box is small enough that we can just translate all the atoms into a single periodic
// box, then skip having to apply periodic boundary conditions later.
real4 blockCenterX = blockCenter[x];
posq1.x -= floor((posq1.x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
posq1.y -= floor((posq1.y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
posq1.z -= floor((posq1.z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX)
#ifdef ENABLE_SHUFFLE
shflPosq.x -= floor((shflPosq.x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
shflPosq.y -= floor((shflPosq.y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
shflPosq.z -= floor((shflPosq.z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
APPLY_PERIODIC_TO_POS_WITH_CENTER(shflPosq, blockCenterX)
#else
localData[threadIdx.x].x -= floor((localData[threadIdx.x].x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
localData[threadIdx.x].y -= floor((localData[threadIdx.x].y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
localData[threadIdx.x].z -= floor((localData[threadIdx.x].z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
APPLY_PERIODIC_TO_POS_WITH_CENTER(localData[threadIdx.x], blockCenterX)
#endif
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
......@@ -431,7 +425,6 @@ extern "C" __global__ void computeNonbonded(
#endif
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
if (r2 < CUTOFF_SQUARED) {
real invR = RSQRT(r2);
real r = r2*invR;
LOAD_ATOM2_PARAMETERS
......@@ -477,7 +470,6 @@ extern "C" __global__ void computeNonbonded(
localData[tbx+tj].fz += dEdR2.z;
#endif
#endif // end USE_SYMMETRIC
}
#ifdef ENABLE_SHUFFLE
SHUFFLE_WARP_DATA
#endif
......@@ -498,14 +490,9 @@ extern "C" __global__ void computeNonbonded(
#endif
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
APPLY_PERIODIC_TO_DELTA(delta)
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = r2*invR;
LOAD_ATOM2_PARAMETERS
......@@ -551,9 +538,6 @@ extern "C" __global__ void computeNonbonded(
localData[tbx+tj].fz += dEdR2.z;
#endif
#endif // end USE_SYMMETRIC
#ifdef USE_CUTOFF
}
#endif
#ifdef ENABLE_SHUFFLE
SHUFFLE_WARP_DATA
#endif
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment