Commit 2d2f05ce authored by Andy Simmonett's avatar Andy Simmonett
Browse files

Merge branch 'master' of github.com:pandegroup/openmm into genpt

parents 94823d84 4d32047c
...@@ -28,7 +28,7 @@ ...@@ -28,7 +28,7 @@
#include "CpuNonbondedForce.h" #include "CpuNonbondedForce.h"
#include "ReferenceForce.h" #include "ReferenceForce.h"
#include "ReferencePME.h" #include "ReferencePME.h"
#include "gmx_atomic.h" #include "openmm/internal/gmx_atomic.h"
#include <algorithm> #include <algorithm>
// In case we're using some primitive version of Visual Studio this will // In case we're using some primitive version of Visual Studio this will
...@@ -322,6 +322,14 @@ void CpuNonbondedForce::calculateDirectIxn(int numberOfAtoms, float* posq, const ...@@ -322,6 +322,14 @@ void CpuNonbondedForce::calculateDirectIxn(int numberOfAtoms, float* posq, const
threads.execute(task); threads.execute(task);
threads.waitForThreads(); threads.waitForThreads();
// Signal the threads to subtract the exclusions.
if (ewald || pme) {
gmx_atomic_set(&counter, 0);
threads.resumeThreads();
threads.waitForThreads();
}
// Combine the energies from all the threads. // Combine the energies from all the threads.
if (totalEnergy != NULL) { if (totalEnergy != NULL) {
...@@ -354,28 +362,37 @@ void CpuNonbondedForce::threadComputeDirect(ThreadPool& threads, int threadIndex ...@@ -354,28 +362,37 @@ void CpuNonbondedForce::threadComputeDirect(ThreadPool& threads, int threadIndex
// Now subtract off the exclusions, since they were implicitly included in the reciprocal space sum. // Now subtract off the exclusions, since they were implicitly included in the reciprocal space sum.
for (int i = threadIndex; i < numberOfAtoms; i += numThreads) { threads.syncThreads();
fvec4 posI((float) atomCoordinates[i][0], (float) atomCoordinates[i][1], (float) atomCoordinates[i][2], 0.0f); const int groupSize = max(1, numberOfAtoms/(10*numThreads));
for (set<int>::const_iterator iter = exclusions[i].begin(); iter != exclusions[i].end(); ++iter) { while (true) {
if (*iter > i) { int start = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), groupSize);
int j = *iter; if (start >= numberOfAtoms)
fvec4 deltaR; break;
fvec4 posJ((float) atomCoordinates[j][0], (float) atomCoordinates[j][1], (float) atomCoordinates[j][2], 0.0f); int end = min(start+groupSize, numberOfAtoms);
float r2; for (int i = start; i < end; i++) {
getDeltaR(posJ, posI, deltaR, r2, false, boxSize, invBoxSize); fvec4 posI((float) atomCoordinates[i][0], (float) atomCoordinates[i][1], (float) atomCoordinates[i][2], 0.0f);
float r = sqrtf(r2); float scaledChargeI = (float) (ONE_4PI_EPS0*posq[4*i+3]);
float inverseR = 1/r; for (set<int>::const_iterator iter = exclusions[i].begin(); iter != exclusions[i].end(); ++iter) {
float chargeProd = ONE_4PI_EPS0*posq[4*i+3]*posq[4*j+3]; if (*iter > i) {
float alphaR = alphaEwald*r; int j = *iter;
float erfAlphaR = erf(alphaR); fvec4 deltaR;
if (erfAlphaR > 1e-6f) { fvec4 posJ((float) atomCoordinates[j][0], (float) atomCoordinates[j][1], (float) atomCoordinates[j][2], 0.0f);
float dEdR = (float) (chargeProd * inverseR * inverseR * inverseR); float r2;
dEdR = (float) (dEdR * (erfAlphaR-TWO_OVER_SQRT_PI*alphaR*exp(-alphaR*alphaR))); getDeltaR(posJ, posI, deltaR, r2, false, boxSize, invBoxSize);
fvec4 result = deltaR*dEdR; float r = sqrtf(r2);
(fvec4(forces+4*i)-result).store(forces+4*i); float alphaR = alphaEwald*r;
(fvec4(forces+4*j)+result).store(forces+4*j); float erfAlphaR = erf(alphaR);
if (includeEnergy) if (erfAlphaR > 1e-6f) {
threadEnergy[threadIndex] -= chargeProd*inverseR*erfAlphaR; float inverseR = 1/r;
float chargeProdOverR = scaledChargeI*posq[4*j+3]*inverseR;
float dEdR = chargeProdOverR*inverseR*inverseR;
dEdR = dEdR * (erfAlphaR-(float)TWO_OVER_SQRT_PI*alphaR*(float)exp(-alphaR*alphaR));
fvec4 result = deltaR*dEdR;
(fvec4(forces+4*i)-result).store(forces+4*i);
(fvec4(forces+4*j)+result).store(forces+4*j);
if (includeEnergy)
threadEnergy[threadIndex] -= chargeProdOverR*erfAlphaR;
}
} }
} }
} }
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2013 Stanford University and the Authors. * * Portions copyright (c) 2013-2015 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -30,6 +30,7 @@ ...@@ -30,6 +30,7 @@
* -------------------------------------------------------------------------- */ * -------------------------------------------------------------------------- */
#include "CpuSETTLE.h" #include "CpuSETTLE.h"
#include "openmm/internal/gmx_atomic.h"
using namespace OpenMM; using namespace OpenMM;
using namespace std; using namespace std;
...@@ -39,10 +40,14 @@ public: ...@@ -39,10 +40,14 @@ public:
ApplyToPositionsTask(vector<OpenMM::RealVec>& atomCoordinates, vector<OpenMM::RealVec>& atomCoordinatesP, vector<RealOpenMM>& inverseMasses, ApplyToPositionsTask(vector<OpenMM::RealVec>& atomCoordinates, vector<OpenMM::RealVec>& atomCoordinatesP, vector<RealOpenMM>& inverseMasses,
RealOpenMM tolerance, vector<ReferenceSETTLEAlgorithm*>& threadSettle) : atomCoordinates(atomCoordinates), atomCoordinatesP(atomCoordinatesP), RealOpenMM tolerance, vector<ReferenceSETTLEAlgorithm*>& threadSettle) : atomCoordinates(atomCoordinates), atomCoordinatesP(atomCoordinatesP),
inverseMasses(inverseMasses), tolerance(tolerance), threadSettle(threadSettle) { inverseMasses(inverseMasses), tolerance(tolerance), threadSettle(threadSettle) {
gmx_atomic_set(&atomicCounter, 0);
} }
void execute(ThreadPool& threads, int threadIndex) { void execute(ThreadPool& threads, int threadIndex) {
if (threadIndex < threadSettle.size()) { while (true) {
threadSettle[threadIndex]->apply(atomCoordinates, atomCoordinatesP, inverseMasses, tolerance); int index = gmx_atomic_fetch_add(&atomicCounter, 1);
if (index >= threadSettle.size())
break;
threadSettle[index]->apply(atomCoordinates, atomCoordinatesP, inverseMasses, tolerance);
} }
} }
vector<OpenMM::RealVec>& atomCoordinates; vector<OpenMM::RealVec>& atomCoordinates;
...@@ -50,6 +55,7 @@ public: ...@@ -50,6 +55,7 @@ public:
vector<RealOpenMM>& inverseMasses; vector<RealOpenMM>& inverseMasses;
RealOpenMM tolerance; RealOpenMM tolerance;
vector<ReferenceSETTLEAlgorithm*>& threadSettle; vector<ReferenceSETTLEAlgorithm*>& threadSettle;
gmx_atomic_t atomicCounter;
}; };
class CpuSETTLE::ApplyToVelocitiesTask : public ThreadPool::Task { class CpuSETTLE::ApplyToVelocitiesTask : public ThreadPool::Task {
...@@ -57,10 +63,14 @@ public: ...@@ -57,10 +63,14 @@ public:
ApplyToVelocitiesTask(vector<OpenMM::RealVec>& atomCoordinates, vector<OpenMM::RealVec>& velocities, vector<RealOpenMM>& inverseMasses, ApplyToVelocitiesTask(vector<OpenMM::RealVec>& atomCoordinates, vector<OpenMM::RealVec>& velocities, vector<RealOpenMM>& inverseMasses,
RealOpenMM tolerance, vector<ReferenceSETTLEAlgorithm*>& threadSettle) : atomCoordinates(atomCoordinates), velocities(velocities), RealOpenMM tolerance, vector<ReferenceSETTLEAlgorithm*>& threadSettle) : atomCoordinates(atomCoordinates), velocities(velocities),
inverseMasses(inverseMasses), tolerance(tolerance), threadSettle(threadSettle) { inverseMasses(inverseMasses), tolerance(tolerance), threadSettle(threadSettle) {
gmx_atomic_set(&atomicCounter, 0);
} }
void execute(ThreadPool& threads, int threadIndex) { void execute(ThreadPool& threads, int threadIndex) {
if (threadIndex < threadSettle.size()) { while (true) {
threadSettle[threadIndex]->applyToVelocities(atomCoordinates, velocities, inverseMasses, tolerance); int index = gmx_atomic_fetch_add(&atomicCounter, 1);
if (index >= threadSettle.size())
break;
threadSettle[index]->applyToVelocities(atomCoordinates, velocities, inverseMasses, tolerance);
} }
} }
vector<OpenMM::RealVec>& atomCoordinates; vector<OpenMM::RealVec>& atomCoordinates;
...@@ -68,17 +78,18 @@ public: ...@@ -68,17 +78,18 @@ public:
vector<RealOpenMM>& inverseMasses; vector<RealOpenMM>& inverseMasses;
RealOpenMM tolerance; RealOpenMM tolerance;
vector<ReferenceSETTLEAlgorithm*>& threadSettle; vector<ReferenceSETTLEAlgorithm*>& threadSettle;
gmx_atomic_t atomicCounter;
}; };
CpuSETTLE::CpuSETTLE(const System& system, const ReferenceSETTLEAlgorithm& settle, ThreadPool& threads) : threads(threads) { CpuSETTLE::CpuSETTLE(const System& system, const ReferenceSETTLEAlgorithm& settle, ThreadPool& threads) : threads(threads) {
int numThreads = threads.getNumThreads(); int numBlocks = 10*threads.getNumThreads();
int numClusters = settle.getNumClusters(); int numClusters = settle.getNumClusters();
vector<RealOpenMM> mass(system.getNumParticles()); vector<RealOpenMM> mass(system.getNumParticles());
for (int i = 0; i < system.getNumParticles(); i++) for (int i = 0; i < system.getNumParticles(); i++)
mass[i] = system.getParticleMass(i); mass[i] = system.getParticleMass(i);
for (int i = 0; i < numThreads; i++) { for (int i = 0; i < numBlocks; i++) {
int start = i*numClusters/numThreads; int start = i*numClusters/numBlocks;
int end = (i+1)*numClusters/numThreads; int end = (i+1)*numClusters/numBlocks;
if (start != end) { if (start != end) {
int numThreadClusters = end-start; int numThreadClusters = end-start;
vector<int> atom1(numThreadClusters), atom2(numThreadClusters), atom3(numThreadClusters); vector<int> atom1(numThreadClusters), atom2(numThreadClusters), atom3(numThreadClusters);
......
#ifndef OPENMM_GBVIFORCE_PROXY_H_
#define OPENMM_GBVIFORCE_PROXY_H_
/* -------------------------------------------------------------------------- * /* -------------------------------------------------------------------------- *
* OpenMM * * OpenMM *
* -------------------------------------------------------------------------- * * -------------------------------------------------------------------------- *
...@@ -9,7 +6,7 @@ ...@@ -9,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2010 Stanford University and the Authors. * * Portions copyright (c) 2015 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -32,22 +29,8 @@ ...@@ -32,22 +29,8 @@
* USE OR OTHER DEALINGS IN THE SOFTWARE. * * USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */ * -------------------------------------------------------------------------- */
#include "openmm/internal/windowsExport.h" #include "CpuTests.h"
#include "openmm/serialization/SerializationProxy.h" #include "TestCompoundIntegrator.h"
namespace OpenMM {
/**
* This is a proxy for serializing GBVIForce objects.
*/
class OPENMM_EXPORT GBVIForceProxy : public SerializationProxy {
public:
GBVIForceProxy();
void serialize(const void* object, SerializationNode& node) const;
void* deserialize(const SerializationNode& node) const;
};
} // namespace OpenMM
#endif /*OPENMM_GBVIFORCE_PROXY_H_*/ void runPlatformTests() {
}
...@@ -13,9 +13,9 @@ ...@@ -13,9 +13,9 @@
#---------------------------------------------------- #----------------------------------------------------
set(OPENMM_BUILD_CUDA_TESTS TRUE CACHE BOOL "Whether to build CUDA test cases") set(OPENMM_BUILD_CUDA_TESTS TRUE CACHE BOOL "Whether to build CUDA test cases")
if(OPENMM_BUILD_CUDA_TESTS) if(BUILD_TESTING AND OPENMM_BUILD_CUDA_TESTS)
SUBDIRS (tests) SUBDIRS (tests)
endif(OPENMM_BUILD_CUDA_TESTS) endif(BUILD_TESTING AND OPENMM_BUILD_CUDA_TESTS)
# The source is organized into subdirectories, but we handle them all from # The source is organized into subdirectories, but we handle them all from
# this CMakeLists file rather than letting CMake visit them as SUBDIRS. # this CMakeLists file rather than letting CMake visit them as SUBDIRS.
......
...@@ -30,6 +30,7 @@ ...@@ -30,6 +30,7 @@
#include <map> #include <map>
#include <queue> #include <queue>
#include <string> #include <string>
#include <utility>
#define __CL_ENABLE_EXCEPTIONS #define __CL_ENABLE_EXCEPTIONS
#ifdef _MSC_VER #ifdef _MSC_VER
// Prevent Windows from defining macros that interfere with other code. // Prevent Windows from defining macros that interfere with other code.
...@@ -538,6 +539,11 @@ public: ...@@ -538,6 +539,11 @@ public:
*/ */
void invalidateMolecules(); void invalidateMolecules();
private: private:
/**
* Compute a sorted list of device indices in decreasing order of desirability
*/
std::vector<int> getDevicePrecedence();
struct Molecule; struct Molecule;
struct MoleculeGroup; struct MoleculeGroup;
class VirtualSiteInfo; class VirtualSiteInfo;
......
...@@ -62,6 +62,14 @@ public: ...@@ -62,6 +62,14 @@ public:
CudaArray& getStepSize() { CudaArray& getStepSize() {
return *stepSize; return *stepSize;
} }
/**
* Set the size to use for the next step.
*/
void setNextStepSize(double size);
/**
* Get the size that was used for the last step.
*/
double getLastStepSize();
/** /**
* Apply constraints to the atom positions. * Apply constraints to the atom positions.
* *
...@@ -154,6 +162,7 @@ private: ...@@ -154,6 +162,7 @@ private:
CudaArray* vsiteLocalCoordsParams; CudaArray* vsiteLocalCoordsParams;
int randomPos; int randomPos;
int lastSeed, numVsites; int lastSeed, numVsites;
double2 lastStepSize;
struct ShakeCluster; struct ShakeCluster;
struct ConstraintOrderer; struct ConstraintOrderer;
}; };
......
...@@ -592,7 +592,7 @@ class CudaCalcNonbondedForceKernel : public CalcNonbondedForceKernel { ...@@ -592,7 +592,7 @@ class CudaCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
public: public:
CudaCalcNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) : CalcNonbondedForceKernel(name, platform), CudaCalcNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) : CalcNonbondedForceKernel(name, platform),
cu(cu), hasInitializedFFT(false), sigmaEpsilon(NULL), exceptionParams(NULL), cosSinSums(NULL), directPmeGrid(NULL), reciprocalPmeGrid(NULL), cu(cu), hasInitializedFFT(false), sigmaEpsilon(NULL), exceptionParams(NULL), cosSinSums(NULL), directPmeGrid(NULL), reciprocalPmeGrid(NULL),
pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeAtomRange(NULL), pmeAtomGridIndex(NULL), sort(NULL), fft(NULL), pmeio(NULL) { pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeAtomRange(NULL), pmeAtomGridIndex(NULL), pmeEnergyBuffer(NULL), sort(NULL), fft(NULL), pmeio(NULL) {
} }
~CudaCalcNonbondedForceKernel(); ~CudaCalcNonbondedForceKernel();
/** /**
...@@ -657,6 +657,7 @@ private: ...@@ -657,6 +657,7 @@ private:
CudaArray* pmeBsplineModuliZ; CudaArray* pmeBsplineModuliZ;
CudaArray* pmeAtomRange; CudaArray* pmeAtomRange;
CudaArray* pmeAtomGridIndex; CudaArray* pmeAtomGridIndex;
CudaArray* pmeEnergyBuffer;
CudaSort* sort; CudaSort* sort;
Kernel cpuPme; Kernel cpuPme;
PmeIO* pmeio; PmeIO* pmeio;
...@@ -1123,7 +1124,6 @@ public: ...@@ -1123,7 +1124,6 @@ public:
double computeKineticEnergy(ContextImpl& context, const VerletIntegrator& integrator); double computeKineticEnergy(ContextImpl& context, const VerletIntegrator& integrator);
private: private:
CudaContext& cu; CudaContext& cu;
double prevStepSize;
CUfunction kernel1, kernel2; CUfunction kernel1, kernel2;
}; };
...@@ -1354,7 +1354,7 @@ private: ...@@ -1354,7 +1354,7 @@ private:
void recordChangedParameters(ContextImpl& context); void recordChangedParameters(ContextImpl& context);
bool evaluateCondition(int step); bool evaluateCondition(int step);
CudaContext& cu; CudaContext& cu;
double prevStepSize, energy; double energy;
float energyFloat; float energyFloat;
int numGlobalVariables; int numGlobalVariables;
bool hasInitializedKernels, deviceValuesAreCurrent, deviceGlobalsAreCurrent, modifiesParameters, keNeedsForce, hasAnyConstraints; bool hasInitializedKernels, deviceValuesAreCurrent, deviceGlobalsAreCurrent, modifiesParameters, keNeedsForce, hasAnyConstraints;
......
...@@ -121,7 +121,7 @@ public: ...@@ -121,7 +121,7 @@ public:
ContextImpl* context; ContextImpl* context;
std::vector<CudaContext*> contexts; std::vector<CudaContext*> contexts;
std::vector<double> contextEnergy; std::vector<double> contextEnergy;
bool removeCM, peerAccessSupported, useCpuPme; bool hasInitializedContexts, removeCM, peerAccessSupported, useCpuPme;
int cmMotionFrequency; int cmMotionFrequency;
int stepCount, computeForceCount; int stepCount, computeForceCount;
double time; double time;
......
...@@ -99,7 +99,7 @@ void CudaBondedUtilities::initialize(const System& system) { ...@@ -99,7 +99,7 @@ void CudaBondedUtilities::initialize(const System& system) {
s<<CudaKernelSources::vectorOps; s<<CudaKernelSources::vectorOps;
for (int i = 0; i < (int) prefixCode.size(); i++) for (int i = 0; i < (int) prefixCode.size(); i++)
s<<prefixCode[i]; s<<prefixCode[i];
s<<"extern \"C\" __global__ void computeBondedForces(unsigned long long* __restrict__ forceBuffer, real* __restrict__ energyBuffer, const real4* __restrict__ posq, int groups, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ"; s<<"extern \"C\" __global__ void computeBondedForces(unsigned long long* __restrict__ forceBuffer, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq, int groups, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ";
for (int force = 0; force < numForces; force++) { for (int force = 0; force < numForces; force++) {
for (int i = 0; i < (int) atomIndices[force].size(); i++) { for (int i = 0; i < (int) atomIndices[force].size(); i++) {
int indexWidth = atomIndices[force][i]->getElementSize()/4; int indexWidth = atomIndices[force][i]->getElementSize()/4;
...@@ -110,7 +110,7 @@ void CudaBondedUtilities::initialize(const System& system) { ...@@ -110,7 +110,7 @@ void CudaBondedUtilities::initialize(const System& system) {
for (int i = 0; i < (int) arguments.size(); i++) for (int i = 0; i < (int) arguments.size(); i++)
s<<", "<<argTypes[i]<<"* customArg"<<(i+1); s<<", "<<argTypes[i]<<"* customArg"<<(i+1);
s<<") {\n"; s<<") {\n";
s<<"real energy = 0;\n"; s<<"mixed energy = 0;\n";
for (int force = 0; force < numForces; force++) for (int force = 0; force < numForces; force++)
s<<createForceSource(force, forceAtoms[force].size(), forceAtoms[force][0].size(), forceGroup[force], forceSource[force]); s<<createForceSource(force, forceAtoms[force].size(), forceAtoms[force][0].size(), forceGroup[force], forceSource[force]);
s<<"energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;\n"; s<<"energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;\n";
......
...@@ -122,49 +122,48 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking ...@@ -122,49 +122,48 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
CHECK_RESULT(cuDeviceGetCount(&numDevices)); CHECK_RESULT(cuDeviceGetCount(&numDevices));
if (deviceIndex < -1 || deviceIndex >= numDevices) if (deviceIndex < -1 || deviceIndex >= numDevices)
throw OpenMMException("Illegal value for CudaDeviceIndex: "+intToString(deviceIndex)); throw OpenMMException("Illegal value for CudaDeviceIndex: "+intToString(deviceIndex));
vector<int> devicePrecedence;
if (deviceIndex == -1) { if (deviceIndex == -1) {
// Try to figure out which device is the fastest. devicePrecedence = getDevicePrecedence();
} else {
int bestSpeed = -1; devicePrecedence.push_back(deviceIndex);
int bestCompute = -1; }
for (int i = 0; i < numDevices; i++) {
CHECK_RESULT(cuDeviceGet(&device, i)); this->deviceIndex = -1;
int major, minor, clock, multiprocessors; for (int i = 0; i < static_cast<int>(devicePrecedence.size()); i++) {
CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, device)); int trialDeviceIndex = devicePrecedence[i];
if (major == 1 && minor < 2) CHECK_RESULT(cuDeviceGet(&device, trialDeviceIndex));
continue; // 1.0 and 1.1 are not supported defaultOptimizationOptions = "--use_fast_math";
CHECK_RESULT(cuDeviceGetAttribute(&clock, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device)); unsigned int flags = CU_CTX_MAP_HOST;
CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device)); if (useBlockingSync)
int speed = clock*multiprocessors; flags += CU_CTX_SCHED_BLOCKING_SYNC;
if (major > bestCompute || (major == bestCompute && speed > bestSpeed)) { else
deviceIndex = i; flags += CU_CTX_SCHED_SPIN;
bestSpeed = speed;
bestCompute = major; if (cuCtxCreate(&context, flags, device) == CUDA_SUCCESS) {
} this->deviceIndex = trialDeviceIndex;
break;
} }
} }
if (deviceIndex == -1) if (this->deviceIndex == -1)
throw OpenMMException("No compatible CUDA device is available"); if (deviceIndex != -1)
CHECK_RESULT(cuDeviceGet(&device, deviceIndex)); throw OpenMMException("The requested CUDA device could not be loaded");
this->deviceIndex = deviceIndex; else
throw OpenMMException("No compatible CUDA device is available");
int major, minor; int major, minor;
CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, device)); CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, device));
// This is a workaround to support GTX 980 with CUDA 6.5. It reports its compute capability #if __CUDA_API_VERSION < 7000
// as 5.2, but the compiler doesn't support anything beyond 5.0. We can remove this once // This is a workaround to support GTX 980 with CUDA 6.5. It reports
// CUDA 7.0 is released. // its compute capability as 5.2, but the compiler doesn't support
if (major == 5) // anything beyond 5.0.
minor = 0; if (major == 5)
minor = 0;
#endif
gpuArchitecture = intToString(major)+intToString(minor); gpuArchitecture = intToString(major)+intToString(minor);
computeCapability = major+0.1*minor; computeCapability = major+0.1*minor;
if ((useDoublePrecision || useMixedPrecision) && computeCapability < 1.3)
throw OpenMMException("This device does not support double precision");
defaultOptimizationOptions = "--use_fast_math";
unsigned int flags = CU_CTX_MAP_HOST;
if (useBlockingSync)
flags += CU_CTX_SCHED_BLOCKING_SYNC;
else
flags += CU_CTX_SCHED_SPIN;
CHECK_RESULT(cuCtxCreate(&context, flags, device));
contextIsValid = true; contextIsValid = true;
CHECK_RESULT(cuCtxSetCacheConfig(CU_FUNC_CACHE_PREFER_SHARED)); CHECK_RESULT(cuCtxSetCacheConfig(CU_FUNC_CACHE_PREFER_SHARED));
if (contextIndex > 0) { if (contextIndex > 0) {
...@@ -245,9 +244,9 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking ...@@ -245,9 +244,9 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
compilationDefines["ATAN"] = useDoublePrecision ? "atan" : "atanf"; compilationDefines["ATAN"] = useDoublePrecision ? "atan" : "atanf";
compilationDefines["ERF"] = useDoublePrecision ? "erf" : "erff"; compilationDefines["ERF"] = useDoublePrecision ? "erf" : "erff";
compilationDefines["ERFC"] = useDoublePrecision ? "erfc" : "erfcf"; compilationDefines["ERFC"] = useDoublePrecision ? "erfc" : "erfcf";
// Set defines for applying periodic boundary conditions. // Set defines for applying periodic boundary conditions.
Vec3 boxVectors[3]; Vec3 boxVectors[3];
system.getDefaultPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]); system.getDefaultPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
boxIsTriclinic = (boxVectors[0][1] != 0.0 || boxVectors[0][2] != 0.0 || boxIsTriclinic = (boxVectors[0][1] != 0.0 || boxVectors[0][2] != 0.0 ||
...@@ -307,11 +306,11 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking ...@@ -307,11 +306,11 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
} }
// Create the work thread used for parallelization when running on multiple devices. // Create the work thread used for parallelization when running on multiple devices.
thread = new WorkThread(); thread = new WorkThread();
// Create utilities objects. // Create utilities objects.
bonded = new CudaBondedUtilities(*this); bonded = new CudaBondedUtilities(*this);
nonbonded = new CudaNonbondedUtilities(*this); nonbonded = new CudaNonbondedUtilities(*this);
integration = new CudaIntegrationUtilities(*this, system); integration = new CudaIntegrationUtilities(*this, system);
...@@ -368,7 +367,7 @@ void CudaContext::initialize() { ...@@ -368,7 +367,7 @@ void CudaContext::initialize() {
CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0)); CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0));
} }
else if (useMixedPrecision) { else if (useMixedPrecision) {
energyBuffer = CudaArray::create<float>(*this, numEnergyBuffers, "energyBuffer"); energyBuffer = CudaArray::create<double>(*this, numEnergyBuffers, "energyBuffer");
int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers); int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers);
CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0)); CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0));
} }
...@@ -427,7 +426,7 @@ string CudaContext::replaceStrings(const string& input, const std::map<std::stri ...@@ -427,7 +426,7 @@ string CudaContext::replaceStrings(const string& input, const std::map<std::stri
if (index != result.npos) { if (index != result.npos) {
if ((index == 0 || symbolChars.find(result[index-1]) == symbolChars.end()) && (index == result.size()-size || symbolChars.find(result[index+size]) == symbolChars.end())) { if ((index == 0 || symbolChars.find(result[index-1]) == symbolChars.end()) && (index == result.size()-size || symbolChars.find(result[index+size]) == symbolChars.end())) {
// We have found a complete symbol, not part of a longer symbol. // We have found a complete symbol, not part of a longer symbol.
result.replace(index, size, iter->second); result.replace(index, size, iter->second);
index += iter->second.size(); index += iter->second.size();
} }
...@@ -462,11 +461,11 @@ static bool compileInWindows(const string &command) { ...@@ -462,11 +461,11 @@ static bool compileInWindows(const string &command) {
return -1; return -1;
} }
WaitForSingleObject(pi.hProcess, INFINITE); WaitForSingleObject(pi.hProcess, INFINITE);
DWORD exitCode = -1; DWORD exitCode = -1;
if(!GetExitCodeProcess(pi.hProcess, &exitCode)) { if(!GetExitCodeProcess(pi.hProcess, &exitCode)) {
throw(OpenMMException("Could not get nvcc.exe's exit code\n")); throw(OpenMMException("Could not get nvcc.exe's exit code\n"));
} else { } else {
if(exitCode == 0) if(exitCode == 0)
return 0; return 0;
else else
return -1; return -1;
...@@ -522,9 +521,9 @@ CUmodule CudaContext::createModule(const string source, const map<string, string ...@@ -522,9 +521,9 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
if (!defines.empty()) if (!defines.empty())
src << endl; src << endl;
src << source << endl; src << source << endl;
// See whether we already have PTX for this kernel cached. // See whether we already have PTX for this kernel cached.
CSHA1 sha1; CSHA1 sha1;
sha1.Update((const UINT_8*) src.str().c_str(), src.str().size()); sha1.Update((const UINT_8*) src.str().c_str(), src.str().size());
sha1.Final(); sha1.Final();
...@@ -539,9 +538,9 @@ CUmodule CudaContext::createModule(const string source, const map<string, string ...@@ -539,9 +538,9 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
CUmodule module; CUmodule module;
if (cuModuleLoad(&module, cacheFile.str().c_str()) == CUDA_SUCCESS) if (cuModuleLoad(&module, cacheFile.str().c_str()) == CUDA_SUCCESS)
return module; return module;
// Select names for the various temporary files. // Select names for the various temporary files.
stringstream tempFileName; stringstream tempFileName;
tempFileName << "openmmTempKernel" << this; // Include a pointer to this context as part of the filename to avoid collisions. tempFileName << "openmmTempKernel" << this; // Include a pointer to this context as part of the filename to avoid collisions.
#ifdef WIN32 #ifdef WIN32
...@@ -555,12 +554,12 @@ CUmodule CudaContext::createModule(const string source, const map<string, string ...@@ -555,12 +554,12 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
int res = 0; int res = 0;
// If the runtime compiler plugin is available, use it. // If the runtime compiler plugin is available, use it.
if (hasCompilerKernel) { if (hasCompilerKernel) {
string ptx = compilerKernel.getAs<CudaCompilerKernel>().createModule(src.str(), "-arch=compute_"+gpuArchitecture+" "+options, *this); string ptx = compilerKernel.getAs<CudaCompilerKernel>().createModule(src.str(), "-arch=compute_"+gpuArchitecture+" "+options, *this);
// If possible, write the PTX out to a temporary file so we can cache it for later use. // If possible, write the PTX out to a temporary file so we can cache it for later use.
bool wroteCache = false; bool wroteCache = false;
try { try {
ofstream out(outputFile.c_str()); ofstream out(outputFile.c_str());
...@@ -574,7 +573,7 @@ CUmodule CudaContext::createModule(const string source, const map<string, string ...@@ -574,7 +573,7 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
} }
if (!wroteCache) { if (!wroteCache) {
// An error occurred. Possibly we don't have permission to write to the temp directory. Just try to load the module directly. // An error occurred. Possibly we don't have permission to write to the temp directory. Just try to load the module directly.
CHECK_RESULT2(cuModuleLoadDataEx(&module, &ptx[0], 0, NULL, NULL), "Error loading CUDA module"); CHECK_RESULT2(cuModuleLoadDataEx(&module, &ptx[0], 0, NULL, NULL), "Error loading CUDA module");
return module; return module;
} }
...@@ -883,7 +882,7 @@ private: ...@@ -883,7 +882,7 @@ private:
void CudaContext::findMoleculeGroups() { void CudaContext::findMoleculeGroups() {
// The first time this is called, we need to identify all the molecules in the system. // The first time this is called, we need to identify all the molecules in the system.
if (moleculeGroups.size() == 0) { if (moleculeGroups.size() == 0) {
// Add a ForceInfo that makes sure reordering doesn't break virtual sites. // Add a ForceInfo that makes sure reordering doesn't break virtual sites.
...@@ -966,7 +965,7 @@ void CudaContext::findMoleculeGroups() { ...@@ -966,7 +965,7 @@ void CudaContext::findMoleculeGroups() {
if (!forces[k]->areParticlesIdentical(mol.atoms[i], mol2.atoms[i])) if (!forces[k]->areParticlesIdentical(mol.atoms[i], mol2.atoms[i]))
identical = false; identical = false;
} }
// See if the constraints are identical. // See if the constraints are identical.
for (int i = 0; i < (int) mol.constraints.size() && identical; i++) { for (int i = 0; i < (int) mol.constraints.size() && identical; i++) {
...@@ -1047,11 +1046,11 @@ void CudaContext::invalidateMolecules() { ...@@ -1047,11 +1046,11 @@ void CudaContext::invalidateMolecules() {
} }
if (valid) if (valid)
return; return;
// The list of which molecules are identical is no longer valid. We need to restore the // The list of which molecules are identical is no longer valid. We need to restore the
// atoms to their original order, rebuild the list of identical molecules, and sort them // atoms to their original order, rebuild the list of identical molecules, and sort them
// again. // again.
vector<int4> newCellOffsets(numAtoms); vector<int4> newCellOffsets(numAtoms);
if (useDoublePrecision) { if (useDoublePrecision) {
vector<double4> oldPosq(paddedNumAtoms); vector<double4> oldPosq(paddedNumAtoms);
...@@ -1196,6 +1195,8 @@ void CudaContext::reorderAtomsImpl() { ...@@ -1196,6 +1195,8 @@ void CudaContext::reorderAtomsImpl() {
molPos[i].x *= invNumAtoms; molPos[i].x *= invNumAtoms;
molPos[i].y *= invNumAtoms; molPos[i].y *= invNumAtoms;
molPos[i].z *= invNumAtoms; molPos[i].z *= invNumAtoms;
if (molPos[i].x != molPos[i].x)
throw OpenMMException("Particle coordinate is nan");
} }
if (nonbonded->getUsePeriodic()) { if (nonbonded->getUsePeriodic()) {
// Move each molecule position into the same box. // Move each molecule position into the same box.
...@@ -1391,3 +1392,41 @@ void CudaContext::WorkThread::flush() { ...@@ -1391,3 +1392,41 @@ void CudaContext::WorkThread::flush() {
pthread_cond_wait(&queueEmptyCondition, &queueLock); pthread_cond_wait(&queueEmptyCondition, &queueLock);
pthread_mutex_unlock(&queueLock); pthread_mutex_unlock(&queueLock);
} }
vector<int> CudaContext::getDevicePrecedence() {
int numDevices;
CUdevice thisDevice;
string errorMessage = "Error initializing Context";
vector<pair<pair<int, int>, int> > devices;
CHECK_RESULT(cuDeviceGetCount(&numDevices));
for (int i = 0; i < numDevices; i++) {
CHECK_RESULT(cuDeviceGet(&thisDevice, i));
int major, minor, clock, multiprocessors, speed;
CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, thisDevice));
if (major == 1 && minor < 2)
continue;
if ((useDoublePrecision || useMixedPrecision) && (major+0.1*minor < 1.3))
continue;
CHECK_RESULT(cuDeviceGetAttribute(&clock, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, thisDevice));
CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, thisDevice));
speed = clock*multiprocessors;
pair<int, int> deviceProperties = std::make_pair(major, speed);
devices.push_back(std::make_pair(deviceProperties, -i));
}
// sort first by compute capability (higher is better), then speed
// (higher is better), and finally device index (lower is better)
std::sort(devices.begin(), devices.end());
std::reverse(devices.begin(), devices.end());
vector<int> precedence;
for (int i = 0; i < static_cast<int>(devices.size()); i++) {
precedence.push_back(-devices[i].second);
}
return precedence;
}
...@@ -109,7 +109,8 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express ...@@ -109,7 +109,8 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
} }
out << ");\n"; out << ");\n";
out << "APPLY_PERIODIC_TO_DELTA(periodicDistance_delta)\n"; out << "APPLY_PERIODIC_TO_DELTA(periodicDistance_delta)\n";
out << tempType << " periodicDistance_rinv = RSQRT(periodicDistance_delta.x*periodicDistance_delta.x + periodicDistance_delta.y*periodicDistance_delta.y + periodicDistance_delta.z*periodicDistance_delta.z);\n"; out << tempType << " periodicDistance_r2 = periodicDistance_delta.x*periodicDistance_delta.x + periodicDistance_delta.y*periodicDistance_delta.y + periodicDistance_delta.z*periodicDistance_delta.z;\n";
out << tempType << " periodicDistance_rinv = RSQRT(periodicDistance_r2);\n";
for (int j = 0; j < nodes.size(); j++) { for (int j = 0; j < nodes.size(); j++) {
const vector<int>& derivOrder = dynamic_cast<const Operation::Custom*>(&nodes[j]->getOperation())->getDerivOrder(); const vector<int>& derivOrder = dynamic_cast<const Operation::Custom*>(&nodes[j]->getOperation())->getDerivOrder();
int argIndex = -1; int argIndex = -1;
...@@ -123,17 +124,17 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express ...@@ -123,17 +124,17 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
if (argIndex == -1) if (argIndex == -1)
out << nodeNames[j] << " = RECIP(periodicDistance_rinv);\n"; out << nodeNames[j] << " = RECIP(periodicDistance_rinv);\n";
else if (argIndex == 0) else if (argIndex == 0)
out << nodeNames[j] << " = periodicDistance_delta.x*periodicDistance_rinv;\n"; out << nodeNames[j] << " = (periodicDistance_r2 > 0 ? periodicDistance_delta.x*periodicDistance_rinv : 0);\n";
else if (argIndex == 1) else if (argIndex == 1)
out << nodeNames[j] << " = periodicDistance_delta.y*periodicDistance_rinv;\n"; out << nodeNames[j] << " = (periodicDistance_r2 > 0 ? periodicDistance_delta.y*periodicDistance_rinv : 0);\n";
else if (argIndex == 2) else if (argIndex == 2)
out << nodeNames[j] << " = periodicDistance_delta.z*periodicDistance_rinv;\n"; out << nodeNames[j] << " = (periodicDistance_r2 > 0 ? periodicDistance_delta.z*periodicDistance_rinv : 0);\n";
else if (argIndex == 3) else if (argIndex == 3)
out << nodeNames[j] << " = -periodicDistance_delta.x*periodicDistance_rinv;\n"; out << nodeNames[j] << " = (periodicDistance_r2 > 0 ? -periodicDistance_delta.x*periodicDistance_rinv : 0);\n";
else if (argIndex == 4) else if (argIndex == 4)
out << nodeNames[j] << " = -periodicDistance_delta.y*periodicDistance_rinv;\n"; out << nodeNames[j] << " = (periodicDistance_r2 > 0 ? -periodicDistance_delta.y*periodicDistance_rinv : 0);\n";
else if (argIndex == 5) else if (argIndex == 5)
out << nodeNames[j] << " = -periodicDistance_delta.z*periodicDistance_rinv;\n"; out << nodeNames[j] << " = (periodicDistance_r2 > 0 ? -periodicDistance_delta.z*periodicDistance_rinv : 0);\n";
} }
} }
else { else {
......
...@@ -106,21 +106,21 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S ...@@ -106,21 +106,21 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
vsiteOutOfPlaneAtoms(NULL), vsiteOutOfPlaneWeights(NULL), vsiteLocalCoordsAtoms(NULL), vsiteLocalCoordsParams(NULL) { vsiteOutOfPlaneAtoms(NULL), vsiteOutOfPlaneWeights(NULL), vsiteLocalCoordsAtoms(NULL), vsiteLocalCoordsParams(NULL) {
// Create workspace arrays. // Create workspace arrays.
lastStepSize = make_double2(0.0, 0.0);
if (context.getUseDoublePrecision() || context.getUseMixedPrecision()) { if (context.getUseDoublePrecision() || context.getUseMixedPrecision()) {
posDelta = CudaArray::create<double4>(context, context.getPaddedNumAtoms(), "posDelta"); posDelta = CudaArray::create<double4>(context, context.getPaddedNumAtoms(), "posDelta");
vector<double4> deltas(posDelta->getSize(), make_double4(0.0, 0.0, 0.0, 0.0)); vector<double4> deltas(posDelta->getSize(), make_double4(0.0, 0.0, 0.0, 0.0));
posDelta->upload(deltas); posDelta->upload(deltas);
stepSize = CudaArray::create<double2>(context, 1, "stepSize"); stepSize = CudaArray::create<double2>(context, 1, "stepSize");
vector<double2> step(1, make_double2(0.0, 0.0)); stepSize->upload(&lastStepSize);
stepSize->upload(step);
} }
else { else {
posDelta = CudaArray::create<float4>(context, context.getPaddedNumAtoms(), "posDelta"); posDelta = CudaArray::create<float4>(context, context.getPaddedNumAtoms(), "posDelta");
vector<float4> deltas(posDelta->getSize(), make_float4(0.0f, 0.0f, 0.0f, 0.0f)); vector<float4> deltas(posDelta->getSize(), make_float4(0.0f, 0.0f, 0.0f, 0.0f));
posDelta->upload(deltas); posDelta->upload(deltas);
stepSize = CudaArray::create<float2>(context, 1, "stepSize"); stepSize = CudaArray::create<float2>(context, 1, "stepSize");
vector<float2> step(1, make_float2(0.0f, 0.0f)); float2 lastStepSizeFloat = make_float2(0.0f, 0.0f);
stepSize->upload(step); stepSize->upload(&lastStepSizeFloat);
} }
// Record the set of constraints and how many constraints each atom is involved in. // Record the set of constraints and how many constraints each atom is involved in.
...@@ -650,6 +650,29 @@ CudaIntegrationUtilities::~CudaIntegrationUtilities() { ...@@ -650,6 +650,29 @@ CudaIntegrationUtilities::~CudaIntegrationUtilities() {
delete vsiteLocalCoordsParams; delete vsiteLocalCoordsParams;
} }
void CudaIntegrationUtilities::setNextStepSize(double size) {
if (size != lastStepSize.x || size != lastStepSize.y) {
lastStepSize = make_double2(size, size);
if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
stepSize->upload(&lastStepSize);
else {
float2 lastStepSizeFloat = make_float2((float) size, (float) size);
stepSize->upload(&lastStepSizeFloat);
}
}
}
double CudaIntegrationUtilities::getLastStepSize() {
if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
stepSize->download(&lastStepSize);
else {
float2 lastStepSizeFloat;
stepSize->download(&lastStepSizeFloat);
lastStepSize = make_double2(lastStepSizeFloat.x, lastStepSizeFloat.y);
}
return lastStepSize.y;
}
void CudaIntegrationUtilities::applyConstraints(double tol) { void CudaIntegrationUtilities::applyConstraints(double tol) {
applyConstraints(false, tol); applyConstraints(false, tol);
} }
......
...@@ -112,7 +112,7 @@ double CudaCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bo ...@@ -112,7 +112,7 @@ double CudaCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bo
cu.getIntegrationUtilities().distributeForcesFromVirtualSites(); cu.getIntegrationUtilities().distributeForcesFromVirtualSites();
if (includeEnergy) { if (includeEnergy) {
CudaArray& energyArray = cu.getEnergyBuffer(); CudaArray& energyArray = cu.getEnergyBuffer();
if (cu.getUseDoublePrecision()) { if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
double* energy = (double*) cu.getPinnedBuffer(); double* energy = (double*) cu.getPinnedBuffer();
energyArray.download(energy); energyArray.download(energy);
for (int i = 0; i < energyArray.getSize(); i++) for (int i = 0; i < energyArray.getSize(); i++)
...@@ -1458,16 +1458,24 @@ private: ...@@ -1458,16 +1458,24 @@ private:
class CudaCalcNonbondedForceKernel::SyncStreamPostComputation : public CudaContext::ForcePostComputation { class CudaCalcNonbondedForceKernel::SyncStreamPostComputation : public CudaContext::ForcePostComputation {
public: public:
SyncStreamPostComputation(CudaContext& cu, CUevent event, int forceGroup) : cu(cu), event(event), forceGroup(forceGroup) { SyncStreamPostComputation(CudaContext& cu, CUevent event, CUfunction addEnergyKernel, CudaArray& pmeEnergyBuffer, int forceGroup) : cu(cu), event(event),
addEnergyKernel(addEnergyKernel), pmeEnergyBuffer(pmeEnergyBuffer), forceGroup(forceGroup) {
} }
double computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) { double computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) {
if ((groups&(1<<forceGroup)) != 0) if ((groups&(1<<forceGroup)) != 0)
cuStreamWaitEvent(cu.getCurrentStream(), event, 0); cuStreamWaitEvent(cu.getCurrentStream(), event, 0);
if (includeEnergy) {
int bufferSize = pmeEnergyBuffer.getSize();
void* args[] = {&pmeEnergyBuffer.getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(), &bufferSize};
cu.executeKernel(addEnergyKernel, args, bufferSize);
}
return 0.0; return 0.0;
} }
private: private:
CudaContext& cu; CudaContext& cu;
CUevent event; CUevent event;
CUfunction addEnergyKernel;
CudaArray& pmeEnergyBuffer;
int forceGroup; int forceGroup;
}; };
...@@ -1493,6 +1501,8 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() { ...@@ -1493,6 +1501,8 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
delete pmeAtomRange; delete pmeAtomRange;
if (pmeAtomGridIndex != NULL) if (pmeAtomGridIndex != NULL)
delete pmeAtomGridIndex; delete pmeAtomGridIndex;
if (pmeEnergyBuffer != NULL)
delete pmeEnergyBuffer;
if (sort != NULL) if (sort != NULL)
delete sort; delete sort;
if (fft != NULL) if (fft != NULL)
...@@ -1681,6 +1691,9 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon ...@@ -1681,6 +1691,9 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
pmeBsplineModuliZ = new CudaArray(cu, gridSizeZ, elementSize, "pmeBsplineModuliZ"); pmeBsplineModuliZ = new CudaArray(cu, gridSizeZ, elementSize, "pmeBsplineModuliZ");
pmeAtomRange = CudaArray::create<int>(cu, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange"); pmeAtomRange = CudaArray::create<int>(cu, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange");
pmeAtomGridIndex = CudaArray::create<int2>(cu, numParticles, "pmeAtomGridIndex"); pmeAtomGridIndex = CudaArray::create<int2>(cu, numParticles, "pmeAtomGridIndex");
int energyElementSize = (cu.getUseDoublePrecision() || cu.getUseMixedPrecision() ? sizeof(double) : sizeof(float));
pmeEnergyBuffer = new CudaArray(cu, cu.getNumThreadBlocks()*CudaContext::ThreadBlockSize, energyElementSize, "pmeEnergyBuffer");
cu.clearBuffer(*pmeEnergyBuffer);
sort = new CudaSort(cu, new SortTrait(), cu.getNumAtoms()); sort = new CudaSort(cu, new SortTrait(), cu.getNumAtoms());
int cufftVersion; int cufftVersion;
cufftGetVersion(&cufftVersion); cufftGetVersion(&cufftVersion);
...@@ -1714,7 +1727,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon ...@@ -1714,7 +1727,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
if (recipForceGroup < 0) if (recipForceGroup < 0)
recipForceGroup = force.getForceGroup(); recipForceGroup = force.getForceGroup();
cu.addPreComputation(new SyncStreamPreComputation(cu, pmeStream, pmeSyncEvent, recipForceGroup)); cu.addPreComputation(new SyncStreamPreComputation(cu, pmeStream, pmeSyncEvent, recipForceGroup));
cu.addPostComputation(new SyncStreamPostComputation(cu, pmeSyncEvent, recipForceGroup)); cu.addPostComputation(new SyncStreamPostComputation(cu, pmeSyncEvent, cu.getKernel(module, "addEnergy"), *pmeEnergyBuffer, recipForceGroup));
} }
hasInitializedFFT = true; hasInitializedFFT = true;
...@@ -1889,7 +1902,7 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF ...@@ -1889,7 +1902,7 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
} }
if (includeEnergy) { if (includeEnergy) {
void* computeEnergyArgs[] = {&reciprocalPmeGrid->getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(), void* computeEnergyArgs[] = {&reciprocalPmeGrid->getDevicePointer(), usePmeStream ? &pmeEnergyBuffer->getDevicePointer() : &cu.getEnergyBuffer().getDevicePointer(),
&pmeBsplineModuliX->getDevicePointer(), &pmeBsplineModuliY->getDevicePointer(), &pmeBsplineModuliZ->getDevicePointer(), &pmeBsplineModuliX->getDevicePointer(), &pmeBsplineModuliY->getDevicePointer(), &pmeBsplineModuliZ->getDevicePointer(),
cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]}; cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeEvalEnergyKernel, computeEnergyArgs, cu.getNumAtoms()); cu.executeKernel(pmeEvalEnergyKernel, computeEnergyArgs, cu.getNumAtoms());
...@@ -5674,7 +5687,6 @@ void CudaIntegrateVerletStepKernel::initialize(const System& system, const Verle ...@@ -5674,7 +5687,6 @@ void CudaIntegrateVerletStepKernel::initialize(const System& system, const Verle
CUmodule module = cu.createModule(CudaKernelSources::verlet, defines, ""); CUmodule module = cu.createModule(CudaKernelSources::verlet, defines, "");
kernel1 = cu.getKernel(module, "integrateVerletPart1"); kernel1 = cu.getKernel(module, "integrateVerletPart1");
kernel2 = cu.getKernel(module, "integrateVerletPart2"); kernel2 = cu.getKernel(module, "integrateVerletPart2");
prevStepSize = -1.0;
} }
void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIntegrator& integrator) { void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIntegrator& integrator) {
...@@ -5683,19 +5695,7 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn ...@@ -5683,19 +5695,7 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn
int numAtoms = cu.getNumAtoms(); int numAtoms = cu.getNumAtoms();
int paddedNumAtoms = cu.getPaddedNumAtoms(); int paddedNumAtoms = cu.getPaddedNumAtoms();
double dt = integrator.getStepSize(); double dt = integrator.getStepSize();
if (dt != prevStepSize) { cu.getIntegrationUtilities().setNextStepSize(dt);
if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
vector<double2> stepSizeVec(1);
stepSizeVec[0] = make_double2(dt, dt);
cu.getIntegrationUtilities().getStepSize().upload(stepSizeVec);
}
else {
vector<float2> stepSizeVec(1);
stepSizeVec[0] = make_float2((float) dt, (float) dt);
cu.getIntegrationUtilities().getStepSize().upload(stepSizeVec);
}
prevStepSize = dt;
}
// Call the first integration kernel. // Call the first integration kernel.
...@@ -5752,6 +5752,7 @@ void CudaIntegrateLangevinStepKernel::execute(ContextImpl& context, const Langev ...@@ -5752,6 +5752,7 @@ void CudaIntegrateLangevinStepKernel::execute(ContextImpl& context, const Langev
double temperature = integrator.getTemperature(); double temperature = integrator.getTemperature();
double friction = integrator.getFriction(); double friction = integrator.getFriction();
double stepSize = integrator.getStepSize(); double stepSize = integrator.getStepSize();
cu.getIntegrationUtilities().setNextStepSize(stepSize);
if (temperature != prevTemp || friction != prevFriction || stepSize != prevStepSize) { if (temperature != prevTemp || friction != prevFriction || stepSize != prevStepSize) {
// Calculate the integration parameters. // Calculate the integration parameters.
...@@ -5766,8 +5767,6 @@ void CudaIntegrateLangevinStepKernel::execute(ContextImpl& context, const Langev ...@@ -5766,8 +5767,6 @@ void CudaIntegrateLangevinStepKernel::execute(ContextImpl& context, const Langev
p[1] = fscale; p[1] = fscale;
p[2] = noisescale; p[2] = noisescale;
params->upload(p); params->upload(p);
double2 ss = make_double2(0, stepSize);
integration.getStepSize().upload(&ss);
} }
else { else {
vector<float> p(params->getSize()); vector<float> p(params->getSize());
...@@ -5775,8 +5774,6 @@ void CudaIntegrateLangevinStepKernel::execute(ContextImpl& context, const Langev ...@@ -5775,8 +5774,6 @@ void CudaIntegrateLangevinStepKernel::execute(ContextImpl& context, const Langev
p[1] = (float) fscale; p[1] = (float) fscale;
p[2] = (float) noisescale; p[2] = (float) noisescale;
params->upload(p); params->upload(p);
float2 ss = make_float2(0, (float) stepSize);
integration.getStepSize().upload(&ss);
} }
prevTemp = temperature; prevTemp = temperature;
prevFriction = friction; prevFriction = friction;
...@@ -5929,20 +5926,13 @@ double CudaIntegrateVariableVerletStepKernel::execute(ContextImpl& context, cons ...@@ -5929,20 +5926,13 @@ double CudaIntegrateVariableVerletStepKernel::execute(ContextImpl& context, cons
// Update the time and step count. // Update the time and step count.
double dt, time; double dt = cu.getIntegrationUtilities().getLastStepSize();
double time = cu.getTime()+dt;
if (useDouble) { if (useDouble) {
double2 stepSize;
cu.getIntegrationUtilities().getStepSize().download(&stepSize);
dt = stepSize.y;
time = cu.getTime()+dt;
if (dt == maxStepSize) if (dt == maxStepSize)
time = maxTime; // Avoid round-off error time = maxTime; // Avoid round-off error
} }
else { else {
float2 stepSize;
cu.getIntegrationUtilities().getStepSize().download(&stepSize);
dt = stepSize.y;
time = cu.getTime()+dt;
if (dt == maxStepSizeFloat) if (dt == maxStepSizeFloat)
time = maxTime; // Avoid round-off error time = maxTime; // Avoid round-off error
} }
...@@ -6023,20 +6013,13 @@ double CudaIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, co ...@@ -6023,20 +6013,13 @@ double CudaIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, co
// Update the time and step count. // Update the time and step count.
double dt, time; double dt = cu.getIntegrationUtilities().getLastStepSize();
double time = cu.getTime()+dt;
if (useDouble) { if (useDouble) {
double2 stepSize;
cu.getIntegrationUtilities().getStepSize().download(&stepSize);
dt = stepSize.y;
time = cu.getTime()+dt;
if (dt == maxStepSize) if (dt == maxStepSize)
time = maxTime; // Avoid round-off error time = maxTime; // Avoid round-off error
} }
else { else {
float2 stepSize;
cu.getIntegrationUtilities().getStepSize().download(&stepSize);
dt = stepSize.y;
time = cu.getTime()+dt;
if (dt == maxStepSizeFloat) if (dt == maxStepSizeFloat)
time = maxTime; // Avoid round-off error time = maxTime; // Avoid round-off error
} }
...@@ -6139,7 +6122,6 @@ void CudaIntegrateCustomStepKernel::initialize(const System& system, const Custo ...@@ -6139,7 +6122,6 @@ void CudaIntegrateCustomStepKernel::initialize(const System& system, const Custo
summedValue = new CudaArray(cu, 1, elementSize, "summedValue"); summedValue = new CudaArray(cu, 1, elementSize, "summedValue");
perDofValues = new CudaParameterSet(cu, integrator.getNumPerDofVariables(), 3*system.getNumParticles(), "perDofVariables", false, cu.getUseDoublePrecision() || cu.getUseMixedPrecision()); perDofValues = new CudaParameterSet(cu, integrator.getNumPerDofVariables(), 3*system.getNumParticles(), "perDofVariables", false, cu.getUseDoublePrecision() || cu.getUseMixedPrecision());
cu.addReorderListener(new ReorderListener(cu, *perDofValues, localPerDofValuesFloat, localPerDofValuesDouble, deviceValuesAreCurrent)); cu.addReorderListener(new ReorderListener(cu, *perDofValues, localPerDofValuesFloat, localPerDofValuesDouble, deviceValuesAreCurrent));
prevStepSize = -1.0;
SimTKOpenMMUtilities::setRandomNumberSeed(integrator.getRandomNumberSeed()); SimTKOpenMMUtilities::setRandomNumberSeed(integrator.getRandomNumberSeed());
} }
...@@ -6553,9 +6535,7 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context, ...@@ -6553,9 +6535,7 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context,
} }
localValuesAreCurrent = false; localValuesAreCurrent = false;
double stepSize = integrator.getStepSize(); double stepSize = integrator.getStepSize();
if (stepSize != prevStepSize) { recordGlobalValue(stepSize, GlobalTarget(DT, dtVariableIndex));
recordGlobalValue(stepSize, GlobalTarget(DT, dtVariableIndex));
}
for (int i = 0; i < (int) parameterNames.size(); i++) { for (int i = 0; i < (int) parameterNames.size(); i++) {
double value = context.getParameter(parameterNames[i]); double value = context.getParameter(parameterNames[i]);
if (value != globalValuesDouble[parameterVariableIndex[i]]) { if (value != globalValuesDouble[parameterVariableIndex[i]]) {
...@@ -6760,17 +6740,10 @@ double CudaIntegrateCustomStepKernel::computeKineticEnergy(ContextImpl& context, ...@@ -6760,17 +6740,10 @@ double CudaIntegrateCustomStepKernel::computeKineticEnergy(ContextImpl& context,
void CudaIntegrateCustomStepKernel::recordGlobalValue(double value, GlobalTarget target) { void CudaIntegrateCustomStepKernel::recordGlobalValue(double value, GlobalTarget target) {
switch (target.type) { switch (target.type) {
case DT: case DT:
if (value != globalValuesDouble[dtVariableIndex])
deviceGlobalsAreCurrent = false;
globalValuesDouble[dtVariableIndex] = value; globalValuesDouble[dtVariableIndex] = value;
deviceGlobalsAreCurrent = false; cu.getIntegrationUtilities().setNextStepSize(value);
if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
double size[] = {0, value};
cu.getIntegrationUtilities().getStepSize().upload(size);
}
else {
float size[] = {0, (float) value};
cu.getIntegrationUtilities().getStepSize().upload(size);
}
prevStepSize = value;
break; break;
case VARIABLE: case VARIABLE:
case PARAMETER: case PARAMETER:
......
...@@ -179,7 +179,7 @@ void CudaPlatform::contextDestroyed(ContextImpl& context) const { ...@@ -179,7 +179,7 @@ void CudaPlatform::contextDestroyed(ContextImpl& context) const {
} }
CudaPlatform::PlatformData::PlatformData(ContextImpl* context, const System& system, const string& deviceIndexProperty, const string& blockingProperty, const string& precisionProperty, CudaPlatform::PlatformData::PlatformData(ContextImpl* context, const System& system, const string& deviceIndexProperty, const string& blockingProperty, const string& precisionProperty,
const string& cpuPmeProperty, const string& compilerProperty, const string& tempProperty, const string& hostCompilerProperty) : context(context), removeCM(false), stepCount(0), computeForceCount(0), time(0.0) { const string& cpuPmeProperty, const string& compilerProperty, const string& tempProperty, const string& hostCompilerProperty) : context(context), removeCM(false), stepCount(0), computeForceCount(0), time(0.0), hasInitializedContexts(false) {
bool blocking = (blockingProperty == "true"); bool blocking = (blockingProperty == "true");
vector<string> devices; vector<string> devices;
size_t searchPos = 0, nextPos; size_t searchPos = 0, nextPos;
...@@ -247,8 +247,11 @@ CudaPlatform::PlatformData::~PlatformData() { ...@@ -247,8 +247,11 @@ CudaPlatform::PlatformData::~PlatformData() {
} }
void CudaPlatform::PlatformData::initializeContexts(const System& system) { void CudaPlatform::PlatformData::initializeContexts(const System& system) {
if (hasInitializedContexts)
return;
for (int i = 0; i < (int) contexts.size(); i++) for (int i = 0; i < (int) contexts.size(); i++)
contexts[i]->initialize(); contexts[i]->initialize();
hasInitializedContexts = true;
} }
void CudaPlatform::PlatformData::syncContexts() { void CudaPlatform::PlatformData::syncContexts() {
......
...@@ -104,10 +104,10 @@ inline __device__ real4 computeCross(real4 vec1, real4 vec2) { ...@@ -104,10 +104,10 @@ inline __device__ real4 computeCross(real4 vec1, real4 vec2) {
/** /**
* Compute the forces on groups based on the bonds. * Compute the forces on groups based on the bonds.
*/ */
extern "C" __global__ void computeGroupForces(unsigned long long* __restrict__ groupForce, real* __restrict__ energyBuffer, const real4* __restrict__ centerPositions, extern "C" __global__ void computeGroupForces(unsigned long long* __restrict__ groupForce, mixed* __restrict__ energyBuffer, const real4* __restrict__ centerPositions,
const int* __restrict__ bondGroups const int* __restrict__ bondGroups
EXTRA_ARGS) { EXTRA_ARGS) {
real energy = 0; mixed energy = 0;
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_BONDS; index += blockDim.x*gridDim.x) { for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_BONDS; index += blockDim.x*gridDim.x) {
COMPUTE_FORCE COMPUTE_FORCE
} }
......
...@@ -13,7 +13,7 @@ typedef struct { ...@@ -13,7 +13,7 @@ typedef struct {
/** /**
* Compute a force based on pair interactions. * Compute a force based on pair interactions.
*/ */
extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forceBuffers, real* __restrict__ energyBuffer, extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer,
const real4* __restrict__ posq, const unsigned int* __restrict__ exclusions, const ushort2* __restrict__ exclusionTiles, const real4* __restrict__ posq, const unsigned int* __restrict__ exclusions, const ushort2* __restrict__ exclusionTiles,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
...@@ -27,7 +27,7 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc ...@@ -27,7 +27,7 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE; const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
const unsigned int tgx = threadIdx.x & (TILE_SIZE-1); const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
const unsigned int tbx = threadIdx.x - tgx; const unsigned int tbx = threadIdx.x - tgx;
real energy = 0; mixed energy = 0;
__shared__ AtomData localData[THREAD_BLOCK_SIZE]; __shared__ AtomData localData[THREAD_BLOCK_SIZE];
// First loop: process tiles that contain exclusions. // First loop: process tiles that contain exclusions.
......
...@@ -2,9 +2,9 @@ ...@@ -2,9 +2,9 @@
* Reduce the derivatives computed in the N^2 energy kernel, and compute all per-particle energy terms. * Reduce the derivatives computed in the N^2 energy kernel, and compute all per-particle energy terms.
*/ */
extern "C" __global__ void computePerParticleEnergy(long long* __restrict__ forceBuffers, real* __restrict__ energyBuffer, const real4* __restrict__ posq extern "C" __global__ void computePerParticleEnergy(long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq
PARAMETER_ARGUMENTS) { PARAMETER_ARGUMENTS) {
real energy = 0; mixed energy = 0;
for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) { for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
// Load the derivatives // Load the derivatives
......
...@@ -66,12 +66,12 @@ inline __device__ real4 computeCross(real4 vec1, real4 vec2) { ...@@ -66,12 +66,12 @@ inline __device__ real4 computeCross(real4 vec1, real4 vec2) {
/** /**
* Compute forces on donors. * Compute forces on donors.
*/ */
extern "C" __global__ void computeDonorForces(unsigned long long* __restrict__ force, real* __restrict__ energyBuffer, const real4* __restrict__ posq, extern "C" __global__ void computeDonorForces(unsigned long long* __restrict__ force, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq,
const int4* __restrict__ exclusions, const int4* __restrict__ donorAtoms, const int4* __restrict__ acceptorAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, const int4* __restrict__ exclusions, const int4* __restrict__ donorAtoms, const int4* __restrict__ acceptorAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
PARAMETER_ARGUMENTS) { PARAMETER_ARGUMENTS) {
extern __shared__ real4 posBuffer[]; extern __shared__ real4 posBuffer[];
real energy = 0; mixed energy = 0;
real3 f1 = make_real3(0); real3 f1 = make_real3(0);
real3 f2 = make_real3(0); real3 f2 = make_real3(0);
real3 f3 = make_real3(0); real3 f3 = make_real3(0);
...@@ -155,7 +155,7 @@ extern "C" __global__ void computeDonorForces(unsigned long long* __restrict__ f ...@@ -155,7 +155,7 @@ extern "C" __global__ void computeDonorForces(unsigned long long* __restrict__ f
/** /**
* Compute forces on acceptors. * Compute forces on acceptors.
*/ */
extern "C" __global__ void computeAcceptorForces(unsigned long long* __restrict__ force, real* __restrict__ energyBuffer, const real4* __restrict__ posq, extern "C" __global__ void computeAcceptorForces(unsigned long long* __restrict__ force, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq,
const int4* __restrict__ exclusions, const int4* __restrict__ donorAtoms, const int4* __restrict__ acceptorAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, const int4* __restrict__ exclusions, const int4* __restrict__ donorAtoms, const int4* __restrict__ acceptorAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
PARAMETER_ARGUMENTS) { PARAMETER_ARGUMENTS) {
......
...@@ -78,7 +78,7 @@ __constant__ float globals[NUM_GLOBALS]; ...@@ -78,7 +78,7 @@ __constant__ float globals[NUM_GLOBALS];
* Compute the interaction. * Compute the interaction.
*/ */
extern "C" __global__ void computeInteraction( extern "C" __global__ void computeInteraction(
unsigned long long* __restrict__ forceBuffers, real* __restrict__ energyBuffer, const real4* __restrict__ posq, unsigned long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq,
real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
, const int* __restrict__ neighbors, const int* __restrict__ neighborStartIndex , const int* __restrict__ neighbors, const int* __restrict__ neighborStartIndex
...@@ -90,7 +90,7 @@ extern "C" __global__ void computeInteraction( ...@@ -90,7 +90,7 @@ extern "C" __global__ void computeInteraction(
, int* __restrict__ exclusions, int* __restrict__ exclusionStartIndex , int* __restrict__ exclusions, int* __restrict__ exclusionStartIndex
#endif #endif
PARAMETER_ARGUMENTS) { PARAMETER_ARGUMENTS) {
real energy = 0.0f; mixed energy = 0;
// Loop over particles to be the first one in the set. // Loop over particles to be the first one in the set.
......
...@@ -9,14 +9,14 @@ typedef struct { ...@@ -9,14 +9,14 @@ typedef struct {
} AtomData; } AtomData;
extern "C" __global__ void computeInteractionGroups( extern "C" __global__ void computeInteractionGroups(
unsigned long long* __restrict__ forceBuffers, real* __restrict__ energyBuffer, const real4* __restrict__ posq, const int4* __restrict__ groupData, unsigned long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq, const int4* __restrict__ groupData,
real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
PARAMETER_ARGUMENTS) { PARAMETER_ARGUMENTS) {
const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE; const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE; // global warpIndex const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE; // global warpIndex
const unsigned int tgx = threadIdx.x & (TILE_SIZE-1); // index within the warp const unsigned int tgx = threadIdx.x & (TILE_SIZE-1); // index within the warp
const unsigned int tbx = threadIdx.x - tgx; // block warpIndex const unsigned int tbx = threadIdx.x - tgx; // block warpIndex
real energy = 0.0f; mixed energy = 0;
__shared__ AtomData localData[LOCAL_MEMORY_SIZE]; __shared__ AtomData localData[LOCAL_MEMORY_SIZE];
const unsigned int startTile = FIRST_TILE+warp*(LAST_TILE-FIRST_TILE)/totalWarps; const unsigned int startTile = FIRST_TILE+warp*(LAST_TILE-FIRST_TILE)/totalWarps;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment