Commit 2d2f05ce authored by Andy Simmonett's avatar Andy Simmonett
Browse files

Merge branch 'master' of github.com:pandegroup/openmm into genpt

parents 94823d84 4d32047c
......@@ -28,7 +28,7 @@
#include "CpuNonbondedForce.h"
#include "ReferenceForce.h"
#include "ReferencePME.h"
#include "gmx_atomic.h"
#include "openmm/internal/gmx_atomic.h"
#include <algorithm>
// In case we're using some primitive version of Visual Studio this will
......@@ -322,6 +322,14 @@ void CpuNonbondedForce::calculateDirectIxn(int numberOfAtoms, float* posq, const
threads.execute(task);
threads.waitForThreads();
// Signal the threads to subtract the exclusions.
if (ewald || pme) {
gmx_atomic_set(&counter, 0);
threads.resumeThreads();
threads.waitForThreads();
}
// Combine the energies from all the threads.
if (totalEnergy != NULL) {
......@@ -354,28 +362,37 @@ void CpuNonbondedForce::threadComputeDirect(ThreadPool& threads, int threadIndex
// Now subtract off the exclusions, since they were implicitly included in the reciprocal space sum.
for (int i = threadIndex; i < numberOfAtoms; i += numThreads) {
fvec4 posI((float) atomCoordinates[i][0], (float) atomCoordinates[i][1], (float) atomCoordinates[i][2], 0.0f);
for (set<int>::const_iterator iter = exclusions[i].begin(); iter != exclusions[i].end(); ++iter) {
if (*iter > i) {
int j = *iter;
fvec4 deltaR;
fvec4 posJ((float) atomCoordinates[j][0], (float) atomCoordinates[j][1], (float) atomCoordinates[j][2], 0.0f);
float r2;
getDeltaR(posJ, posI, deltaR, r2, false, boxSize, invBoxSize);
float r = sqrtf(r2);
float inverseR = 1/r;
float chargeProd = ONE_4PI_EPS0*posq[4*i+3]*posq[4*j+3];
float alphaR = alphaEwald*r;
float erfAlphaR = erf(alphaR);
if (erfAlphaR > 1e-6f) {
float dEdR = (float) (chargeProd * inverseR * inverseR * inverseR);
dEdR = (float) (dEdR * (erfAlphaR-TWO_OVER_SQRT_PI*alphaR*exp(-alphaR*alphaR)));
fvec4 result = deltaR*dEdR;
(fvec4(forces+4*i)-result).store(forces+4*i);
(fvec4(forces+4*j)+result).store(forces+4*j);
if (includeEnergy)
threadEnergy[threadIndex] -= chargeProd*inverseR*erfAlphaR;
threads.syncThreads();
const int groupSize = max(1, numberOfAtoms/(10*numThreads));
while (true) {
int start = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), groupSize);
if (start >= numberOfAtoms)
break;
int end = min(start+groupSize, numberOfAtoms);
for (int i = start; i < end; i++) {
fvec4 posI((float) atomCoordinates[i][0], (float) atomCoordinates[i][1], (float) atomCoordinates[i][2], 0.0f);
float scaledChargeI = (float) (ONE_4PI_EPS0*posq[4*i+3]);
for (set<int>::const_iterator iter = exclusions[i].begin(); iter != exclusions[i].end(); ++iter) {
if (*iter > i) {
int j = *iter;
fvec4 deltaR;
fvec4 posJ((float) atomCoordinates[j][0], (float) atomCoordinates[j][1], (float) atomCoordinates[j][2], 0.0f);
float r2;
getDeltaR(posJ, posI, deltaR, r2, false, boxSize, invBoxSize);
float r = sqrtf(r2);
float alphaR = alphaEwald*r;
float erfAlphaR = erf(alphaR);
if (erfAlphaR > 1e-6f) {
float inverseR = 1/r;
float chargeProdOverR = scaledChargeI*posq[4*j+3]*inverseR;
float dEdR = chargeProdOverR*inverseR*inverseR;
dEdR = dEdR * (erfAlphaR-(float)TWO_OVER_SQRT_PI*alphaR*(float)exp(-alphaR*alphaR));
fvec4 result = deltaR*dEdR;
(fvec4(forces+4*i)-result).store(forces+4*i);
(fvec4(forces+4*j)+result).store(forces+4*j);
if (includeEnergy)
threadEnergy[threadIndex] -= chargeProdOverR*erfAlphaR;
}
}
}
}
......
......@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2013 Stanford University and the Authors. *
* Portions copyright (c) 2013-2015 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -30,6 +30,7 @@
* -------------------------------------------------------------------------- */
#include "CpuSETTLE.h"
#include "openmm/internal/gmx_atomic.h"
using namespace OpenMM;
using namespace std;
......@@ -39,10 +40,14 @@ public:
ApplyToPositionsTask(vector<OpenMM::RealVec>& atomCoordinates, vector<OpenMM::RealVec>& atomCoordinatesP, vector<RealOpenMM>& inverseMasses,
RealOpenMM tolerance, vector<ReferenceSETTLEAlgorithm*>& threadSettle) : atomCoordinates(atomCoordinates), atomCoordinatesP(atomCoordinatesP),
inverseMasses(inverseMasses), tolerance(tolerance), threadSettle(threadSettle) {
gmx_atomic_set(&atomicCounter, 0);
}
void execute(ThreadPool& threads, int threadIndex) {
if (threadIndex < threadSettle.size()) {
threadSettle[threadIndex]->apply(atomCoordinates, atomCoordinatesP, inverseMasses, tolerance);
while (true) {
int index = gmx_atomic_fetch_add(&atomicCounter, 1);
if (index >= threadSettle.size())
break;
threadSettle[index]->apply(atomCoordinates, atomCoordinatesP, inverseMasses, tolerance);
}
}
vector<OpenMM::RealVec>& atomCoordinates;
......@@ -50,6 +55,7 @@ public:
vector<RealOpenMM>& inverseMasses;
RealOpenMM tolerance;
vector<ReferenceSETTLEAlgorithm*>& threadSettle;
gmx_atomic_t atomicCounter;
};
class CpuSETTLE::ApplyToVelocitiesTask : public ThreadPool::Task {
......@@ -57,10 +63,14 @@ public:
ApplyToVelocitiesTask(vector<OpenMM::RealVec>& atomCoordinates, vector<OpenMM::RealVec>& velocities, vector<RealOpenMM>& inverseMasses,
RealOpenMM tolerance, vector<ReferenceSETTLEAlgorithm*>& threadSettle) : atomCoordinates(atomCoordinates), velocities(velocities),
inverseMasses(inverseMasses), tolerance(tolerance), threadSettle(threadSettle) {
gmx_atomic_set(&atomicCounter, 0);
}
void execute(ThreadPool& threads, int threadIndex) {
if (threadIndex < threadSettle.size()) {
threadSettle[threadIndex]->applyToVelocities(atomCoordinates, velocities, inverseMasses, tolerance);
while (true) {
int index = gmx_atomic_fetch_add(&atomicCounter, 1);
if (index >= threadSettle.size())
break;
threadSettle[index]->applyToVelocities(atomCoordinates, velocities, inverseMasses, tolerance);
}
}
vector<OpenMM::RealVec>& atomCoordinates;
......@@ -68,17 +78,18 @@ public:
vector<RealOpenMM>& inverseMasses;
RealOpenMM tolerance;
vector<ReferenceSETTLEAlgorithm*>& threadSettle;
gmx_atomic_t atomicCounter;
};
CpuSETTLE::CpuSETTLE(const System& system, const ReferenceSETTLEAlgorithm& settle, ThreadPool& threads) : threads(threads) {
int numThreads = threads.getNumThreads();
int numBlocks = 10*threads.getNumThreads();
int numClusters = settle.getNumClusters();
vector<RealOpenMM> mass(system.getNumParticles());
for (int i = 0; i < system.getNumParticles(); i++)
mass[i] = system.getParticleMass(i);
for (int i = 0; i < numThreads; i++) {
int start = i*numClusters/numThreads;
int end = (i+1)*numClusters/numThreads;
for (int i = 0; i < numBlocks; i++) {
int start = i*numClusters/numBlocks;
int end = (i+1)*numClusters/numBlocks;
if (start != end) {
int numThreadClusters = end-start;
vector<int> atom1(numThreadClusters), atom2(numThreadClusters), atom3(numThreadClusters);
......
#ifndef OPENMM_GBVIFORCE_PROXY_H_
#define OPENMM_GBVIFORCE_PROXY_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
......@@ -9,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2010 Stanford University and the Authors. *
* Portions copyright (c) 2015 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -32,22 +29,8 @@
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include "openmm/internal/windowsExport.h"
#include "openmm/serialization/SerializationProxy.h"
namespace OpenMM {
/**
* This is a proxy for serializing GBVIForce objects.
*/
class OPENMM_EXPORT GBVIForceProxy : public SerializationProxy {
public:
GBVIForceProxy();
void serialize(const void* object, SerializationNode& node) const;
void* deserialize(const SerializationNode& node) const;
};
} // namespace OpenMM
#include "CpuTests.h"
#include "TestCompoundIntegrator.h"
#endif /*OPENMM_GBVIFORCE_PROXY_H_*/
void runPlatformTests() {
}
......@@ -13,9 +13,9 @@
#----------------------------------------------------
set(OPENMM_BUILD_CUDA_TESTS TRUE CACHE BOOL "Whether to build CUDA test cases")
if(OPENMM_BUILD_CUDA_TESTS)
if(BUILD_TESTING AND OPENMM_BUILD_CUDA_TESTS)
SUBDIRS (tests)
endif(OPENMM_BUILD_CUDA_TESTS)
endif(BUILD_TESTING AND OPENMM_BUILD_CUDA_TESTS)
# The source is organized into subdirectories, but we handle them all from
# this CMakeLists file rather than letting CMake visit them as SUBDIRS.
......
......@@ -30,6 +30,7 @@
#include <map>
#include <queue>
#include <string>
#include <utility>
#define __CL_ENABLE_EXCEPTIONS
#ifdef _MSC_VER
// Prevent Windows from defining macros that interfere with other code.
......@@ -538,6 +539,11 @@ public:
*/
void invalidateMolecules();
private:
/**
* Compute a sorted list of device indices in decreasing order of desirability
*/
std::vector<int> getDevicePrecedence();
struct Molecule;
struct MoleculeGroup;
class VirtualSiteInfo;
......
......@@ -62,6 +62,14 @@ public:
CudaArray& getStepSize() {
return *stepSize;
}
/**
* Set the size to use for the next step.
*/
void setNextStepSize(double size);
/**
* Get the size that was used for the last step.
*/
double getLastStepSize();
/**
* Apply constraints to the atom positions.
*
......@@ -154,6 +162,7 @@ private:
CudaArray* vsiteLocalCoordsParams;
int randomPos;
int lastSeed, numVsites;
double2 lastStepSize;
struct ShakeCluster;
struct ConstraintOrderer;
};
......
......@@ -592,7 +592,7 @@ class CudaCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
public:
CudaCalcNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cu, const System& system) : CalcNonbondedForceKernel(name, platform),
cu(cu), hasInitializedFFT(false), sigmaEpsilon(NULL), exceptionParams(NULL), cosSinSums(NULL), directPmeGrid(NULL), reciprocalPmeGrid(NULL),
pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeAtomRange(NULL), pmeAtomGridIndex(NULL), sort(NULL), fft(NULL), pmeio(NULL) {
pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeAtomRange(NULL), pmeAtomGridIndex(NULL), pmeEnergyBuffer(NULL), sort(NULL), fft(NULL), pmeio(NULL) {
}
~CudaCalcNonbondedForceKernel();
/**
......@@ -657,6 +657,7 @@ private:
CudaArray* pmeBsplineModuliZ;
CudaArray* pmeAtomRange;
CudaArray* pmeAtomGridIndex;
CudaArray* pmeEnergyBuffer;
CudaSort* sort;
Kernel cpuPme;
PmeIO* pmeio;
......@@ -1123,7 +1124,6 @@ public:
double computeKineticEnergy(ContextImpl& context, const VerletIntegrator& integrator);
private:
CudaContext& cu;
double prevStepSize;
CUfunction kernel1, kernel2;
};
......@@ -1354,7 +1354,7 @@ private:
void recordChangedParameters(ContextImpl& context);
bool evaluateCondition(int step);
CudaContext& cu;
double prevStepSize, energy;
double energy;
float energyFloat;
int numGlobalVariables;
bool hasInitializedKernels, deviceValuesAreCurrent, deviceGlobalsAreCurrent, modifiesParameters, keNeedsForce, hasAnyConstraints;
......
......@@ -121,7 +121,7 @@ public:
ContextImpl* context;
std::vector<CudaContext*> contexts;
std::vector<double> contextEnergy;
bool removeCM, peerAccessSupported, useCpuPme;
bool hasInitializedContexts, removeCM, peerAccessSupported, useCpuPme;
int cmMotionFrequency;
int stepCount, computeForceCount;
double time;
......
......@@ -99,7 +99,7 @@ void CudaBondedUtilities::initialize(const System& system) {
s<<CudaKernelSources::vectorOps;
for (int i = 0; i < (int) prefixCode.size(); i++)
s<<prefixCode[i];
s<<"extern \"C\" __global__ void computeBondedForces(unsigned long long* __restrict__ forceBuffer, real* __restrict__ energyBuffer, const real4* __restrict__ posq, int groups, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ";
s<<"extern \"C\" __global__ void computeBondedForces(unsigned long long* __restrict__ forceBuffer, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq, int groups, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ";
for (int force = 0; force < numForces; force++) {
for (int i = 0; i < (int) atomIndices[force].size(); i++) {
int indexWidth = atomIndices[force][i]->getElementSize()/4;
......@@ -110,7 +110,7 @@ void CudaBondedUtilities::initialize(const System& system) {
for (int i = 0; i < (int) arguments.size(); i++)
s<<", "<<argTypes[i]<<"* customArg"<<(i+1);
s<<") {\n";
s<<"real energy = 0;\n";
s<<"mixed energy = 0;\n";
for (int force = 0; force < numForces; force++)
s<<createForceSource(force, forceAtoms[force].size(), forceAtoms[force][0].size(), forceGroup[force], forceSource[force]);
s<<"energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;\n";
......
......@@ -122,49 +122,48 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
CHECK_RESULT(cuDeviceGetCount(&numDevices));
if (deviceIndex < -1 || deviceIndex >= numDevices)
throw OpenMMException("Illegal value for CudaDeviceIndex: "+intToString(deviceIndex));
vector<int> devicePrecedence;
if (deviceIndex == -1) {
// Try to figure out which device is the fastest.
int bestSpeed = -1;
int bestCompute = -1;
for (int i = 0; i < numDevices; i++) {
CHECK_RESULT(cuDeviceGet(&device, i));
int major, minor, clock, multiprocessors;
CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, device));
if (major == 1 && minor < 2)
continue; // 1.0 and 1.1 are not supported
CHECK_RESULT(cuDeviceGetAttribute(&clock, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device));
CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device));
int speed = clock*multiprocessors;
if (major > bestCompute || (major == bestCompute && speed > bestSpeed)) {
deviceIndex = i;
bestSpeed = speed;
bestCompute = major;
}
devicePrecedence = getDevicePrecedence();
} else {
devicePrecedence.push_back(deviceIndex);
}
this->deviceIndex = -1;
for (int i = 0; i < static_cast<int>(devicePrecedence.size()); i++) {
int trialDeviceIndex = devicePrecedence[i];
CHECK_RESULT(cuDeviceGet(&device, trialDeviceIndex));
defaultOptimizationOptions = "--use_fast_math";
unsigned int flags = CU_CTX_MAP_HOST;
if (useBlockingSync)
flags += CU_CTX_SCHED_BLOCKING_SYNC;
else
flags += CU_CTX_SCHED_SPIN;
if (cuCtxCreate(&context, flags, device) == CUDA_SUCCESS) {
this->deviceIndex = trialDeviceIndex;
break;
}
}
if (deviceIndex == -1)
throw OpenMMException("No compatible CUDA device is available");
CHECK_RESULT(cuDeviceGet(&device, deviceIndex));
this->deviceIndex = deviceIndex;
if (this->deviceIndex == -1)
if (deviceIndex != -1)
throw OpenMMException("The requested CUDA device could not be loaded");
else
throw OpenMMException("No compatible CUDA device is available");
int major, minor;
CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, device));
// This is a workaround to support GTX 980 with CUDA 6.5. It reports its compute capability
// as 5.2, but the compiler doesn't support anything beyond 5.0. We can remove this once
// CUDA 7.0 is released.
if (major == 5)
minor = 0;
#if __CUDA_API_VERSION < 7000
// This is a workaround to support GTX 980 with CUDA 6.5. It reports
// its compute capability as 5.2, but the compiler doesn't support
// anything beyond 5.0.
if (major == 5)
minor = 0;
#endif
gpuArchitecture = intToString(major)+intToString(minor);
computeCapability = major+0.1*minor;
if ((useDoublePrecision || useMixedPrecision) && computeCapability < 1.3)
throw OpenMMException("This device does not support double precision");
defaultOptimizationOptions = "--use_fast_math";
unsigned int flags = CU_CTX_MAP_HOST;
if (useBlockingSync)
flags += CU_CTX_SCHED_BLOCKING_SYNC;
else
flags += CU_CTX_SCHED_SPIN;
CHECK_RESULT(cuCtxCreate(&context, flags, device));
contextIsValid = true;
CHECK_RESULT(cuCtxSetCacheConfig(CU_FUNC_CACHE_PREFER_SHARED));
if (contextIndex > 0) {
......@@ -245,9 +244,9 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
compilationDefines["ATAN"] = useDoublePrecision ? "atan" : "atanf";
compilationDefines["ERF"] = useDoublePrecision ? "erf" : "erff";
compilationDefines["ERFC"] = useDoublePrecision ? "erfc" : "erfcf";
// Set defines for applying periodic boundary conditions.
Vec3 boxVectors[3];
system.getDefaultPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
boxIsTriclinic = (boxVectors[0][1] != 0.0 || boxVectors[0][2] != 0.0 ||
......@@ -307,11 +306,11 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
}
// Create the work thread used for parallelization when running on multiple devices.
thread = new WorkThread();
// Create utilities objects.
bonded = new CudaBondedUtilities(*this);
nonbonded = new CudaNonbondedUtilities(*this);
integration = new CudaIntegrationUtilities(*this, system);
......@@ -368,7 +367,7 @@ void CudaContext::initialize() {
CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0));
}
else if (useMixedPrecision) {
energyBuffer = CudaArray::create<float>(*this, numEnergyBuffers, "energyBuffer");
energyBuffer = CudaArray::create<double>(*this, numEnergyBuffers, "energyBuffer");
int pinnedBufferSize = max(paddedNumAtoms*4, numEnergyBuffers);
CHECK_RESULT(cuMemHostAlloc(&pinnedBuffer, pinnedBufferSize*sizeof(double), 0));
}
......@@ -427,7 +426,7 @@ string CudaContext::replaceStrings(const string& input, const std::map<std::stri
if (index != result.npos) {
if ((index == 0 || symbolChars.find(result[index-1]) == symbolChars.end()) && (index == result.size()-size || symbolChars.find(result[index+size]) == symbolChars.end())) {
// We have found a complete symbol, not part of a longer symbol.
result.replace(index, size, iter->second);
index += iter->second.size();
}
......@@ -462,11 +461,11 @@ static bool compileInWindows(const string &command) {
return -1;
}
WaitForSingleObject(pi.hProcess, INFINITE);
DWORD exitCode = -1;
DWORD exitCode = -1;
if(!GetExitCodeProcess(pi.hProcess, &exitCode)) {
throw(OpenMMException("Could not get nvcc.exe's exit code\n"));
} else {
if(exitCode == 0)
if(exitCode == 0)
return 0;
else
return -1;
......@@ -522,9 +521,9 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
if (!defines.empty())
src << endl;
src << source << endl;
// See whether we already have PTX for this kernel cached.
CSHA1 sha1;
sha1.Update((const UINT_8*) src.str().c_str(), src.str().size());
sha1.Final();
......@@ -539,9 +538,9 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
CUmodule module;
if (cuModuleLoad(&module, cacheFile.str().c_str()) == CUDA_SUCCESS)
return module;
// Select names for the various temporary files.
stringstream tempFileName;
tempFileName << "openmmTempKernel" << this; // Include a pointer to this context as part of the filename to avoid collisions.
#ifdef WIN32
......@@ -555,12 +554,12 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
int res = 0;
// If the runtime compiler plugin is available, use it.
if (hasCompilerKernel) {
string ptx = compilerKernel.getAs<CudaCompilerKernel>().createModule(src.str(), "-arch=compute_"+gpuArchitecture+" "+options, *this);
// If possible, write the PTX out to a temporary file so we can cache it for later use.
bool wroteCache = false;
try {
ofstream out(outputFile.c_str());
......@@ -574,7 +573,7 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
}
if (!wroteCache) {
// An error occurred. Possibly we don't have permission to write to the temp directory. Just try to load the module directly.
CHECK_RESULT2(cuModuleLoadDataEx(&module, &ptx[0], 0, NULL, NULL), "Error loading CUDA module");
return module;
}
......@@ -883,7 +882,7 @@ private:
void CudaContext::findMoleculeGroups() {
// The first time this is called, we need to identify all the molecules in the system.
if (moleculeGroups.size() == 0) {
// Add a ForceInfo that makes sure reordering doesn't break virtual sites.
......@@ -966,7 +965,7 @@ void CudaContext::findMoleculeGroups() {
if (!forces[k]->areParticlesIdentical(mol.atoms[i], mol2.atoms[i]))
identical = false;
}
// See if the constraints are identical.
for (int i = 0; i < (int) mol.constraints.size() && identical; i++) {
......@@ -1047,11 +1046,11 @@ void CudaContext::invalidateMolecules() {
}
if (valid)
return;
// The list of which molecules are identical is no longer valid. We need to restore the
// atoms to their original order, rebuild the list of identical molecules, and sort them
// again.
vector<int4> newCellOffsets(numAtoms);
if (useDoublePrecision) {
vector<double4> oldPosq(paddedNumAtoms);
......@@ -1196,6 +1195,8 @@ void CudaContext::reorderAtomsImpl() {
molPos[i].x *= invNumAtoms;
molPos[i].y *= invNumAtoms;
molPos[i].z *= invNumAtoms;
if (molPos[i].x != molPos[i].x)
throw OpenMMException("Particle coordinate is nan");
}
if (nonbonded->getUsePeriodic()) {
// Move each molecule position into the same box.
......@@ -1391,3 +1392,41 @@ void CudaContext::WorkThread::flush() {
pthread_cond_wait(&queueEmptyCondition, &queueLock);
pthread_mutex_unlock(&queueLock);
}
vector<int> CudaContext::getDevicePrecedence() {
int numDevices;
CUdevice thisDevice;
string errorMessage = "Error initializing Context";
vector<pair<pair<int, int>, int> > devices;
CHECK_RESULT(cuDeviceGetCount(&numDevices));
for (int i = 0; i < numDevices; i++) {
CHECK_RESULT(cuDeviceGet(&thisDevice, i));
int major, minor, clock, multiprocessors, speed;
CHECK_RESULT(cuDeviceComputeCapability(&major, &minor, thisDevice));
if (major == 1 && minor < 2)
continue;
if ((useDoublePrecision || useMixedPrecision) && (major+0.1*minor < 1.3))
continue;
CHECK_RESULT(cuDeviceGetAttribute(&clock, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, thisDevice));
CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, thisDevice));
speed = clock*multiprocessors;
pair<int, int> deviceProperties = std::make_pair(major, speed);
devices.push_back(std::make_pair(deviceProperties, -i));
}
// sort first by compute capability (higher is better), then speed
// (higher is better), and finally device index (lower is better)
std::sort(devices.begin(), devices.end());
std::reverse(devices.begin(), devices.end());
vector<int> precedence;
for (int i = 0; i < static_cast<int>(devices.size()); i++) {
precedence.push_back(-devices[i].second);
}
return precedence;
}
......@@ -109,7 +109,8 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
}
out << ");\n";
out << "APPLY_PERIODIC_TO_DELTA(periodicDistance_delta)\n";
out << tempType << " periodicDistance_rinv = RSQRT(periodicDistance_delta.x*periodicDistance_delta.x + periodicDistance_delta.y*periodicDistance_delta.y + periodicDistance_delta.z*periodicDistance_delta.z);\n";
out << tempType << " periodicDistance_r2 = periodicDistance_delta.x*periodicDistance_delta.x + periodicDistance_delta.y*periodicDistance_delta.y + periodicDistance_delta.z*periodicDistance_delta.z;\n";
out << tempType << " periodicDistance_rinv = RSQRT(periodicDistance_r2);\n";
for (int j = 0; j < nodes.size(); j++) {
const vector<int>& derivOrder = dynamic_cast<const Operation::Custom*>(&nodes[j]->getOperation())->getDerivOrder();
int argIndex = -1;
......@@ -123,17 +124,17 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
if (argIndex == -1)
out << nodeNames[j] << " = RECIP(periodicDistance_rinv);\n";
else if (argIndex == 0)
out << nodeNames[j] << " = periodicDistance_delta.x*periodicDistance_rinv;\n";
out << nodeNames[j] << " = (periodicDistance_r2 > 0 ? periodicDistance_delta.x*periodicDistance_rinv : 0);\n";
else if (argIndex == 1)
out << nodeNames[j] << " = periodicDistance_delta.y*periodicDistance_rinv;\n";
out << nodeNames[j] << " = (periodicDistance_r2 > 0 ? periodicDistance_delta.y*periodicDistance_rinv : 0);\n";
else if (argIndex == 2)
out << nodeNames[j] << " = periodicDistance_delta.z*periodicDistance_rinv;\n";
out << nodeNames[j] << " = (periodicDistance_r2 > 0 ? periodicDistance_delta.z*periodicDistance_rinv : 0);\n";
else if (argIndex == 3)
out << nodeNames[j] << " = -periodicDistance_delta.x*periodicDistance_rinv;\n";
out << nodeNames[j] << " = (periodicDistance_r2 > 0 ? -periodicDistance_delta.x*periodicDistance_rinv : 0);\n";
else if (argIndex == 4)
out << nodeNames[j] << " = -periodicDistance_delta.y*periodicDistance_rinv;\n";
out << nodeNames[j] << " = (periodicDistance_r2 > 0 ? -periodicDistance_delta.y*periodicDistance_rinv : 0);\n";
else if (argIndex == 5)
out << nodeNames[j] << " = -periodicDistance_delta.z*periodicDistance_rinv;\n";
out << nodeNames[j] << " = (periodicDistance_r2 > 0 ? -periodicDistance_delta.z*periodicDistance_rinv : 0);\n";
}
}
else {
......
......@@ -106,21 +106,21 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
vsiteOutOfPlaneAtoms(NULL), vsiteOutOfPlaneWeights(NULL), vsiteLocalCoordsAtoms(NULL), vsiteLocalCoordsParams(NULL) {
// Create workspace arrays.
lastStepSize = make_double2(0.0, 0.0);
if (context.getUseDoublePrecision() || context.getUseMixedPrecision()) {
posDelta = CudaArray::create<double4>(context, context.getPaddedNumAtoms(), "posDelta");
vector<double4> deltas(posDelta->getSize(), make_double4(0.0, 0.0, 0.0, 0.0));
posDelta->upload(deltas);
stepSize = CudaArray::create<double2>(context, 1, "stepSize");
vector<double2> step(1, make_double2(0.0, 0.0));
stepSize->upload(step);
stepSize->upload(&lastStepSize);
}
else {
posDelta = CudaArray::create<float4>(context, context.getPaddedNumAtoms(), "posDelta");
vector<float4> deltas(posDelta->getSize(), make_float4(0.0f, 0.0f, 0.0f, 0.0f));
posDelta->upload(deltas);
stepSize = CudaArray::create<float2>(context, 1, "stepSize");
vector<float2> step(1, make_float2(0.0f, 0.0f));
stepSize->upload(step);
float2 lastStepSizeFloat = make_float2(0.0f, 0.0f);
stepSize->upload(&lastStepSizeFloat);
}
// Record the set of constraints and how many constraints each atom is involved in.
......@@ -650,6 +650,29 @@ CudaIntegrationUtilities::~CudaIntegrationUtilities() {
delete vsiteLocalCoordsParams;
}
void CudaIntegrationUtilities::setNextStepSize(double size) {
if (size != lastStepSize.x || size != lastStepSize.y) {
lastStepSize = make_double2(size, size);
if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
stepSize->upload(&lastStepSize);
else {
float2 lastStepSizeFloat = make_float2((float) size, (float) size);
stepSize->upload(&lastStepSizeFloat);
}
}
}
double CudaIntegrationUtilities::getLastStepSize() {
if (context.getUseDoublePrecision() || context.getUseMixedPrecision())
stepSize->download(&lastStepSize);
else {
float2 lastStepSizeFloat;
stepSize->download(&lastStepSizeFloat);
lastStepSize = make_double2(lastStepSizeFloat.x, lastStepSizeFloat.y);
}
return lastStepSize.y;
}
void CudaIntegrationUtilities::applyConstraints(double tol) {
applyConstraints(false, tol);
}
......
......@@ -112,7 +112,7 @@ double CudaCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bo
cu.getIntegrationUtilities().distributeForcesFromVirtualSites();
if (includeEnergy) {
CudaArray& energyArray = cu.getEnergyBuffer();
if (cu.getUseDoublePrecision()) {
if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
double* energy = (double*) cu.getPinnedBuffer();
energyArray.download(energy);
for (int i = 0; i < energyArray.getSize(); i++)
......@@ -1458,16 +1458,24 @@ private:
class CudaCalcNonbondedForceKernel::SyncStreamPostComputation : public CudaContext::ForcePostComputation {
public:
SyncStreamPostComputation(CudaContext& cu, CUevent event, int forceGroup) : cu(cu), event(event), forceGroup(forceGroup) {
SyncStreamPostComputation(CudaContext& cu, CUevent event, CUfunction addEnergyKernel, CudaArray& pmeEnergyBuffer, int forceGroup) : cu(cu), event(event),
addEnergyKernel(addEnergyKernel), pmeEnergyBuffer(pmeEnergyBuffer), forceGroup(forceGroup) {
}
double computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) {
if ((groups&(1<<forceGroup)) != 0)
cuStreamWaitEvent(cu.getCurrentStream(), event, 0);
if (includeEnergy) {
int bufferSize = pmeEnergyBuffer.getSize();
void* args[] = {&pmeEnergyBuffer.getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(), &bufferSize};
cu.executeKernel(addEnergyKernel, args, bufferSize);
}
return 0.0;
}
private:
CudaContext& cu;
CUevent event;
CUfunction addEnergyKernel;
CudaArray& pmeEnergyBuffer;
int forceGroup;
};
......@@ -1493,6 +1501,8 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
delete pmeAtomRange;
if (pmeAtomGridIndex != NULL)
delete pmeAtomGridIndex;
if (pmeEnergyBuffer != NULL)
delete pmeEnergyBuffer;
if (sort != NULL)
delete sort;
if (fft != NULL)
......@@ -1681,6 +1691,9 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
pmeBsplineModuliZ = new CudaArray(cu, gridSizeZ, elementSize, "pmeBsplineModuliZ");
pmeAtomRange = CudaArray::create<int>(cu, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange");
pmeAtomGridIndex = CudaArray::create<int2>(cu, numParticles, "pmeAtomGridIndex");
int energyElementSize = (cu.getUseDoublePrecision() || cu.getUseMixedPrecision() ? sizeof(double) : sizeof(float));
pmeEnergyBuffer = new CudaArray(cu, cu.getNumThreadBlocks()*CudaContext::ThreadBlockSize, energyElementSize, "pmeEnergyBuffer");
cu.clearBuffer(*pmeEnergyBuffer);
sort = new CudaSort(cu, new SortTrait(), cu.getNumAtoms());
int cufftVersion;
cufftGetVersion(&cufftVersion);
......@@ -1714,7 +1727,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
if (recipForceGroup < 0)
recipForceGroup = force.getForceGroup();
cu.addPreComputation(new SyncStreamPreComputation(cu, pmeStream, pmeSyncEvent, recipForceGroup));
cu.addPostComputation(new SyncStreamPostComputation(cu, pmeSyncEvent, recipForceGroup));
cu.addPostComputation(new SyncStreamPostComputation(cu, pmeSyncEvent, cu.getKernel(module, "addEnergy"), *pmeEnergyBuffer, recipForceGroup));
}
hasInitializedFFT = true;
......@@ -1889,7 +1902,7 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
}
if (includeEnergy) {
void* computeEnergyArgs[] = {&reciprocalPmeGrid->getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(),
void* computeEnergyArgs[] = {&reciprocalPmeGrid->getDevicePointer(), usePmeStream ? &pmeEnergyBuffer->getDevicePointer() : &cu.getEnergyBuffer().getDevicePointer(),
&pmeBsplineModuliX->getDevicePointer(), &pmeBsplineModuliY->getDevicePointer(), &pmeBsplineModuliZ->getDevicePointer(),
cu.getPeriodicBoxSizePointer(), recipBoxVectorPointer[0], recipBoxVectorPointer[1], recipBoxVectorPointer[2]};
cu.executeKernel(pmeEvalEnergyKernel, computeEnergyArgs, cu.getNumAtoms());
......@@ -5674,7 +5687,6 @@ void CudaIntegrateVerletStepKernel::initialize(const System& system, const Verle
CUmodule module = cu.createModule(CudaKernelSources::verlet, defines, "");
kernel1 = cu.getKernel(module, "integrateVerletPart1");
kernel2 = cu.getKernel(module, "integrateVerletPart2");
prevStepSize = -1.0;
}
void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIntegrator& integrator) {
......@@ -5683,19 +5695,7 @@ void CudaIntegrateVerletStepKernel::execute(ContextImpl& context, const VerletIn
int numAtoms = cu.getNumAtoms();
int paddedNumAtoms = cu.getPaddedNumAtoms();
double dt = integrator.getStepSize();
if (dt != prevStepSize) {
if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
vector<double2> stepSizeVec(1);
stepSizeVec[0] = make_double2(dt, dt);
cu.getIntegrationUtilities().getStepSize().upload(stepSizeVec);
}
else {
vector<float2> stepSizeVec(1);
stepSizeVec[0] = make_float2((float) dt, (float) dt);
cu.getIntegrationUtilities().getStepSize().upload(stepSizeVec);
}
prevStepSize = dt;
}
cu.getIntegrationUtilities().setNextStepSize(dt);
// Call the first integration kernel.
......@@ -5752,6 +5752,7 @@ void CudaIntegrateLangevinStepKernel::execute(ContextImpl& context, const Langev
double temperature = integrator.getTemperature();
double friction = integrator.getFriction();
double stepSize = integrator.getStepSize();
cu.getIntegrationUtilities().setNextStepSize(stepSize);
if (temperature != prevTemp || friction != prevFriction || stepSize != prevStepSize) {
// Calculate the integration parameters.
......@@ -5766,8 +5767,6 @@ void CudaIntegrateLangevinStepKernel::execute(ContextImpl& context, const Langev
p[1] = fscale;
p[2] = noisescale;
params->upload(p);
double2 ss = make_double2(0, stepSize);
integration.getStepSize().upload(&ss);
}
else {
vector<float> p(params->getSize());
......@@ -5775,8 +5774,6 @@ void CudaIntegrateLangevinStepKernel::execute(ContextImpl& context, const Langev
p[1] = (float) fscale;
p[2] = (float) noisescale;
params->upload(p);
float2 ss = make_float2(0, (float) stepSize);
integration.getStepSize().upload(&ss);
}
prevTemp = temperature;
prevFriction = friction;
......@@ -5929,20 +5926,13 @@ double CudaIntegrateVariableVerletStepKernel::execute(ContextImpl& context, cons
// Update the time and step count.
double dt, time;
double dt = cu.getIntegrationUtilities().getLastStepSize();
double time = cu.getTime()+dt;
if (useDouble) {
double2 stepSize;
cu.getIntegrationUtilities().getStepSize().download(&stepSize);
dt = stepSize.y;
time = cu.getTime()+dt;
if (dt == maxStepSize)
time = maxTime; // Avoid round-off error
}
else {
float2 stepSize;
cu.getIntegrationUtilities().getStepSize().download(&stepSize);
dt = stepSize.y;
time = cu.getTime()+dt;
if (dt == maxStepSizeFloat)
time = maxTime; // Avoid round-off error
}
......@@ -6023,20 +6013,13 @@ double CudaIntegrateVariableLangevinStepKernel::execute(ContextImpl& context, co
// Update the time and step count.
double dt, time;
double dt = cu.getIntegrationUtilities().getLastStepSize();
double time = cu.getTime()+dt;
if (useDouble) {
double2 stepSize;
cu.getIntegrationUtilities().getStepSize().download(&stepSize);
dt = stepSize.y;
time = cu.getTime()+dt;
if (dt == maxStepSize)
time = maxTime; // Avoid round-off error
}
else {
float2 stepSize;
cu.getIntegrationUtilities().getStepSize().download(&stepSize);
dt = stepSize.y;
time = cu.getTime()+dt;
if (dt == maxStepSizeFloat)
time = maxTime; // Avoid round-off error
}
......@@ -6139,7 +6122,6 @@ void CudaIntegrateCustomStepKernel::initialize(const System& system, const Custo
summedValue = new CudaArray(cu, 1, elementSize, "summedValue");
perDofValues = new CudaParameterSet(cu, integrator.getNumPerDofVariables(), 3*system.getNumParticles(), "perDofVariables", false, cu.getUseDoublePrecision() || cu.getUseMixedPrecision());
cu.addReorderListener(new ReorderListener(cu, *perDofValues, localPerDofValuesFloat, localPerDofValuesDouble, deviceValuesAreCurrent));
prevStepSize = -1.0;
SimTKOpenMMUtilities::setRandomNumberSeed(integrator.getRandomNumberSeed());
}
......@@ -6553,9 +6535,7 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context,
}
localValuesAreCurrent = false;
double stepSize = integrator.getStepSize();
if (stepSize != prevStepSize) {
recordGlobalValue(stepSize, GlobalTarget(DT, dtVariableIndex));
}
recordGlobalValue(stepSize, GlobalTarget(DT, dtVariableIndex));
for (int i = 0; i < (int) parameterNames.size(); i++) {
double value = context.getParameter(parameterNames[i]);
if (value != globalValuesDouble[parameterVariableIndex[i]]) {
......@@ -6760,17 +6740,10 @@ double CudaIntegrateCustomStepKernel::computeKineticEnergy(ContextImpl& context,
void CudaIntegrateCustomStepKernel::recordGlobalValue(double value, GlobalTarget target) {
switch (target.type) {
case DT:
if (value != globalValuesDouble[dtVariableIndex])
deviceGlobalsAreCurrent = false;
globalValuesDouble[dtVariableIndex] = value;
deviceGlobalsAreCurrent = false;
if (cu.getUseDoublePrecision() || cu.getUseMixedPrecision()) {
double size[] = {0, value};
cu.getIntegrationUtilities().getStepSize().upload(size);
}
else {
float size[] = {0, (float) value};
cu.getIntegrationUtilities().getStepSize().upload(size);
}
prevStepSize = value;
cu.getIntegrationUtilities().setNextStepSize(value);
break;
case VARIABLE:
case PARAMETER:
......
......@@ -179,7 +179,7 @@ void CudaPlatform::contextDestroyed(ContextImpl& context) const {
}
CudaPlatform::PlatformData::PlatformData(ContextImpl* context, const System& system, const string& deviceIndexProperty, const string& blockingProperty, const string& precisionProperty,
const string& cpuPmeProperty, const string& compilerProperty, const string& tempProperty, const string& hostCompilerProperty) : context(context), removeCM(false), stepCount(0), computeForceCount(0), time(0.0) {
const string& cpuPmeProperty, const string& compilerProperty, const string& tempProperty, const string& hostCompilerProperty) : context(context), removeCM(false), stepCount(0), computeForceCount(0), time(0.0), hasInitializedContexts(false) {
bool blocking = (blockingProperty == "true");
vector<string> devices;
size_t searchPos = 0, nextPos;
......@@ -247,8 +247,11 @@ CudaPlatform::PlatformData::~PlatformData() {
}
void CudaPlatform::PlatformData::initializeContexts(const System& system) {
if (hasInitializedContexts)
return;
for (int i = 0; i < (int) contexts.size(); i++)
contexts[i]->initialize();
hasInitializedContexts = true;
}
void CudaPlatform::PlatformData::syncContexts() {
......
......@@ -104,10 +104,10 @@ inline __device__ real4 computeCross(real4 vec1, real4 vec2) {
/**
* Compute the forces on groups based on the bonds.
*/
extern "C" __global__ void computeGroupForces(unsigned long long* __restrict__ groupForce, real* __restrict__ energyBuffer, const real4* __restrict__ centerPositions,
extern "C" __global__ void computeGroupForces(unsigned long long* __restrict__ groupForce, mixed* __restrict__ energyBuffer, const real4* __restrict__ centerPositions,
const int* __restrict__ bondGroups
EXTRA_ARGS) {
real energy = 0;
mixed energy = 0;
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_BONDS; index += blockDim.x*gridDim.x) {
COMPUTE_FORCE
}
......
......@@ -13,7 +13,7 @@ typedef struct {
/**
* Compute a force based on pair interactions.
*/
extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forceBuffers, real* __restrict__ energyBuffer,
extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer,
const real4* __restrict__ posq, const unsigned int* __restrict__ exclusions, const ushort2* __restrict__ exclusionTiles,
#ifdef USE_CUTOFF
const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
......@@ -27,7 +27,7 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
const unsigned int tbx = threadIdx.x - tgx;
real energy = 0;
mixed energy = 0;
__shared__ AtomData localData[THREAD_BLOCK_SIZE];
// First loop: process tiles that contain exclusions.
......
......@@ -2,9 +2,9 @@
* Reduce the derivatives computed in the N^2 energy kernel, and compute all per-particle energy terms.
*/
extern "C" __global__ void computePerParticleEnergy(long long* __restrict__ forceBuffers, real* __restrict__ energyBuffer, const real4* __restrict__ posq
extern "C" __global__ void computePerParticleEnergy(long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq
PARAMETER_ARGUMENTS) {
real energy = 0;
mixed energy = 0;
for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
// Load the derivatives
......
......@@ -66,12 +66,12 @@ inline __device__ real4 computeCross(real4 vec1, real4 vec2) {
/**
* Compute forces on donors.
*/
extern "C" __global__ void computeDonorForces(unsigned long long* __restrict__ force, real* __restrict__ energyBuffer, const real4* __restrict__ posq,
extern "C" __global__ void computeDonorForces(unsigned long long* __restrict__ force, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq,
const int4* __restrict__ exclusions, const int4* __restrict__ donorAtoms, const int4* __restrict__ acceptorAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
PARAMETER_ARGUMENTS) {
extern __shared__ real4 posBuffer[];
real energy = 0;
mixed energy = 0;
real3 f1 = make_real3(0);
real3 f2 = make_real3(0);
real3 f3 = make_real3(0);
......@@ -155,7 +155,7 @@ extern "C" __global__ void computeDonorForces(unsigned long long* __restrict__ f
/**
* Compute forces on acceptors.
*/
extern "C" __global__ void computeAcceptorForces(unsigned long long* __restrict__ force, real* __restrict__ energyBuffer, const real4* __restrict__ posq,
extern "C" __global__ void computeAcceptorForces(unsigned long long* __restrict__ force, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq,
const int4* __restrict__ exclusions, const int4* __restrict__ donorAtoms, const int4* __restrict__ acceptorAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
PARAMETER_ARGUMENTS) {
......
......@@ -78,7 +78,7 @@ __constant__ float globals[NUM_GLOBALS];
* Compute the interaction.
*/
extern "C" __global__ void computeInteraction(
unsigned long long* __restrict__ forceBuffers, real* __restrict__ energyBuffer, const real4* __restrict__ posq,
unsigned long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq,
real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
#ifdef USE_CUTOFF
, const int* __restrict__ neighbors, const int* __restrict__ neighborStartIndex
......@@ -90,7 +90,7 @@ extern "C" __global__ void computeInteraction(
, int* __restrict__ exclusions, int* __restrict__ exclusionStartIndex
#endif
PARAMETER_ARGUMENTS) {
real energy = 0.0f;
mixed energy = 0;
// Loop over particles to be the first one in the set.
......
......@@ -9,14 +9,14 @@ typedef struct {
} AtomData;
extern "C" __global__ void computeInteractionGroups(
unsigned long long* __restrict__ forceBuffers, real* __restrict__ energyBuffer, const real4* __restrict__ posq, const int4* __restrict__ groupData,
unsigned long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq, const int4* __restrict__ groupData,
real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
PARAMETER_ARGUMENTS) {
const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE; // global warpIndex
const unsigned int tgx = threadIdx.x & (TILE_SIZE-1); // index within the warp
const unsigned int tbx = threadIdx.x - tgx; // block warpIndex
real energy = 0.0f;
mixed energy = 0;
__shared__ AtomData localData[LOCAL_MEMORY_SIZE];
const unsigned int startTile = FIRST_TILE+warp*(LAST_TILE-FIRST_TILE)/totalWarps;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment