Commit 3862202e authored by Justin MacCallum's avatar Justin MacCallum
Browse files

Merge branch 'upstream' into fork

parents e1a4e015 73882ac5
...@@ -181,7 +181,7 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express ...@@ -181,7 +181,7 @@ void CudaExpressionUtilities::processExpression(stringstream& out, const Express
out << "ASIN(" << getTempName(node.getChildren()[0], temps) << ")"; out << "ASIN(" << getTempName(node.getChildren()[0], temps) << ")";
break; break;
case Operation::ACOS: case Operation::ACOS:
out << "ACSO(" << getTempName(node.getChildren()[0], temps) << ")"; out << "ACOS(" << getTempName(node.getChildren()[0], temps) << ")";
break; break;
case Operation::ATAN: case Operation::ATAN:
out << "ATAN(" << getTempName(node.getChildren()[0], temps) << ")"; out << "ATAN(" << getTempName(node.getChildren()[0], temps) << ")";
......
...@@ -44,8 +44,8 @@ ...@@ -44,8 +44,8 @@
#include "lepton/Operation.h" #include "lepton/Operation.h"
#include "lepton/Parser.h" #include "lepton/Parser.h"
#include "lepton/ParsedExpression.h" #include "lepton/ParsedExpression.h"
#include "../src/SimTKUtilities/SimTKOpenMMRealType.h" #include "SimTKOpenMMRealType.h"
#include "../src/SimTKUtilities/SimTKOpenMMUtilities.h" #include "SimTKOpenMMUtilities.h"
#include <cmath> #include <cmath>
#include <set> #include <set>
...@@ -84,10 +84,12 @@ void CudaCalcForcesAndEnergyKernel::initialize(const System& system) { ...@@ -84,10 +84,12 @@ void CudaCalcForcesAndEnergyKernel::initialize(const System& system) {
void CudaCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForces, bool includeEnergy, int groups) { void CudaCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForces, bool includeEnergy, int groups) {
cu.setAsCurrent(); cu.setAsCurrent();
cu.clearAutoclearBuffers();
for (vector<CudaContext::ForcePreComputation*>::iterator iter = cu.getPreComputations().begin(); iter != cu.getPreComputations().end(); ++iter)
(*iter)->computeForceAndEnergy(includeForces, includeEnergy, groups);
CudaNonbondedUtilities& nb = cu.getNonbondedUtilities(); CudaNonbondedUtilities& nb = cu.getNonbondedUtilities();
bool includeNonbonded = ((groups&(1<<nb.getForceGroup())) != 0); bool includeNonbonded = ((groups&(1<<nb.getForceGroup())) != 0);
cu.setComputeForceCount(cu.getComputeForceCount()+1); cu.setComputeForceCount(cu.getComputeForceCount()+1);
cu.clearAutoclearBuffers();
if (includeNonbonded) if (includeNonbonded)
nb.prepareInteractions(); nb.prepareInteractions();
} }
...@@ -96,8 +98,10 @@ double CudaCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bo ...@@ -96,8 +98,10 @@ double CudaCalcForcesAndEnergyKernel::finishComputation(ContextImpl& context, bo
cu.getBondedUtilities().computeInteractions(groups); cu.getBondedUtilities().computeInteractions(groups);
if ((groups&(1<<cu.getNonbondedUtilities().getForceGroup())) != 0) if ((groups&(1<<cu.getNonbondedUtilities().getForceGroup())) != 0)
cu.getNonbondedUtilities().computeInteractions(); cu.getNonbondedUtilities().computeInteractions();
cu.getIntegrationUtilities().distributeForcesFromVirtualSites();
double sum = 0.0; double sum = 0.0;
for (vector<CudaContext::ForcePostComputation*>::iterator iter = cu.getPostComputations().begin(); iter != cu.getPostComputations().end(); ++iter)
sum += (*iter)->computeForceAndEnergy(includeForces, includeEnergy, groups);
cu.getIntegrationUtilities().distributeForcesFromVirtualSites();
if (includeEnergy) { if (includeEnergy) {
CudaArray& energyArray = cu.getEnergyBuffer(); CudaArray& energyArray = cu.getEnergyBuffer();
if (cu.getUseDoublePrecision()) { if (cu.getUseDoublePrecision()) {
...@@ -1330,6 +1334,58 @@ private: ...@@ -1330,6 +1334,58 @@ private:
const NonbondedForce& force; const NonbondedForce& force;
}; };
class CudaCalcNonbondedForceKernel::PmeIO : public CalcPmeReciprocalForceKernel::IO {
public:
PmeIO(CudaContext& cu, CUfunction addForcesKernel) : cu(cu), addForcesKernel(addForcesKernel), forceTemp(NULL) {
forceTemp = CudaArray::create<float4>(cu, cu.getNumAtoms(), "PmeForce");
}
~PmeIO() {
if (forceTemp != NULL)
delete forceTemp;
}
float* getPosq() {
cu.setAsCurrent();
cu.getPosq().download(posq);
return (float*) &posq[0];
}
void setForce(float* force) {
forceTemp->upload(force);
void* args[] = {&forceTemp->getDevicePointer(), &cu.getForce().getDevicePointer()};
cu.executeKernel(addForcesKernel, args, cu.getNumAtoms());
}
private:
CudaContext& cu;
vector<float4> posq;
CudaArray* forceTemp;
CUfunction addForcesKernel;
};
class CudaCalcNonbondedForceKernel::PmePreComputation : public CudaContext::ForcePreComputation {
public:
PmePreComputation(CudaContext& cu, Kernel& pme, CalcPmeReciprocalForceKernel::IO& io) : cu(cu), pme(pme), io(io) {
}
void computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) {
Vec3 boxSize(cu.getPeriodicBoxSize().x, cu.getPeriodicBoxSize().y, cu.getPeriodicBoxSize().z);
pme.getAs<CalcPmeReciprocalForceKernel>().beginComputation(io, boxSize, includeEnergy);
}
private:
CudaContext& cu;
Kernel pme;
CalcPmeReciprocalForceKernel::IO& io;
};
class CudaCalcNonbondedForceKernel::PmePostComputation : public CudaContext::ForcePostComputation {
public:
PmePostComputation(Kernel& pme, CalcPmeReciprocalForceKernel::IO& io) : pme(pme), io(io) {
}
double computeForceAndEnergy(bool includeForces, bool includeEnergy, int groups) {
return pme.getAs<CalcPmeReciprocalForceKernel>().finishComputation(io);
}
private:
Kernel pme;
CalcPmeReciprocalForceKernel::IO& io;
};
CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() { CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
cu.setAsCurrent(); cu.setAsCurrent();
if (sigmaEpsilon != NULL) if (sigmaEpsilon != NULL)
...@@ -1354,6 +1410,8 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() { ...@@ -1354,6 +1410,8 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
delete pmeAtomGridIndex; delete pmeAtomGridIndex;
if (sort != NULL) if (sort != NULL)
delete sort; delete sort;
if (pmeio != NULL)
delete pmeio;
if (hasInitializedFFT) { if (hasInitializedFFT) {
cufftDestroy(fftForward); cufftDestroy(fftForward);
cufftDestroy(fftBackward); cufftDestroy(fftBackward);
...@@ -1457,7 +1515,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon ...@@ -1457,7 +1515,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
else else
dispersionCoefficient = 0.0; dispersionCoefficient = 0.0;
alpha = 0; alpha = 0;
if (force.getNonbondedMethod() == NonbondedForce::Ewald) { if (force.getNonbondedMethod() == NonbondedForce::Ewald && cu.getContextIndex() == 0) {
// Compute the Ewald parameters. // Compute the Ewald parameters.
int kmaxx, kmaxy, kmaxz; int kmaxx, kmaxy, kmaxz;
...@@ -1465,7 +1523,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon ...@@ -1465,7 +1523,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
defines["EWALD_ALPHA"] = cu.doubleToString(alpha); defines["EWALD_ALPHA"] = cu.doubleToString(alpha);
defines["TWO_OVER_SQRT_PI"] = cu.doubleToString(2.0/sqrt(M_PI)); defines["TWO_OVER_SQRT_PI"] = cu.doubleToString(2.0/sqrt(M_PI));
defines["USE_EWALD"] = "1"; defines["USE_EWALD"] = "1";
ewaldSelfEnergy = (cu.getContextIndex() == 0 ? -ONE_4PI_EPS0*alpha*sumSquaredCharges/sqrt(M_PI) : 0.0); ewaldSelfEnergy = -ONE_4PI_EPS0*alpha*sumSquaredCharges/sqrt(M_PI);
// Create the reciprocal space kernels. // Create the reciprocal space kernels.
...@@ -1484,7 +1542,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon ...@@ -1484,7 +1542,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
int elementSize = (cu.getUseDoublePrecision() ? sizeof(double2) : sizeof(float2)); int elementSize = (cu.getUseDoublePrecision() ? sizeof(double2) : sizeof(float2));
cosSinSums = new CudaArray(cu, (2*kmaxx-1)*(2*kmaxy-1)*(2*kmaxz-1), elementSize, "cosSinSums"); cosSinSums = new CudaArray(cu, (2*kmaxx-1)*(2*kmaxy-1)*(2*kmaxz-1), elementSize, "cosSinSums");
} }
else if (force.getNonbondedMethod() == NonbondedForce::PME) { else if (force.getNonbondedMethod() == NonbondedForce::PME && cu.getContextIndex() == 0) {
// Compute the PME parameters. // Compute the PME parameters.
int gridSizeX, gridSizeY, gridSizeZ; int gridSizeX, gridSizeY, gridSizeZ;
...@@ -1497,7 +1555,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon ...@@ -1497,7 +1555,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
defines["EWALD_ALPHA"] = cu.doubleToString(alpha); defines["EWALD_ALPHA"] = cu.doubleToString(alpha);
defines["TWO_OVER_SQRT_PI"] = cu.doubleToString(2.0/sqrt(M_PI)); defines["TWO_OVER_SQRT_PI"] = cu.doubleToString(2.0/sqrt(M_PI));
defines["USE_EWALD"] = "1"; defines["USE_EWALD"] = "1";
ewaldSelfEnergy = (cu.getContextIndex() == 0 ? -ONE_4PI_EPS0*alpha*sumSquaredCharges/sqrt(M_PI) : 0.0); ewaldSelfEnergy = -ONE_4PI_EPS0*alpha*sumSquaredCharges/sqrt(M_PI);
pmeDefines["PME_ORDER"] = cu.intToString(PmeOrder); pmeDefines["PME_ORDER"] = cu.intToString(PmeOrder);
pmeDefines["NUM_ATOMS"] = cu.intToString(numParticles); pmeDefines["NUM_ATOMS"] = cu.intToString(numParticles);
pmeDefines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms()); pmeDefines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
...@@ -1510,111 +1568,128 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon ...@@ -1510,111 +1568,128 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
if (cu.getUseDoublePrecision()) if (cu.getUseDoublePrecision())
pmeDefines["USE_DOUBLE_PRECISION"] = "1"; pmeDefines["USE_DOUBLE_PRECISION"] = "1";
CUmodule module = cu.createModule(CudaKernelSources::vectorOps+CudaKernelSources::pme, pmeDefines); CUmodule module = cu.createModule(CudaKernelSources::vectorOps+CudaKernelSources::pme, pmeDefines);
pmeGridIndexKernel = cu.getKernel(module, "findAtomGridIndex"); if (cu.getPlatformData().useCpuPme) {
pmeSpreadChargeKernel = cu.getKernel(module, "gridSpreadCharge"); // Create the CPU PME kernel.
pmeConvolutionKernel = cu.getKernel(module, "reciprocalConvolution");
pmeInterpolateForceKernel = cu.getKernel(module, "gridInterpolateForce"); try {
pmeEvalEnergyKernel = cu.getKernel(module, "gridEvaluateEnergy"); cpuPme = getPlatform().createKernel(CalcPmeReciprocalForceKernel::Name(), *cu.getPlatformData().context);
pmeFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadCharge"); cpuPme.getAs<CalcPmeReciprocalForceKernel>().initialize(gridSizeX, gridSizeY, gridSizeZ, numParticles, alpha);
cuFuncSetCacheConfig(pmeSpreadChargeKernel, CU_FUNC_CACHE_PREFER_L1); CUfunction addForcesKernel = cu.getKernel(module, "addForces");
cuFuncSetCacheConfig(pmeInterpolateForceKernel, CU_FUNC_CACHE_PREFER_L1); pmeio = new PmeIO(cu, addForcesKernel);
cu.addPreComputation(new PmePreComputation(cu, cpuPme, *pmeio));
// Create required data structures. cu.addPostComputation(new PmePostComputation(cpuPme, *pmeio));
}
int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float)); catch (OpenMMException& ex) {
// The CPU PME plugin isn't available.
directPmeGrid = new CudaArray(cu, gridSizeX*gridSizeY*gridSizeZ, cu.getComputeCapability() >= 2.0 ? elementSize : sizeof(long long), "originalPmeGrid"); }
reciprocalPmeGrid = new CudaArray(cu, gridSizeX*gridSizeY*(gridSizeZ/2+1), 2*elementSize, "reciprocalPmeGrid"); }
if (pmeio == NULL) {
cu.addAutoclearBuffer(*directPmeGrid); pmeGridIndexKernel = cu.getKernel(module, "findAtomGridIndex");
pmeSpreadChargeKernel = cu.getKernel(module, "gridSpreadCharge");
pmeBsplineModuliX = new CudaArray(cu, gridSizeX, elementSize, "pmeBsplineModuliX"); pmeConvolutionKernel = cu.getKernel(module, "reciprocalConvolution");
pmeBsplineModuliY = new CudaArray(cu, gridSizeY, elementSize, "pmeBsplineModuliY"); pmeInterpolateForceKernel = cu.getKernel(module, "gridInterpolateForce");
pmeBsplineModuliZ = new CudaArray(cu, gridSizeZ, elementSize, "pmeBsplineModuliZ"); pmeEvalEnergyKernel = cu.getKernel(module, "gridEvaluateEnergy");
pmeAtomRange = CudaArray::create<int>(cu, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange"); pmeFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadCharge");
pmeAtomGridIndex = CudaArray::create<int2>(cu, numParticles, "pmeAtomGridIndex"); cuFuncSetCacheConfig(pmeSpreadChargeKernel, CU_FUNC_CACHE_PREFER_L1);
sort = new CudaSort(cu, new SortTrait(), cu.getNumAtoms()); cuFuncSetCacheConfig(pmeInterpolateForceKernel, CU_FUNC_CACHE_PREFER_L1);
cufftResult result = cufftPlan3d(&fftForward, gridSizeX, gridSizeY, gridSizeZ, cu.getUseDoublePrecision() ? CUFFT_D2Z : CUFFT_R2C); // Create required data structures.
if (result != CUFFT_SUCCESS)
throw OpenMMException("Error initializing FFT: "+cu.intToString(result)); int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
result = cufftPlan3d(&fftBackward, gridSizeX, gridSizeY, gridSizeZ, cu.getUseDoublePrecision() ? CUFFT_Z2D : CUFFT_C2R);
if (result != CUFFT_SUCCESS) directPmeGrid = new CudaArray(cu, gridSizeX*gridSizeY*gridSizeZ, cu.getComputeCapability() >= 2.0 ? elementSize : sizeof(long long), "originalPmeGrid");
throw OpenMMException("Error initializing FFT: "+cu.intToString(result)); reciprocalPmeGrid = new CudaArray(cu, gridSizeX*gridSizeY*(gridSizeZ/2+1), 2*elementSize, "reciprocalPmeGrid");
cufftSetCompatibilityMode(fftForward, CUFFT_COMPATIBILITY_NATIVE); cu.addAutoclearBuffer(*directPmeGrid);
cufftSetCompatibilityMode(fftBackward, CUFFT_COMPATIBILITY_NATIVE);
pmeBsplineModuliX = new CudaArray(cu, gridSizeX, elementSize, "pmeBsplineModuliX");
pmeBsplineModuliY = new CudaArray(cu, gridSizeY, elementSize, "pmeBsplineModuliY");
pmeBsplineModuliZ = new CudaArray(cu, gridSizeZ, elementSize, "pmeBsplineModuliZ");
pmeAtomRange = CudaArray::create<int>(cu, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange");
pmeAtomGridIndex = CudaArray::create<int2>(cu, numParticles, "pmeAtomGridIndex");
sort = new CudaSort(cu, new SortTrait(), cu.getNumAtoms());
cufftResult result = cufftPlan3d(&fftForward, gridSizeX, gridSizeY, gridSizeZ, cu.getUseDoublePrecision() ? CUFFT_D2Z : CUFFT_R2C);
if (result != CUFFT_SUCCESS)
throw OpenMMException("Error initializing FFT: "+cu.intToString(result));
result = cufftPlan3d(&fftBackward, gridSizeX, gridSizeY, gridSizeZ, cu.getUseDoublePrecision() ? CUFFT_Z2D : CUFFT_C2R);
if (result != CUFFT_SUCCESS)
throw OpenMMException("Error initializing FFT: "+cu.intToString(result));
cufftSetCompatibilityMode(fftForward, CUFFT_COMPATIBILITY_NATIVE);
cufftSetCompatibilityMode(fftBackward, CUFFT_COMPATIBILITY_NATIVE);
hasInitializedFFT = true;
// Initialize the b-spline moduli.
int maxSize = max(max(gridSizeX, gridSizeY), gridSizeZ);
vector<double> data(PmeOrder);
vector<double> ddata(PmeOrder);
vector<double> bsplines_data(maxSize);
data[PmeOrder-1] = 0.0;
data[1] = 0.0;
data[0] = 1.0;
for (int i = 3; i < PmeOrder; i++) {
double div = 1.0/(i-1.0);
data[i-1] = 0.0;
for (int j = 1; j < (i-1); j++)
data[i-j-1] = div*(j*data[i-j-2]+(i-j)*data[i-j-1]);
data[0] = div*data[0];
}
hasInitializedFFT = true; // Differentiate.
// Initialize the b-spline moduli. ddata[0] = -data[0];
for (int i = 1; i < PmeOrder; i++)
int maxSize = max(max(gridSizeX, gridSizeY), gridSizeZ); ddata[i] = data[i-1]-data[i];
vector<double> data(PmeOrder); double div = 1.0/(PmeOrder-1);
vector<double> ddata(PmeOrder); data[PmeOrder-1] = 0.0;
vector<double> bsplines_data(maxSize); for (int i = 1; i < (PmeOrder-1); i++)
data[PmeOrder-1] = 0.0; data[PmeOrder-i-1] = div*(i*data[PmeOrder-i-2]+(PmeOrder-i)*data[PmeOrder-i-1]);
data[1] = 0.0;
data[0] = 1.0;
for (int i = 3; i < PmeOrder; i++) {
double div = 1.0/(i-1.0);
data[i-1] = 0.0;
for (int j = 1; j < (i-1); j++)
data[i-j-1] = div*(j*data[i-j-2]+(i-j)*data[i-j-1]);
data[0] = div*data[0]; data[0] = div*data[0];
} for (int i = 0; i < maxSize; i++)
bsplines_data[i] = 0.0;
// Differentiate. for (int i = 1; i <= PmeOrder; i++)
bsplines_data[i] = data[i-1];
ddata[0] = -data[0];
for (int i = 1; i < PmeOrder; i++) // Evaluate the actual bspline moduli for X/Y/Z.
ddata[i] = data[i-1]-data[i];
double div = 1.0/(PmeOrder-1); for(int dim = 0; dim < 3; dim++) {
data[PmeOrder-1] = 0.0; int ndata = (dim == 0 ? gridSizeX : dim == 1 ? gridSizeY : gridSizeZ);
for (int i = 1; i < (PmeOrder-1); i++) vector<double> moduli(ndata);
data[PmeOrder-i-1] = div*(i*data[PmeOrder-i-2]+(PmeOrder-i)*data[PmeOrder-i-1]); for (int i = 0; i < ndata; i++) {
data[0] = div*data[0]; double sc = 0.0;
for (int i = 0; i < maxSize; i++) double ss = 0.0;
bsplines_data[i] = 0.0; for (int j = 0; j < ndata; j++) {
for (int i = 1; i <= PmeOrder; i++) double arg = (2.0*M_PI*i*j)/ndata;
bsplines_data[i] = data[i-1]; sc += bsplines_data[j]*cos(arg);
ss += bsplines_data[j]*sin(arg);
// Evaluate the actual bspline moduli for X/Y/Z. }
moduli[i] = sc*sc+ss*ss;
for(int dim = 0; dim < 3; dim++) {
int ndata = (dim == 0 ? gridSizeX : dim == 1 ? gridSizeY : gridSizeZ);
vector<double> moduli(ndata);
for (int i = 0; i < ndata; i++) {
double sc = 0.0;
double ss = 0.0;
for (int j = 0; j < ndata; j++) {
double arg = (2.0*M_PI*i*j)/ndata;
sc += bsplines_data[j]*cos(arg);
ss += bsplines_data[j]*sin(arg);
} }
moduli[i] = sc*sc+ss*ss;
}
for (int i = 0; i < ndata; i++)
if (moduli[i] < 1.0e-7)
moduli[i] = (moduli[i-1]+moduli[i+1])*0.5;
if (cu.getUseDoublePrecision()) {
if (dim == 0)
pmeBsplineModuliX->upload(moduli);
else if (dim == 1)
pmeBsplineModuliY->upload(moduli);
else
pmeBsplineModuliZ->upload(moduli);
}
else {
vector<float> modulif(ndata);
for (int i = 0; i < ndata; i++) for (int i = 0; i < ndata; i++)
modulif[i] = (float) moduli[i]; if (moduli[i] < 1.0e-7)
if (dim == 0) moduli[i] = (moduli[i-1]+moduli[i+1])*0.5;
pmeBsplineModuliX->upload(modulif); if (cu.getUseDoublePrecision()) {
else if (dim == 1) if (dim == 0)
pmeBsplineModuliY->upload(modulif); pmeBsplineModuliX->upload(moduli);
else else if (dim == 1)
pmeBsplineModuliZ->upload(modulif); pmeBsplineModuliY->upload(moduli);
else
pmeBsplineModuliZ->upload(moduli);
}
else {
vector<float> modulif(ndata);
for (int i = 0; i < ndata; i++)
modulif[i] = (float) moduli[i];
if (dim == 0)
pmeBsplineModuliX->upload(modulif);
else if (dim == 1)
pmeBsplineModuliY->upload(modulif);
else
pmeBsplineModuliZ->upload(modulif);
}
} }
} }
} }
...@@ -1654,13 +1729,13 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon ...@@ -1654,13 +1729,13 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
} }
double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy, bool includeDirect, bool includeReciprocal) { double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy, bool includeDirect, bool includeReciprocal) {
if (cosSinSums != NULL && cu.getContextIndex() == 0 && includeReciprocal) { if (cosSinSums != NULL && includeReciprocal) {
void* sumsArgs[] = {&cu.getEnergyBuffer().getDevicePointer(), &cu.getPosq().getDevicePointer(), &cosSinSums->getDevicePointer(), cu.getPeriodicBoxSizePointer()}; void* sumsArgs[] = {&cu.getEnergyBuffer().getDevicePointer(), &cu.getPosq().getDevicePointer(), &cosSinSums->getDevicePointer(), cu.getPeriodicBoxSizePointer()};
cu.executeKernel(ewaldSumsKernel, sumsArgs, cosSinSums->getSize()); cu.executeKernel(ewaldSumsKernel, sumsArgs, cosSinSums->getSize());
void* forcesArgs[] = {&cu.getForce().getDevicePointer(), &cu.getPosq().getDevicePointer(), &cosSinSums->getDevicePointer(), cu.getPeriodicBoxSizePointer()}; void* forcesArgs[] = {&cu.getForce().getDevicePointer(), &cu.getPosq().getDevicePointer(), &cosSinSums->getDevicePointer(), cu.getPeriodicBoxSizePointer()};
cu.executeKernel(ewaldForcesKernel, forcesArgs, cu.getNumAtoms()); cu.executeKernel(ewaldForcesKernel, forcesArgs, cu.getNumAtoms());
} }
if (directPmeGrid != NULL && cu.getContextIndex() == 0 && includeReciprocal) { if (directPmeGrid != NULL && includeReciprocal) {
void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(), cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()}; void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(), cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()};
cu.executeKernel(pmeGridIndexKernel, gridIndexArgs, cu.getNumAtoms()); cu.executeKernel(pmeGridIndexKernel, gridIndexArgs, cu.getNumAtoms());
...@@ -4677,22 +4752,6 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context, ...@@ -4677,22 +4752,6 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context,
defines["SUM_BUFFER_SIZE"] = "0"; defines["SUM_BUFFER_SIZE"] = "0";
defines["SUM_OUTPUT_INDEX"] = "0"; defines["SUM_OUTPUT_INDEX"] = "0";
// Initialize the random number generator.
uniformRandoms = CudaArray::create<float4>(cu, cu.getNumAtoms(), "uniformRandoms");
randomSeed = CudaArray::create<int4>(cu, cu.getNumThreadBlocks()*CudaContext::ThreadBlockSize, "randomSeed");
vector<int4> seed(randomSeed->getSize());
unsigned int r = integrator.getRandomNumberSeed()+1;
for (int i = 0; i < randomSeed->getSize(); i++) {
seed[i].x = r = (1664525*r + 1013904223) & 0xFFFFFFFF;
seed[i].y = r = (1664525*r + 1013904223) & 0xFFFFFFFF;
seed[i].z = r = (1664525*r + 1013904223) & 0xFFFFFFFF;
seed[i].w = r = (1664525*r + 1013904223) & 0xFFFFFFFF;
}
randomSeed->upload(seed);
CUmodule randomProgram = cu.createModule(CudaKernelSources::customIntegrator, defines);
randomKernel = cu.getKernel(randomProgram, "generateRandomNumbers");
// Build a list of all variables that affect the forces, so we can tell which // Build a list of all variables that affect the forces, so we can tell which
// steps invalidate them. // steps invalidate them.
...@@ -4783,10 +4842,10 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context, ...@@ -4783,10 +4842,10 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context,
for (int step = 1; step < numSteps; step++) { for (int step = 1; step < numSteps; step++) {
if (needsForces[step] || needsEnergy[step]) if (needsForces[step] || needsEnergy[step])
continue; continue;
if (stepType[step-1] == CustomIntegrator::ComputeGlobal && stepType[step] == CustomIntegrator::ComputeGlobal) if (stepType[step-1] == CustomIntegrator::ComputeGlobal && stepType[step] == CustomIntegrator::ComputeGlobal &&
!usesVariable(expression[step], "uniform") && !usesVariable(expression[step], "gaussian"))
merged[step] = true; merged[step] = true;
if (stepType[step-1] == CustomIntegrator::ComputePerDof && stepType[step] == CustomIntegrator::ComputePerDof && if (stepType[step-1] == CustomIntegrator::ComputePerDof && stepType[step] == CustomIntegrator::ComputePerDof)
!usesVariable(expression[step], "uniform"))
merged[step] = true; merged[step] = true;
} }
...@@ -4805,7 +4864,13 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context, ...@@ -4805,7 +4864,13 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context,
} }
int numGaussian = 0, numUniform = 0; int numGaussian = 0, numUniform = 0;
for (int j = step; j < numSteps && (j == step || merged[j]); j++) { for (int j = step; j < numSteps && (j == step || merged[j]); j++) {
numGaussian += numAtoms*usesVariable(expression[j], "gaussian");
numUniform += numAtoms*usesVariable(expression[j], "uniform");
compute << "{\n"; compute << "{\n";
if (numGaussian > 0)
compute << "float4 gaussian = gaussianValues[gaussianIndex+index];\n";
if (numUniform > 0)
compute << "float4 uniform = uniformValues[uniformIndex+index];\n";
for (int i = 0; i < 3; i++) for (int i = 0; i < 3; i++)
compute << createPerDofComputation(stepType[j] == CustomIntegrator::ComputePerDof ? variable[j] : "", expression[j], i, integrator, forceName[j], energyName[j]); compute << createPerDofComputation(stepType[j] == CustomIntegrator::ComputePerDof ? variable[j] : "", expression[j], i, integrator, forceName[j], energyName[j]);
if (variable[j] == "x") { if (variable[j] == "x") {
...@@ -4824,9 +4889,11 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context, ...@@ -4824,9 +4889,11 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context,
compute << "perDofValues"<<cu.intToString(i+1)<<"[3*index+2] = perDofz"<<cu.intToString(i+1)<<";\n"; compute << "perDofValues"<<cu.intToString(i+1)<<"[3*index+2] = perDofz"<<cu.intToString(i+1)<<";\n";
} }
} }
if (numGaussian > 0)
compute << "gaussianIndex += NUM_ATOMS;\n";
if (numUniform > 0)
compute << "uniformIndex += NUM_ATOMS;\n";
compute << "}\n"; compute << "}\n";
numGaussian += numAtoms*usesVariable(expression[j], "gaussian");
numUniform += numAtoms*usesVariable(expression[j], "uniform");
} }
map<string, string> replacements; map<string, string> replacements;
replacements["COMPUTE_STEP"] = compute.str(); replacements["COMPUTE_STEP"] = compute.str();
...@@ -4856,9 +4923,9 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context, ...@@ -4856,9 +4923,9 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context,
args1.push_back(&globalValues->getDevicePointer()); args1.push_back(&globalValues->getDevicePointer());
args1.push_back(&contextParameterValues->getDevicePointer()); args1.push_back(&contextParameterValues->getDevicePointer());
args1.push_back(&sumBuffer->getDevicePointer()); args1.push_back(&sumBuffer->getDevicePointer());
args1.push_back(&integration.getRandom().getDevicePointer());
args1.push_back(NULL); args1.push_back(NULL);
args1.push_back(&uniformRandoms->getDevicePointer()); args1.push_back(NULL);
args1.push_back(NULL);
args1.push_back(&potentialEnergy->getDevicePointer()); args1.push_back(&potentialEnergy->getDevicePointer());
for (int i = 0; i < (int) perDofValues->getBuffers().size(); i++) for (int i = 0; i < (int) perDofValues->getBuffers().size(); i++)
args1.push_back(&perDofValues->getBuffers()[i].getMemory()); args1.push_back(&perDofValues->getBuffers()[i].getMemory());
...@@ -4925,6 +4992,25 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context, ...@@ -4925,6 +4992,25 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context,
} }
} }
// Initialize the random number generator.
int maxUniformRandoms = 1;
for (int i = 0; i < (int) requiredUniform.size(); i++)
maxUniformRandoms = max(maxUniformRandoms, requiredUniform[i]);
uniformRandoms = CudaArray::create<float4>(cu, maxUniformRandoms, "uniformRandoms");
randomSeed = CudaArray::create<int4>(cu, cu.getNumThreadBlocks()*CudaContext::ThreadBlockSize, "randomSeed");
vector<int4> seed(randomSeed->getSize());
unsigned int r = integrator.getRandomNumberSeed()+1;
for (int i = 0; i < randomSeed->getSize(); i++) {
seed[i].x = r = (1664525*r + 1013904223) & 0xFFFFFFFF;
seed[i].y = r = (1664525*r + 1013904223) & 0xFFFFFFFF;
seed[i].z = r = (1664525*r + 1013904223) & 0xFFFFFFFF;
seed[i].w = r = (1664525*r + 1013904223) & 0xFFFFFFFF;
}
randomSeed->upload(seed);
CUmodule randomProgram = cu.createModule(CudaKernelSources::customIntegrator, defines);
randomKernel = cu.getKernel(randomProgram, "generateRandomNumbers");
// Create the kernel for summing the potential energy. // Create the kernel for summing the potential energy.
defines["SUM_OUTPUT_INDEX"] = "0"; defines["SUM_OUTPUT_INDEX"] = "0";
...@@ -4967,7 +5053,7 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context, ...@@ -4967,7 +5053,7 @@ void CudaIntegrateCustomStepKernel::prepareForComputation(ContextImpl& context,
kineticEnergyArgs.push_back(&globalValues->getDevicePointer()); kineticEnergyArgs.push_back(&globalValues->getDevicePointer());
kineticEnergyArgs.push_back(&contextParameterValues->getDevicePointer()); kineticEnergyArgs.push_back(&contextParameterValues->getDevicePointer());
kineticEnergyArgs.push_back(&sumBuffer->getDevicePointer()); kineticEnergyArgs.push_back(&sumBuffer->getDevicePointer());
kineticEnergyArgs.push_back(&integration.getRandom().getDevicePointer()); kineticEnergyArgs.push_back(NULL);
kineticEnergyArgs.push_back(NULL); kineticEnergyArgs.push_back(NULL);
kineticEnergyArgs.push_back(&uniformRandoms->getDevicePointer()); kineticEnergyArgs.push_back(&uniformRandoms->getDevicePointer());
kineticEnergyArgs.push_back(&potentialEnergy->getDevicePointer()); kineticEnergyArgs.push_back(&potentialEnergy->getDevicePointer());
...@@ -5037,7 +5123,8 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat ...@@ -5037,7 +5123,8 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
// Loop over computation steps in the integrator and execute them. // Loop over computation steps in the integrator and execute them.
void* randomArgs[] = {&uniformRandoms->getDevicePointer(), &randomSeed->getDevicePointer()}; int maxUniformRandoms = uniformRandoms->getSize();
void* randomArgs[] = {&maxUniformRandoms, &uniformRandoms->getDevicePointer(), &randomSeed->getDevicePointer()};
CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0); CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0);
for (int i = 0; i < numSteps; i++) { for (int i = 0; i < numSteps; i++) {
int lastForceGroups = context.getLastForceGroups(); int lastForceGroups = context.getLastForceGroups();
...@@ -5086,7 +5173,9 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat ...@@ -5086,7 +5173,9 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
if (stepType[i] == CustomIntegrator::ComputePerDof && !merged[i]) { if (stepType[i] == CustomIntegrator::ComputePerDof && !merged[i]) {
int randomIndex = integration.prepareRandomNumbers(requiredGaussian[i]); int randomIndex = integration.prepareRandomNumbers(requiredGaussian[i]);
kernelArgs[i][0][1] = &posCorrection; kernelArgs[i][0][1] = &posCorrection;
kernelArgs[i][0][9] = &integration.getRandom().getDevicePointer();
kernelArgs[i][0][10] = &randomIndex; kernelArgs[i][0][10] = &randomIndex;
kernelArgs[i][0][11] = &uniformRandoms->getDevicePointer();
if (requiredUniform[i] > 0) if (requiredUniform[i] > 0)
cu.executeKernel(randomKernel, &randomArgs[0], numAtoms); cu.executeKernel(randomKernel, &randomArgs[0], numAtoms);
cu.executeKernel(kernels[i][0], &kernelArgs[i][0][0], numAtoms); cu.executeKernel(kernels[i][0], &kernelArgs[i][0][0], numAtoms);
...@@ -5101,7 +5190,9 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat ...@@ -5101,7 +5190,9 @@ void CudaIntegrateCustomStepKernel::execute(ContextImpl& context, CustomIntegrat
else if (stepType[i] == CustomIntegrator::ComputeSum) { else if (stepType[i] == CustomIntegrator::ComputeSum) {
int randomIndex = integration.prepareRandomNumbers(requiredGaussian[i]); int randomIndex = integration.prepareRandomNumbers(requiredGaussian[i]);
kernelArgs[i][0][1] = &posCorrection; kernelArgs[i][0][1] = &posCorrection;
kernelArgs[i][0][9] = &integration.getRandom().getDevicePointer();
kernelArgs[i][0][10] = &randomIndex; kernelArgs[i][0][10] = &randomIndex;
kernelArgs[i][0][11] = &uniformRandoms->getDevicePointer();
if (requiredUniform[i] > 0) if (requiredUniform[i] > 0)
cu.executeKernel(randomKernel, &randomArgs[0], numAtoms); cu.executeKernel(randomKernel, &randomArgs[0], numAtoms);
cu.clearBuffer(*sumBuffer); cu.clearBuffer(*sumBuffer);
...@@ -5152,6 +5243,7 @@ double CudaIntegrateCustomStepKernel::computeKineticEnergy(ContextImpl& context, ...@@ -5152,6 +5243,7 @@ double CudaIntegrateCustomStepKernel::computeKineticEnergy(ContextImpl& context,
CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0); CUdeviceptr posCorrection = (cu.getUseMixedPrecision() ? cu.getPosqCorrection().getDevicePointer() : 0);
int randomIndex = 0; int randomIndex = 0;
kineticEnergyArgs[1] = &posCorrection; kineticEnergyArgs[1] = &posCorrection;
kineticEnergyArgs[9] = &cu.getIntegrationUtilities().getRandom().getDevicePointer();
kineticEnergyArgs[10] = &randomIndex; kineticEnergyArgs[10] = &randomIndex;
cu.clearBuffer(*sumBuffer); cu.clearBuffer(*sumBuffer);
cu.executeKernel(kineticEnergyKernel, &kineticEnergyArgs[0], cu.getNumAtoms()); cu.executeKernel(kineticEnergyKernel, &kineticEnergyArgs[0], cu.getNumAtoms());
......
...@@ -457,7 +457,9 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source, ...@@ -457,7 +457,9 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
} }
replacements["LOAD_ATOM1_PARAMETERS"] = load1.str(); replacements["LOAD_ATOM1_PARAMETERS"] = load1.str();
bool useShuffle = (context.getComputeCapability() >= 3.0); int cudaVersion;
cuDriverGetVersion(&cudaVersion);
bool useShuffle = (context.getComputeCapability() >= 3.0 && cudaVersion >= 5050);
// Part 1. Defines for on diagonal exclusion tiles // Part 1. Defines for on diagonal exclusion tiles
stringstream loadLocal1; stringstream loadLocal1;
......
...@@ -87,12 +87,14 @@ CudaPlatform::CudaPlatform() { ...@@ -87,12 +87,14 @@ CudaPlatform::CudaPlatform() {
platformProperties.push_back(CudaDeviceName()); platformProperties.push_back(CudaDeviceName());
platformProperties.push_back(CudaUseBlockingSync()); platformProperties.push_back(CudaUseBlockingSync());
platformProperties.push_back(CudaPrecision()); platformProperties.push_back(CudaPrecision());
platformProperties.push_back(CudaUseCpuPme());
platformProperties.push_back(CudaCompiler()); platformProperties.push_back(CudaCompiler());
platformProperties.push_back(CudaTempDirectory()); platformProperties.push_back(CudaTempDirectory());
setPropertyDefaultValue(CudaDeviceIndex(), ""); setPropertyDefaultValue(CudaDeviceIndex(), "");
setPropertyDefaultValue(CudaDeviceName(), ""); setPropertyDefaultValue(CudaDeviceName(), "");
setPropertyDefaultValue(CudaUseBlockingSync(), "true"); setPropertyDefaultValue(CudaUseBlockingSync(), "true");
setPropertyDefaultValue(CudaPrecision(), "single"); setPropertyDefaultValue(CudaPrecision(), "single");
setPropertyDefaultValue(CudaUseCpuPme(), "false");
#ifdef _MSC_VER #ifdef _MSC_VER
char* bindir = getenv("CUDA_BIN_PATH"); char* bindir = getenv("CUDA_BIN_PATH");
string nvcc = (bindir == NULL ? "nvcc.exe" : string(bindir)+"\\nvcc.exe"); string nvcc = (bindir == NULL ? "nvcc.exe" : string(bindir)+"\\nvcc.exe");
...@@ -141,13 +143,20 @@ void CudaPlatform::contextCreated(ContextImpl& context, const map<string, string ...@@ -141,13 +143,20 @@ void CudaPlatform::contextCreated(ContextImpl& context, const map<string, string
getPropertyDefaultValue(CudaUseBlockingSync()) : properties.find(CudaUseBlockingSync())->second); getPropertyDefaultValue(CudaUseBlockingSync()) : properties.find(CudaUseBlockingSync())->second);
string precisionPropValue = (properties.find(CudaPrecision()) == properties.end() ? string precisionPropValue = (properties.find(CudaPrecision()) == properties.end() ?
getPropertyDefaultValue(CudaPrecision()) : properties.find(CudaPrecision())->second); getPropertyDefaultValue(CudaPrecision()) : properties.find(CudaPrecision())->second);
string cpuPmePropValue = (properties.find(CudaUseCpuPme()) == properties.end() ?
getPropertyDefaultValue(CudaUseCpuPme()) : properties.find(CudaUseCpuPme())->second);
const string& compilerPropValue = (properties.find(CudaCompiler()) == properties.end() ? const string& compilerPropValue = (properties.find(CudaCompiler()) == properties.end() ?
getPropertyDefaultValue(CudaCompiler()) : properties.find(CudaCompiler())->second); getPropertyDefaultValue(CudaCompiler()) : properties.find(CudaCompiler())->second);
const string& tempPropValue = (properties.find(CudaTempDirectory()) == properties.end() ? const string& tempPropValue = (properties.find(CudaTempDirectory()) == properties.end() ?
getPropertyDefaultValue(CudaTempDirectory()) : properties.find(CudaTempDirectory())->second); getPropertyDefaultValue(CudaTempDirectory()) : properties.find(CudaTempDirectory())->second);
transform(blockingPropValue.begin(), blockingPropValue.end(), blockingPropValue.begin(), ::tolower); transform(blockingPropValue.begin(), blockingPropValue.end(), blockingPropValue.begin(), ::tolower);
transform(precisionPropValue.begin(), precisionPropValue.end(), precisionPropValue.begin(), ::tolower); transform(precisionPropValue.begin(), precisionPropValue.end(), precisionPropValue.begin(), ::tolower);
context.setPlatformData(new PlatformData(context.getSystem(), devicePropValue, blockingPropValue, precisionPropValue, compilerPropValue, tempPropValue)); transform(cpuPmePropValue.begin(), cpuPmePropValue.end(), cpuPmePropValue.begin(), ::tolower);
vector<string> pmeKernelName;
pmeKernelName.push_back(CalcPmeReciprocalForceKernel::Name());
if (!supportsKernels(pmeKernelName))
cpuPmePropValue = "false";
context.setPlatformData(new PlatformData(&context, context.getSystem(), devicePropValue, blockingPropValue, precisionPropValue, cpuPmePropValue, compilerPropValue, tempPropValue));
} }
void CudaPlatform::contextDestroyed(ContextImpl& context) const { void CudaPlatform::contextDestroyed(ContextImpl& context) const {
...@@ -155,8 +164,8 @@ void CudaPlatform::contextDestroyed(ContextImpl& context) const { ...@@ -155,8 +164,8 @@ void CudaPlatform::contextDestroyed(ContextImpl& context) const {
delete data; delete data;
} }
CudaPlatform::PlatformData::PlatformData(const System& system, const string& deviceIndexProperty, const string& blockingProperty, const string& precisionProperty, CudaPlatform::PlatformData::PlatformData(ContextImpl* context, const System& system, const string& deviceIndexProperty, const string& blockingProperty, const string& precisionProperty,
const string& compilerProperty, const string& tempProperty) : removeCM(false), stepCount(0), computeForceCount(0), time(0.0) { const string& cpuPmeProperty, const string& compilerProperty, const string& tempProperty) : context(context), removeCM(false), stepCount(0), computeForceCount(0), time(0.0) {
bool blocking = (blockingProperty == "true"); bool blocking = (blockingProperty == "true");
vector<string> devices; vector<string> devices;
size_t searchPos = 0, nextPos; size_t searchPos = 0, nextPos;
...@@ -185,10 +194,12 @@ CudaPlatform::PlatformData::PlatformData(const System& system, const string& dev ...@@ -185,10 +194,12 @@ CudaPlatform::PlatformData::PlatformData(const System& system, const string& dev
CHECK_RESULT(cuDeviceGetName(name, 1000, contexts[i]->getDevice()), "Error querying device name"); CHECK_RESULT(cuDeviceGetName(name, 1000, contexts[i]->getDevice()), "Error querying device name");
deviceName << name; deviceName << name;
} }
useCpuPme = (cpuPmeProperty == "true" && !contexts[0]->getUseDoublePrecision());
propertyValues[CudaPlatform::CudaDeviceIndex()] = deviceIndex.str(); propertyValues[CudaPlatform::CudaDeviceIndex()] = deviceIndex.str();
propertyValues[CudaPlatform::CudaDeviceName()] = deviceName.str(); propertyValues[CudaPlatform::CudaDeviceName()] = deviceName.str();
propertyValues[CudaPlatform::CudaUseBlockingSync()] = blocking ? "true" : "false"; propertyValues[CudaPlatform::CudaUseBlockingSync()] = blocking ? "true" : "false";
propertyValues[CudaPlatform::CudaPrecision()] = precisionProperty; propertyValues[CudaPlatform::CudaPrecision()] = precisionProperty;
propertyValues[CudaPlatform::CudaUseCpuPme()] = useCpuPme ? "true" : "false";
propertyValues[CudaPlatform::CudaCompiler()] = compilerProperty; propertyValues[CudaPlatform::CudaCompiler()] = compilerProperty;
propertyValues[CudaPlatform::CudaTempDirectory()] = tempProperty; propertyValues[CudaPlatform::CudaTempDirectory()] = tempProperty;
contextEnergy.resize(contexts.size()); contextEnergy.resize(contexts.size());
......
...@@ -52,10 +52,10 @@ extern "C" __global__ void applyPositionDeltas(real4* __restrict__ posq, real4* ...@@ -52,10 +52,10 @@ extern "C" __global__ void applyPositionDeltas(real4* __restrict__ posq, real4*
} }
} }
extern "C" __global__ void generateRandomNumbers(float4* __restrict__ random, uint4* __restrict__ seed) { extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restrict__ random, uint4* __restrict__ seed) {
uint4 state = seed[blockIdx.x*blockDim.x+threadIdx.x]; uint4 state = seed[blockIdx.x*blockDim.x+threadIdx.x];
unsigned int carry = 0; unsigned int carry = 0;
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) { for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numValues; index += blockDim.x*gridDim.x) {
// Generate three uniform random numbers. // Generate three uniform random numbers.
state.x = state.x * 69069 + 1; state.x = state.x * 69069 + 1;
......
...@@ -34,11 +34,10 @@ inline __device__ mixed4 convertFromDouble4(double4 a) { ...@@ -34,11 +34,10 @@ inline __device__ mixed4 convertFromDouble4(double4 a) {
extern "C" __global__ void computePerDof(real4* __restrict__ posq, real4* __restrict__ posqCorrection, mixed4* __restrict__ posDelta, extern "C" __global__ void computePerDof(real4* __restrict__ posq, real4* __restrict__ posqCorrection, mixed4* __restrict__ posDelta,
mixed4* __restrict__ velm, const long long* __restrict__ force, const mixed2* __restrict__ dt, const mixed* __restrict__ globals, mixed4* __restrict__ velm, const long long* __restrict__ force, const mixed2* __restrict__ dt, const mixed* __restrict__ globals,
const mixed* __restrict__ params, mixed* __restrict__ sum, const float4* __restrict__ gaussianValues, const mixed* __restrict__ params, mixed* __restrict__ sum, const float4* __restrict__ gaussianValues,
unsigned int randomIndex, const float4* __restrict__ uniformValues, const real* __restrict__ energy unsigned int gaussianBaseIndex, const float4* __restrict__ uniformValues, const real* __restrict__ energy
PARAMETER_ARGUMENTS) { PARAMETER_ARGUMENTS) {
mixed stepSize = dt[0].y; mixed stepSize = dt[0].y;
int index = blockIdx.x*blockDim.x+threadIdx.x; int index = blockIdx.x*blockDim.x+threadIdx.x;
randomIndex += index;
const double forceScale = 1.0/0xFFFFFFFF; const double forceScale = 1.0/0xFFFFFFFF;
while (index < NUM_ATOMS) { while (index < NUM_ATOMS) {
#ifdef LOAD_POS_AS_DELTA #ifdef LOAD_POS_AS_DELTA
...@@ -50,11 +49,10 @@ extern "C" __global__ void computePerDof(real4* __restrict__ posq, real4* __rest ...@@ -50,11 +49,10 @@ extern "C" __global__ void computePerDof(real4* __restrict__ posq, real4* __rest
double4 f = make_double4(forceScale*force[index], forceScale*force[index+PADDED_NUM_ATOMS], forceScale*force[index+PADDED_NUM_ATOMS*2], 0.0); double4 f = make_double4(forceScale*force[index], forceScale*force[index+PADDED_NUM_ATOMS], forceScale*force[index+PADDED_NUM_ATOMS*2], 0.0);
double mass = 1.0/velocity.w; double mass = 1.0/velocity.w;
if (velocity.w != 0.0) { if (velocity.w != 0.0) {
float4 gaussian = gaussianValues[randomIndex]; int gaussianIndex = gaussianBaseIndex;
float4 uniform = uniformValues[index]; int uniformIndex = 0;
COMPUTE_STEP COMPUTE_STEP
} }
randomIndex += blockDim.x*gridDim.x;
index += blockDim.x*gridDim.x; index += blockDim.x*gridDim.x;
} }
} }
...@@ -364,9 +364,9 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co ...@@ -364,9 +364,9 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co
mixed yaksYd = zaksZd*xaksXd - xaksZd*zaksXd; mixed yaksYd = zaksZd*xaksXd - xaksZd*zaksXd;
mixed zaksYd = xaksZd*yaksXd - yaksZd*xaksXd; mixed zaksYd = xaksZd*yaksXd - yaksZd*xaksXd;
mixed axlng = SQRT(xaksXd*xaksXd + yaksXd*yaksXd + zaksXd*zaksXd); mixed axlng = sqrt(xaksXd*xaksXd + yaksXd*yaksXd + zaksXd*zaksXd);
mixed aylng = SQRT(xaksYd*xaksYd + yaksYd*yaksYd + zaksYd*zaksYd); mixed aylng = sqrt(xaksYd*xaksYd + yaksYd*yaksYd + zaksYd*zaksYd);
mixed azlng = SQRT(xaksZd*xaksZd + yaksZd*yaksZd + zaksZd*zaksZd); mixed azlng = sqrt(xaksZd*xaksZd + yaksZd*yaksZd + zaksZd*zaksZd);
mixed trns11 = xaksXd / axlng; mixed trns11 = xaksXd / axlng;
mixed trns21 = yaksXd / axlng; mixed trns21 = yaksXd / axlng;
mixed trns31 = zaksXd / axlng; mixed trns31 = zaksXd / axlng;
...@@ -392,13 +392,13 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co ...@@ -392,13 +392,13 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co
// --- Step2 A2' --- // --- Step2 A2' ---
float rc = 0.5f*params.y; float rc = 0.5f*params.y;
mixed rb = SQRT(params.x*params.x-rc*rc); mixed rb = sqrt(params.x*params.x-rc*rc);
mixed ra = rb*(m1+m2)*invTotalMass; mixed ra = rb*(m1+m2)*invTotalMass;
rb -= ra; rb -= ra;
mixed sinphi = za1d/ra; mixed sinphi = za1d/ra;
mixed cosphi = SQRT(1-sinphi*sinphi); mixed cosphi = sqrt(1-sinphi*sinphi);
mixed sinpsi = (zb1d-zc1d) / (2*rc*cosphi); mixed sinpsi = (zb1d-zc1d) / (2*rc*cosphi);
mixed cospsi = SQRT(1-sinpsi*sinpsi); mixed cospsi = sqrt(1-sinpsi*sinpsi);
mixed ya2d = ra*cosphi; mixed ya2d = ra*cosphi;
mixed xb2d = - rc*cospsi; mixed xb2d = - rc*cospsi;
...@@ -406,7 +406,7 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co ...@@ -406,7 +406,7 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co
mixed yc2d = - rb*cosphi + rc*sinpsi*sinphi; mixed yc2d = - rb*cosphi + rc*sinpsi*sinphi;
mixed xb2d2 = xb2d*xb2d; mixed xb2d2 = xb2d*xb2d;
mixed hh2 = 4.0f*xb2d2 + (yb2d-yc2d)*(yb2d-yc2d) + (zb1d-zc1d)*(zb1d-zc1d); mixed hh2 = 4.0f*xb2d2 + (yb2d-yc2d)*(yb2d-yc2d) + (zb1d-zc1d)*(zb1d-zc1d);
mixed deltx = 2.0f*xb2d + SQRT(4.0f*xb2d2 - hh2 + params.y*params.y); mixed deltx = 2.0f*xb2d + sqrt(4.0f*xb2d2 - hh2 + params.y*params.y);
xb2d -= deltx*0.5f; xb2d -= deltx*0.5f;
// --- Step3 al,be,ga --- // --- Step3 al,be,ga ---
...@@ -416,11 +416,11 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co ...@@ -416,11 +416,11 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co
mixed gamma = xb0d*yb1d - xb1d*yb0d + xc0d*yc1d - xc1d*yc0d; mixed gamma = xb0d*yb1d - xb1d*yb0d + xc0d*yc1d - xc1d*yc0d;
mixed al2be2 = alpha*alpha + beta*beta; mixed al2be2 = alpha*alpha + beta*beta;
mixed sintheta = (alpha*gamma - beta*SQRT(al2be2 - gamma*gamma)) / al2be2; mixed sintheta = (alpha*gamma - beta*sqrt(al2be2 - gamma*gamma)) / al2be2;
// --- Step4 A3' --- // --- Step4 A3' ---
mixed costheta = SQRT(1-sintheta*sintheta); mixed costheta = sqrt(1-sintheta*sintheta);
mixed xa3d = - ya2d*sintheta; mixed xa3d = - ya2d*sintheta;
mixed ya3d = ya2d*costheta; mixed ya3d = ya2d*costheta;
mixed za3d = za1d; mixed za3d = za1d;
......
...@@ -266,8 +266,18 @@ void gridInterpolateForce(const real4* __restrict__ posq, unsigned long long* __ ...@@ -266,8 +266,18 @@ void gridInterpolateForce(const real4* __restrict__ posq, unsigned long long* __
} }
} }
real q = pos.w*EPSILON_FACTOR; real q = pos.w*EPSILON_FACTOR;
forceBuffers[atom] += static_cast<unsigned long long>((long long) (-q*force.x*GRID_SIZE_X*invPeriodicBoxSize.x*0x100000000)); forceBuffers[atom] += static_cast<unsigned long long>((long long) (-q*force.x*GRID_SIZE_X*invPeriodicBoxSize.x*0x100000000));
forceBuffers[atom+PADDED_NUM_ATOMS] += static_cast<unsigned long long>((long long) (-q*force.y*GRID_SIZE_Y*invPeriodicBoxSize.y*0x100000000)); forceBuffers[atom+PADDED_NUM_ATOMS] += static_cast<unsigned long long>((long long) (-q*force.y*GRID_SIZE_Y*invPeriodicBoxSize.y*0x100000000));
forceBuffers[atom+2*PADDED_NUM_ATOMS] += static_cast<unsigned long long>((long long) (-q*force.z*GRID_SIZE_Z*invPeriodicBoxSize.z*0x100000000)); forceBuffers[atom+2*PADDED_NUM_ATOMS] += static_cast<unsigned long long>((long long) (-q*force.z*GRID_SIZE_Z*invPeriodicBoxSize.z*0x100000000));
}
}
extern "C" __global__
void addForces(const real4* __restrict__ forces, unsigned long long* __restrict__ forceBuffers) {
for (int atom = blockIdx.x*blockDim.x+threadIdx.x; atom < NUM_ATOMS; atom += blockDim.x*gridDim.x) {
real4 f = forces[atom];
forceBuffers[atom] += static_cast<unsigned long long>((long long) (f.x*0x100000000));
forceBuffers[atom+PADDED_NUM_ATOMS] += static_cast<unsigned long long>((long long) (f.y*0x100000000));
forceBuffers[atom+2*PADDED_NUM_ATOMS] += static_cast<unsigned long long>((long long) (f.z*0x100000000));
} }
} }
...@@ -40,7 +40,7 @@ ...@@ -40,7 +40,7 @@
#include "openmm/NonbondedForce.h" #include "openmm/NonbondedForce.h"
#include "openmm/System.h" #include "openmm/System.h"
#include "openmm/VerletIntegrator.h" #include "openmm/VerletIntegrator.h"
#include "../src/SimTKUtilities/SimTKOpenMMRealType.h" #include "SimTKOpenMMRealType.h"
#include "sfmt/SFMT.h" #include "sfmt/SFMT.h"
#include <iostream> #include <iostream>
#include <vector> #include <vector>
......
...@@ -43,7 +43,7 @@ ...@@ -43,7 +43,7 @@
#include "openmm/NonbondedForce.h" #include "openmm/NonbondedForce.h"
#include "openmm/System.h" #include "openmm/System.h"
#include "openmm/BrownianIntegrator.h" #include "openmm/BrownianIntegrator.h"
#include "../src/SimTKUtilities/SimTKOpenMMRealType.h" #include "SimTKOpenMMRealType.h"
#include "sfmt/SFMT.h" #include "sfmt/SFMT.h"
#include <iostream> #include <iostream>
#include <vector> #include <vector>
......
...@@ -40,7 +40,7 @@ ...@@ -40,7 +40,7 @@
#include "openmm/PeriodicTorsionForce.h" #include "openmm/PeriodicTorsionForce.h"
#include "openmm/System.h" #include "openmm/System.h"
#include "openmm/VerletIntegrator.h" #include "openmm/VerletIntegrator.h"
#include "../src/SimTKUtilities/SimTKOpenMMRealType.h" #include "SimTKOpenMMRealType.h"
#include "sfmt/SFMT.h" #include "sfmt/SFMT.h"
#include <iostream> #include <iostream>
#include <vector> #include <vector>
......
...@@ -42,7 +42,7 @@ ...@@ -42,7 +42,7 @@
#include "openmm/System.h" #include "openmm/System.h"
#include "openmm/LangevinIntegrator.h" #include "openmm/LangevinIntegrator.h"
#include "openmm/VerletIntegrator.h" #include "openmm/VerletIntegrator.h"
#include "../src/SimTKUtilities/SimTKOpenMMRealType.h" #include "SimTKOpenMMRealType.h"
#include "sfmt/SFMT.h" #include "sfmt/SFMT.h"
#include <iostream> #include <iostream>
#include <vector> #include <vector>
......
...@@ -39,7 +39,7 @@ ...@@ -39,7 +39,7 @@
#include "openmm/CustomBondForce.h" #include "openmm/CustomBondForce.h"
#include "openmm/System.h" #include "openmm/System.h"
#include "openmm/VerletIntegrator.h" #include "openmm/VerletIntegrator.h"
#include "../src/SimTKUtilities/SimTKOpenMMRealType.h" #include "SimTKOpenMMRealType.h"
#include <iostream> #include <iostream>
#include <vector> #include <vector>
......
...@@ -39,7 +39,7 @@ ...@@ -39,7 +39,7 @@
#include "openmm/CustomExternalForce.h" #include "openmm/CustomExternalForce.h"
#include "openmm/System.h" #include "openmm/System.h"
#include "openmm/VerletIntegrator.h" #include "openmm/VerletIntegrator.h"
#include "../src/SimTKUtilities/SimTKOpenMMRealType.h" #include "SimTKOpenMMRealType.h"
#include "sfmt/SFMT.h" #include "sfmt/SFMT.h"
#include <iostream> #include <iostream>
#include <vector> #include <vector>
......
...@@ -41,7 +41,7 @@ ...@@ -41,7 +41,7 @@
#include "openmm/NonbondedForce.h" #include "openmm/NonbondedForce.h"
#include "openmm/System.h" #include "openmm/System.h"
#include "openmm/CustomIntegrator.h" #include "openmm/CustomIntegrator.h"
#include "../src/SimTKUtilities/SimTKOpenMMRealType.h" #include "SimTKOpenMMRealType.h"
#include "sfmt/SFMT.h" #include "sfmt/SFMT.h"
#include <iostream> #include <iostream>
#include <vector> #include <vector>
...@@ -651,6 +651,72 @@ void testRespa() { ...@@ -651,6 +651,72 @@ void testRespa() {
} }
} }
/**
* Make sure random numbers are computed correctly when steps get merged.
*/
void testMergedRandoms() {
const int numParticles = 10;
const int numSteps = 10;
System system;
for (int i = 0; i < numParticles; i++)
system.addParticle(1.0);
CustomIntegrator integrator(0.1);
integrator.addPerDofVariable("dofUniform1", 0);
integrator.addPerDofVariable("dofUniform2", 0);
integrator.addPerDofVariable("dofGaussian1", 0);
integrator.addPerDofVariable("dofGaussian2", 0);
integrator.addGlobalVariable("globalUniform1", 0);
integrator.addGlobalVariable("globalUniform2", 0);
integrator.addGlobalVariable("globalGaussian1", 0);
integrator.addGlobalVariable("globalGaussian2", 0);
integrator.addComputePerDof("dofUniform1", "uniform");
integrator.addComputePerDof("dofUniform2", "uniform");
integrator.addComputePerDof("dofGaussian1", "gaussian");
integrator.addComputePerDof("dofGaussian2", "gaussian");
integrator.addComputeGlobal("globalUniform1", "uniform");
integrator.addComputeGlobal("globalUniform2", "uniform");
integrator.addComputeGlobal("globalGaussian1", "gaussian");
integrator.addComputeGlobal("globalGaussian2", "gaussian");
Context context(system, integrator, platform);
// See if the random numbers are computed correctly.
vector<Vec3> values1, values2;
for (int i = 0; i < numSteps; i++) {
integrator.step(1);
integrator.getPerDofVariable(0, values1);
integrator.getPerDofVariable(1, values2);
for (int i = 0; i < numParticles; i++)
for (int j = 0; j < 3; j++) {
double v1 = values1[i][j];
double v2 = values2[i][j];
ASSERT(v1 >= 0 && v1 < 1);
ASSERT(v2 >= 0 && v2 < 1);
ASSERT(v1 != v2);
}
integrator.getPerDofVariable(2, values1);
integrator.getPerDofVariable(3, values2);
for (int i = 0; i < numParticles; i++)
for (int j = 0; j < 3; j++) {
double v1 = values1[i][j];
double v2 = values2[i][j];
ASSERT(v1 >= -10 && v1 < 10);
ASSERT(v2 >= -10 && v2 < 10);
ASSERT(v1 != v2);
}
double v1 = integrator.getGlobalVariable(0);
double v2 = integrator.getGlobalVariable(1);
ASSERT(v1 >= 0 && v1 < 1);
ASSERT(v2 >= 0 && v2 < 1);
ASSERT(v1 != v2);
v1 = integrator.getGlobalVariable(2);
v2 = integrator.getGlobalVariable(3);
ASSERT(v1 >= -10 && v1 < 10);
ASSERT(v2 >= -10 && v2 < 10);
ASSERT(v1 != v2);
}
}
int main(int argc, char* argv[]) { int main(int argc, char* argv[]) {
try { try {
if (argc > 1) if (argc > 1)
...@@ -666,6 +732,7 @@ int main(int argc, char* argv[]) { ...@@ -666,6 +732,7 @@ int main(int argc, char* argv[]) {
testPerDofVariables(); testPerDofVariables();
testForceGroups(); testForceGroups();
testRespa(); testRespa();
testMergedRandoms();
} }
catch(const exception& e) { catch(const exception& e) {
cout << "exception: " << e.what() << endl; cout << "exception: " << e.what() << endl;
......
...@@ -42,7 +42,7 @@ ...@@ -42,7 +42,7 @@
#include "openmm/LangevinIntegrator.h" #include "openmm/LangevinIntegrator.h"
#include "openmm/VerletIntegrator.h" #include "openmm/VerletIntegrator.h"
#include "openmm/internal/ContextImpl.h" #include "openmm/internal/ContextImpl.h"
#include "../src/SimTKUtilities/SimTKOpenMMRealType.h" #include "SimTKOpenMMRealType.h"
#include "sfmt/SFMT.h" #include "sfmt/SFMT.h"
#include <iostream> #include <iostream>
#include <vector> #include <vector>
......
...@@ -40,7 +40,7 @@ ...@@ -40,7 +40,7 @@
#include "openmm/GBSAOBCForce.h" #include "openmm/GBSAOBCForce.h"
#include "openmm/System.h" #include "openmm/System.h"
#include "openmm/LangevinIntegrator.h" #include "openmm/LangevinIntegrator.h"
#include "../src/SimTKUtilities/SimTKOpenMMRealType.h" #include "SimTKOpenMMRealType.h"
#include "sfmt/SFMT.h" #include "sfmt/SFMT.h"
#include "openmm/NonbondedForce.h" #include "openmm/NonbondedForce.h"
#include <iostream> #include <iostream>
......
...@@ -40,7 +40,7 @@ ...@@ -40,7 +40,7 @@
#include "openmm/HarmonicAngleForce.h" #include "openmm/HarmonicAngleForce.h"
#include "openmm/System.h" #include "openmm/System.h"
#include "openmm/VerletIntegrator.h" #include "openmm/VerletIntegrator.h"
#include "../src/SimTKUtilities/SimTKOpenMMRealType.h" #include "SimTKOpenMMRealType.h"
#include <iostream> #include <iostream>
#include <vector> #include <vector>
......
...@@ -40,7 +40,7 @@ ...@@ -40,7 +40,7 @@
#include "openmm/NonbondedForce.h" #include "openmm/NonbondedForce.h"
#include "openmm/System.h" #include "openmm/System.h"
#include "openmm/LangevinIntegrator.h" #include "openmm/LangevinIntegrator.h"
#include "../src/SimTKUtilities/SimTKOpenMMRealType.h" #include "SimTKOpenMMRealType.h"
#include "sfmt/SFMT.h" #include "sfmt/SFMT.h"
#include <iostream> #include <iostream>
#include <vector> #include <vector>
......
...@@ -42,7 +42,7 @@ ...@@ -42,7 +42,7 @@
#include "openmm/LangevinIntegrator.h" #include "openmm/LangevinIntegrator.h"
#include "openmm/VerletIntegrator.h" #include "openmm/VerletIntegrator.h"
#include "sfmt/SFMT.h" #include "sfmt/SFMT.h"
#include "../src/SimTKUtilities/SimTKOpenMMRealType.h" #include "SimTKOpenMMRealType.h"
#include <iostream> #include <iostream>
#include <vector> #include <vector>
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment