Commit 17ae3aae authored by Peter Eastman's avatar Peter Eastman
Browse files

Continuing to implement new CUDA platform: GBSAOBCForce

parent e2fc86ab
......@@ -179,6 +179,7 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
compilationDefines["RECIP"] = useDoublePrecision ? "1.0/" : "1.0f/";
compilationDefines["EXP"] = useDoublePrecision ? "exp" : "expf";
compilationDefines["LOG"] = useDoublePrecision ? "log" : "logf";
compilationDefines["POW"] = useDoublePrecision ? "pow" : "powf";
compilationDefines["COS"] = useDoublePrecision ? "cos" : "cosf";
compilationDefines["SIN"] = useDoublePrecision ? "sin" : "sinf";
compilationDefines["TAN"] = useDoublePrecision ? "tan" : "tanf";
......
......@@ -96,8 +96,8 @@ KernelImpl* CudaKernelFactory::createKernelImpl(std::string name, const Platform
return new CudaCalcNonbondedForceKernel(name, platform, cu, context.getSystem());
if (name == CalcCustomNonbondedForceKernel::Name())
return new CudaCalcCustomNonbondedForceKernel(name, platform, cu, context.getSystem());
// if (name == CalcGBSAOBCForceKernel::Name())
// return new CudaCalcGBSAOBCForceKernel(name, platform, cu);
if (name == CalcGBSAOBCForceKernel::Name())
return new CudaCalcGBSAOBCForceKernel(name, platform, cu);
// if (name == CalcCustomGBForceKernel::Name())
// return new CudaCalcCustomGBForceKernel(name, platform, cu, context.getSystem());
if (name == CalcCustomExternalForceKernel::Name())
......
......@@ -1421,6 +1421,8 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
pmeDefines["GRID_SIZE_Y"] = cu.intToString(gridSizeY);
pmeDefines["GRID_SIZE_Z"] = cu.intToString(gridSizeZ);
pmeDefines["EPSILON_FACTOR"] = cu.doubleToString(sqrt(ONE_4PI_EPS0));
if (cu.getUseDoublePrecision())
pmeDefines["USE_DOUBLE_PRECISION"] = "1";
CUmodule module = cu.createModule(CudaKernelSources::vectorOps+CudaKernelSources::pme, pmeDefines);
pmeUpdateBsplinesKernel = cu.getKernel(module, "updateBsplines");
pmeAtomRangeKernel = cu.getKernel(module, "findAtomRangeForGrid");
......@@ -1573,11 +1575,17 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
cu.executeKernel(pmeSpreadChargeKernel, spreadArgs, cu.getNumAtoms(), PmeOrder*PmeOrder*PmeOrder);
void* finishSpreadArgs[] = {&pmeGrid->getDevicePointer()};
cu.executeKernel(pmeFinishSpreadChargeKernel, finishSpreadArgs, pmeGrid->getSize());
cufftExecC2C(fft, (float2*) pmeGrid->getDevicePointer(), (float2*) pmeGrid->getDevicePointer(), CUFFT_FORWARD);
if (cu.getUseDoublePrecision())
cufftExecZ2Z(fft, (double2*) pmeGrid->getDevicePointer(), (double2*) pmeGrid->getDevicePointer(), CUFFT_FORWARD);
else
cufftExecC2C(fft, (float2*) pmeGrid->getDevicePointer(), (float2*) pmeGrid->getDevicePointer(), CUFFT_FORWARD);
void* convolutionArgs[] = {&pmeGrid->getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(), &pmeBsplineModuliX->getDevicePointer(),
&pmeBsplineModuliY->getDevicePointer(), &pmeBsplineModuliZ->getDevicePointer(), cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()};
cu.executeKernel(pmeConvolutionKernel, convolutionArgs, cu.getNumAtoms());
cufftExecC2C(fft, (float2*) pmeGrid->getDevicePointer(), (float2*) pmeGrid->getDevicePointer(), CUFFT_INVERSE);
if (cu.getUseDoublePrecision())
cufftExecZ2Z(fft, (double2*) pmeGrid->getDevicePointer(), (double2*) pmeGrid->getDevicePointer(), CUFFT_INVERSE);
else
cufftExecC2C(fft, (float2*) pmeGrid->getDevicePointer(), (float2*) pmeGrid->getDevicePointer(), CUFFT_INVERSE);
void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &pmeGrid->getDevicePointer(),
cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()};
interpolateForceThreads = 64;
......@@ -1859,231 +1867,191 @@ void CudaCalcCustomNonbondedForceKernel::copyParametersToContext(ContextImpl& co
cu.invalidateMolecules();
}
//class CudaGBSAOBCForceInfo : public CudaForceInfo {
//public:
// CudaGBSAOBCForceInfo(int requiredBuffers, const GBSAOBCForce& force) : CudaForceInfo(requiredBuffers), force(force) {
// }
// bool areParticlesIdentical(int particle1, int particle2) {
// double charge1, charge2, radius1, radius2, scale1, scale2;
// force.getParticleParameters(particle1, charge1, radius1, scale1);
// force.getParticleParameters(particle2, charge2, radius2, scale2);
// return (charge1 == charge2 && radius1 == radius2 && scale1 == scale2);
// }
//private:
// const GBSAOBCForce& force;
//};
//
//CudaCalcGBSAOBCForceKernel::~CudaCalcGBSAOBCForceKernel() {
// cu.setAsCurrent();
// if (params != NULL)
// delete params;
// if (bornSum != NULL)
// delete bornSum;
// if (longBornSum != NULL)
// delete longBornSum;
// if (bornRadii != NULL)
// delete bornRadii;
// if (bornForce != NULL)
// delete bornForce;
// if (longBornForce != NULL)
// delete longBornForce;
// if (obcChain != NULL)
// delete obcChain;
//}
//
//void CudaCalcGBSAOBCForceKernel::initialize(const System& system, const GBSAOBCForce& force) {
// cu.setAsCurrent();
// if (cu.getPlatformData().contexts.size() > 1)
// throw OpenMMException("GBSAOBCForce does not support using multiple CUDA devices");
// CudaNonbondedUtilities& nb = cu.getNonbondedUtilities();
// params = new CudaArray<mm_float2>(cu, cu.getPaddedNumAtoms(), "gbsaObcParams");
// bornRadii = new CudaArray<cl_float>(cu, cu.getPaddedNumAtoms(), "bornRadii");
// obcChain = new CudaArray<cl_float>(cu, cu.getPaddedNumAtoms(), "obcChain");
// if (cu.getSupports64BitGlobalAtomics()) {
// longBornSum = new CudaArray<cl_long>(cu, cu.getPaddedNumAtoms(), "longBornSum");
// longBornForce = new CudaArray<cl_long>(cu, cu.getPaddedNumAtoms(), "longBornForce");
// bornForce = new CudaArray<cl_float>(cu, cu.getPaddedNumAtoms(), "bornForce");
// cu.addAutoclearBuffer(longBornSum->getDevicePointer(), 2*longBornSum->getSize());
// cu.addAutoclearBuffer(longBornForce->getDevicePointer(), 2*longBornForce->getSize());
// }
// else {
// bornSum = new CudaArray<cl_float>(cu, cu.getPaddedNumAtoms()*nb.getNumForceBuffers(), "bornSum");
// bornForce = new CudaArray<cl_float>(cu, cu.getPaddedNumAtoms()*nb.getNumForceBuffers(), "bornForce");
// cu.addAutoclearBuffer(bornSum->getDevicePointer(), bornSum->getSize());
// cu.addAutoclearBuffer(bornForce->getDevicePointer(), bornForce->getSize());
// }
// CudaArray<mm_float4>& posq = cu.getPosq();
// int numParticles = force.getNumParticles();
// vector<mm_float2> paramsVector(numParticles);
// const double dielectricOffset = 0.009;
// for (int i = 0; i < numParticles; i++) {
// double charge, radius, scalingFactor;
// force.getParticleParameters(i, charge, radius, scalingFactor);
// radius -= dielectricOffset;
// paramsVector[i] = mm_float2((float) radius, (float) (scalingFactor*radius));
// posq[i].w = (float) charge;
// }
// posq.upload();
// params->upload(paramsVector);
// prefactor = -ONE_4PI_EPS0*((1.0/force.getSoluteDielectric())-(1.0/force.getSolventDielectric()));
// bool useCutoff = (force.getNonbondedMethod() != GBSAOBCForce::NoCutoff);
// bool usePeriodic = (force.getNonbondedMethod() != GBSAOBCForce::NoCutoff && force.getNonbondedMethod() != GBSAOBCForce::CutoffNonPeriodic);
// string source = CudaKernelSources::gbsaObc2;
// nb.addInteraction(useCutoff, usePeriodic, false, force.getCutoffDistance(), vector<vector<int> >(), source, force.getForceGroup());
// nb.addParameter(CudaNonbondedUtilities::ParameterInfo("obcParams", "float", 2, sizeof(cl_float2), params->getDevicePointer()));;
// nb.addParameter(CudaNonbondedUtilities::ParameterInfo("bornForce", "float", 1, sizeof(cl_float), bornForce->getDevicePointer()));;
// cu.addForce(new CudaGBSAOBCForceInfo(nb.getNumForceBuffers(), force));
//}
//
//double CudaCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
// CudaNonbondedUtilities& nb = cu.getNonbondedUtilities();
// bool deviceIsCpu = (cu.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU);
// if (!hasCreatedKernels) {
// // These Kernels cannot be created in initialize(), because the CudaNonbondedUtilities has not been initialized yet then.
//
// hasCreatedKernels = true;
// maxTiles = (nb.getUseCutoff() ? nb.getInteractingTiles().getSize() : 0);
// map<string, string> defines;
// if (nb.getForceBufferPerAtomBlock())
// defines["USE_OUTPUT_BUFFER_PER_BLOCK"] = "1";
// if (nb.getUseCutoff())
// defines["USE_CUTOFF"] = "1";
// if (nb.getUsePeriodic())
// defines["USE_PERIODIC"] = "1";
// defines["CUTOFF_SQUARED"] = cu.doubleToString(nb.getCutoffDistance()*nb.getCutoffDistance());
// defines["PREFACTOR"] = cu.doubleToString(prefactor);
// defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
// defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
// defines["NUM_BLOCKS"] = cu.intToString(cu.getNumAtomBlocks());
// defines["FORCE_WORK_GROUP_SIZE"] = cu.intToString(nb.getForceThreadBlockSize());
// string platformVendor = cu::Platform(cu.getDevice().getInfo<CL_DEVICE_PLATFORM>()).getInfo<CL_PLATFORM_VENDOR>();
// if (platformVendor == "Apple")
// defines["USE_APPLE_WORKAROUND"] = "1";
// string file;
// if (deviceIsCpu)
// file = CudaKernelSources::gbsaObc_cpu;
// else if (cu.getSIMDWidth() == 32)
// file = CudaKernelSources::gbsaObc_nvidia;
// else
// file = CudaKernelSources::gbsaObc_default;
// CUmodule module = cu.createModule(file, defines);
// bool useLong = (cu.getSupports64BitGlobalAtomics() && !deviceIsCpu);
// int index = 0;
// computeBornSumKernel = cu.getKernel(module, "computeBornSum");
// computeBornSumKernel.setArg<cu::Buffer>(index++, (useLong ? longBornSum->getDevicePointer() : bornSum->getDevicePointer()));
// computeBornSumKernel.setArg<cu::Buffer>(index++, cu.getPosq().getDevicePointer());
// computeBornSumKernel.setArg<cu::Buffer>(index++, params->getDevicePointer());
// if (nb.getUseCutoff()) {
// computeBornSumKernel.setArg<cu::Buffer>(index++, nb.getInteractingTiles().getDevicePointer());
// computeBornSumKernel.setArg<cu::Buffer>(index++, nb.getInteractionCount().getDevicePointer());
// index += 2; // The periodic box size arguments are set when the kernel is executed.
// computeBornSumKernel.setArg<cl_uint>(index++, maxTiles);
// if (cu.getSIMDWidth() == 32 || deviceIsCpu)
// computeBornSumKernel.setArg<cu::Buffer>(index++, nb.getInteractionFlags().getDevicePointer());
// }
// else
// computeBornSumKernel.setArg<cl_uint>(index++, cu.getNumAtomBlocks()*(cu.getNumAtomBlocks()+1)/2);
// if (cu.getSIMDWidth() == 32) {
// computeBornSumKernel.setArg<cu::Buffer>(index++, nb.getExclusionIndices().getDevicePointer());
// computeBornSumKernel.setArg<cu::Buffer>(index++, nb.getExclusionRowIndices().getDevicePointer());
// }
// force1Kernel = cu.getKernel(module, "computeGBSAForce1");
// index = 0;
// force1Kernel.setArg<cu::Buffer>(index++, (useLong ? cu.getLongForceBuffer().getDevicePointer() : cu.getForceBuffers().getDevicePointer()));
// force1Kernel.setArg<cu::Buffer>(index++, (useLong ? longBornForce->getDevicePointer() : bornForce->getDevicePointer()));
// force1Kernel.setArg<cu::Buffer>(index++, cu.getEnergyBuffer().getDevicePointer());
// force1Kernel.setArg<cu::Buffer>(index++, cu.getPosq().getDevicePointer());
// force1Kernel.setArg<cu::Buffer>(index++, bornRadii->getDevicePointer());
// if (nb.getUseCutoff()) {
// force1Kernel.setArg<cu::Buffer>(index++, nb.getInteractingTiles().getDevicePointer());
// force1Kernel.setArg<cu::Buffer>(index++, nb.getInteractionCount().getDevicePointer());
// index += 2; // The periodic box size arguments are set when the kernel is executed.
// force1Kernel.setArg<cl_uint>(index++, maxTiles);
// if (cu.getSIMDWidth() == 32 || deviceIsCpu)
// force1Kernel.setArg<cu::Buffer>(index++, nb.getInteractionFlags().getDevicePointer());
// }
// else
// force1Kernel.setArg<cl_uint>(index++, cu.getNumAtomBlocks()*(cu.getNumAtomBlocks()+1)/2);
// if (cu.getSIMDWidth() == 32) {
// force1Kernel.setArg<cu::Buffer>(index++, nb.getExclusionIndices().getDevicePointer());
// force1Kernel.setArg<cu::Buffer>(index++, nb.getExclusionRowIndices().getDevicePointer());
// }
// module = cu.createModule(CudaKernelSources::gbsaObcReductions, defines);
// reduceBornSumKernel = cu.getKernel(module, "reduceBornSum");
// reduceBornSumKernel.setArg<cl_int>(0, cu.getPaddedNumAtoms());
// reduceBornSumKernel.setArg<cl_int>(1, nb.getNumForceBuffers());
// reduceBornSumKernel.setArg<cl_float>(2, 1.0f);
// reduceBornSumKernel.setArg<cl_float>(3, 0.8f);
// reduceBornSumKernel.setArg<cl_float>(4, 4.85f);
// reduceBornSumKernel.setArg<cu::Buffer>(5, (useLong ? longBornSum->getDevicePointer() : bornSum->getDevicePointer()));
// reduceBornSumKernel.setArg<cu::Buffer>(6, params->getDevicePointer());
// reduceBornSumKernel.setArg<cu::Buffer>(7, bornRadii->getDevicePointer());
// reduceBornSumKernel.setArg<cu::Buffer>(8, obcChain->getDevicePointer());
// reduceBornForceKernel = cu.getKernel(module, "reduceBornForce");
// index = 0;
// reduceBornForceKernel.setArg<cl_int>(index++, cu.getPaddedNumAtoms());
// reduceBornForceKernel.setArg<cl_int>(index++, nb.getNumForceBuffers());
// reduceBornForceKernel.setArg<cu::Buffer>(index++, bornForce->getDevicePointer());
// if (useLong)
// reduceBornForceKernel.setArg<cu::Buffer>(index++, longBornForce->getDevicePointer());
// reduceBornForceKernel.setArg<cu::Buffer>(index++, cu.getEnergyBuffer().getDevicePointer());
// reduceBornForceKernel.setArg<cu::Buffer>(index++, params->getDevicePointer());
// reduceBornForceKernel.setArg<cu::Buffer>(index++, bornRadii->getDevicePointer());
// reduceBornForceKernel.setArg<cu::Buffer>(index++, obcChain->getDevicePointer());
// }
// if (nb.getUseCutoff()) {
// computeBornSumKernel.setArg<mm_float4>(5, cu.getPeriodicBoxSize());
// computeBornSumKernel.setArg<mm_float4>(6, cu.getInvPeriodicBoxSize());
// force1Kernel.setArg<mm_float4>(7, cu.getPeriodicBoxSize());
// force1Kernel.setArg<mm_float4>(8, cu.getInvPeriodicBoxSize());
// if (maxTiles < nb.getInteractingTiles().getSize()) {
// maxTiles = nb.getInteractingTiles().getSize();
// computeBornSumKernel.setArg<cu::Buffer>(3, nb.getInteractingTiles().getDevicePointer());
// computeBornSumKernel.setArg<cl_uint>(7, maxTiles);
// force1Kernel.setArg<cu::Buffer>(5, nb.getInteractingTiles().getDevicePointer());
// force1Kernel.setArg<cl_uint>(9, maxTiles);
// if (cu.getSIMDWidth() == 32 || deviceIsCpu) {
// computeBornSumKernel.setArg<cu::Buffer>(8, nb.getInteractionFlags().getDevicePointer());
// force1Kernel.setArg<cu::Buffer>(10, nb.getInteractionFlags().getDevicePointer());
// }
// }
// }
// cu.executeKernel(computeBornSumKernel, nb.getNumForceThreadBlocks()*nb.getForceThreadBlockSize(), nb.getForceThreadBlockSize());
// cu.executeKernel(reduceBornSumKernel, cu.getPaddedNumAtoms());
// cu.executeKernel(force1Kernel, nb.getNumForceThreadBlocks()*nb.getForceThreadBlockSize(), nb.getForceThreadBlockSize());
// cu.executeKernel(reduceBornForceKernel, cu.getPaddedNumAtoms());
// return 0.0;
//}
//
//void CudaCalcGBSAOBCForceKernel::copyParametersToContext(ContextImpl& context, const GBSAOBCForce& force) {
// // Make sure the new parameters are acceptable.
//
// cu.setAsCurrent();
// int numParticles = force.getNumParticles();
// if (numParticles != cu.getNumAtoms())
// throw OpenMMException("updateParametersInContext: The number of particles has changed");
//
// // Record the per-particle parameters.
//
// CudaArray<mm_float4>& posq = cu.getPosq();
// posq.download();
// vector<mm_float2> paramsVector(numParticles);
// const double dielectricOffset = 0.009;
// for (int i = 0; i < numParticles; i++) {
// double charge, radius, scalingFactor;
// force.getParticleParameters(i, charge, radius, scalingFactor);
// radius -= dielectricOffset;
// paramsVector[i] = mm_float2((float) radius, (float) (scalingFactor*radius));
// posq[i].w = (float) charge;
// }
// posq.upload();
// params->upload(paramsVector);
//
// // Mark that the current reordering may be invalid.
//
// cu.invalidateMolecules();
//}
//
class CudaGBSAOBCForceInfo : public CudaForceInfo {
public:
CudaGBSAOBCForceInfo(const GBSAOBCForce& force) : force(force) {
}
bool areParticlesIdentical(int particle1, int particle2) {
double charge1, charge2, radius1, radius2, scale1, scale2;
force.getParticleParameters(particle1, charge1, radius1, scale1);
force.getParticleParameters(particle2, charge2, radius2, scale2);
return (charge1 == charge2 && radius1 == radius2 && scale1 == scale2);
}
private:
const GBSAOBCForce& force;
};
CudaCalcGBSAOBCForceKernel::~CudaCalcGBSAOBCForceKernel() {
cu.setAsCurrent();
if (params != NULL)
delete params;
if (bornSum != NULL)
delete bornSum;
if (bornRadii != NULL)
delete bornRadii;
if (bornForce != NULL)
delete bornForce;
if (obcChain != NULL)
delete obcChain;
}
void CudaCalcGBSAOBCForceKernel::initialize(const System& system, const GBSAOBCForce& force) {
cu.setAsCurrent();
if (cu.getPlatformData().contexts.size() > 1)
throw OpenMMException("GBSAOBCForce does not support using multiple CUDA devices");
CudaNonbondedUtilities& nb = cu.getNonbondedUtilities();
params = CudaArray::create<float2>(cu, cu.getPaddedNumAtoms(), "gbsaObcParams");
if (cu.getUseDoublePrecision()) {
bornRadii = CudaArray::create<double>(cu, cu.getPaddedNumAtoms(), "bornRadii");
obcChain = CudaArray::create<double>(cu, cu.getPaddedNumAtoms(), "obcChain");
}
else {
bornRadii = CudaArray::create<float>(cu, cu.getPaddedNumAtoms(), "bornRadii");
obcChain = CudaArray::create<float>(cu, cu.getPaddedNumAtoms(), "obcChain");
}
bornSum = CudaArray::create<long long>(cu, cu.getPaddedNumAtoms(), "bornSum");
bornForce = CudaArray::create<long long>(cu, cu.getPaddedNumAtoms(), "bornForce");
cu.addAutoclearBuffer(bornSum->getDevicePointer(), bornSum->getSize()*sizeof(long long));
cu.addAutoclearBuffer(bornForce->getDevicePointer(), bornForce->getSize()*sizeof(long long));
CudaArray& posq = cu.getPosq();
float4* posqf = (float4*) cu.getPinnedBuffer();
double4* posqd = (double4*) cu.getPinnedBuffer();
vector<float2> paramsVector(cu.getPaddedNumAtoms());
const double dielectricOffset = 0.009;
for (int i = 0; i < force.getNumParticles(); i++) {
double charge, radius, scalingFactor;
force.getParticleParameters(i, charge, radius, scalingFactor);
radius -= dielectricOffset;
paramsVector[i] = make_float2((float) radius, (float) (scalingFactor*radius));
if (cu.getUseDoublePrecision())
posqd[i] = make_double4(0, 0, 0, charge);
else
posqf[i] = make_float4(0, 0, 0, (float) charge);
}
posq.upload(cu.getPinnedBuffer());
params->upload(paramsVector);
prefactor = -ONE_4PI_EPS0*((1.0/force.getSoluteDielectric())-(1.0/force.getSolventDielectric()));
bool useCutoff = (force.getNonbondedMethod() != GBSAOBCForce::NoCutoff);
bool usePeriodic = (force.getNonbondedMethod() != GBSAOBCForce::NoCutoff && force.getNonbondedMethod() != GBSAOBCForce::CutoffNonPeriodic);
string source = CudaKernelSources::gbsaObc2;
nb.addInteraction(useCutoff, usePeriodic, false, force.getCutoffDistance(), vector<vector<int> >(), source, force.getForceGroup());
nb.addParameter(CudaNonbondedUtilities::ParameterInfo("obcParams", "float", 2, sizeof(float2), params->getDevicePointer()));;
nb.addParameter(CudaNonbondedUtilities::ParameterInfo("bornForce", "long long", 1, sizeof(long long), bornForce->getDevicePointer()));;
cu.addForce(new CudaGBSAOBCForceInfo(force));
}
double CudaCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
CudaNonbondedUtilities& nb = cu.getNonbondedUtilities();
if (!hasCreatedKernels) {
// These Kernels cannot be created in initialize(), because the CudaNonbondedUtilities has not been initialized yet then.
hasCreatedKernels = true;
maxTiles = (nb.getUseCutoff() ? nb.getInteractingTiles().getSize() : cu.getNumAtomBlocks()*(cu.getNumAtomBlocks()+1)/2);
map<string, string> defines;
if (nb.getUseCutoff())
defines["USE_CUTOFF"] = "1";
if (nb.getUsePeriodic())
defines["USE_PERIODIC"] = "1";
defines["CUTOFF_SQUARED"] = cu.doubleToString(nb.getCutoffDistance()*nb.getCutoffDistance());
defines["PREFACTOR"] = cu.doubleToString(prefactor);
defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
defines["NUM_BLOCKS"] = cu.intToString(cu.getNumAtomBlocks());
defines["FORCE_WORK_GROUP_SIZE"] = cu.intToString(nb.getForceThreadBlockSize());
CUmodule module = cu.createModule(CudaKernelSources::vectorOps+CudaKernelSources::gbsaObc1, defines);
computeBornSumKernel = cu.getKernel(module, "computeBornSum");
computeSumArgs.push_back(&bornSum->getDevicePointer());
computeSumArgs.push_back(&cu.getPosq().getDevicePointer());
computeSumArgs.push_back(&params->getDevicePointer());
if (nb.getUseCutoff()) {
computeSumArgs.push_back(&nb.getInteractingTiles().getDevicePointer());
computeSumArgs.push_back(&nb.getInteractionCount().getDevicePointer());
computeSumArgs.push_back(cu.getPeriodicBoxSizePointer());
computeSumArgs.push_back(cu.getInvPeriodicBoxSizePointer());
computeSumArgs.push_back(&maxTiles);
computeSumArgs.push_back(&nb.getInteractionFlags().getDevicePointer());
}
else
computeSumArgs.push_back(&maxTiles);
computeSumArgs.push_back(&nb.getExclusionIndices().getDevicePointer());
computeSumArgs.push_back(&nb.getExclusionRowIndices().getDevicePointer());
force1Kernel = cu.getKernel(module, "computeGBSAForce1");
force1Args.push_back(&cu.getForce().getDevicePointer());
force1Args.push_back(&bornForce->getDevicePointer());
force1Args.push_back(&cu.getEnergyBuffer().getDevicePointer());
force1Args.push_back(&cu.getPosq().getDevicePointer());
force1Args.push_back(&bornRadii->getDevicePointer());
if (nb.getUseCutoff()) {
force1Args.push_back(&nb.getInteractingTiles().getDevicePointer());
force1Args.push_back(&nb.getInteractionCount().getDevicePointer());
force1Args.push_back(cu.getPeriodicBoxSizePointer());
force1Args.push_back(cu.getInvPeriodicBoxSizePointer());
force1Args.push_back(&maxTiles);
force1Args.push_back(&nb.getInteractionFlags().getDevicePointer());
}
else
force1Args.push_back(&maxTiles);
force1Args.push_back(&nb.getExclusionIndices().getDevicePointer());
force1Args.push_back(&nb.getExclusionRowIndices().getDevicePointer());
module = cu.createModule(CudaKernelSources::gbsaObcReductions, defines);
reduceBornSumKernel = cu.getKernel(module, "reduceBornSum");
reduceBornForceKernel = cu.getKernel(module, "reduceBornForce");
}
if (nb.getUseCutoff()) {
if (maxTiles < nb.getInteractingTiles().getSize()) {
maxTiles = nb.getInteractingTiles().getSize();
computeSumArgs[3] = &nb.getInteractingTiles().getDevicePointer();
force1Args[5] = &nb.getInteractingTiles().getDevicePointer();
computeSumArgs[8] = &nb.getInteractionFlags().getDevicePointer();
force1Args[10] = &nb.getInteractionFlags().getDevicePointer();
}
}
cu.executeKernel(computeBornSumKernel, &computeSumArgs[0], nb.getNumForceThreadBlocks()*nb.getForceThreadBlockSize(), nb.getForceThreadBlockSize());
float alpha = 1.0f, beta = 0.8f, gamma = 4.85f;
void* reduceSumArgs[] = {&alpha, &beta, &gamma, &bornSum->getDevicePointer(), &params->getDevicePointer(),
&bornRadii->getDevicePointer(), &obcChain->getDevicePointer()};
cu.executeKernel(reduceBornSumKernel, reduceSumArgs, cu.getPaddedNumAtoms());
cu.executeKernel(force1Kernel, &force1Args[0], nb.getNumForceThreadBlocks()*nb.getForceThreadBlockSize(), nb.getForceThreadBlockSize());
void* reduceForceArgs[] = {&bornForce->getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(), &params->getDevicePointer(),
&bornRadii->getDevicePointer(), &obcChain->getDevicePointer()};
cu.executeKernel(reduceBornForceKernel, &reduceForceArgs[0], cu.getPaddedNumAtoms());
return 0.0;
}
void CudaCalcGBSAOBCForceKernel::copyParametersToContext(ContextImpl& context, const GBSAOBCForce& force) {
// Make sure the new parameters are acceptable.
cu.setAsCurrent();
int numParticles = force.getNumParticles();
if (numParticles != cu.getNumAtoms())
throw OpenMMException("updateParametersInContext: The number of particles has changed");
// Record the per-particle parameters.
CudaArray& posq = cu.getPosq();
float4* posqf = (float4*) cu.getPinnedBuffer();
double4* posqd = (double4*) cu.getPinnedBuffer();
posq.download(cu.getPinnedBuffer());
vector<float2> paramsVector(cu.getPaddedNumAtoms());
const double dielectricOffset = 0.009;
for (int i = 0; i < numParticles; i++) {
double charge, radius, scalingFactor;
force.getParticleParameters(i, charge, radius, scalingFactor);
radius -= dielectricOffset;
paramsVector[i] = make_float2((float) radius, (float) (scalingFactor*radius));
if (cu.getUseDoublePrecision())
posqd[i].w = charge;
else
posqf[i].w = (float) charge;
}
posq.upload(cu.getPinnedBuffer());
params->upload(paramsVector);
// Mark that the current reordering may be invalid.
cu.invalidateMolecules();
}
//class CudaCustomGBForceInfo : public CudaForceInfo {
//public:
// CudaCustomGBForceInfo(int requiredBuffers, const CustomGBForce& force) : CudaForceInfo(requiredBuffers), force(force) {
......@@ -2870,7 +2838,7 @@ void CudaCalcCustomNonbondedForceKernel::copyParametersToContext(ContextImpl& co
// perParticleValueKernel.setArg<cu::Buffer>(index++, tabulatedFunctionParams->getDevicePointer());
// }
// index = 0;
// pairEnergyKernel.setArg<cu::Buffer>(index++, useLong ? cu.getLongForceBuffer().getDevicePointer() : cu.getForceBuffers().getDevicePointer());
// pairEnergyKernel.setArg<cu::Buffer>(index++, useLong ? cu.getLongForceBuffer().getDevicePointer() : cu.getForce().getDevicePointer());
// pairEnergyKernel.setArg<cu::Buffer>(index++, cu.getEnergyBuffer().getDevicePointer());
// pairEnergyKernel.setArg(index++, (deviceIsCpu ? CudaContext::TileSize : nb.getForceThreadBlockSize())*sizeof(cl_float4), NULL);
// pairEnergyKernel.setArg<cu::Buffer>(index++, cu.getPosq().getDevicePointer());
......@@ -2927,7 +2895,7 @@ void CudaCalcCustomNonbondedForceKernel::copyParametersToContext(ContextImpl& co
// index = 0;
// perParticleEnergyKernel.setArg<cl_int>(index++, cu.getPaddedNumAtoms());
// perParticleEnergyKernel.setArg<cl_int>(index++, nb.getNumForceBuffers());
// perParticleEnergyKernel.setArg<cu::Buffer>(index++, cu.getForceBuffers().getDevicePointer());
// perParticleEnergyKernel.setArg<cu::Buffer>(index++, cu.getForce().getDevicePointer());
// perParticleEnergyKernel.setArg<cu::Buffer>(index++, cu.getEnergyBuffer().getDevicePointer());
// perParticleEnergyKernel.setArg<cu::Buffer>(index++, cu.getPosq().getDevicePointer());
// if (globals != NULL)
......@@ -2947,7 +2915,7 @@ void CudaCalcCustomNonbondedForceKernel::copyParametersToContext(ContextImpl& co
// }
// if (needParameterGradient) {
// index = 0;
// gradientChainRuleKernel.setArg<cu::Buffer>(index++, cu.getForceBuffers().getDevicePointer());
// gradientChainRuleKernel.setArg<cu::Buffer>(index++, cu.getForce().getDevicePointer());
// gradientChainRuleKernel.setArg<cu::Buffer>(index++, cu.getPosq().getDevicePointer());
// if (globals != NULL)
// gradientChainRuleKernel.setArg<cu::Buffer>(index++, globals->getDevicePointer());
......
......@@ -666,57 +666,55 @@ private:
System& system;
};
///**
// * This kernel is invoked by GBSAOBCForce to calculate the forces acting on the system.
// */
//class CudaCalcGBSAOBCForceKernel : public CalcGBSAOBCForceKernel {
//public:
// CudaCalcGBSAOBCForceKernel(std::string name, const Platform& platform, CudaContext& cu) : CalcGBSAOBCForceKernel(name, platform), cu(cu),
// hasCreatedKernels(false), params(NULL), bornSum(NULL), longBornSum(NULL), bornRadii(NULL), bornForce(NULL),
// longBornForce(NULL), obcChain(NULL) {
// }
// ~CudaCalcGBSAOBCForceKernel();
// /**
// * Initialize the kernel.
// *
// * @param system the System this kernel will be applied to
// * @param force the GBSAOBCForce this kernel will be used for
// */
// void initialize(const System& system, const GBSAOBCForce& force);
// /**
// * Execute the kernel to calculate the forces and/or energy.
// *
// * @param context the context in which to execute this kernel
// * @param includeForces true if forces should be calculated
// * @param includeEnergy true if the energy should be calculated
// * @return the potential energy due to the force
// */
// double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
// /**
// * Copy changed parameters over to a context.
// *
// * @param context the context to copy parameters to
// * @param force the GBSAOBCForce to copy the parameters from
// */
// void copyParametersToContext(ContextImpl& context, const GBSAOBCForce& force);
//private:
// double prefactor;
// bool hasCreatedKernels;
// int maxTiles;
// CudaContext& cu;
// CudaArray<mm_float2>* params;
// CudaArray<cl_float>* bornSum;
// CudaArray<cl_long>* longBornSum;
// CudaArray<cl_float>* bornRadii;
// CudaArray<cl_float>* bornForce;
// CudaArray<cl_long>* longBornForce;
// CudaArray<cl_float>* obcChain;
// CUfunction computeBornSumKernel;
// CUfunction reduceBornSumKernel;
// CUfunction force1Kernel;
// CUfunction reduceBornForceKernel;
//};
//
/**
* This kernel is invoked by GBSAOBCForce to calculate the forces acting on the system.
*/
class CudaCalcGBSAOBCForceKernel : public CalcGBSAOBCForceKernel {
public:
CudaCalcGBSAOBCForceKernel(std::string name, const Platform& platform, CudaContext& cu) : CalcGBSAOBCForceKernel(name, platform), cu(cu),
hasCreatedKernels(false), params(NULL), bornSum(NULL), bornRadii(NULL), bornForce(NULL), obcChain(NULL) {
}
~CudaCalcGBSAOBCForceKernel();
/**
* Initialize the kernel.
*
* @param system the System this kernel will be applied to
* @param force the GBSAOBCForce this kernel will be used for
*/
void initialize(const System& system, const GBSAOBCForce& force);
/**
* Execute the kernel to calculate the forces and/or energy.
*
* @param context the context in which to execute this kernel
* @param includeForces true if forces should be calculated
* @param includeEnergy true if the energy should be calculated
* @return the potential energy due to the force
*/
double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
/**
* Copy changed parameters over to a context.
*
* @param context the context to copy parameters to
* @param force the GBSAOBCForce to copy the parameters from
*/
void copyParametersToContext(ContextImpl& context, const GBSAOBCForce& force);
private:
double prefactor;
bool hasCreatedKernels;
int maxTiles;
CudaContext& cu;
CudaArray* params;
CudaArray* bornSum;
CudaArray* bornRadii;
CudaArray* bornForce;
CudaArray* obcChain;
CUfunction computeBornSumKernel;
CUfunction reduceBornSumKernel;
CUfunction force1Kernel;
CUfunction reduceBornForceKernel;
std::vector<void*> computeSumArgs, force1Args;
};
///**
// * This kernel is invoked by CustomGBForce to calculate the forces acting on the system.
// */
......
#define TILE_SIZE 32
#define WARPS_PER_GROUP (FORCE_WORK_GROUP_SIZE/TILE_SIZE)
typedef struct {
real x, y, z;
real q;
float radius, scaledRadius;
real bornSum;
} AtomData1;
/**
* Compute the Born sum.
*/
extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ global_bornSum, const real4* __restrict__ posq, const float2* __restrict__ global_params,
#ifdef USE_CUTOFF
const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const unsigned int* __restrict__ interactionFlags,
#else
unsigned int numTiles,
#endif
unsigned int* exclusionIndices, unsigned int* exclusionRowIndices) {
unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
#ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0];
unsigned int pos = warp*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
unsigned int end = (warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
#else
unsigned int pos = warp*numTiles/totalWarps;
unsigned int end = (warp+1)*numTiles/totalWarps;
#endif
unsigned int lasty = 0xFFFFFFFF;
__shared__ AtomData1 localData[FORCE_WORK_GROUP_SIZE];
__shared__ real tempBuffer[FORCE_WORK_GROUP_SIZE];
__shared__ unsigned int exclusionRange[2*WARPS_PER_GROUP];
__shared__ int exclusionIndex[WARPS_PER_GROUP];
do {
// Extract the coordinates of this tile
const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
const unsigned int tbx = threadIdx.x - tgx;
const unsigned int localGroupIndex = threadIdx.x/TILE_SIZE;
unsigned int x, y;
real bornSum = 0;
if (pos < end) {
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
x = tileIndices.x;
y = tileIndices.y;
}
else
#endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
unsigned int atom1 = x*TILE_SIZE + tgx;
real4 posq1 = posq[atom1];
float2 params1 = global_params[atom1];
if (pos >= end)
; // This warp is done.
else if (x == y) {
// This tile is on the diagonal.
localData[threadIdx.x].x = posq1.x;
localData[threadIdx.x].y = posq1.y;
localData[threadIdx.x].z = posq1.z;
localData[threadIdx.x].q = posq1.w;
localData[threadIdx.x].radius = params1.x;
localData[threadIdx.x].scaledRadius = params1.y;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
real3 delta = make_real3(localData[tbx+j].x-posq1.x, localData[tbx+j].y-posq1.y, localData[tbx+j].z-posq1.z);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
float2 params2 = make_float2(localData[tbx+j].radius, localData[tbx+j].scaledRadius);
real rScaledRadiusJ = r+params2.y;
if ((j != tgx) && (params1.x < rScaledRadiusJ)) {
real l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
real u_ij = RECIP(rScaledRadiusJ);
real l_ij2 = l_ij*l_ij;
real u_ij2 = u_ij*u_ij;
real ratio = LOG(u_ij * RECIP(l_ij));
bornSum += l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
(params2.y*params2.y*invR)*(l_ij2-u_ij2));
if (params1.x < params2.y-r)
bornSum += 2.0f*(RECIP(params1.x)-l_ij);
}
}
}
}
else {
// This is an off-diagonal tile.
if (lasty != y) {
unsigned int j = y*TILE_SIZE + tgx;
real4 tempPosq = posq[j];
localData[threadIdx.x].x = tempPosq.x;
localData[threadIdx.x].y = tempPosq.y;
localData[threadIdx.x].z = tempPosq.z;
localData[threadIdx.x].q = tempPosq.w;
float2 tempParams = global_params[j];
localData[threadIdx.x].radius = tempParams.x;
localData[threadIdx.x].scaledRadius = tempParams.y;
}
localData[threadIdx.x].bornSum = 0.0f;
#ifdef USE_CUTOFF
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
bool computeSubset = false;
if (flags != 0xFFFFFFFF) {
if (tgx < 2)
exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
if (tgx == 0)
exclusionIndex[localGroupIndex] = -1;
for (unsigned int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
if (exclusionIndices[i] == y)
exclusionIndex[localGroupIndex] = i*TILE_SIZE;
computeSubset = (exclusionIndex[localGroupIndex] == -1);
}
if (computeSubset) {
if (flags == 0) {
// No interactions in this tile.
}
else {
// Compute only a subset of the interactions in this tile.
for (unsigned int j = 0; j < TILE_SIZE; j++) {
if ((flags&(1<<j)) != 0) {
real3 delta = make_real3(localData[tbx+j].x-posq1.x, localData[tbx+j].y-posq1.y, localData[tbx+j].z-posq1.z);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
tempBuffer[threadIdx.x] = 0.0f;
#ifdef USE_CUTOFF
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
float2 params2 = make_float2(localData[tbx+j].radius, localData[tbx+j].scaledRadius);
real rScaledRadiusJ = r+params2.y;
if (params1.x < rScaledRadiusJ) {
real l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
real u_ij = RECIP(rScaledRadiusJ);
real l_ij2 = l_ij*l_ij;
real u_ij2 = u_ij*u_ij;
real ratio = LOG(u_ij * RECIP(l_ij));
bornSum += l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
(params2.y*params2.y*invR)*(l_ij2-u_ij2));
if (params1.x < params2.y-r)
bornSum += 2.0f*(RECIP(params1.x)-l_ij);
}
real rScaledRadiusI = r+params1.y;
if (params2.x < rScaledRadiusI) {
real l_ij = RECIP(max(params2.x, fabs(r-params1.y)));
real u_ij = RECIP(rScaledRadiusI);
real l_ij2 = l_ij*l_ij;
real u_ij2 = u_ij*u_ij;
real ratio = LOG(u_ij * RECIP(l_ij));
real term = l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
(params1.y*params1.y*invR)*(l_ij2-u_ij2));
if (params2.x < params1.y-r)
term += 2.0f*(RECIP(params2.x)-l_ij);
tempBuffer[threadIdx.x] = term;
}
}
// Sum the forces on atom j.
if (tgx % 4 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+1]+tempBuffer[threadIdx.x+2]+tempBuffer[threadIdx.x+3];
if (tgx == 0)
localData[tbx+j].bornSum += tempBuffer[threadIdx.x]+tempBuffer[threadIdx.x+4]+tempBuffer[threadIdx.x+8]+tempBuffer[threadIdx.x+12]+tempBuffer[threadIdx.x+16]+tempBuffer[threadIdx.x+20]+tempBuffer[threadIdx.x+24]+tempBuffer[threadIdx.x+28];
}
}
}
}
else
#endif
{
// Compute the full set of interactions in this tile.
unsigned int tj = tgx;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
real3 delta = make_real3(localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else
if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS) {
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
float2 params2 = make_float2(localData[tbx+tj].radius, localData[tbx+tj].scaledRadius);
real rScaledRadiusJ = r+params2.y;
if (params1.x < rScaledRadiusJ) {
real l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
real u_ij = RECIP(rScaledRadiusJ);
real l_ij2 = l_ij*l_ij;
real u_ij2 = u_ij*u_ij;
real ratio = LOG(u_ij * RECIP(l_ij));
bornSum += l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
(params2.y*params2.y*invR)*(l_ij2-u_ij2));
if (params1.x < params2.y-r)
bornSum += 2.0f*(RECIP(params1.x)-l_ij);
}
real rScaledRadiusI = r+params1.y;
if (params2.x < rScaledRadiusI) {
real l_ij = RECIP(max(params2.x, fabs(r-params1.y)));
real u_ij = RECIP(rScaledRadiusI);
real l_ij2 = l_ij*l_ij;
real u_ij2 = u_ij*u_ij;
real ratio = LOG(u_ij * RECIP(l_ij));
real term = l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
(params1.y*params1.y*invR)*(l_ij2-u_ij2));
if (params2.x < params1.y-r)
term += 2.0f*(RECIP(params2.x)-l_ij);
localData[tbx+tj].bornSum += term;
}
}
tj = (tj + 1) & (TILE_SIZE - 1);
}
}
}
}
// Write results.
if (pos < end) {
const unsigned int offset = x*TILE_SIZE + tgx;
atomicAdd(&global_bornSum[offset], static_cast<unsigned long long>((long long) (bornSum*0xFFFFFFFF)));
}
if (pos < end && x != y) {
const unsigned int offset = y*TILE_SIZE + tgx;
atomicAdd(&global_bornSum[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].bornSum*0xFFFFFFFF)));
}
lasty = y;
pos++;
} while (pos < end);
}
typedef struct {
real x, y, z;
real q;
real fx, fy, fz, fw;
real bornRadius;
} AtomData2;
/**
* First part of computing the GBSA interaction.
*/
extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ forceBuffers, unsigned long long* __restrict__ global_bornForce,
real* __restrict__ energyBuffer, const real4* __restrict__ posq, const real* __restrict__ global_bornRadii,
#ifdef USE_CUTOFF
const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const unsigned int* __restrict__ interactionFlags,
#else
unsigned int numTiles,
#endif
unsigned int* exclusionIndices, unsigned int* exclusionRowIndices) {
unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
#ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0];
unsigned int pos = warp*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
unsigned int end = (warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
#else
unsigned int pos = warp*numTiles/totalWarps;
unsigned int end = (warp+1)*numTiles/totalWarps;
#endif
real energy = 0;
unsigned int lasty = 0xFFFFFFFF;
__shared__ AtomData2 localData[FORCE_WORK_GROUP_SIZE];
__shared__ real4 tempBuffer[FORCE_WORK_GROUP_SIZE];
__shared__ unsigned int exclusionRange[2*WARPS_PER_GROUP];
__shared__ int exclusionIndex[WARPS_PER_GROUP];
do {
// Extract the coordinates of this tile
const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
const unsigned int tbx = threadIdx.x - tgx;
const unsigned int localGroupIndex = threadIdx.x/TILE_SIZE;
unsigned int x, y;
real4 force = make_real4(0);
if (pos < end) {
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
x = tileIndices.x;
y = tileIndices.y;
}
else
#endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
unsigned int atom1 = x*TILE_SIZE + tgx;
real4 posq1 = posq[atom1];
real bornRadius1 = global_bornRadii[atom1];
if (x == y) {
// This tile is on the diagonal.
localData[threadIdx.x].x = posq1.x;
localData[threadIdx.x].y = posq1.y;
localData[threadIdx.x].z = posq1.z;
localData[threadIdx.x].q = posq1.w;
localData[threadIdx.x].bornRadius = bornRadius1;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
real4 posq2 = make_real4(localData[tbx+j].x, localData[tbx+j].y, localData[tbx+j].z, localData[tbx+j].q);
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
real bornRadius2 = localData[tbx+j].bornRadius;
real alpha2_ij = bornRadius1*bornRadius2;
real D_ij = r2*RECIP(4.0f*alpha2_ij);
real expTerm = EXP(-D_ij);
real denominator2 = r2 + alpha2_ij*expTerm;
real denominator = SQRT(denominator2);
real tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator);
real Gpol = tempEnergy*RECIP(denominator2);
real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
real dEdR = Gpol*(1.0f - 0.25f*expTerm);
force.w += dGpol_dalpha2_ij*bornRadius2;
energy += 0.5f*tempEnergy;
delta *= dEdR;
force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
#ifdef USE_CUTOFF
}
#endif
}
}
}
else {
// This is an off-diagonal tile.
if (lasty != y) {
unsigned int j = y*TILE_SIZE + tgx;
real4 tempPosq = posq[j];
localData[threadIdx.x].x = tempPosq.x;
localData[threadIdx.x].y = tempPosq.y;
localData[threadIdx.x].z = tempPosq.z;
localData[threadIdx.x].q = tempPosq.w;
localData[threadIdx.x].bornRadius = global_bornRadii[j];
}
localData[threadIdx.x].fx = 0.0f;
localData[threadIdx.x].fy = 0.0f;
localData[threadIdx.x].fz = 0.0f;
localData[threadIdx.x].fw = 0.0f;
#ifdef USE_CUTOFF
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
bool computeSubset = false;
if (flags != 0xFFFFFFFF) {
if (tgx < 2)
exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
if (tgx == 0)
exclusionIndex[localGroupIndex] = -1;
for (unsigned int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
if (exclusionIndices[i] == y)
exclusionIndex[localGroupIndex] = i*TILE_SIZE;
computeSubset = (exclusionIndex[localGroupIndex] == -1);
}
if (computeSubset) {
if (flags == 0) {
// No interactions in this tile.
}
else {
// Compute only a subset of the interactions in this tile.
for (unsigned int j = 0; j < TILE_SIZE; j++) {
if ((flags&(1<<j)) != 0) {
real4 posq2 = make_real4(localData[tbx+j].x, localData[tbx+j].y, localData[tbx+j].z, localData[tbx+j].q);
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
real bornRadius2 = localData[tbx+j].bornRadius;
real alpha2_ij = bornRadius1*bornRadius2;
real D_ij = r2*RECIP(4.0f*alpha2_ij);
real expTerm = EXP(-D_ij);
real denominator2 = r2 + alpha2_ij*expTerm;
real denominator = SQRT(denominator2);
real tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator);
real Gpol = tempEnergy*RECIP(denominator2);
real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
real dEdR = Gpol*(1.0f - 0.25f*expTerm);
#ifdef USE_CUTOFF
if (atom1 >= NUM_ATOMS || y*TILE_SIZE+j >= NUM_ATOMS || r2 > CUTOFF_SQUARED) {
#else
if (atom1 >= NUM_ATOMS || y*TILE_SIZE+j >= NUM_ATOMS) {
#endif
dEdR = 0.0f;
dGpol_dalpha2_ij = 0.0f;
tempEnergy = 0.0f;
}
energy += tempEnergy;
force.w += dGpol_dalpha2_ij*bornRadius2;
delta *= dEdR;
force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
tempBuffer[threadIdx.x] = make_real4(delta.x, delta.y, delta.z, dGpol_dalpha2_ij*bornRadius1);
#ifdef USE_CUTOFF
}
else
tempBuffer[threadIdx.x] = make_real4(0);
#endif
// Sum the forces on atom j.
if (tgx % 4 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+1]+tempBuffer[threadIdx.x+2]+tempBuffer[threadIdx.x+3];
if (tgx == 0) {
real4 sum = tempBuffer[threadIdx.x]+tempBuffer[threadIdx.x+4]+tempBuffer[threadIdx.x+8]+tempBuffer[threadIdx.x+12]+tempBuffer[threadIdx.x+16]+tempBuffer[threadIdx.x+20]+tempBuffer[threadIdx.x+24]+tempBuffer[threadIdx.x+28];
localData[tbx+j].fx += sum.x;
localData[tbx+j].fy += sum.y;
localData[tbx+j].fz += sum.z;
localData[tbx+j].fw += sum.w;
}
}
}
}
}
else
#endif
{
// Compute the full set of interactions in this tile.
unsigned int tj = tgx;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS) {
real4 posq2 = make_real4(localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z, localData[tbx+tj].q);
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
real bornRadius2 = localData[tbx+tj].bornRadius;
real alpha2_ij = bornRadius1*bornRadius2;
real D_ij = r2*RECIP(4.0f*alpha2_ij);
real expTerm = EXP(-D_ij);
real denominator2 = r2 + alpha2_ij*expTerm;
real denominator = SQRT(denominator2);
real tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator);
real Gpol = tempEnergy*RECIP(denominator2);
real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
real dEdR = Gpol*(1.0f - 0.25f*expTerm);
force.w += dGpol_dalpha2_ij*bornRadius2;
energy += tempEnergy;
delta *= dEdR;
force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
localData[tbx+tj].fx += delta.x;
localData[tbx+tj].fy += delta.y;
localData[tbx+tj].fz += delta.z;
localData[tbx+tj].fw += dGpol_dalpha2_ij*bornRadius1;
#ifdef USE_CUTOFF
}
#endif
}
tj = (tj + 1) & (TILE_SIZE - 1);
}
}
}
}
// Write results.
if (pos < end) {
const unsigned int offset = x*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (force.x*0xFFFFFFFF)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0xFFFFFFFF)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0xFFFFFFFF)));
atomicAdd(&global_bornForce[offset], static_cast<unsigned long long>((long long) (force.w*0xFFFFFFFF)));
}
if (pos < end && x != y) {
const unsigned int offset = y*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fx*0xFFFFFFFF)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fy*0xFFFFFFFF)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fz*0xFFFFFFFF)));
atomicAdd(&global_bornForce[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fw*0xFFFFFFFF)));
}
lasty = y;
pos++;
} while (pos < end);
energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
}
{
real invRSquaredOver4 = 0.25f*invR*invR;
real rScaledRadiusJ = r+obcParams2.y;
real rScaledRadiusI = r+obcParams1.y;
real l_ijJ = RECIP(max(obcParams1.x, fabs(r-obcParams2.y)));
real l_ijI = RECIP(max(obcParams2.x, fabs(r-obcParams1.y)));
real u_ijJ = RECIP(rScaledRadiusJ);
real u_ijI = RECIP(rScaledRadiusI);
real l_ij2J = l_ijJ*l_ijJ;
real l_ij2I = l_ijI*l_ijI;
real u_ij2J = u_ijJ*u_ijJ;
real u_ij2I = u_ijI*u_ijI;
real t1J = LOG(u_ijJ*RECIP(l_ijJ));
real t1I = LOG(u_ijI*RECIP(l_ijI));
real t2J = (l_ij2J-u_ij2J);
real t2I = (l_ij2I-u_ij2I);
real term1 = (0.5f*(0.25f+obcParams2.y*obcParams2.y*invRSquaredOver4)*t2J + t1J*invRSquaredOver4)*invR;
real term2 = (0.5f*(0.25f+obcParams1.y*obcParams1.y*invRSquaredOver4)*t2I + t1I*invRSquaredOver4)*invR;
real tempdEdR = (obcParams1.x < rScaledRadiusJ ? bornForce1*term1/0xFFFFFFFF : 0);
tempdEdR += (obcParams2.x < rScaledRadiusI ? bornForce2*term2/0xFFFFFFFF : 0);
#ifdef USE_CUTOFF
unsigned int includeInteraction = (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2 && r2 < CUTOFF_SQUARED);
#else
unsigned int includeInteraction = (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2);
#endif
dEdR += (includeInteraction ? tempdEdR : 0);
}
#define DIELECTRIC_OFFSET 0.009f
#define PROBE_RADIUS 0.14f
#define SURFACE_AREA_FACTOR -170.351730667551f //-6.0f*3.14159265358979323846f*0.0216f*1000.0f*0.4184f;
/**
* Reduce the Born sums to compute the Born radii.
*/
extern "C" __global__ void reduceBornSum(float alpha, float beta, float gamma, const long long* __restrict__ bornSum,
const float2* __restrict__ params, real* __restrict__ bornRadii, real* __restrict__ obcChain) {
for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
// Get summed Born data
real sum = RECIP(0xFFFFFFFF)*bornSum[index];
// Now calculate Born radius and OBC term.
float offsetRadius = params[index].x;
sum *= 0.5f*offsetRadius;
real sum2 = sum*sum;
real sum3 = sum*sum2;
real tanhSum = tanh(alpha*sum - beta*sum2 + gamma*sum3);
real nonOffsetRadius = offsetRadius + DIELECTRIC_OFFSET;
real radius = RECIP(RECIP(offsetRadius) - tanhSum/nonOffsetRadius);
real chain = offsetRadius*(alpha - 2.0f*beta*sum + 3.0f*gamma*sum2);
chain = (1-tanhSum*tanhSum)*chain / nonOffsetRadius;
bornRadii[index] = radius;
obcChain[index] = chain;
}
}
/**
* Reduce the Born force.
*/
extern "C" __global__ void reduceBornForce(long long* __restrict__ bornForce, real* __restrict__ energyBuffer,
const float2* __restrict__ params, const real* __restrict__ bornRadii, const real* __restrict__ obcChain) {
real energy = 0;
for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
// Get summed Born force
real force = RECIP(0xFFFFFFFF)*bornForce[index];
// Now calculate the actual force
float offsetRadius = params[index].x;
real bornRadius = bornRadii[index];
real r = offsetRadius+DIELECTRIC_OFFSET+PROBE_RADIUS;
real ratio6 = POW((offsetRadius+DIELECTRIC_OFFSET)/bornRadius, 6);
real saTerm = SURFACE_AREA_FACTOR*r*r*ratio6;
force += saTerm/bornRadius;
energy += saTerm;
force *= bornRadius*bornRadius*obcChain[index];
bornForce[index] = (long long) (force*0xFFFFFFFF);
}
energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy/-6;
}
\ No newline at end of file
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008-2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
/**
* This tests the CUDA implementation of GBSAOBCForce.
*/
#include "openmm/internal/AssertionUtilities.h"
#include "openmm/Context.h"
#include "CudaPlatform.h"
#include "ReferencePlatform.h"
#include "openmm/GBSAOBCForce.h"
#include "openmm/System.h"
#include "openmm/LangevinIntegrator.h"
#include "../src/SimTKUtilities/SimTKOpenMMRealType.h"
#include "sfmt/SFMT.h"
#include "openmm/NonbondedForce.h"
#include <iostream>
#include <vector>
using namespace OpenMM;
using namespace std;
const double TOL = 1e-5;
void testSingleParticle() {
CudaPlatform platform;
System system;
system.addParticle(2.0);
LangevinIntegrator integrator(0, 0.1, 0.01);
GBSAOBCForce* gbsa = new GBSAOBCForce();
NonbondedForce* nonbonded = new NonbondedForce();
gbsa->addParticle(0.5, 0.15, 1);
nonbonded->addParticle(0.5, 1, 0);
system.addForce(gbsa);
system.addForce(nonbonded);
Context context(system, integrator, platform);
vector<Vec3> positions(1);
positions[0] = Vec3(0, 0, 0);
context.setPositions(positions);
State state = context.getState(State::Energy);
double bornRadius = 0.15-0.009; // dielectric offset
double eps0 = EPSILON0;
double bornEnergy = (-0.5*0.5/(8*PI_M*eps0))*(1.0/gbsa->getSoluteDielectric()-1.0/gbsa->getSolventDielectric())/bornRadius;
double extendedRadius = bornRadius+0.14; // probe radius
double nonpolarEnergy = CAL2JOULE*PI_M*0.0216*(10*extendedRadius)*(10*extendedRadius)*std::pow(0.15/bornRadius, 6.0); // Where did this formula come from? Just copied it from CpuImplicitSolvent.cpp
ASSERT_EQUAL_TOL((bornEnergy+nonpolarEnergy), state.getPotentialEnergy(), 0.01);
// Change the parameters and see if it is still correct.
gbsa->setParticleParameters(0, 0.4, 0.25, 1);
gbsa->updateParametersInContext(context);
state = context.getState(State::Energy);
bornRadius = 0.25-0.009; // dielectric offset
bornEnergy = (-0.4*0.4/(8*PI_M*eps0))*(1.0/gbsa->getSoluteDielectric()-1.0/gbsa->getSolventDielectric())/bornRadius;
extendedRadius = bornRadius+0.14;
nonpolarEnergy = CAL2JOULE*PI_M*0.0216*(10*extendedRadius)*(10*extendedRadius)*std::pow(0.25/bornRadius, 6.0);
ASSERT_EQUAL_TOL((bornEnergy+nonpolarEnergy), state.getPotentialEnergy(), 0.01);
}
void testCutoffAndPeriodic() {
CudaPlatform cl;
System system;
system.addParticle(1.0);
system.addParticle(1.0);
LangevinIntegrator integrator(0, 0.1, 0.01);
GBSAOBCForce* gbsa = new GBSAOBCForce();
NonbondedForce* nonbonded = new NonbondedForce();
gbsa->addParticle(-1, 0.15, 1);
nonbonded->addParticle(-1, 1, 0);
gbsa->addParticle(1, 0.15, 1);
nonbonded->addParticle(1, 1, 0);
const double cutoffDistance = 3.0;
const double boxSize = 10.0;
nonbonded->setCutoffDistance(cutoffDistance);
gbsa->setCutoffDistance(cutoffDistance);
system.setDefaultPeriodicBoxVectors(Vec3(boxSize, 0, 0), Vec3(0, boxSize, 0), Vec3(0, 0, boxSize));
system.addForce(gbsa);
system.addForce(nonbonded);
vector<Vec3> positions(2);
positions[0] = Vec3(0, 0, 0);
positions[1] = Vec3(2, 0, 0);
// Calculate the forces for both cutoff and periodic with two different atom positions.
nonbonded->setNonbondedMethod(NonbondedForce::CutoffNonPeriodic);
gbsa->setNonbondedMethod(GBSAOBCForce::CutoffNonPeriodic);
Context context(system, integrator, cl);
context.setPositions(positions);
State state1 = context.getState(State::Forces);
nonbonded->setNonbondedMethod(NonbondedForce::CutoffPeriodic);
gbsa->setNonbondedMethod(GBSAOBCForce::CutoffPeriodic);
context.reinitialize();
context.setPositions(positions);
State state2 = context.getState(State::Forces);
positions[1][0]+= boxSize;
nonbonded->setNonbondedMethod(NonbondedForce::CutoffNonPeriodic);
gbsa->setNonbondedMethod(GBSAOBCForce::CutoffNonPeriodic);
context.reinitialize();
context.setPositions(positions);
State state3 = context.getState(State::Forces);
nonbonded->setNonbondedMethod(NonbondedForce::CutoffPeriodic);
gbsa->setNonbondedMethod(GBSAOBCForce::CutoffPeriodic);
context.reinitialize();
context.setPositions(positions);
State state4 = context.getState(State::Forces);
// All forces should be identical, exception state3 which should be zero.
ASSERT_EQUAL_VEC(state1.getForces()[0], state2.getForces()[0], 0.01);
ASSERT_EQUAL_VEC(state1.getForces()[1], state2.getForces()[1], 0.01);
ASSERT_EQUAL_VEC(state1.getForces()[0], state4.getForces()[0], 0.01);
ASSERT_EQUAL_VEC(state1.getForces()[1], state4.getForces()[1], 0.01);
ASSERT_EQUAL_VEC(state3.getForces()[0], Vec3(0, 0, 0), 0.01);
ASSERT_EQUAL_VEC(state3.getForces()[1], Vec3(0, 0, 0), 0.01);
}
void testForce(int numParticles, NonbondedForce::NonbondedMethod method, GBSAOBCForce::NonbondedMethod method2) {
CudaPlatform cl;
ReferencePlatform reference;
System system;
GBSAOBCForce* gbsa = new GBSAOBCForce();
NonbondedForce* nonbonded = new NonbondedForce();
for (int i = 0; i < numParticles; ++i) {
system.addParticle(1.0);
double charge = i%2 == 0 ? -1 : 1;
gbsa->addParticle(charge, 0.15, 1);
nonbonded->addParticle(charge, 1, 0);
}
nonbonded->setNonbondedMethod(method);
gbsa->setNonbondedMethod(method2);
nonbonded->setCutoffDistance(3.0);
gbsa->setCutoffDistance(3.0);
int grid = (int) floor(0.5+pow(numParticles, 1.0/3.0));
if (method == NonbondedForce::CutoffPeriodic) {
double boxSize = (grid+1)*1.1;
system.setDefaultPeriodicBoxVectors(Vec3(boxSize, 0, 0), Vec3(0, boxSize, 0), Vec3(0, 0, boxSize));
}
system.addForce(gbsa);
system.addForce(nonbonded);
LangevinIntegrator integrator1(0, 0.1, 0.01);
LangevinIntegrator integrator2(0, 0.1, 0.01);
Context context(system, integrator1, cl);
Context refContext(system, integrator2, reference);
// Set random (but uniformly distributed) positions for all the particles.
vector<Vec3> positions(numParticles);
OpenMM_SFMT::SFMT sfmt;
init_gen_rand(0, sfmt);
for (int i = 0; i < grid; i++)
for (int j = 0; j < grid; j++)
for (int k = 0; k < grid; k++)
positions[i*grid*grid+j*grid+k] = Vec3(i*1.1, j*1.1, k*1.1);
for (int i = 0; i < numParticles; ++i)
positions[i] = positions[i] + Vec3(0.5*genrand_real2(sfmt), 0.5*genrand_real2(sfmt), 0.5*genrand_real2(sfmt));
context.setPositions(positions);
refContext.setPositions(positions);
State state = context.getState(State::Forces | State::Energy);
State refState = refContext.getState(State::Forces | State::Energy);
// Make sure the CUDA and Reference platforms agree.
double norm = 0.0;
double diff = 0.0;
for (int i = 0; i < numParticles; ++i) {
Vec3 f = state.getForces()[i];
norm += f[0]*f[0] + f[1]*f[1] + f[2]*f[2];
Vec3 delta = f-refState.getForces()[i];
diff += delta[0]*delta[0] + delta[1]*delta[1] + delta[2]*delta[2];
}
norm = std::sqrt(norm);
diff = std::sqrt(diff);
ASSERT_EQUAL_TOL(0.0, diff, 0.001*norm);
ASSERT_EQUAL_TOL(state.getPotentialEnergy(), refState.getPotentialEnergy(), 1e-3);
// Take a small step in the direction of the energy gradient. (This doesn't work with cutoffs, since the energy
// changes discontinuously at the cutoff distance.)
if (method == NonbondedForce::NoCutoff)
{
const double delta = 1e-2;
double step = delta/norm;
for (int i = 0; i < numParticles; ++i) {
Vec3 p = positions[i];
Vec3 f = state.getForces()[i];
positions[i] = Vec3(p[0]-f[0]*step, p[1]-f[1]*step, p[2]-f[2]*step);
}
context.setPositions(positions);
// See whether the potential energy changed by the expected amount.
State state2 = context.getState(State::Energy);
ASSERT_EQUAL_TOL(norm, (state2.getPotentialEnergy()-state.getPotentialEnergy())/delta, 1e-3*abs(state.getPotentialEnergy()));
}
}
int main() {
try {
testSingleParticle();
testCutoffAndPeriodic();
for (int i = 5; i < 11; i++) {
testForce(i*i*i, NonbondedForce::NoCutoff, GBSAOBCForce::NoCutoff);
testForce(i*i*i, NonbondedForce::CutoffNonPeriodic, GBSAOBCForce::CutoffNonPeriodic);
testForce(i*i*i, NonbondedForce::CutoffPeriodic, GBSAOBCForce::CutoffPeriodic);
}
}
catch(const exception& e) {
cout << "exception: " << e.what() << endl;
return 1;
}
cout << "Done" << endl;
return 0;
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment