Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
17ae3aae
Commit
17ae3aae
authored
Jun 21, 2012
by
Peter Eastman
Browse files
Continuing to implement new CUDA platform: GBSAOBCForce
parent
e2fc86ab
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
1120 additions
and
283 deletions
+1120
-283
platforms/cuda2/src/CudaContext.cpp
platforms/cuda2/src/CudaContext.cpp
+1
-0
platforms/cuda2/src/CudaKernelFactory.cpp
platforms/cuda2/src/CudaKernelFactory.cpp
+2
-2
platforms/cuda2/src/CudaKernels.cpp
platforms/cuda2/src/CudaKernels.cpp
+198
-230
platforms/cuda2/src/CudaKernels.h
platforms/cuda2/src/CudaKernels.h
+49
-51
platforms/cuda2/src/kernels/gbsaObc1.cu
platforms/cuda2/src/kernels/gbsaObc1.cu
+542
-0
platforms/cuda2/src/kernels/gbsaObc2.cu
platforms/cuda2/src/kernels/gbsaObc2.cu
+27
-0
platforms/cuda2/src/kernels/gbsaObcReductions.cu
platforms/cuda2/src/kernels/gbsaObcReductions.cu
+57
-0
platforms/cuda2/tests/TestCudaGBSAOBCForce.cpp
platforms/cuda2/tests/TestCudaGBSAOBCForce.cpp
+244
-0
No files found.
platforms/cuda2/src/CudaContext.cpp
View file @
17ae3aae
...
...
@@ -179,6 +179,7 @@ CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlocking
compilationDefines
[
"RECIP"
]
=
useDoublePrecision
?
"1.0/"
:
"1.0f/"
;
compilationDefines
[
"EXP"
]
=
useDoublePrecision
?
"exp"
:
"expf"
;
compilationDefines
[
"LOG"
]
=
useDoublePrecision
?
"log"
:
"logf"
;
compilationDefines
[
"POW"
]
=
useDoublePrecision
?
"pow"
:
"powf"
;
compilationDefines
[
"COS"
]
=
useDoublePrecision
?
"cos"
:
"cosf"
;
compilationDefines
[
"SIN"
]
=
useDoublePrecision
?
"sin"
:
"sinf"
;
compilationDefines
[
"TAN"
]
=
useDoublePrecision
?
"tan"
:
"tanf"
;
...
...
platforms/cuda2/src/CudaKernelFactory.cpp
View file @
17ae3aae
...
...
@@ -96,8 +96,8 @@ KernelImpl* CudaKernelFactory::createKernelImpl(std::string name, const Platform
return
new
CudaCalcNonbondedForceKernel
(
name
,
platform
,
cu
,
context
.
getSystem
());
if
(
name
==
CalcCustomNonbondedForceKernel
::
Name
())
return
new
CudaCalcCustomNonbondedForceKernel
(
name
,
platform
,
cu
,
context
.
getSystem
());
//
if (name == CalcGBSAOBCForceKernel::Name())
//
return new CudaCalcGBSAOBCForceKernel(name, platform, cu);
if
(
name
==
CalcGBSAOBCForceKernel
::
Name
())
return
new
CudaCalcGBSAOBCForceKernel
(
name
,
platform
,
cu
);
// if (name == CalcCustomGBForceKernel::Name())
// return new CudaCalcCustomGBForceKernel(name, platform, cu, context.getSystem());
if
(
name
==
CalcCustomExternalForceKernel
::
Name
())
...
...
platforms/cuda2/src/CudaKernels.cpp
View file @
17ae3aae
...
...
@@ -1421,6 +1421,8 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
pmeDefines
[
"GRID_SIZE_Y"
]
=
cu
.
intToString
(
gridSizeY
);
pmeDefines
[
"GRID_SIZE_Z"
]
=
cu
.
intToString
(
gridSizeZ
);
pmeDefines
[
"EPSILON_FACTOR"
]
=
cu
.
doubleToString
(
sqrt
(
ONE_4PI_EPS0
));
if
(
cu
.
getUseDoublePrecision
())
pmeDefines
[
"USE_DOUBLE_PRECISION"
]
=
"1"
;
CUmodule
module
=
cu
.
createModule
(
CudaKernelSources
::
vectorOps
+
CudaKernelSources
::
pme
,
pmeDefines
);
pmeUpdateBsplinesKernel
=
cu
.
getKernel
(
module
,
"updateBsplines"
);
pmeAtomRangeKernel
=
cu
.
getKernel
(
module
,
"findAtomRangeForGrid"
);
...
...
@@ -1573,11 +1575,17 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
cu
.
executeKernel
(
pmeSpreadChargeKernel
,
spreadArgs
,
cu
.
getNumAtoms
(),
PmeOrder
*
PmeOrder
*
PmeOrder
);
void
*
finishSpreadArgs
[]
=
{
&
pmeGrid
->
getDevicePointer
()};
cu
.
executeKernel
(
pmeFinishSpreadChargeKernel
,
finishSpreadArgs
,
pmeGrid
->
getSize
());
cufftExecC2C
(
fft
,
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
CUFFT_FORWARD
);
if
(
cu
.
getUseDoublePrecision
())
cufftExecZ2Z
(
fft
,
(
double2
*
)
pmeGrid
->
getDevicePointer
(),
(
double2
*
)
pmeGrid
->
getDevicePointer
(),
CUFFT_FORWARD
);
else
cufftExecC2C
(
fft
,
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
CUFFT_FORWARD
);
void
*
convolutionArgs
[]
=
{
&
pmeGrid
->
getDevicePointer
(),
&
cu
.
getEnergyBuffer
().
getDevicePointer
(),
&
pmeBsplineModuliX
->
getDevicePointer
(),
&
pmeBsplineModuliY
->
getDevicePointer
(),
&
pmeBsplineModuliZ
->
getDevicePointer
(),
cu
.
getPeriodicBoxSizePointer
(),
cu
.
getInvPeriodicBoxSizePointer
()};
cu
.
executeKernel
(
pmeConvolutionKernel
,
convolutionArgs
,
cu
.
getNumAtoms
());
cufftExecC2C
(
fft
,
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
CUFFT_INVERSE
);
if
(
cu
.
getUseDoublePrecision
())
cufftExecZ2Z
(
fft
,
(
double2
*
)
pmeGrid
->
getDevicePointer
(),
(
double2
*
)
pmeGrid
->
getDevicePointer
(),
CUFFT_INVERSE
);
else
cufftExecC2C
(
fft
,
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
(
float2
*
)
pmeGrid
->
getDevicePointer
(),
CUFFT_INVERSE
);
void
*
interpolateArgs
[]
=
{
&
cu
.
getPosq
().
getDevicePointer
(),
&
cu
.
getForce
().
getDevicePointer
(),
&
pmeGrid
->
getDevicePointer
(),
cu
.
getPeriodicBoxSizePointer
(),
cu
.
getInvPeriodicBoxSizePointer
()};
interpolateForceThreads
=
64
;
...
...
@@ -1859,231 +1867,191 @@ void CudaCalcCustomNonbondedForceKernel::copyParametersToContext(ContextImpl& co
cu
.
invalidateMolecules
();
}
//class CudaGBSAOBCForceInfo : public CudaForceInfo {
//public:
// CudaGBSAOBCForceInfo(int requiredBuffers, const GBSAOBCForce& force) : CudaForceInfo(requiredBuffers), force(force) {
// }
// bool areParticlesIdentical(int particle1, int particle2) {
// double charge1, charge2, radius1, radius2, scale1, scale2;
// force.getParticleParameters(particle1, charge1, radius1, scale1);
// force.getParticleParameters(particle2, charge2, radius2, scale2);
// return (charge1 == charge2 && radius1 == radius2 && scale1 == scale2);
// }
//private:
// const GBSAOBCForce& force;
//};
//
//CudaCalcGBSAOBCForceKernel::~CudaCalcGBSAOBCForceKernel() {
// cu.setAsCurrent();
// if (params != NULL)
// delete params;
// if (bornSum != NULL)
// delete bornSum;
// if (longBornSum != NULL)
// delete longBornSum;
// if (bornRadii != NULL)
// delete bornRadii;
// if (bornForce != NULL)
// delete bornForce;
// if (longBornForce != NULL)
// delete longBornForce;
// if (obcChain != NULL)
// delete obcChain;
//}
//
//void CudaCalcGBSAOBCForceKernel::initialize(const System& system, const GBSAOBCForce& force) {
// cu.setAsCurrent();
// if (cu.getPlatformData().contexts.size() > 1)
// throw OpenMMException("GBSAOBCForce does not support using multiple CUDA devices");
// CudaNonbondedUtilities& nb = cu.getNonbondedUtilities();
// params = new CudaArray<mm_float2>(cu, cu.getPaddedNumAtoms(), "gbsaObcParams");
// bornRadii = new CudaArray<cl_float>(cu, cu.getPaddedNumAtoms(), "bornRadii");
// obcChain = new CudaArray<cl_float>(cu, cu.getPaddedNumAtoms(), "obcChain");
// if (cu.getSupports64BitGlobalAtomics()) {
// longBornSum = new CudaArray<cl_long>(cu, cu.getPaddedNumAtoms(), "longBornSum");
// longBornForce = new CudaArray<cl_long>(cu, cu.getPaddedNumAtoms(), "longBornForce");
// bornForce = new CudaArray<cl_float>(cu, cu.getPaddedNumAtoms(), "bornForce");
// cu.addAutoclearBuffer(longBornSum->getDevicePointer(), 2*longBornSum->getSize());
// cu.addAutoclearBuffer(longBornForce->getDevicePointer(), 2*longBornForce->getSize());
// }
// else {
// bornSum = new CudaArray<cl_float>(cu, cu.getPaddedNumAtoms()*nb.getNumForceBuffers(), "bornSum");
// bornForce = new CudaArray<cl_float>(cu, cu.getPaddedNumAtoms()*nb.getNumForceBuffers(), "bornForce");
// cu.addAutoclearBuffer(bornSum->getDevicePointer(), bornSum->getSize());
// cu.addAutoclearBuffer(bornForce->getDevicePointer(), bornForce->getSize());
// }
// CudaArray<mm_float4>& posq = cu.getPosq();
// int numParticles = force.getNumParticles();
// vector<mm_float2> paramsVector(numParticles);
// const double dielectricOffset = 0.009;
// for (int i = 0; i < numParticles; i++) {
// double charge, radius, scalingFactor;
// force.getParticleParameters(i, charge, radius, scalingFactor);
// radius -= dielectricOffset;
// paramsVector[i] = mm_float2((float) radius, (float) (scalingFactor*radius));
// posq[i].w = (float) charge;
// }
// posq.upload();
// params->upload(paramsVector);
// prefactor = -ONE_4PI_EPS0*((1.0/force.getSoluteDielectric())-(1.0/force.getSolventDielectric()));
// bool useCutoff = (force.getNonbondedMethod() != GBSAOBCForce::NoCutoff);
// bool usePeriodic = (force.getNonbondedMethod() != GBSAOBCForce::NoCutoff && force.getNonbondedMethod() != GBSAOBCForce::CutoffNonPeriodic);
// string source = CudaKernelSources::gbsaObc2;
// nb.addInteraction(useCutoff, usePeriodic, false, force.getCutoffDistance(), vector<vector<int> >(), source, force.getForceGroup());
// nb.addParameter(CudaNonbondedUtilities::ParameterInfo("obcParams", "float", 2, sizeof(cl_float2), params->getDevicePointer()));;
// nb.addParameter(CudaNonbondedUtilities::ParameterInfo("bornForce", "float", 1, sizeof(cl_float), bornForce->getDevicePointer()));;
// cu.addForce(new CudaGBSAOBCForceInfo(nb.getNumForceBuffers(), force));
//}
//
//double CudaCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
// CudaNonbondedUtilities& nb = cu.getNonbondedUtilities();
// bool deviceIsCpu = (cu.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU);
// if (!hasCreatedKernels) {
// // These Kernels cannot be created in initialize(), because the CudaNonbondedUtilities has not been initialized yet then.
//
// hasCreatedKernels = true;
// maxTiles = (nb.getUseCutoff() ? nb.getInteractingTiles().getSize() : 0);
// map<string, string> defines;
// if (nb.getForceBufferPerAtomBlock())
// defines["USE_OUTPUT_BUFFER_PER_BLOCK"] = "1";
// if (nb.getUseCutoff())
// defines["USE_CUTOFF"] = "1";
// if (nb.getUsePeriodic())
// defines["USE_PERIODIC"] = "1";
// defines["CUTOFF_SQUARED"] = cu.doubleToString(nb.getCutoffDistance()*nb.getCutoffDistance());
// defines["PREFACTOR"] = cu.doubleToString(prefactor);
// defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
// defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
// defines["NUM_BLOCKS"] = cu.intToString(cu.getNumAtomBlocks());
// defines["FORCE_WORK_GROUP_SIZE"] = cu.intToString(nb.getForceThreadBlockSize());
// string platformVendor = cu::Platform(cu.getDevice().getInfo<CL_DEVICE_PLATFORM>()).getInfo<CL_PLATFORM_VENDOR>();
// if (platformVendor == "Apple")
// defines["USE_APPLE_WORKAROUND"] = "1";
// string file;
// if (deviceIsCpu)
// file = CudaKernelSources::gbsaObc_cpu;
// else if (cu.getSIMDWidth() == 32)
// file = CudaKernelSources::gbsaObc_nvidia;
// else
// file = CudaKernelSources::gbsaObc_default;
// CUmodule module = cu.createModule(file, defines);
// bool useLong = (cu.getSupports64BitGlobalAtomics() && !deviceIsCpu);
// int index = 0;
// computeBornSumKernel = cu.getKernel(module, "computeBornSum");
// computeBornSumKernel.setArg<cu::Buffer>(index++, (useLong ? longBornSum->getDevicePointer() : bornSum->getDevicePointer()));
// computeBornSumKernel.setArg<cu::Buffer>(index++, cu.getPosq().getDevicePointer());
// computeBornSumKernel.setArg<cu::Buffer>(index++, params->getDevicePointer());
// if (nb.getUseCutoff()) {
// computeBornSumKernel.setArg<cu::Buffer>(index++, nb.getInteractingTiles().getDevicePointer());
// computeBornSumKernel.setArg<cu::Buffer>(index++, nb.getInteractionCount().getDevicePointer());
// index += 2; // The periodic box size arguments are set when the kernel is executed.
// computeBornSumKernel.setArg<cl_uint>(index++, maxTiles);
// if (cu.getSIMDWidth() == 32 || deviceIsCpu)
// computeBornSumKernel.setArg<cu::Buffer>(index++, nb.getInteractionFlags().getDevicePointer());
// }
// else
// computeBornSumKernel.setArg<cl_uint>(index++, cu.getNumAtomBlocks()*(cu.getNumAtomBlocks()+1)/2);
// if (cu.getSIMDWidth() == 32) {
// computeBornSumKernel.setArg<cu::Buffer>(index++, nb.getExclusionIndices().getDevicePointer());
// computeBornSumKernel.setArg<cu::Buffer>(index++, nb.getExclusionRowIndices().getDevicePointer());
// }
// force1Kernel = cu.getKernel(module, "computeGBSAForce1");
// index = 0;
// force1Kernel.setArg<cu::Buffer>(index++, (useLong ? cu.getLongForceBuffer().getDevicePointer() : cu.getForceBuffers().getDevicePointer()));
// force1Kernel.setArg<cu::Buffer>(index++, (useLong ? longBornForce->getDevicePointer() : bornForce->getDevicePointer()));
// force1Kernel.setArg<cu::Buffer>(index++, cu.getEnergyBuffer().getDevicePointer());
// force1Kernel.setArg<cu::Buffer>(index++, cu.getPosq().getDevicePointer());
// force1Kernel.setArg<cu::Buffer>(index++, bornRadii->getDevicePointer());
// if (nb.getUseCutoff()) {
// force1Kernel.setArg<cu::Buffer>(index++, nb.getInteractingTiles().getDevicePointer());
// force1Kernel.setArg<cu::Buffer>(index++, nb.getInteractionCount().getDevicePointer());
// index += 2; // The periodic box size arguments are set when the kernel is executed.
// force1Kernel.setArg<cl_uint>(index++, maxTiles);
// if (cu.getSIMDWidth() == 32 || deviceIsCpu)
// force1Kernel.setArg<cu::Buffer>(index++, nb.getInteractionFlags().getDevicePointer());
// }
// else
// force1Kernel.setArg<cl_uint>(index++, cu.getNumAtomBlocks()*(cu.getNumAtomBlocks()+1)/2);
// if (cu.getSIMDWidth() == 32) {
// force1Kernel.setArg<cu::Buffer>(index++, nb.getExclusionIndices().getDevicePointer());
// force1Kernel.setArg<cu::Buffer>(index++, nb.getExclusionRowIndices().getDevicePointer());
// }
// module = cu.createModule(CudaKernelSources::gbsaObcReductions, defines);
// reduceBornSumKernel = cu.getKernel(module, "reduceBornSum");
// reduceBornSumKernel.setArg<cl_int>(0, cu.getPaddedNumAtoms());
// reduceBornSumKernel.setArg<cl_int>(1, nb.getNumForceBuffers());
// reduceBornSumKernel.setArg<cl_float>(2, 1.0f);
// reduceBornSumKernel.setArg<cl_float>(3, 0.8f);
// reduceBornSumKernel.setArg<cl_float>(4, 4.85f);
// reduceBornSumKernel.setArg<cu::Buffer>(5, (useLong ? longBornSum->getDevicePointer() : bornSum->getDevicePointer()));
// reduceBornSumKernel.setArg<cu::Buffer>(6, params->getDevicePointer());
// reduceBornSumKernel.setArg<cu::Buffer>(7, bornRadii->getDevicePointer());
// reduceBornSumKernel.setArg<cu::Buffer>(8, obcChain->getDevicePointer());
// reduceBornForceKernel = cu.getKernel(module, "reduceBornForce");
// index = 0;
// reduceBornForceKernel.setArg<cl_int>(index++, cu.getPaddedNumAtoms());
// reduceBornForceKernel.setArg<cl_int>(index++, nb.getNumForceBuffers());
// reduceBornForceKernel.setArg<cu::Buffer>(index++, bornForce->getDevicePointer());
// if (useLong)
// reduceBornForceKernel.setArg<cu::Buffer>(index++, longBornForce->getDevicePointer());
// reduceBornForceKernel.setArg<cu::Buffer>(index++, cu.getEnergyBuffer().getDevicePointer());
// reduceBornForceKernel.setArg<cu::Buffer>(index++, params->getDevicePointer());
// reduceBornForceKernel.setArg<cu::Buffer>(index++, bornRadii->getDevicePointer());
// reduceBornForceKernel.setArg<cu::Buffer>(index++, obcChain->getDevicePointer());
// }
// if (nb.getUseCutoff()) {
// computeBornSumKernel.setArg<mm_float4>(5, cu.getPeriodicBoxSize());
// computeBornSumKernel.setArg<mm_float4>(6, cu.getInvPeriodicBoxSize());
// force1Kernel.setArg<mm_float4>(7, cu.getPeriodicBoxSize());
// force1Kernel.setArg<mm_float4>(8, cu.getInvPeriodicBoxSize());
// if (maxTiles < nb.getInteractingTiles().getSize()) {
// maxTiles = nb.getInteractingTiles().getSize();
// computeBornSumKernel.setArg<cu::Buffer>(3, nb.getInteractingTiles().getDevicePointer());
// computeBornSumKernel.setArg<cl_uint>(7, maxTiles);
// force1Kernel.setArg<cu::Buffer>(5, nb.getInteractingTiles().getDevicePointer());
// force1Kernel.setArg<cl_uint>(9, maxTiles);
// if (cu.getSIMDWidth() == 32 || deviceIsCpu) {
// computeBornSumKernel.setArg<cu::Buffer>(8, nb.getInteractionFlags().getDevicePointer());
// force1Kernel.setArg<cu::Buffer>(10, nb.getInteractionFlags().getDevicePointer());
// }
// }
// }
// cu.executeKernel(computeBornSumKernel, nb.getNumForceThreadBlocks()*nb.getForceThreadBlockSize(), nb.getForceThreadBlockSize());
// cu.executeKernel(reduceBornSumKernel, cu.getPaddedNumAtoms());
// cu.executeKernel(force1Kernel, nb.getNumForceThreadBlocks()*nb.getForceThreadBlockSize(), nb.getForceThreadBlockSize());
// cu.executeKernel(reduceBornForceKernel, cu.getPaddedNumAtoms());
// return 0.0;
//}
//
//void CudaCalcGBSAOBCForceKernel::copyParametersToContext(ContextImpl& context, const GBSAOBCForce& force) {
// // Make sure the new parameters are acceptable.
//
// cu.setAsCurrent();
// int numParticles = force.getNumParticles();
// if (numParticles != cu.getNumAtoms())
// throw OpenMMException("updateParametersInContext: The number of particles has changed");
//
// // Record the per-particle parameters.
//
// CudaArray<mm_float4>& posq = cu.getPosq();
// posq.download();
// vector<mm_float2> paramsVector(numParticles);
// const double dielectricOffset = 0.009;
// for (int i = 0; i < numParticles; i++) {
// double charge, radius, scalingFactor;
// force.getParticleParameters(i, charge, radius, scalingFactor);
// radius -= dielectricOffset;
// paramsVector[i] = mm_float2((float) radius, (float) (scalingFactor*radius));
// posq[i].w = (float) charge;
// }
// posq.upload();
// params->upload(paramsVector);
//
// // Mark that the current reordering may be invalid.
//
// cu.invalidateMolecules();
//}
//
class
CudaGBSAOBCForceInfo
:
public
CudaForceInfo
{
public:
CudaGBSAOBCForceInfo
(
const
GBSAOBCForce
&
force
)
:
force
(
force
)
{
}
bool
areParticlesIdentical
(
int
particle1
,
int
particle2
)
{
double
charge1
,
charge2
,
radius1
,
radius2
,
scale1
,
scale2
;
force
.
getParticleParameters
(
particle1
,
charge1
,
radius1
,
scale1
);
force
.
getParticleParameters
(
particle2
,
charge2
,
radius2
,
scale2
);
return
(
charge1
==
charge2
&&
radius1
==
radius2
&&
scale1
==
scale2
);
}
private:
const
GBSAOBCForce
&
force
;
};
CudaCalcGBSAOBCForceKernel
::~
CudaCalcGBSAOBCForceKernel
()
{
cu
.
setAsCurrent
();
if
(
params
!=
NULL
)
delete
params
;
if
(
bornSum
!=
NULL
)
delete
bornSum
;
if
(
bornRadii
!=
NULL
)
delete
bornRadii
;
if
(
bornForce
!=
NULL
)
delete
bornForce
;
if
(
obcChain
!=
NULL
)
delete
obcChain
;
}
void
CudaCalcGBSAOBCForceKernel
::
initialize
(
const
System
&
system
,
const
GBSAOBCForce
&
force
)
{
cu
.
setAsCurrent
();
if
(
cu
.
getPlatformData
().
contexts
.
size
()
>
1
)
throw
OpenMMException
(
"GBSAOBCForce does not support using multiple CUDA devices"
);
CudaNonbondedUtilities
&
nb
=
cu
.
getNonbondedUtilities
();
params
=
CudaArray
::
create
<
float2
>
(
cu
,
cu
.
getPaddedNumAtoms
(),
"gbsaObcParams"
);
if
(
cu
.
getUseDoublePrecision
())
{
bornRadii
=
CudaArray
::
create
<
double
>
(
cu
,
cu
.
getPaddedNumAtoms
(),
"bornRadii"
);
obcChain
=
CudaArray
::
create
<
double
>
(
cu
,
cu
.
getPaddedNumAtoms
(),
"obcChain"
);
}
else
{
bornRadii
=
CudaArray
::
create
<
float
>
(
cu
,
cu
.
getPaddedNumAtoms
(),
"bornRadii"
);
obcChain
=
CudaArray
::
create
<
float
>
(
cu
,
cu
.
getPaddedNumAtoms
(),
"obcChain"
);
}
bornSum
=
CudaArray
::
create
<
long
long
>
(
cu
,
cu
.
getPaddedNumAtoms
(),
"bornSum"
);
bornForce
=
CudaArray
::
create
<
long
long
>
(
cu
,
cu
.
getPaddedNumAtoms
(),
"bornForce"
);
cu
.
addAutoclearBuffer
(
bornSum
->
getDevicePointer
(),
bornSum
->
getSize
()
*
sizeof
(
long
long
));
cu
.
addAutoclearBuffer
(
bornForce
->
getDevicePointer
(),
bornForce
->
getSize
()
*
sizeof
(
long
long
));
CudaArray
&
posq
=
cu
.
getPosq
();
float4
*
posqf
=
(
float4
*
)
cu
.
getPinnedBuffer
();
double4
*
posqd
=
(
double4
*
)
cu
.
getPinnedBuffer
();
vector
<
float2
>
paramsVector
(
cu
.
getPaddedNumAtoms
());
const
double
dielectricOffset
=
0.009
;
for
(
int
i
=
0
;
i
<
force
.
getNumParticles
();
i
++
)
{
double
charge
,
radius
,
scalingFactor
;
force
.
getParticleParameters
(
i
,
charge
,
radius
,
scalingFactor
);
radius
-=
dielectricOffset
;
paramsVector
[
i
]
=
make_float2
((
float
)
radius
,
(
float
)
(
scalingFactor
*
radius
));
if
(
cu
.
getUseDoublePrecision
())
posqd
[
i
]
=
make_double4
(
0
,
0
,
0
,
charge
);
else
posqf
[
i
]
=
make_float4
(
0
,
0
,
0
,
(
float
)
charge
);
}
posq
.
upload
(
cu
.
getPinnedBuffer
());
params
->
upload
(
paramsVector
);
prefactor
=
-
ONE_4PI_EPS0
*
((
1.0
/
force
.
getSoluteDielectric
())
-
(
1.0
/
force
.
getSolventDielectric
()));
bool
useCutoff
=
(
force
.
getNonbondedMethod
()
!=
GBSAOBCForce
::
NoCutoff
);
bool
usePeriodic
=
(
force
.
getNonbondedMethod
()
!=
GBSAOBCForce
::
NoCutoff
&&
force
.
getNonbondedMethod
()
!=
GBSAOBCForce
::
CutoffNonPeriodic
);
string
source
=
CudaKernelSources
::
gbsaObc2
;
nb
.
addInteraction
(
useCutoff
,
usePeriodic
,
false
,
force
.
getCutoffDistance
(),
vector
<
vector
<
int
>
>
(),
source
,
force
.
getForceGroup
());
nb
.
addParameter
(
CudaNonbondedUtilities
::
ParameterInfo
(
"obcParams"
,
"float"
,
2
,
sizeof
(
float2
),
params
->
getDevicePointer
()));;
nb
.
addParameter
(
CudaNonbondedUtilities
::
ParameterInfo
(
"bornForce"
,
"long long"
,
1
,
sizeof
(
long
long
),
bornForce
->
getDevicePointer
()));;
cu
.
addForce
(
new
CudaGBSAOBCForceInfo
(
force
));
}
double
CudaCalcGBSAOBCForceKernel
::
execute
(
ContextImpl
&
context
,
bool
includeForces
,
bool
includeEnergy
)
{
CudaNonbondedUtilities
&
nb
=
cu
.
getNonbondedUtilities
();
if
(
!
hasCreatedKernels
)
{
// These Kernels cannot be created in initialize(), because the CudaNonbondedUtilities has not been initialized yet then.
hasCreatedKernels
=
true
;
maxTiles
=
(
nb
.
getUseCutoff
()
?
nb
.
getInteractingTiles
().
getSize
()
:
cu
.
getNumAtomBlocks
()
*
(
cu
.
getNumAtomBlocks
()
+
1
)
/
2
);
map
<
string
,
string
>
defines
;
if
(
nb
.
getUseCutoff
())
defines
[
"USE_CUTOFF"
]
=
"1"
;
if
(
nb
.
getUsePeriodic
())
defines
[
"USE_PERIODIC"
]
=
"1"
;
defines
[
"CUTOFF_SQUARED"
]
=
cu
.
doubleToString
(
nb
.
getCutoffDistance
()
*
nb
.
getCutoffDistance
());
defines
[
"PREFACTOR"
]
=
cu
.
doubleToString
(
prefactor
);
defines
[
"NUM_ATOMS"
]
=
cu
.
intToString
(
cu
.
getNumAtoms
());
defines
[
"PADDED_NUM_ATOMS"
]
=
cu
.
intToString
(
cu
.
getPaddedNumAtoms
());
defines
[
"NUM_BLOCKS"
]
=
cu
.
intToString
(
cu
.
getNumAtomBlocks
());
defines
[
"FORCE_WORK_GROUP_SIZE"
]
=
cu
.
intToString
(
nb
.
getForceThreadBlockSize
());
CUmodule
module
=
cu
.
createModule
(
CudaKernelSources
::
vectorOps
+
CudaKernelSources
::
gbsaObc1
,
defines
);
computeBornSumKernel
=
cu
.
getKernel
(
module
,
"computeBornSum"
);
computeSumArgs
.
push_back
(
&
bornSum
->
getDevicePointer
());
computeSumArgs
.
push_back
(
&
cu
.
getPosq
().
getDevicePointer
());
computeSumArgs
.
push_back
(
&
params
->
getDevicePointer
());
if
(
nb
.
getUseCutoff
())
{
computeSumArgs
.
push_back
(
&
nb
.
getInteractingTiles
().
getDevicePointer
());
computeSumArgs
.
push_back
(
&
nb
.
getInteractionCount
().
getDevicePointer
());
computeSumArgs
.
push_back
(
cu
.
getPeriodicBoxSizePointer
());
computeSumArgs
.
push_back
(
cu
.
getInvPeriodicBoxSizePointer
());
computeSumArgs
.
push_back
(
&
maxTiles
);
computeSumArgs
.
push_back
(
&
nb
.
getInteractionFlags
().
getDevicePointer
());
}
else
computeSumArgs
.
push_back
(
&
maxTiles
);
computeSumArgs
.
push_back
(
&
nb
.
getExclusionIndices
().
getDevicePointer
());
computeSumArgs
.
push_back
(
&
nb
.
getExclusionRowIndices
().
getDevicePointer
());
force1Kernel
=
cu
.
getKernel
(
module
,
"computeGBSAForce1"
);
force1Args
.
push_back
(
&
cu
.
getForce
().
getDevicePointer
());
force1Args
.
push_back
(
&
bornForce
->
getDevicePointer
());
force1Args
.
push_back
(
&
cu
.
getEnergyBuffer
().
getDevicePointer
());
force1Args
.
push_back
(
&
cu
.
getPosq
().
getDevicePointer
());
force1Args
.
push_back
(
&
bornRadii
->
getDevicePointer
());
if
(
nb
.
getUseCutoff
())
{
force1Args
.
push_back
(
&
nb
.
getInteractingTiles
().
getDevicePointer
());
force1Args
.
push_back
(
&
nb
.
getInteractionCount
().
getDevicePointer
());
force1Args
.
push_back
(
cu
.
getPeriodicBoxSizePointer
());
force1Args
.
push_back
(
cu
.
getInvPeriodicBoxSizePointer
());
force1Args
.
push_back
(
&
maxTiles
);
force1Args
.
push_back
(
&
nb
.
getInteractionFlags
().
getDevicePointer
());
}
else
force1Args
.
push_back
(
&
maxTiles
);
force1Args
.
push_back
(
&
nb
.
getExclusionIndices
().
getDevicePointer
());
force1Args
.
push_back
(
&
nb
.
getExclusionRowIndices
().
getDevicePointer
());
module
=
cu
.
createModule
(
CudaKernelSources
::
gbsaObcReductions
,
defines
);
reduceBornSumKernel
=
cu
.
getKernel
(
module
,
"reduceBornSum"
);
reduceBornForceKernel
=
cu
.
getKernel
(
module
,
"reduceBornForce"
);
}
if
(
nb
.
getUseCutoff
())
{
if
(
maxTiles
<
nb
.
getInteractingTiles
().
getSize
())
{
maxTiles
=
nb
.
getInteractingTiles
().
getSize
();
computeSumArgs
[
3
]
=
&
nb
.
getInteractingTiles
().
getDevicePointer
();
force1Args
[
5
]
=
&
nb
.
getInteractingTiles
().
getDevicePointer
();
computeSumArgs
[
8
]
=
&
nb
.
getInteractionFlags
().
getDevicePointer
();
force1Args
[
10
]
=
&
nb
.
getInteractionFlags
().
getDevicePointer
();
}
}
cu
.
executeKernel
(
computeBornSumKernel
,
&
computeSumArgs
[
0
],
nb
.
getNumForceThreadBlocks
()
*
nb
.
getForceThreadBlockSize
(),
nb
.
getForceThreadBlockSize
());
float
alpha
=
1.0
f
,
beta
=
0.8
f
,
gamma
=
4.85
f
;
void
*
reduceSumArgs
[]
=
{
&
alpha
,
&
beta
,
&
gamma
,
&
bornSum
->
getDevicePointer
(),
&
params
->
getDevicePointer
(),
&
bornRadii
->
getDevicePointer
(),
&
obcChain
->
getDevicePointer
()};
cu
.
executeKernel
(
reduceBornSumKernel
,
reduceSumArgs
,
cu
.
getPaddedNumAtoms
());
cu
.
executeKernel
(
force1Kernel
,
&
force1Args
[
0
],
nb
.
getNumForceThreadBlocks
()
*
nb
.
getForceThreadBlockSize
(),
nb
.
getForceThreadBlockSize
());
void
*
reduceForceArgs
[]
=
{
&
bornForce
->
getDevicePointer
(),
&
cu
.
getEnergyBuffer
().
getDevicePointer
(),
&
params
->
getDevicePointer
(),
&
bornRadii
->
getDevicePointer
(),
&
obcChain
->
getDevicePointer
()};
cu
.
executeKernel
(
reduceBornForceKernel
,
&
reduceForceArgs
[
0
],
cu
.
getPaddedNumAtoms
());
return
0.0
;
}
void
CudaCalcGBSAOBCForceKernel
::
copyParametersToContext
(
ContextImpl
&
context
,
const
GBSAOBCForce
&
force
)
{
// Make sure the new parameters are acceptable.
cu
.
setAsCurrent
();
int
numParticles
=
force
.
getNumParticles
();
if
(
numParticles
!=
cu
.
getNumAtoms
())
throw
OpenMMException
(
"updateParametersInContext: The number of particles has changed"
);
// Record the per-particle parameters.
CudaArray
&
posq
=
cu
.
getPosq
();
float4
*
posqf
=
(
float4
*
)
cu
.
getPinnedBuffer
();
double4
*
posqd
=
(
double4
*
)
cu
.
getPinnedBuffer
();
posq
.
download
(
cu
.
getPinnedBuffer
());
vector
<
float2
>
paramsVector
(
cu
.
getPaddedNumAtoms
());
const
double
dielectricOffset
=
0.009
;
for
(
int
i
=
0
;
i
<
numParticles
;
i
++
)
{
double
charge
,
radius
,
scalingFactor
;
force
.
getParticleParameters
(
i
,
charge
,
radius
,
scalingFactor
);
radius
-=
dielectricOffset
;
paramsVector
[
i
]
=
make_float2
((
float
)
radius
,
(
float
)
(
scalingFactor
*
radius
));
if
(
cu
.
getUseDoublePrecision
())
posqd
[
i
].
w
=
charge
;
else
posqf
[
i
].
w
=
(
float
)
charge
;
}
posq
.
upload
(
cu
.
getPinnedBuffer
());
params
->
upload
(
paramsVector
);
// Mark that the current reordering may be invalid.
cu
.
invalidateMolecules
();
}
//class CudaCustomGBForceInfo : public CudaForceInfo {
//public:
// CudaCustomGBForceInfo(int requiredBuffers, const CustomGBForce& force) : CudaForceInfo(requiredBuffers), force(force) {
...
...
@@ -2870,7 +2838,7 @@ void CudaCalcCustomNonbondedForceKernel::copyParametersToContext(ContextImpl& co
// perParticleValueKernel.setArg<cu::Buffer>(index++, tabulatedFunctionParams->getDevicePointer());
// }
// index = 0;
// pairEnergyKernel.setArg<cu::Buffer>(index++, useLong ? cu.getLongForceBuffer().getDevicePointer() : cu.getForce
Buffers
().getDevicePointer());
// pairEnergyKernel.setArg<cu::Buffer>(index++, useLong ? cu.getLongForceBuffer().getDevicePointer() : cu.getForce().getDevicePointer());
// pairEnergyKernel.setArg<cu::Buffer>(index++, cu.getEnergyBuffer().getDevicePointer());
// pairEnergyKernel.setArg(index++, (deviceIsCpu ? CudaContext::TileSize : nb.getForceThreadBlockSize())*sizeof(cl_float4), NULL);
// pairEnergyKernel.setArg<cu::Buffer>(index++, cu.getPosq().getDevicePointer());
...
...
@@ -2927,7 +2895,7 @@ void CudaCalcCustomNonbondedForceKernel::copyParametersToContext(ContextImpl& co
// index = 0;
// perParticleEnergyKernel.setArg<cl_int>(index++, cu.getPaddedNumAtoms());
// perParticleEnergyKernel.setArg<cl_int>(index++, nb.getNumForceBuffers());
// perParticleEnergyKernel.setArg<cu::Buffer>(index++, cu.getForce
Buffers
().getDevicePointer());
// perParticleEnergyKernel.setArg<cu::Buffer>(index++, cu.getForce().getDevicePointer());
// perParticleEnergyKernel.setArg<cu::Buffer>(index++, cu.getEnergyBuffer().getDevicePointer());
// perParticleEnergyKernel.setArg<cu::Buffer>(index++, cu.getPosq().getDevicePointer());
// if (globals != NULL)
...
...
@@ -2947,7 +2915,7 @@ void CudaCalcCustomNonbondedForceKernel::copyParametersToContext(ContextImpl& co
// }
// if (needParameterGradient) {
// index = 0;
// gradientChainRuleKernel.setArg<cu::Buffer>(index++, cu.getForce
Buffers
().getDevicePointer());
// gradientChainRuleKernel.setArg<cu::Buffer>(index++, cu.getForce().getDevicePointer());
// gradientChainRuleKernel.setArg<cu::Buffer>(index++, cu.getPosq().getDevicePointer());
// if (globals != NULL)
// gradientChainRuleKernel.setArg<cu::Buffer>(index++, globals->getDevicePointer());
...
...
platforms/cuda2/src/CudaKernels.h
View file @
17ae3aae
...
...
@@ -666,57 +666,55 @@ private:
System
&
system
;
};
///**
// * This kernel is invoked by GBSAOBCForce to calculate the forces acting on the system.
// */
//class CudaCalcGBSAOBCForceKernel : public CalcGBSAOBCForceKernel {
//public:
// CudaCalcGBSAOBCForceKernel(std::string name, const Platform& platform, CudaContext& cu) : CalcGBSAOBCForceKernel(name, platform), cu(cu),
// hasCreatedKernels(false), params(NULL), bornSum(NULL), longBornSum(NULL), bornRadii(NULL), bornForce(NULL),
// longBornForce(NULL), obcChain(NULL) {
// }
// ~CudaCalcGBSAOBCForceKernel();
// /**
// * Initialize the kernel.
// *
// * @param system the System this kernel will be applied to
// * @param force the GBSAOBCForce this kernel will be used for
// */
// void initialize(const System& system, const GBSAOBCForce& force);
// /**
// * Execute the kernel to calculate the forces and/or energy.
// *
// * @param context the context in which to execute this kernel
// * @param includeForces true if forces should be calculated
// * @param includeEnergy true if the energy should be calculated
// * @return the potential energy due to the force
// */
// double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
// /**
// * Copy changed parameters over to a context.
// *
// * @param context the context to copy parameters to
// * @param force the GBSAOBCForce to copy the parameters from
// */
// void copyParametersToContext(ContextImpl& context, const GBSAOBCForce& force);
//private:
// double prefactor;
// bool hasCreatedKernels;
// int maxTiles;
// CudaContext& cu;
// CudaArray<mm_float2>* params;
// CudaArray<cl_float>* bornSum;
// CudaArray<cl_long>* longBornSum;
// CudaArray<cl_float>* bornRadii;
// CudaArray<cl_float>* bornForce;
// CudaArray<cl_long>* longBornForce;
// CudaArray<cl_float>* obcChain;
// CUfunction computeBornSumKernel;
// CUfunction reduceBornSumKernel;
// CUfunction force1Kernel;
// CUfunction reduceBornForceKernel;
//};
//
/**
* This kernel is invoked by GBSAOBCForce to calculate the forces acting on the system.
*/
class
CudaCalcGBSAOBCForceKernel
:
public
CalcGBSAOBCForceKernel
{
public:
CudaCalcGBSAOBCForceKernel
(
std
::
string
name
,
const
Platform
&
platform
,
CudaContext
&
cu
)
:
CalcGBSAOBCForceKernel
(
name
,
platform
),
cu
(
cu
),
hasCreatedKernels
(
false
),
params
(
NULL
),
bornSum
(
NULL
),
bornRadii
(
NULL
),
bornForce
(
NULL
),
obcChain
(
NULL
)
{
}
~
CudaCalcGBSAOBCForceKernel
();
/**
* Initialize the kernel.
*
* @param system the System this kernel will be applied to
* @param force the GBSAOBCForce this kernel will be used for
*/
void
initialize
(
const
System
&
system
,
const
GBSAOBCForce
&
force
);
/**
* Execute the kernel to calculate the forces and/or energy.
*
* @param context the context in which to execute this kernel
* @param includeForces true if forces should be calculated
* @param includeEnergy true if the energy should be calculated
* @return the potential energy due to the force
*/
double
execute
(
ContextImpl
&
context
,
bool
includeForces
,
bool
includeEnergy
);
/**
* Copy changed parameters over to a context.
*
* @param context the context to copy parameters to
* @param force the GBSAOBCForce to copy the parameters from
*/
void
copyParametersToContext
(
ContextImpl
&
context
,
const
GBSAOBCForce
&
force
);
private:
double
prefactor
;
bool
hasCreatedKernels
;
int
maxTiles
;
CudaContext
&
cu
;
CudaArray
*
params
;
CudaArray
*
bornSum
;
CudaArray
*
bornRadii
;
CudaArray
*
bornForce
;
CudaArray
*
obcChain
;
CUfunction
computeBornSumKernel
;
CUfunction
reduceBornSumKernel
;
CUfunction
force1Kernel
;
CUfunction
reduceBornForceKernel
;
std
::
vector
<
void
*>
computeSumArgs
,
force1Args
;
};
///**
// * This kernel is invoked by CustomGBForce to calculate the forces acting on the system.
// */
...
...
platforms/cuda2/src/kernels/gbsaObc1.cu
0 → 100644
View file @
17ae3aae
#define TILE_SIZE 32
#define WARPS_PER_GROUP (FORCE_WORK_GROUP_SIZE/TILE_SIZE)
typedef
struct
{
real
x
,
y
,
z
;
real
q
;
float
radius
,
scaledRadius
;
real
bornSum
;
}
AtomData1
;
/**
* Compute the Born sum.
*/
extern
"C"
__global__
void
computeBornSum
(
unsigned
long
long
*
__restrict__
global_bornSum
,
const
real4
*
__restrict__
posq
,
const
float2
*
__restrict__
global_params
,
#ifdef USE_CUTOFF
const
ushort2
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
unsigned
int
maxTiles
,
const
unsigned
int
*
__restrict__
interactionFlags
,
#else
unsigned
int
numTiles
,
#endif
unsigned
int
*
exclusionIndices
,
unsigned
int
*
exclusionRowIndices
)
{
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
TILE_SIZE
;
#ifdef USE_CUTOFF
unsigned
int
numTiles
=
interactionCount
[
0
];
unsigned
int
pos
=
warp
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
(
NUM_BLOCKS
+
1
)
/
2
:
numTiles
)
/
totalWarps
;
unsigned
int
end
=
(
warp
+
1
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
(
NUM_BLOCKS
+
1
)
/
2
:
numTiles
)
/
totalWarps
;
#else
unsigned
int
pos
=
warp
*
numTiles
/
totalWarps
;
unsigned
int
end
=
(
warp
+
1
)
*
numTiles
/
totalWarps
;
#endif
unsigned
int
lasty
=
0xFFFFFFFF
;
__shared__
AtomData1
localData
[
FORCE_WORK_GROUP_SIZE
];
__shared__
real
tempBuffer
[
FORCE_WORK_GROUP_SIZE
];
__shared__
unsigned
int
exclusionRange
[
2
*
WARPS_PER_GROUP
];
__shared__
int
exclusionIndex
[
WARPS_PER_GROUP
];
do
{
// Extract the coordinates of this tile
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
const
unsigned
int
localGroupIndex
=
threadIdx
.
x
/
TILE_SIZE
;
unsigned
int
x
,
y
;
real
bornSum
=
0
;
if
(
pos
<
end
)
{
#ifdef USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles
[
pos
];
x
=
tileIndices
.
x
;
y
=
tileIndices
.
y
;
}
else
#endif
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS
+
0.5
f
-
sqrt
((
NUM_BLOCKS
+
0.5
f
)
*
(
NUM_BLOCKS
+
0.5
f
)
-
2
*
pos
));
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
if
(
x
<
y
||
x
>=
NUM_BLOCKS
)
{
// Occasionally happens due to roundoff error.
y
+=
(
x
<
y
?
-
1
:
1
);
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
}
}
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
real4
posq1
=
posq
[
atom1
];
float2
params1
=
global_params
[
atom1
];
if
(
pos
>=
end
)
;
// This warp is done.
else
if
(
x
==
y
)
{
// This tile is on the diagonal.
localData
[
threadIdx
.
x
].
x
=
posq1
.
x
;
localData
[
threadIdx
.
x
].
y
=
posq1
.
y
;
localData
[
threadIdx
.
x
].
z
=
posq1
.
z
;
localData
[
threadIdx
.
x
].
q
=
posq1
.
w
;
localData
[
threadIdx
.
x
].
radius
=
params1
.
x
;
localData
[
threadIdx
.
x
].
scaledRadius
=
params1
.
y
;
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
real3
delta
=
make_real3
(
localData
[
tbx
+
j
].
x
-
posq1
.
x
,
localData
[
tbx
+
j
].
y
-
posq1
.
y
,
localData
[
tbx
+
j
].
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
#ifdef USE_CUTOFF
if
(
atom1
<
NUM_ATOMS
&&
y
*
TILE_SIZE
+
j
<
NUM_ATOMS
&&
r2
<
CUTOFF_SQUARED
)
{
#else
if
(
atom1
<
NUM_ATOMS
&&
y
*
TILE_SIZE
+
j
<
NUM_ATOMS
)
{
#endif
real
invR
=
RSQRT
(
r2
);
real
r
=
RECIP
(
invR
);
float2
params2
=
make_float2
(
localData
[
tbx
+
j
].
radius
,
localData
[
tbx
+
j
].
scaledRadius
);
real
rScaledRadiusJ
=
r
+
params2
.
y
;
if
((
j
!=
tgx
)
&&
(
params1
.
x
<
rScaledRadiusJ
))
{
real
l_ij
=
RECIP
(
max
(
params1
.
x
,
fabs
(
r
-
params2
.
y
)));
real
u_ij
=
RECIP
(
rScaledRadiusJ
);
real
l_ij2
=
l_ij
*
l_ij
;
real
u_ij2
=
u_ij
*
u_ij
;
real
ratio
=
LOG
(
u_ij
*
RECIP
(
l_ij
));
bornSum
+=
l_ij
-
u_ij
+
(
0.50
f
*
invR
*
ratio
)
+
0.25
f
*
(
r
*
(
u_ij2
-
l_ij2
)
+
(
params2
.
y
*
params2
.
y
*
invR
)
*
(
l_ij2
-
u_ij2
));
if
(
params1
.
x
<
params2
.
y
-
r
)
bornSum
+=
2.0
f
*
(
RECIP
(
params1
.
x
)
-
l_ij
);
}
}
}
}
else
{
// This is an off-diagonal tile.
if
(
lasty
!=
y
)
{
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
real4
tempPosq
=
posq
[
j
];
localData
[
threadIdx
.
x
].
x
=
tempPosq
.
x
;
localData
[
threadIdx
.
x
].
y
=
tempPosq
.
y
;
localData
[
threadIdx
.
x
].
z
=
tempPosq
.
z
;
localData
[
threadIdx
.
x
].
q
=
tempPosq
.
w
;
float2
tempParams
=
global_params
[
j
];
localData
[
threadIdx
.
x
].
radius
=
tempParams
.
x
;
localData
[
threadIdx
.
x
].
scaledRadius
=
tempParams
.
y
;
}
localData
[
threadIdx
.
x
].
bornSum
=
0.0
f
;
#ifdef USE_CUTOFF
unsigned
int
flags
=
(
numTiles
<=
maxTiles
?
interactionFlags
[
pos
]
:
0xFFFFFFFF
);
bool
computeSubset
=
false
;
if
(
flags
!=
0xFFFFFFFF
)
{
if
(
tgx
<
2
)
exclusionRange
[
2
*
localGroupIndex
+
tgx
]
=
exclusionRowIndices
[
x
+
tgx
];
if
(
tgx
==
0
)
exclusionIndex
[
localGroupIndex
]
=
-
1
;
for
(
unsigned
int
i
=
exclusionRange
[
2
*
localGroupIndex
]
+
tgx
;
i
<
exclusionRange
[
2
*
localGroupIndex
+
1
];
i
+=
TILE_SIZE
)
if
(
exclusionIndices
[
i
]
==
y
)
exclusionIndex
[
localGroupIndex
]
=
i
*
TILE_SIZE
;
computeSubset
=
(
exclusionIndex
[
localGroupIndex
]
==
-
1
);
}
if
(
computeSubset
)
{
if
(
flags
==
0
)
{
// No interactions in this tile.
}
else
{
// Compute only a subset of the interactions in this tile.
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
if
((
flags
&
(
1
<<
j
))
!=
0
)
{
real3
delta
=
make_real3
(
localData
[
tbx
+
j
].
x
-
posq1
.
x
,
localData
[
tbx
+
j
].
y
-
posq1
.
y
,
localData
[
tbx
+
j
].
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
tempBuffer
[
threadIdx
.
x
]
=
0.0
f
;
#ifdef USE_CUTOFF
if
(
atom1
<
NUM_ATOMS
&&
y
*
TILE_SIZE
+
j
<
NUM_ATOMS
&&
r2
<
CUTOFF_SQUARED
)
{
#else
if
(
atom1
<
NUM_ATOMS
&&
y
*
TILE_SIZE
+
j
<
NUM_ATOMS
)
{
#endif
real
invR
=
RSQRT
(
r2
);
real
r
=
RECIP
(
invR
);
float2
params2
=
make_float2
(
localData
[
tbx
+
j
].
radius
,
localData
[
tbx
+
j
].
scaledRadius
);
real
rScaledRadiusJ
=
r
+
params2
.
y
;
if
(
params1
.
x
<
rScaledRadiusJ
)
{
real
l_ij
=
RECIP
(
max
(
params1
.
x
,
fabs
(
r
-
params2
.
y
)));
real
u_ij
=
RECIP
(
rScaledRadiusJ
);
real
l_ij2
=
l_ij
*
l_ij
;
real
u_ij2
=
u_ij
*
u_ij
;
real
ratio
=
LOG
(
u_ij
*
RECIP
(
l_ij
));
bornSum
+=
l_ij
-
u_ij
+
(
0.50
f
*
invR
*
ratio
)
+
0.25
f
*
(
r
*
(
u_ij2
-
l_ij2
)
+
(
params2
.
y
*
params2
.
y
*
invR
)
*
(
l_ij2
-
u_ij2
));
if
(
params1
.
x
<
params2
.
y
-
r
)
bornSum
+=
2.0
f
*
(
RECIP
(
params1
.
x
)
-
l_ij
);
}
real
rScaledRadiusI
=
r
+
params1
.
y
;
if
(
params2
.
x
<
rScaledRadiusI
)
{
real
l_ij
=
RECIP
(
max
(
params2
.
x
,
fabs
(
r
-
params1
.
y
)));
real
u_ij
=
RECIP
(
rScaledRadiusI
);
real
l_ij2
=
l_ij
*
l_ij
;
real
u_ij2
=
u_ij
*
u_ij
;
real
ratio
=
LOG
(
u_ij
*
RECIP
(
l_ij
));
real
term
=
l_ij
-
u_ij
+
(
0.50
f
*
invR
*
ratio
)
+
0.25
f
*
(
r
*
(
u_ij2
-
l_ij2
)
+
(
params1
.
y
*
params1
.
y
*
invR
)
*
(
l_ij2
-
u_ij2
));
if
(
params2
.
x
<
params1
.
y
-
r
)
term
+=
2.0
f
*
(
RECIP
(
params2
.
x
)
-
l_ij
);
tempBuffer
[
threadIdx
.
x
]
=
term
;
}
}
// Sum the forces on atom j.
if
(
tgx
%
4
==
0
)
tempBuffer
[
threadIdx
.
x
]
+=
tempBuffer
[
threadIdx
.
x
+
1
]
+
tempBuffer
[
threadIdx
.
x
+
2
]
+
tempBuffer
[
threadIdx
.
x
+
3
];
if
(
tgx
==
0
)
localData
[
tbx
+
j
].
bornSum
+=
tempBuffer
[
threadIdx
.
x
]
+
tempBuffer
[
threadIdx
.
x
+
4
]
+
tempBuffer
[
threadIdx
.
x
+
8
]
+
tempBuffer
[
threadIdx
.
x
+
12
]
+
tempBuffer
[
threadIdx
.
x
+
16
]
+
tempBuffer
[
threadIdx
.
x
+
20
]
+
tempBuffer
[
threadIdx
.
x
+
24
]
+
tempBuffer
[
threadIdx
.
x
+
28
];
}
}
}
}
else
#endif
{
// Compute the full set of interactions in this tile.
unsigned
int
tj
=
tgx
;
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
real3
delta
=
make_real3
(
localData
[
tbx
+
tj
].
x
-
posq1
.
x
,
localData
[
tbx
+
tj
].
y
-
posq1
.
y
,
localData
[
tbx
+
tj
].
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
#ifdef USE_CUTOFF
if
(
atom1
<
NUM_ATOMS
&&
y
*
TILE_SIZE
+
tj
<
NUM_ATOMS
&&
r2
<
CUTOFF_SQUARED
)
{
#else
if
(
atom1
<
NUM_ATOMS
&&
y
*
TILE_SIZE
+
tj
<
NUM_ATOMS
)
{
#endif
real
invR
=
RSQRT
(
r2
);
real
r
=
RECIP
(
invR
);
float2
params2
=
make_float2
(
localData
[
tbx
+
tj
].
radius
,
localData
[
tbx
+
tj
].
scaledRadius
);
real
rScaledRadiusJ
=
r
+
params2
.
y
;
if
(
params1
.
x
<
rScaledRadiusJ
)
{
real
l_ij
=
RECIP
(
max
(
params1
.
x
,
fabs
(
r
-
params2
.
y
)));
real
u_ij
=
RECIP
(
rScaledRadiusJ
);
real
l_ij2
=
l_ij
*
l_ij
;
real
u_ij2
=
u_ij
*
u_ij
;
real
ratio
=
LOG
(
u_ij
*
RECIP
(
l_ij
));
bornSum
+=
l_ij
-
u_ij
+
(
0.50
f
*
invR
*
ratio
)
+
0.25
f
*
(
r
*
(
u_ij2
-
l_ij2
)
+
(
params2
.
y
*
params2
.
y
*
invR
)
*
(
l_ij2
-
u_ij2
));
if
(
params1
.
x
<
params2
.
y
-
r
)
bornSum
+=
2.0
f
*
(
RECIP
(
params1
.
x
)
-
l_ij
);
}
real
rScaledRadiusI
=
r
+
params1
.
y
;
if
(
params2
.
x
<
rScaledRadiusI
)
{
real
l_ij
=
RECIP
(
max
(
params2
.
x
,
fabs
(
r
-
params1
.
y
)));
real
u_ij
=
RECIP
(
rScaledRadiusI
);
real
l_ij2
=
l_ij
*
l_ij
;
real
u_ij2
=
u_ij
*
u_ij
;
real
ratio
=
LOG
(
u_ij
*
RECIP
(
l_ij
));
real
term
=
l_ij
-
u_ij
+
(
0.50
f
*
invR
*
ratio
)
+
0.25
f
*
(
r
*
(
u_ij2
-
l_ij2
)
+
(
params1
.
y
*
params1
.
y
*
invR
)
*
(
l_ij2
-
u_ij2
));
if
(
params2
.
x
<
params1
.
y
-
r
)
term
+=
2.0
f
*
(
RECIP
(
params2
.
x
)
-
l_ij
);
localData
[
tbx
+
tj
].
bornSum
+=
term
;
}
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
}
}
// Write results.
if
(
pos
<
end
)
{
const
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
global_bornSum
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
bornSum
*
0xFFFFFFFF
)));
}
if
(
pos
<
end
&&
x
!=
y
)
{
const
unsigned
int
offset
=
y
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
global_bornSum
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
bornSum
*
0xFFFFFFFF
)));
}
lasty
=
y
;
pos
++
;
}
while
(
pos
<
end
);
}
typedef
struct
{
real
x
,
y
,
z
;
real
q
;
real
fx
,
fy
,
fz
,
fw
;
real
bornRadius
;
}
AtomData2
;
/**
* First part of computing the GBSA interaction.
*/
extern
"C"
__global__
void
computeGBSAForce1
(
unsigned
long
long
*
__restrict__
forceBuffers
,
unsigned
long
long
*
__restrict__
global_bornForce
,
real
*
__restrict__
energyBuffer
,
const
real4
*
__restrict__
posq
,
const
real
*
__restrict__
global_bornRadii
,
#ifdef USE_CUTOFF
const
ushort2
*
__restrict__
tiles
,
const
unsigned
int
*
__restrict__
interactionCount
,
real4
periodicBoxSize
,
real4
invPeriodicBoxSize
,
unsigned
int
maxTiles
,
const
unsigned
int
*
__restrict__
interactionFlags
,
#else
unsigned
int
numTiles
,
#endif
unsigned
int
*
exclusionIndices
,
unsigned
int
*
exclusionRowIndices
)
{
unsigned
int
totalWarps
=
(
blockDim
.
x
*
gridDim
.
x
)
/
TILE_SIZE
;
unsigned
int
warp
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
TILE_SIZE
;
#ifdef USE_CUTOFF
unsigned
int
numTiles
=
interactionCount
[
0
];
unsigned
int
pos
=
warp
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
(
NUM_BLOCKS
+
1
)
/
2
:
numTiles
)
/
totalWarps
;
unsigned
int
end
=
(
warp
+
1
)
*
(
numTiles
>
maxTiles
?
NUM_BLOCKS
*
(
NUM_BLOCKS
+
1
)
/
2
:
numTiles
)
/
totalWarps
;
#else
unsigned
int
pos
=
warp
*
numTiles
/
totalWarps
;
unsigned
int
end
=
(
warp
+
1
)
*
numTiles
/
totalWarps
;
#endif
real
energy
=
0
;
unsigned
int
lasty
=
0xFFFFFFFF
;
__shared__
AtomData2
localData
[
FORCE_WORK_GROUP_SIZE
];
__shared__
real4
tempBuffer
[
FORCE_WORK_GROUP_SIZE
];
__shared__
unsigned
int
exclusionRange
[
2
*
WARPS_PER_GROUP
];
__shared__
int
exclusionIndex
[
WARPS_PER_GROUP
];
do
{
// Extract the coordinates of this tile
const
unsigned
int
tgx
=
threadIdx
.
x
&
(
TILE_SIZE
-
1
);
const
unsigned
int
tbx
=
threadIdx
.
x
-
tgx
;
const
unsigned
int
localGroupIndex
=
threadIdx
.
x
/
TILE_SIZE
;
unsigned
int
x
,
y
;
real4
force
=
make_real4
(
0
);
if
(
pos
<
end
)
{
#ifdef USE_CUTOFF
if
(
numTiles
<=
maxTiles
)
{
ushort2
tileIndices
=
tiles
[
pos
];
x
=
tileIndices
.
x
;
y
=
tileIndices
.
y
;
}
else
#endif
{
y
=
(
unsigned
int
)
floor
(
NUM_BLOCKS
+
0.5
f
-
sqrt
((
NUM_BLOCKS
+
0.5
f
)
*
(
NUM_BLOCKS
+
0.5
f
)
-
2
*
pos
));
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
if
(
x
<
y
||
x
>=
NUM_BLOCKS
)
{
// Occasionally happens due to roundoff error.
y
+=
(
x
<
y
?
-
1
:
1
);
x
=
(
pos
-
y
*
NUM_BLOCKS
+
y
*
(
y
+
1
)
/
2
);
}
}
unsigned
int
atom1
=
x
*
TILE_SIZE
+
tgx
;
real4
posq1
=
posq
[
atom1
];
real
bornRadius1
=
global_bornRadii
[
atom1
];
if
(
x
==
y
)
{
// This tile is on the diagonal.
localData
[
threadIdx
.
x
].
x
=
posq1
.
x
;
localData
[
threadIdx
.
x
].
y
=
posq1
.
y
;
localData
[
threadIdx
.
x
].
z
=
posq1
.
z
;
localData
[
threadIdx
.
x
].
q
=
posq1
.
w
;
localData
[
threadIdx
.
x
].
bornRadius
=
bornRadius1
;
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
if
(
atom1
<
NUM_ATOMS
&&
y
*
TILE_SIZE
+
j
<
NUM_ATOMS
)
{
real4
posq2
=
make_real4
(
localData
[
tbx
+
j
].
x
,
localData
[
tbx
+
j
].
y
,
localData
[
tbx
+
j
].
z
,
localData
[
tbx
+
j
].
q
);
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
#ifdef USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
#endif
real
invR
=
RSQRT
(
r2
);
real
r
=
RECIP
(
invR
);
real
bornRadius2
=
localData
[
tbx
+
j
].
bornRadius
;
real
alpha2_ij
=
bornRadius1
*
bornRadius2
;
real
D_ij
=
r2
*
RECIP
(
4.0
f
*
alpha2_ij
);
real
expTerm
=
EXP
(
-
D_ij
);
real
denominator2
=
r2
+
alpha2_ij
*
expTerm
;
real
denominator
=
SQRT
(
denominator2
);
real
tempEnergy
=
(
PREFACTOR
*
posq1
.
w
*
posq2
.
w
)
*
RECIP
(
denominator
);
real
Gpol
=
tempEnergy
*
RECIP
(
denominator2
);
real
dGpol_dalpha2_ij
=
-
0.5
f
*
Gpol
*
expTerm
*
(
1.0
f
+
D_ij
);
real
dEdR
=
Gpol
*
(
1.0
f
-
0.25
f
*
expTerm
);
force
.
w
+=
dGpol_dalpha2_ij
*
bornRadius2
;
energy
+=
0.5
f
*
tempEnergy
;
delta
*=
dEdR
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
z
-=
delta
.
z
;
#ifdef USE_CUTOFF
}
#endif
}
}
}
else
{
// This is an off-diagonal tile.
if
(
lasty
!=
y
)
{
unsigned
int
j
=
y
*
TILE_SIZE
+
tgx
;
real4
tempPosq
=
posq
[
j
];
localData
[
threadIdx
.
x
].
x
=
tempPosq
.
x
;
localData
[
threadIdx
.
x
].
y
=
tempPosq
.
y
;
localData
[
threadIdx
.
x
].
z
=
tempPosq
.
z
;
localData
[
threadIdx
.
x
].
q
=
tempPosq
.
w
;
localData
[
threadIdx
.
x
].
bornRadius
=
global_bornRadii
[
j
];
}
localData
[
threadIdx
.
x
].
fx
=
0.0
f
;
localData
[
threadIdx
.
x
].
fy
=
0.0
f
;
localData
[
threadIdx
.
x
].
fz
=
0.0
f
;
localData
[
threadIdx
.
x
].
fw
=
0.0
f
;
#ifdef USE_CUTOFF
unsigned
int
flags
=
(
numTiles
<=
maxTiles
?
interactionFlags
[
pos
]
:
0xFFFFFFFF
);
bool
computeSubset
=
false
;
if
(
flags
!=
0xFFFFFFFF
)
{
if
(
tgx
<
2
)
exclusionRange
[
2
*
localGroupIndex
+
tgx
]
=
exclusionRowIndices
[
x
+
tgx
];
if
(
tgx
==
0
)
exclusionIndex
[
localGroupIndex
]
=
-
1
;
for
(
unsigned
int
i
=
exclusionRange
[
2
*
localGroupIndex
]
+
tgx
;
i
<
exclusionRange
[
2
*
localGroupIndex
+
1
];
i
+=
TILE_SIZE
)
if
(
exclusionIndices
[
i
]
==
y
)
exclusionIndex
[
localGroupIndex
]
=
i
*
TILE_SIZE
;
computeSubset
=
(
exclusionIndex
[
localGroupIndex
]
==
-
1
);
}
if
(
computeSubset
)
{
if
(
flags
==
0
)
{
// No interactions in this tile.
}
else
{
// Compute only a subset of the interactions in this tile.
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
if
((
flags
&
(
1
<<
j
))
!=
0
)
{
real4
posq2
=
make_real4
(
localData
[
tbx
+
j
].
x
,
localData
[
tbx
+
j
].
y
,
localData
[
tbx
+
j
].
z
,
localData
[
tbx
+
j
].
q
);
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
#ifdef USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
#endif
real
invR
=
RSQRT
(
r2
);
real
r
=
RECIP
(
invR
);
real
bornRadius2
=
localData
[
tbx
+
j
].
bornRadius
;
real
alpha2_ij
=
bornRadius1
*
bornRadius2
;
real
D_ij
=
r2
*
RECIP
(
4.0
f
*
alpha2_ij
);
real
expTerm
=
EXP
(
-
D_ij
);
real
denominator2
=
r2
+
alpha2_ij
*
expTerm
;
real
denominator
=
SQRT
(
denominator2
);
real
tempEnergy
=
(
PREFACTOR
*
posq1
.
w
*
posq2
.
w
)
*
RECIP
(
denominator
);
real
Gpol
=
tempEnergy
*
RECIP
(
denominator2
);
real
dGpol_dalpha2_ij
=
-
0.5
f
*
Gpol
*
expTerm
*
(
1.0
f
+
D_ij
);
real
dEdR
=
Gpol
*
(
1.0
f
-
0.25
f
*
expTerm
);
#ifdef USE_CUTOFF
if
(
atom1
>=
NUM_ATOMS
||
y
*
TILE_SIZE
+
j
>=
NUM_ATOMS
||
r2
>
CUTOFF_SQUARED
)
{
#else
if
(
atom1
>=
NUM_ATOMS
||
y
*
TILE_SIZE
+
j
>=
NUM_ATOMS
)
{
#endif
dEdR
=
0.0
f
;
dGpol_dalpha2_ij
=
0.0
f
;
tempEnergy
=
0.0
f
;
}
energy
+=
tempEnergy
;
force
.
w
+=
dGpol_dalpha2_ij
*
bornRadius2
;
delta
*=
dEdR
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
z
-=
delta
.
z
;
tempBuffer
[
threadIdx
.
x
]
=
make_real4
(
delta
.
x
,
delta
.
y
,
delta
.
z
,
dGpol_dalpha2_ij
*
bornRadius1
);
#ifdef USE_CUTOFF
}
else
tempBuffer
[
threadIdx
.
x
]
=
make_real4
(
0
);
#endif
// Sum the forces on atom j.
if
(
tgx
%
4
==
0
)
tempBuffer
[
threadIdx
.
x
]
+=
tempBuffer
[
threadIdx
.
x
+
1
]
+
tempBuffer
[
threadIdx
.
x
+
2
]
+
tempBuffer
[
threadIdx
.
x
+
3
];
if
(
tgx
==
0
)
{
real4
sum
=
tempBuffer
[
threadIdx
.
x
]
+
tempBuffer
[
threadIdx
.
x
+
4
]
+
tempBuffer
[
threadIdx
.
x
+
8
]
+
tempBuffer
[
threadIdx
.
x
+
12
]
+
tempBuffer
[
threadIdx
.
x
+
16
]
+
tempBuffer
[
threadIdx
.
x
+
20
]
+
tempBuffer
[
threadIdx
.
x
+
24
]
+
tempBuffer
[
threadIdx
.
x
+
28
];
localData
[
tbx
+
j
].
fx
+=
sum
.
x
;
localData
[
tbx
+
j
].
fy
+=
sum
.
y
;
localData
[
tbx
+
j
].
fz
+=
sum
.
z
;
localData
[
tbx
+
j
].
fw
+=
sum
.
w
;
}
}
}
}
}
else
#endif
{
// Compute the full set of interactions in this tile.
unsigned
int
tj
=
tgx
;
for
(
unsigned
int
j
=
0
;
j
<
TILE_SIZE
;
j
++
)
{
if
(
atom1
<
NUM_ATOMS
&&
y
*
TILE_SIZE
+
tj
<
NUM_ATOMS
)
{
real4
posq2
=
make_real4
(
localData
[
tbx
+
tj
].
x
,
localData
[
tbx
+
tj
].
y
,
localData
[
tbx
+
tj
].
z
,
localData
[
tbx
+
tj
].
q
);
real3
delta
=
make_real3
(
posq2
.
x
-
posq1
.
x
,
posq2
.
y
-
posq1
.
y
,
posq2
.
z
-
posq1
.
z
);
#ifdef USE_PERIODIC
delta
.
x
-=
floor
(
delta
.
x
*
invPeriodicBoxSize
.
x
+
0.5
f
)
*
periodicBoxSize
.
x
;
delta
.
y
-=
floor
(
delta
.
y
*
invPeriodicBoxSize
.
y
+
0.5
f
)
*
periodicBoxSize
.
y
;
delta
.
z
-=
floor
(
delta
.
z
*
invPeriodicBoxSize
.
z
+
0.5
f
)
*
periodicBoxSize
.
z
;
#endif
real
r2
=
delta
.
x
*
delta
.
x
+
delta
.
y
*
delta
.
y
+
delta
.
z
*
delta
.
z
;
#ifdef USE_CUTOFF
if
(
r2
<
CUTOFF_SQUARED
)
{
#endif
real
invR
=
RSQRT
(
r2
);
real
r
=
RECIP
(
invR
);
real
bornRadius2
=
localData
[
tbx
+
tj
].
bornRadius
;
real
alpha2_ij
=
bornRadius1
*
bornRadius2
;
real
D_ij
=
r2
*
RECIP
(
4.0
f
*
alpha2_ij
);
real
expTerm
=
EXP
(
-
D_ij
);
real
denominator2
=
r2
+
alpha2_ij
*
expTerm
;
real
denominator
=
SQRT
(
denominator2
);
real
tempEnergy
=
(
PREFACTOR
*
posq1
.
w
*
posq2
.
w
)
*
RECIP
(
denominator
);
real
Gpol
=
tempEnergy
*
RECIP
(
denominator2
);
real
dGpol_dalpha2_ij
=
-
0.5
f
*
Gpol
*
expTerm
*
(
1.0
f
+
D_ij
);
real
dEdR
=
Gpol
*
(
1.0
f
-
0.25
f
*
expTerm
);
force
.
w
+=
dGpol_dalpha2_ij
*
bornRadius2
;
energy
+=
tempEnergy
;
delta
*=
dEdR
;
force
.
x
-=
delta
.
x
;
force
.
y
-=
delta
.
y
;
force
.
z
-=
delta
.
z
;
localData
[
tbx
+
tj
].
fx
+=
delta
.
x
;
localData
[
tbx
+
tj
].
fy
+=
delta
.
y
;
localData
[
tbx
+
tj
].
fz
+=
delta
.
z
;
localData
[
tbx
+
tj
].
fw
+=
dGpol_dalpha2_ij
*
bornRadius1
;
#ifdef USE_CUTOFF
}
#endif
}
tj
=
(
tj
+
1
)
&
(
TILE_SIZE
-
1
);
}
}
}
}
// Write results.
if
(
pos
<
end
)
{
const
unsigned
int
offset
=
x
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
x
*
0xFFFFFFFF
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
y
*
0xFFFFFFFF
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
z
*
0xFFFFFFFF
)));
atomicAdd
(
&
global_bornForce
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
force
.
w
*
0xFFFFFFFF
)));
}
if
(
pos
<
end
&&
x
!=
y
)
{
const
unsigned
int
offset
=
y
*
TILE_SIZE
+
tgx
;
atomicAdd
(
&
forceBuffers
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fx
*
0xFFFFFFFF
)));
atomicAdd
(
&
forceBuffers
[
offset
+
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fy
*
0xFFFFFFFF
)));
atomicAdd
(
&
forceBuffers
[
offset
+
2
*
PADDED_NUM_ATOMS
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fz
*
0xFFFFFFFF
)));
atomicAdd
(
&
global_bornForce
[
offset
],
static_cast
<
unsigned
long
long
>
((
long
long
)
(
localData
[
threadIdx
.
x
].
fw
*
0xFFFFFFFF
)));
}
lasty
=
y
;
pos
++
;
}
while
(
pos
<
end
);
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
energy
;
}
platforms/cuda2/src/kernels/gbsaObc2.cu
0 → 100644
View file @
17ae3aae
{
real
invRSquaredOver4
=
0.25
f
*
invR
*
invR
;
real
rScaledRadiusJ
=
r
+
obcParams2
.
y
;
real
rScaledRadiusI
=
r
+
obcParams1
.
y
;
real
l_ijJ
=
RECIP
(
max
(
obcParams1
.
x
,
fabs
(
r
-
obcParams2
.
y
)));
real
l_ijI
=
RECIP
(
max
(
obcParams2
.
x
,
fabs
(
r
-
obcParams1
.
y
)));
real
u_ijJ
=
RECIP
(
rScaledRadiusJ
);
real
u_ijI
=
RECIP
(
rScaledRadiusI
);
real
l_ij2J
=
l_ijJ
*
l_ijJ
;
real
l_ij2I
=
l_ijI
*
l_ijI
;
real
u_ij2J
=
u_ijJ
*
u_ijJ
;
real
u_ij2I
=
u_ijI
*
u_ijI
;
real
t1J
=
LOG
(
u_ijJ
*
RECIP
(
l_ijJ
));
real
t1I
=
LOG
(
u_ijI
*
RECIP
(
l_ijI
));
real
t2J
=
(
l_ij2J
-
u_ij2J
);
real
t2I
=
(
l_ij2I
-
u_ij2I
);
real
term1
=
(
0.5
f
*
(
0.25
f
+
obcParams2
.
y
*
obcParams2
.
y
*
invRSquaredOver4
)
*
t2J
+
t1J
*
invRSquaredOver4
)
*
invR
;
real
term2
=
(
0.5
f
*
(
0.25
f
+
obcParams1
.
y
*
obcParams1
.
y
*
invRSquaredOver4
)
*
t2I
+
t1I
*
invRSquaredOver4
)
*
invR
;
real
tempdEdR
=
(
obcParams1
.
x
<
rScaledRadiusJ
?
bornForce1
*
term1
/
0xFFFFFFFF
:
0
);
tempdEdR
+=
(
obcParams2
.
x
<
rScaledRadiusI
?
bornForce2
*
term2
/
0xFFFFFFFF
:
0
);
#ifdef USE_CUTOFF
unsigned
int
includeInteraction
=
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
&&
atom1
!=
atom2
&&
r2
<
CUTOFF_SQUARED
);
#else
unsigned
int
includeInteraction
=
(
atom1
<
NUM_ATOMS
&&
atom2
<
NUM_ATOMS
&&
atom1
!=
atom2
);
#endif
dEdR
+=
(
includeInteraction
?
tempdEdR
:
0
);
}
platforms/cuda2/src/kernels/gbsaObcReductions.cu
0 → 100644
View file @
17ae3aae
#define DIELECTRIC_OFFSET 0.009f
#define PROBE_RADIUS 0.14f
#define SURFACE_AREA_FACTOR -170.351730667551f //-6.0f*3.14159265358979323846f*0.0216f*1000.0f*0.4184f;
/**
* Reduce the Born sums to compute the Born radii.
*/
extern
"C"
__global__
void
reduceBornSum
(
float
alpha
,
float
beta
,
float
gamma
,
const
long
long
*
__restrict__
bornSum
,
const
float2
*
__restrict__
params
,
real
*
__restrict__
bornRadii
,
real
*
__restrict__
obcChain
)
{
for
(
unsigned
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
NUM_ATOMS
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
// Get summed Born data
real
sum
=
RECIP
(
0xFFFFFFFF
)
*
bornSum
[
index
];
// Now calculate Born radius and OBC term.
float
offsetRadius
=
params
[
index
].
x
;
sum
*=
0.5
f
*
offsetRadius
;
real
sum2
=
sum
*
sum
;
real
sum3
=
sum
*
sum2
;
real
tanhSum
=
tanh
(
alpha
*
sum
-
beta
*
sum2
+
gamma
*
sum3
);
real
nonOffsetRadius
=
offsetRadius
+
DIELECTRIC_OFFSET
;
real
radius
=
RECIP
(
RECIP
(
offsetRadius
)
-
tanhSum
/
nonOffsetRadius
);
real
chain
=
offsetRadius
*
(
alpha
-
2.0
f
*
beta
*
sum
+
3.0
f
*
gamma
*
sum2
);
chain
=
(
1
-
tanhSum
*
tanhSum
)
*
chain
/
nonOffsetRadius
;
bornRadii
[
index
]
=
radius
;
obcChain
[
index
]
=
chain
;
}
}
/**
* Reduce the Born force.
*/
extern
"C"
__global__
void
reduceBornForce
(
long
long
*
__restrict__
bornForce
,
real
*
__restrict__
energyBuffer
,
const
float2
*
__restrict__
params
,
const
real
*
__restrict__
bornRadii
,
const
real
*
__restrict__
obcChain
)
{
real
energy
=
0
;
for
(
unsigned
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
NUM_ATOMS
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
// Get summed Born force
real
force
=
RECIP
(
0xFFFFFFFF
)
*
bornForce
[
index
];
// Now calculate the actual force
float
offsetRadius
=
params
[
index
].
x
;
real
bornRadius
=
bornRadii
[
index
];
real
r
=
offsetRadius
+
DIELECTRIC_OFFSET
+
PROBE_RADIUS
;
real
ratio6
=
POW
((
offsetRadius
+
DIELECTRIC_OFFSET
)
/
bornRadius
,
6
);
real
saTerm
=
SURFACE_AREA_FACTOR
*
r
*
r
*
ratio6
;
force
+=
saTerm
/
bornRadius
;
energy
+=
saTerm
;
force
*=
bornRadius
*
bornRadius
*
obcChain
[
index
];
bornForce
[
index
]
=
(
long
long
)
(
force
*
0xFFFFFFFF
);
}
energyBuffer
[
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
]
+=
energy
/-
6
;
}
\ No newline at end of file
platforms/cuda2/tests/TestCudaGBSAOBCForce.cpp
0 → 100644
View file @
17ae3aae
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008-2012 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
/**
* This tests the CUDA implementation of GBSAOBCForce.
*/
#include "openmm/internal/AssertionUtilities.h"
#include "openmm/Context.h"
#include "CudaPlatform.h"
#include "ReferencePlatform.h"
#include "openmm/GBSAOBCForce.h"
#include "openmm/System.h"
#include "openmm/LangevinIntegrator.h"
#include "../src/SimTKUtilities/SimTKOpenMMRealType.h"
#include "sfmt/SFMT.h"
#include "openmm/NonbondedForce.h"
#include <iostream>
#include <vector>
using
namespace
OpenMM
;
using
namespace
std
;
const
double
TOL
=
1e-5
;
void
testSingleParticle
()
{
CudaPlatform
platform
;
System
system
;
system
.
addParticle
(
2.0
);
LangevinIntegrator
integrator
(
0
,
0.1
,
0.01
);
GBSAOBCForce
*
gbsa
=
new
GBSAOBCForce
();
NonbondedForce
*
nonbonded
=
new
NonbondedForce
();
gbsa
->
addParticle
(
0.5
,
0.15
,
1
);
nonbonded
->
addParticle
(
0.5
,
1
,
0
);
system
.
addForce
(
gbsa
);
system
.
addForce
(
nonbonded
);
Context
context
(
system
,
integrator
,
platform
);
vector
<
Vec3
>
positions
(
1
);
positions
[
0
]
=
Vec3
(
0
,
0
,
0
);
context
.
setPositions
(
positions
);
State
state
=
context
.
getState
(
State
::
Energy
);
double
bornRadius
=
0.15
-
0.009
;
// dielectric offset
double
eps0
=
EPSILON0
;
double
bornEnergy
=
(
-
0.5
*
0.5
/
(
8
*
PI_M
*
eps0
))
*
(
1.0
/
gbsa
->
getSoluteDielectric
()
-
1.0
/
gbsa
->
getSolventDielectric
())
/
bornRadius
;
double
extendedRadius
=
bornRadius
+
0.14
;
// probe radius
double
nonpolarEnergy
=
CAL2JOULE
*
PI_M
*
0.0216
*
(
10
*
extendedRadius
)
*
(
10
*
extendedRadius
)
*
std
::
pow
(
0.15
/
bornRadius
,
6.0
);
// Where did this formula come from? Just copied it from CpuImplicitSolvent.cpp
ASSERT_EQUAL_TOL
((
bornEnergy
+
nonpolarEnergy
),
state
.
getPotentialEnergy
(),
0.01
);
// Change the parameters and see if it is still correct.
gbsa
->
setParticleParameters
(
0
,
0.4
,
0.25
,
1
);
gbsa
->
updateParametersInContext
(
context
);
state
=
context
.
getState
(
State
::
Energy
);
bornRadius
=
0.25
-
0.009
;
// dielectric offset
bornEnergy
=
(
-
0.4
*
0.4
/
(
8
*
PI_M
*
eps0
))
*
(
1.0
/
gbsa
->
getSoluteDielectric
()
-
1.0
/
gbsa
->
getSolventDielectric
())
/
bornRadius
;
extendedRadius
=
bornRadius
+
0.14
;
nonpolarEnergy
=
CAL2JOULE
*
PI_M
*
0.0216
*
(
10
*
extendedRadius
)
*
(
10
*
extendedRadius
)
*
std
::
pow
(
0.25
/
bornRadius
,
6.0
);
ASSERT_EQUAL_TOL
((
bornEnergy
+
nonpolarEnergy
),
state
.
getPotentialEnergy
(),
0.01
);
}
void
testCutoffAndPeriodic
()
{
CudaPlatform
cl
;
System
system
;
system
.
addParticle
(
1.0
);
system
.
addParticle
(
1.0
);
LangevinIntegrator
integrator
(
0
,
0.1
,
0.01
);
GBSAOBCForce
*
gbsa
=
new
GBSAOBCForce
();
NonbondedForce
*
nonbonded
=
new
NonbondedForce
();
gbsa
->
addParticle
(
-
1
,
0.15
,
1
);
nonbonded
->
addParticle
(
-
1
,
1
,
0
);
gbsa
->
addParticle
(
1
,
0.15
,
1
);
nonbonded
->
addParticle
(
1
,
1
,
0
);
const
double
cutoffDistance
=
3.0
;
const
double
boxSize
=
10.0
;
nonbonded
->
setCutoffDistance
(
cutoffDistance
);
gbsa
->
setCutoffDistance
(
cutoffDistance
);
system
.
setDefaultPeriodicBoxVectors
(
Vec3
(
boxSize
,
0
,
0
),
Vec3
(
0
,
boxSize
,
0
),
Vec3
(
0
,
0
,
boxSize
));
system
.
addForce
(
gbsa
);
system
.
addForce
(
nonbonded
);
vector
<
Vec3
>
positions
(
2
);
positions
[
0
]
=
Vec3
(
0
,
0
,
0
);
positions
[
1
]
=
Vec3
(
2
,
0
,
0
);
// Calculate the forces for both cutoff and periodic with two different atom positions.
nonbonded
->
setNonbondedMethod
(
NonbondedForce
::
CutoffNonPeriodic
);
gbsa
->
setNonbondedMethod
(
GBSAOBCForce
::
CutoffNonPeriodic
);
Context
context
(
system
,
integrator
,
cl
);
context
.
setPositions
(
positions
);
State
state1
=
context
.
getState
(
State
::
Forces
);
nonbonded
->
setNonbondedMethod
(
NonbondedForce
::
CutoffPeriodic
);
gbsa
->
setNonbondedMethod
(
GBSAOBCForce
::
CutoffPeriodic
);
context
.
reinitialize
();
context
.
setPositions
(
positions
);
State
state2
=
context
.
getState
(
State
::
Forces
);
positions
[
1
][
0
]
+=
boxSize
;
nonbonded
->
setNonbondedMethod
(
NonbondedForce
::
CutoffNonPeriodic
);
gbsa
->
setNonbondedMethod
(
GBSAOBCForce
::
CutoffNonPeriodic
);
context
.
reinitialize
();
context
.
setPositions
(
positions
);
State
state3
=
context
.
getState
(
State
::
Forces
);
nonbonded
->
setNonbondedMethod
(
NonbondedForce
::
CutoffPeriodic
);
gbsa
->
setNonbondedMethod
(
GBSAOBCForce
::
CutoffPeriodic
);
context
.
reinitialize
();
context
.
setPositions
(
positions
);
State
state4
=
context
.
getState
(
State
::
Forces
);
// All forces should be identical, exception state3 which should be zero.
ASSERT_EQUAL_VEC
(
state1
.
getForces
()[
0
],
state2
.
getForces
()[
0
],
0.01
);
ASSERT_EQUAL_VEC
(
state1
.
getForces
()[
1
],
state2
.
getForces
()[
1
],
0.01
);
ASSERT_EQUAL_VEC
(
state1
.
getForces
()[
0
],
state4
.
getForces
()[
0
],
0.01
);
ASSERT_EQUAL_VEC
(
state1
.
getForces
()[
1
],
state4
.
getForces
()[
1
],
0.01
);
ASSERT_EQUAL_VEC
(
state3
.
getForces
()[
0
],
Vec3
(
0
,
0
,
0
),
0.01
);
ASSERT_EQUAL_VEC
(
state3
.
getForces
()[
1
],
Vec3
(
0
,
0
,
0
),
0.01
);
}
void
testForce
(
int
numParticles
,
NonbondedForce
::
NonbondedMethod
method
,
GBSAOBCForce
::
NonbondedMethod
method2
)
{
CudaPlatform
cl
;
ReferencePlatform
reference
;
System
system
;
GBSAOBCForce
*
gbsa
=
new
GBSAOBCForce
();
NonbondedForce
*
nonbonded
=
new
NonbondedForce
();
for
(
int
i
=
0
;
i
<
numParticles
;
++
i
)
{
system
.
addParticle
(
1.0
);
double
charge
=
i
%
2
==
0
?
-
1
:
1
;
gbsa
->
addParticle
(
charge
,
0.15
,
1
);
nonbonded
->
addParticle
(
charge
,
1
,
0
);
}
nonbonded
->
setNonbondedMethod
(
method
);
gbsa
->
setNonbondedMethod
(
method2
);
nonbonded
->
setCutoffDistance
(
3.0
);
gbsa
->
setCutoffDistance
(
3.0
);
int
grid
=
(
int
)
floor
(
0.5
+
pow
(
numParticles
,
1.0
/
3.0
));
if
(
method
==
NonbondedForce
::
CutoffPeriodic
)
{
double
boxSize
=
(
grid
+
1
)
*
1.1
;
system
.
setDefaultPeriodicBoxVectors
(
Vec3
(
boxSize
,
0
,
0
),
Vec3
(
0
,
boxSize
,
0
),
Vec3
(
0
,
0
,
boxSize
));
}
system
.
addForce
(
gbsa
);
system
.
addForce
(
nonbonded
);
LangevinIntegrator
integrator1
(
0
,
0.1
,
0.01
);
LangevinIntegrator
integrator2
(
0
,
0.1
,
0.01
);
Context
context
(
system
,
integrator1
,
cl
);
Context
refContext
(
system
,
integrator2
,
reference
);
// Set random (but uniformly distributed) positions for all the particles.
vector
<
Vec3
>
positions
(
numParticles
);
OpenMM_SFMT
::
SFMT
sfmt
;
init_gen_rand
(
0
,
sfmt
);
for
(
int
i
=
0
;
i
<
grid
;
i
++
)
for
(
int
j
=
0
;
j
<
grid
;
j
++
)
for
(
int
k
=
0
;
k
<
grid
;
k
++
)
positions
[
i
*
grid
*
grid
+
j
*
grid
+
k
]
=
Vec3
(
i
*
1.1
,
j
*
1.1
,
k
*
1.1
);
for
(
int
i
=
0
;
i
<
numParticles
;
++
i
)
positions
[
i
]
=
positions
[
i
]
+
Vec3
(
0.5
*
genrand_real2
(
sfmt
),
0.5
*
genrand_real2
(
sfmt
),
0.5
*
genrand_real2
(
sfmt
));
context
.
setPositions
(
positions
);
refContext
.
setPositions
(
positions
);
State
state
=
context
.
getState
(
State
::
Forces
|
State
::
Energy
);
State
refState
=
refContext
.
getState
(
State
::
Forces
|
State
::
Energy
);
// Make sure the CUDA and Reference platforms agree.
double
norm
=
0.0
;
double
diff
=
0.0
;
for
(
int
i
=
0
;
i
<
numParticles
;
++
i
)
{
Vec3
f
=
state
.
getForces
()[
i
];
norm
+=
f
[
0
]
*
f
[
0
]
+
f
[
1
]
*
f
[
1
]
+
f
[
2
]
*
f
[
2
];
Vec3
delta
=
f
-
refState
.
getForces
()[
i
];
diff
+=
delta
[
0
]
*
delta
[
0
]
+
delta
[
1
]
*
delta
[
1
]
+
delta
[
2
]
*
delta
[
2
];
}
norm
=
std
::
sqrt
(
norm
);
diff
=
std
::
sqrt
(
diff
);
ASSERT_EQUAL_TOL
(
0.0
,
diff
,
0.001
*
norm
);
ASSERT_EQUAL_TOL
(
state
.
getPotentialEnergy
(),
refState
.
getPotentialEnergy
(),
1e-3
);
// Take a small step in the direction of the energy gradient. (This doesn't work with cutoffs, since the energy
// changes discontinuously at the cutoff distance.)
if
(
method
==
NonbondedForce
::
NoCutoff
)
{
const
double
delta
=
1e-2
;
double
step
=
delta
/
norm
;
for
(
int
i
=
0
;
i
<
numParticles
;
++
i
)
{
Vec3
p
=
positions
[
i
];
Vec3
f
=
state
.
getForces
()[
i
];
positions
[
i
]
=
Vec3
(
p
[
0
]
-
f
[
0
]
*
step
,
p
[
1
]
-
f
[
1
]
*
step
,
p
[
2
]
-
f
[
2
]
*
step
);
}
context
.
setPositions
(
positions
);
// See whether the potential energy changed by the expected amount.
State
state2
=
context
.
getState
(
State
::
Energy
);
ASSERT_EQUAL_TOL
(
norm
,
(
state2
.
getPotentialEnergy
()
-
state
.
getPotentialEnergy
())
/
delta
,
1e-3
*
abs
(
state
.
getPotentialEnergy
()));
}
}
int
main
()
{
try
{
testSingleParticle
();
testCutoffAndPeriodic
();
for
(
int
i
=
5
;
i
<
11
;
i
++
)
{
testForce
(
i
*
i
*
i
,
NonbondedForce
::
NoCutoff
,
GBSAOBCForce
::
NoCutoff
);
testForce
(
i
*
i
*
i
,
NonbondedForce
::
CutoffNonPeriodic
,
GBSAOBCForce
::
CutoffNonPeriodic
);
testForce
(
i
*
i
*
i
,
NonbondedForce
::
CutoffPeriodic
,
GBSAOBCForce
::
CutoffPeriodic
);
}
}
catch
(
const
exception
&
e
)
{
cout
<<
"exception: "
<<
e
.
what
()
<<
endl
;
return
1
;
}
cout
<<
"Done"
<<
endl
;
return
0
;
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment