Commit 873552ba authored by Peter Eastman's avatar Peter Eastman
Browse files

Deleted free energy plugin

parent c775bd19
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "CudaFreeEnergyKernelFactory.h"
#include "CudaFreeEnergyKernels.h"
#include "openmm/freeEnergyKernels.h"
#include "FreeEnergyCudaData.h"
#include "openmm/internal/ContextImpl.h"
#include "openmm/OpenMMException.h"
#include "kernels/GpuFreeEnergyCudaKernels.h"
using namespace OpenMM;
extern "C" void registerPlatforms() {
}
extern "C" void registerKernelFactories() {
// (void) fprintf( stderr, "initOpenMMCudaFreeEnergyPlugin called\n");
if ( gpuIsAvailableSoftcore() ){
for( int ii = 0; ii < Platform::getNumPlatforms(); ii++ ){
Platform& platform = Platform::getPlatform(ii);
if( platform.getName().compare( "Cuda" ) == 0 ){
CudaFreeEnergyKernelFactory* factory = new CudaFreeEnergyKernelFactory();
platform.registerKernelFactory(CalcNonbondedSoftcoreForceKernel::Name(), factory);
platform.registerKernelFactory(CalcGBSAOBCSoftcoreForceKernel::Name(), factory);
platform.registerKernelFactory(CalcGBVISoftcoreForceKernel::Name(), factory);
}
}
}
}
extern "C" OPENMMCUDA_EXPORT void registerFreeEnergyCudaKernelFactories( void ) {
int hasCudaPlatform = 0;
for( int ii = 0; ii < Platform::getNumPlatforms() && hasCudaPlatform == 0; ii++ ){
Platform& platform = Platform::getPlatform(ii);
if( platform.getName() == "Cuda" ){
hasCudaPlatform = 1;
}
}
if( hasCudaPlatform == 0 ){
if (gpuIsAvailable() ){
Platform::registerPlatform(new CudaPlatform());
}
}
registerKernelFactories();
}
static std::map<ContextImpl*, FreeEnergyCudaData*> contextToFreeEnergyDataMap;
// look up FreeEnergyCudaData for input contextImpl in contextToFreeEnergyDataMap
extern "C" void* getFreeEnergyCudaData( ContextImpl& context ) {
std::map<ContextImpl*, FreeEnergyCudaData*>::const_iterator mapIterator = contextToFreeEnergyDataMap.find(&context);
if( mapIterator == contextToFreeEnergyDataMap.end() ){
return NULL;
} else {
return static_cast<void*>(mapIterator->second);
}
}
// remove FreeEnergyCudaData from contextToFreeEnergyDataMap
extern "C" void removeFreeEnergyCudaDataFromContextMap( void* inputContext ) {
ContextImpl* context = static_cast<ContextImpl*>(inputContext);
contextToFreeEnergyDataMap.erase( context );
return;
}
KernelImpl* CudaFreeEnergyKernelFactory::createKernelImpl(std::string name, const Platform& platform, ContextImpl& context) const {
// create FreeEnergyCudaData object if contextToFreeEnergyDataMap does not contain
// key equal to current context
FreeEnergyCudaData* freeEnergyCudaData;
std::map<ContextImpl*, FreeEnergyCudaData*>::const_iterator mapIterator = contextToFreeEnergyDataMap.find(&context);
if( mapIterator == contextToFreeEnergyDataMap.end() ){
CudaPlatform::PlatformData& cudaPlatformData = *static_cast<CudaPlatform::PlatformData*>(context.getPlatformData());
freeEnergyCudaData = new FreeEnergyCudaData( cudaPlatformData );
contextToFreeEnergyDataMap[&context] = freeEnergyCudaData;
//freeEnergyCudaData->setLog( stderr );
freeEnergyCudaData->setContextImpl( static_cast<void*>(&context) );
} else {
freeEnergyCudaData = mapIterator->second;
}
if (name == CalcNonbondedSoftcoreForceKernel::Name())
return new CudaFreeEnergyCalcNonbondedSoftcoreForceKernel(name, platform, *freeEnergyCudaData, context.getSystem());
if (name == CalcGBSAOBCSoftcoreForceKernel::Name())
return new CudaFreeEnergyCalcGBSAOBCSoftcoreForceKernel(name, platform, *freeEnergyCudaData);
if (name == CalcGBVISoftcoreForceKernel::Name())
return new CudaFreeEnergyCalcGBVISoftcoreForceKernel(name, platform, *freeEnergyCudaData);
throw OpenMMException( (std::string("Tried to create kernel with illegal kernel name '") + name + "'").c_str() );
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008-2009 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "CudaFreeEnergyKernels.h"
#include "CudaForceInfo.h"
#include "openmm/Context.h"
#include "openmm/OpenMMException.h"
#include "openmm/internal/ContextImpl.h"
#include "kernels/gputypes.h"
#include "kernels/cudaKernels.h"
#include "kernels/GpuFreeEnergyCudaKernels.h"
#include <cmath>
#include <map>
#include <cstring>
#include <cstdlib>
#include <typeinfo>
using namespace OpenMM;
typedef std::map< std::string, int > MapStringInt;
typedef MapStringInt::iterator MapStringIntI;
typedef MapStringInt::const_iterator MapStringIntCI;
// force names
const std::string HARMONIC_BOND_FORCE = "HarmonicBond";
const std::string HARMONIC_ANGLE_FORCE = "HarmonicBond";
const std::string PERIODIC_TORSION_FORCE = "PeriodicTorsion";
const std::string RB_TORSION_FORCE = "RbTorsion";
const std::string NB_FORCE = "Nb";
const std::string NB_SOFTCORE_FORCE = "NbSoftcore";
const std::string NB_EXCEPTION_FORCE = "NbException";
const std::string NB_EXCEPTION_SOFTCORE_FORCE = "NbSoftcoreException";
const std::string GBSA_OBC_FORCE = "Obc";
const std::string GBSA_OBC_SOFTCORE_FORCE = "ObcSoftcore";
const std::string GBVI_FORCE = "GBVI";
const std::string GBVI_SOFTCORE_FORCE = "GBVISoftcore";
static void getForceMap(const System& system, MapStringInt& forceMap, FILE* log) {
// check forces and relevant parameters
for(int i = 0; i < system.getNumForces(); ++i) {
int hit = 0;
const Force& force = system.getForce(i);
std::string forceName = "NA";
// bond
if( !hit ){
try {
const HarmonicBondForce& harmonicBondForce = dynamic_cast<const HarmonicBondForce&>(force);
forceMap[HARMONIC_BOND_FORCE] = 1;
forceName = HARMONIC_BOND_FORCE;
hit++;
} catch( std::bad_cast ){
}
}
// angle
if( !hit ){
try {
const HarmonicAngleForce& harmonicAngleForce = dynamic_cast<const HarmonicAngleForce&>(force);
forceMap[HARMONIC_ANGLE_FORCE] = 1;
forceName = HARMONIC_ANGLE_FORCE;
hit++;
} catch( std::bad_cast ){
}
}
// PeriodicTorsionForce
if( !hit ){
try {
const PeriodicTorsionForce & periodicTorsionForce = dynamic_cast<const PeriodicTorsionForce&>(force);
forceMap[PERIODIC_TORSION_FORCE] = 1;
forceName = PERIODIC_TORSION_FORCE;
hit++;
} catch( std::bad_cast ){
}
}
// RBTorsionForce
if( !hit ){
try {
const RBTorsionForce& rBTorsionForce = dynamic_cast<const RBTorsionForce&>(force);
forceMap[RB_TORSION_FORCE] = 1;
forceName = RB_TORSION_FORCE;
hit++;
} catch( std::bad_cast ){
}
}
// nonbonded
if( !hit ){
try {
const NonbondedForce& nbForce = dynamic_cast<const NonbondedForce&>(force);
forceMap[NB_FORCE] = 1;
forceName = NB_FORCE;
} catch( std::bad_cast ){
}
}
// nonbonded softcore
if( !hit ){
try {
const NonbondedSoftcoreForce& nbForce = dynamic_cast<const NonbondedSoftcoreForce&>(force);
forceMap[NB_SOFTCORE_FORCE] = 1;
forceName = NB_SOFTCORE_FORCE;
} catch( std::bad_cast ){
}
}
// GBSA OBC
if( !hit ){
try {
const GBSAOBCForce& obcForce = dynamic_cast<const GBSAOBCForce&>(force);
forceMap[GBSA_OBC_FORCE] = 1;
forceName = GBSA_OBC_FORCE;
hit++;
} catch( std::bad_cast ){
}
}
// GBSA OBC softcore
if( !hit ){
try {
const GBSAOBCSoftcoreForce& obcForce = dynamic_cast<const GBSAOBCSoftcoreForce&>(force);
forceMap[GBSA_OBC_SOFTCORE_FORCE] = 1;
forceName = GBSA_OBC_SOFTCORE_FORCE;
hit++;
} catch( std::bad_cast ){
}
}
// GB/VI
if( !hit ){
try {
const GBVIForce& obcForce = dynamic_cast<const GBVIForce&>(force);
forceMap[GBVI_FORCE] = 1;
forceName = GBVI_FORCE;
hit++;
} catch( std::bad_cast ){
}
}
// GB/VI softcore
if( !hit ){
try {
const GBVISoftcoreForce& gbviForce = dynamic_cast<const GBVISoftcoreForce&>(force);
forceMap[GBVI_SOFTCORE_FORCE] = 1;
forceName = GBVI_SOFTCORE_FORCE;
hit++;
} catch( std::bad_cast ){
}
}
if( log ){
(void) fprintf( log, "Map: Force %d %s\n", i, forceName.c_str() );
}
}
}
class CudaFreeEnergyCalcNonbondedSoftcoreForceKernel::ForceInfo : public CudaForceInfo {
public:
ForceInfo(const NonbondedSoftcoreForce& force) : force(force) {
}
bool areParticlesIdentical(int particle1, int particle2) {
double charge1, charge2, sigma1, sigma2, epsilon1, epsilon2, softcoreLJLambda1, softcoreLJLambda2;
force.getParticleParameters(particle1, charge1, sigma1, epsilon1, softcoreLJLambda1);
force.getParticleParameters(particle2, charge2, sigma2, epsilon2, softcoreLJLambda2);
return (charge1 == charge2 && sigma1 == sigma2 && epsilon1 == epsilon2 && softcoreLJLambda1 == softcoreLJLambda2);
}
int getNumParticleGroups() {
return force.getNumExceptions();
}
void getParticlesInGroup(int index, std::vector<int>& particles) {
int particle1, particle2;
double chargeProd, sigma, epsilon, softcoreLJLambda;
force.getExceptionParameters(index, particle1, particle2, chargeProd, sigma, epsilon, softcoreLJLambda);
particles.resize(2);
particles[0] = particle1;
particles[1] = particle2;
}
bool areGroupsIdentical(int group1, int group2) {
int particle1, particle2;
double chargeProd1, chargeProd2, sigma1, sigma2, epsilon1, epsilon2, softcoreLJLambda1, softcoreLJLambda2;
force.getExceptionParameters(group1, particle1, particle2, chargeProd1, sigma1, epsilon1, softcoreLJLambda1);
force.getExceptionParameters(group2, particle1, particle2, chargeProd2, sigma2, epsilon2, softcoreLJLambda2);
return (chargeProd1 == chargeProd2 && sigma1 == sigma2 && epsilon1 == epsilon2 && softcoreLJLambda1 == softcoreLJLambda2);
}
private:
const NonbondedSoftcoreForce& force;
};
CudaFreeEnergyCalcNonbondedSoftcoreForceKernel::~CudaFreeEnergyCalcNonbondedSoftcoreForceKernel() {
if( 0 && data.getLog() ){
(void) fprintf( data.getLog(), "~CudaFreeEnergyCalcNonbondedSoftcoreForceKernel called.\n" );
(void) fflush( data.getLog() );
}
data.decrementKernelCount();
}
void CudaFreeEnergyCalcNonbondedSoftcoreForceKernel::initialize(const System& system, const NonbondedSoftcoreForce& force) {
// ---------------------------------------------------------------------------------------
static const std::string methodName = "CudaFreeEnergyCalcNonbondedSoftcoreForceKernel::initialize";
// ---------------------------------------------------------------------------------------
if( data.getLog() ){
(void) fprintf( data.getLog(), "%s called.\n", methodName.c_str() );
(void) fflush( data.getLog() );
}
// check forces and relevant parameters
MapStringInt forceMap;
getForceMap( system, forceMap, data.getLog() );
int softcore = 0;
if( forceMap.find( GBSA_OBC_FORCE ) != forceMap.end() ){
setIncludeGBSA( true );
}
if( forceMap.find( GBSA_OBC_SOFTCORE_FORCE ) != forceMap.end() ){
setIncludeGBSA( true );
softcore++;
}
if( forceMap.find( GBVI_FORCE ) != forceMap.end() ){
setIncludeGBVI( true );
}
if( forceMap.find( GBVI_SOFTCORE_FORCE ) != forceMap.end() ){
setIncludeGBVI( true );
softcore++;
}
if( forceMap.find( NB_SOFTCORE_FORCE ) != forceMap.end() ){
softcore++;
}
setIncludeSoftcore( softcore );
numParticles = force.getNumParticles();
// Identify which exceptions are 1-4 interactions.
std::vector<pair<int, int> > exclusions;
std::vector<int> exceptions;
for (int i = 0; i < force.getNumExceptions(); i++) {
int particle1, particle2;
double chargeProd, sigma, epsilon, softcoreLJLambda;
force.getExceptionParameters(i, particle1, particle2, chargeProd, sigma, epsilon, softcoreLJLambda);
exclusions.push_back(pair<int, int>(particle1, particle2));
if (chargeProd != 0.0 || epsilon != 0.0)
exceptions.push_back(i);
}
// Initialize nonbonded interactions.
if( numParticles > 0 ){
std::vector<int> particle(numParticles);
std::vector<float> c6(numParticles);
std::vector<float> c12(numParticles);
std::vector<float> q(numParticles);
std::vector<float> softcoreLJLambdaArray(numParticles);
std::vector<char> symbol;
std::vector<std::vector<int> > exclusionList(numParticles);
for (int i = 0; i < numParticles; i++) {
double charge, radius, depth, softcoreLJLambda;
force.getParticleParameters(i, charge, radius, depth, softcoreLJLambda);
particle[i] = i;
q[i] = static_cast<float>( charge );
c6[i] = static_cast<float>( (4*depth*pow(radius, 6.0)) );
c12[i] = static_cast<float>( (4*depth*pow(radius, 12.0)) );
softcoreLJLambdaArray[i] = static_cast<float>( softcoreLJLambda );
exclusionList[i].push_back(i);
}
for (int i = 0; i < (int)exclusions.size(); i++) {
exclusionList[exclusions[i].first].push_back(exclusions[i].second);
exclusionList[exclusions[i].second].push_back(exclusions[i].first);
}
Vec3 boxVectors[3];
system.getDefaultPeriodicBoxVectors(boxVectors[0], boxVectors[1], boxVectors[2]);
freeEnergyGpuSetPeriodicBoxSize( data.getFreeEnergyGpu(), static_cast<float>(boxVectors[0][0] ), static_cast<float>(boxVectors[1][1] ), static_cast<float>(boxVectors[2][2] ));
CudaFreeEnergyNonbondedMethod method = FREE_ENERGY_NO_CUTOFF;
if (force.getNonbondedMethod() != NonbondedSoftcoreForce::NoCutoff) {
method = FREE_ENERGY_CUTOFF;
}
if (force.getNonbondedMethod() == NonbondedSoftcoreForce::CutoffPeriodic) {
method = FREE_ENERGY_PERIODIC;
}
// setup parameters
gpuSetNonbondedSoftcoreParameters( data.getFreeEnergyGpu(), 138.935485f, particle, c6, c12, q,
softcoreLJLambdaArray, symbol, exclusionList, method,
static_cast<float>(force.getCutoffDistance() ), static_cast<float>(force.getReactionFieldDielectric()));
}
// Initialize 1-4 nonbonded interactions.
numExceptions = exceptions.size();
if( numExceptions > 0 ){
std::vector<int> particle1(numExceptions);
std::vector<int> particle2(numExceptions);
std::vector<float> c6(numExceptions);
std::vector<float> c12(numExceptions);
std::vector<float> qProd(numExceptions);
std::vector<float> softcoreLJLambdaArray(numExceptions);
for (int i = 0; i < numExceptions; i++) {
double charge, sig, eps, softcoreLJLambda;
force.getExceptionParameters(exceptions[i], particle1[i], particle2[i], charge, sig, eps, softcoreLJLambda);
c6[i] = static_cast<float>( (4.0*eps*pow(sig, 6.0)) );
c12[i] = static_cast<float>( (4.0*eps*pow(sig, 12.0)) );
qProd[i] = static_cast<float>( charge );
softcoreLJLambdaArray[i] = static_cast<float>( softcoreLJLambda );
}
gpuSetLJ14SoftcoreParameters( data.getFreeEnergyGpu(), 138.935485f, particle1, particle2, c6, c12, qProd, softcoreLJLambdaArray);
} else if( data.getLog() ){
(void) fprintf( data.getLog(), "Mo nonbonded softcore exceptions.\n" );
(void) fflush( data.getLog() );
}
data.getFreeEnergyGpu()->gpuContext->forces.push_back(new ForceInfo(force));
}
double CudaFreeEnergyCalcNonbondedSoftcoreForceKernel::execute( ContextImpl& context, bool includeForces, bool includeEnergy ){
// ---------------------------------------------------------------------------------------
static const std::string methodName = "CudaFreeEnergyCalcNonbondedSoftcoreForceKernel::executeForces";
// ---------------------------------------------------------------------------------------
freeEnergyGpuContext gpu = data.getFreeEnergyGpu();
data.initializeGpu( );
// calculate nonbonded ixns here, only if implicit solvent is inactive
if ( !getIncludeGBSA() && !getIncludeGBVI() ) {
kCalculateCDLJSoftcoreForces(gpu);
}
// local LJ-14 forces
if( getNumExceptions() > 0 ){
kCalculateLocalSoftcoreForces(gpu);
}
return 0.0;
}
bool CudaFreeEnergyCalcNonbondedSoftcoreForceKernel::getIncludeGBSA( void ) const {
return bIncludeGBSA;
}
void CudaFreeEnergyCalcNonbondedSoftcoreForceKernel::setIncludeGBSA( bool inputIncludeGBSA ){
bIncludeGBSA = inputIncludeGBSA;
}
bool CudaFreeEnergyCalcNonbondedSoftcoreForceKernel::getIncludeGBVI( void ) const {
return bIncludeGBVI;
}
void CudaFreeEnergyCalcNonbondedSoftcoreForceKernel::setIncludeGBVI( bool inputIncludeGBVI ){
bIncludeGBVI = inputIncludeGBVI;
}
int CudaFreeEnergyCalcNonbondedSoftcoreForceKernel::getIncludeSoftcore( void ) const {
return includeSoftcore;
}
int CudaFreeEnergyCalcNonbondedSoftcoreForceKernel::getNumExceptions( void ) const {
return numExceptions;
}
void CudaFreeEnergyCalcNonbondedSoftcoreForceKernel::setIncludeSoftcore( int inputIncludeSoftcore ){
includeSoftcore = inputIncludeSoftcore;
}
class CudaFreeEnergyCalcGBSAOBCSoftcoreForceKernel::ForceInfo : public CudaForceInfo {
public:
ForceInfo(const GBSAOBCSoftcoreForce& force) : force(force) {
}
bool areParticlesIdentical(int particle1, int particle2) {
double charge1, charge2, radius1, radius2, scale1, scale2, particleNonPolarScalingFactor1, particleNonPolarScalingFactor2;
force.getParticleParameters(particle1, charge1, radius1, scale1, particleNonPolarScalingFactor1);
force.getParticleParameters(particle2, charge2, radius2, scale2, particleNonPolarScalingFactor2);
return (charge1 == charge2 && radius1 == radius2 && scale1 == scale2 && particleNonPolarScalingFactor1 == particleNonPolarScalingFactor2);
}
private:
const GBSAOBCSoftcoreForce& force;
};
CudaFreeEnergyCalcGBSAOBCSoftcoreForceKernel::~CudaFreeEnergyCalcGBSAOBCSoftcoreForceKernel() {
if( 0 && data.getLog() ){
(void) fprintf( data.getLog(), "~CudaFreeEnergyCalcGBSAOBCSoftcoreForceKernel called.\n" );
(void) fflush( data.getLog() );
}
data.decrementKernelCount();
}
void CudaFreeEnergyCalcGBSAOBCSoftcoreForceKernel::initialize(const System& system, const GBSAOBCSoftcoreForce& force) {
// ---------------------------------------------------------------------------------------
freeEnergyGpuContext gpu = data.getFreeEnergyGpu();
MapStringInt forceMap;
getForceMap( system, forceMap, log);
// check that nonbonded (non-softcore is not active)
if( forceMap.find( NB_FORCE ) != forceMap.end() ){
throw OpenMMException( "Mixing NonbondedForce and GBSAOBCSoftoreForce is not allowed -- use NonbondedSoftcoreForce " );
}
if( forceMap.find( NB_SOFTCORE_FORCE ) == forceMap.end() ){
throw OpenMMException( "NonbondedSoftcore force must be included w/ GBSAOBCSoftcore force." );
}
int numParticles = system.getNumParticles();
std::vector<float> radius(numParticles);
std::vector<float> scale(numParticles);
std::vector<float> charge(numParticles);
std::vector<float> nonPolarScalingFactors(numParticles);
for( int ii = 0; ii < numParticles; ii++ ){
double particleCharge, particleRadius, scalingFactor, particleNonPolarScalingFactor;
force.getParticleParameters( ii, particleCharge, particleRadius, scalingFactor, particleNonPolarScalingFactor);
radius[ii] = static_cast<float>( particleRadius);
scale[ii] = static_cast<float>( scalingFactor);
charge[ii] = static_cast<float>( particleCharge);
nonPolarScalingFactors[ii] = static_cast<float>( particleNonPolarScalingFactor);
}
gpuSetObcSoftcoreParameters( gpu, static_cast<float>( force.getSoluteDielectric()),
static_cast<float>( force.getSolventDielectric()),
static_cast<float>( force.getNonPolarPrefactor()),
radius, scale, charge, nonPolarScalingFactors );
data.getFreeEnergyGpu()->gpuContext->forces.push_back(new ForceInfo(force));
return;
}
double CudaFreeEnergyCalcGBSAOBCSoftcoreForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
// ---------------------------------------------------------------------------------------
freeEnergyGpuContext freeEnergyGpu = data.getFreeEnergyGpu();
gpuContext gpu = freeEnergyGpu->gpuContext;
int call = 0;
// send address's of arrays, ... to device on first call
// required since force/energy buffers not set when CudaFreeEnergyCalcGBVISoftcoreForceKernel::initialize() was called
data.initializeGpu( );
// (1) clear Born force array
// (2) calculate Born radii and sum
// (3) loop 1
// (4) sum/calculate Born forces
// (5) loop 2
kClearSoftcoreBornForces(gpu);
kCalculateObcGbsaSoftcoreBornSum( freeEnergyGpu );
kReduceObcGbsaSoftcoreBornSum(gpu);
kCalculateCDLJObcGbsaSoftcoreForces1( freeEnergyGpu );
// sum Born forces and execute second OBC loop
kReduceObcGbsaSoftcoreBornForces(gpu);
kCalculateObcGbsaSoftcoreForces2( freeEnergyGpu );
if( data.getLog() ){
kPrintObcGbsaSoftcore( freeEnergyGpu, "Post kCalculateObcGbsaSoftcoreForces2", call, data.getLog() );
}
return 0.0;
}
class CudaFreeEnergyCalcGBVISoftcoreForceKernel::ForceInfo : public CudaForceInfo {
public:
ForceInfo(const GBVISoftcoreForce& force) : force(force) {
}
bool areParticlesIdentical(int particle1, int particle2) {
double charge1, charge2, radius1, radius2, gamma1, gamma2, bornRadiusScaleFactor1, bornRadiusScaleFactor2;
force.getParticleParameters(particle1, charge1, radius1, gamma1, bornRadiusScaleFactor1);
force.getParticleParameters(particle2, charge2, radius2, gamma2, bornRadiusScaleFactor2);
return (charge1 == charge2 && radius1 == radius2 && gamma1 == gamma2 && bornRadiusScaleFactor1 == bornRadiusScaleFactor2);
}
private:
const GBVISoftcoreForce& force;
};
CudaFreeEnergyCalcGBVISoftcoreForceKernel::~CudaFreeEnergyCalcGBVISoftcoreForceKernel() {
if( 0 && data.getLog() ){
(void) fprintf( data.getLog(), "~CudaFreeEnergyCalcGBVISoftcoreForceKernel called.\n" );
(void) fflush( data.getLog() );
}
data.decrementKernelCount();
}
void CudaFreeEnergyCalcGBVISoftcoreForceKernel::initialize(const System& system, const GBVISoftcoreForce& force, const std::vector<double> & inputScaledRadii) {
// ---------------------------------------------------------------------------------------
int numParticles = system.getNumParticles();
freeEnergyGpuContext gpu = data.getFreeEnergyGpu();
// check forces and relevant parameters
MapStringInt forceMap;
getForceMap( system, forceMap, log);
// check that nonbonded (non-softcore is not active)
if( forceMap.find( NB_FORCE ) != forceMap.end() ){
throw OpenMMException( "Mixing NonbondedForce and GBVISoftoreForce not allowed -- use NonbondedSoftcoreForce " );
}
std::vector<int> particle(numParticles);
std::vector<float> radius(numParticles);
std::vector<float> scaledRadii(numParticles);
std::vector<float> gammas(numParticles);
std::vector<float> bornRadiusScaleFactors(numParticles);
for( int i = 0; i < numParticles; i++ ){
double charge, particleRadius, gamma, bornRadiusScaleFactor;
force.getParticleParameters(i, charge, particleRadius, gamma, bornRadiusScaleFactor);
particle[i] = i;
radius[i] = static_cast<float>( particleRadius );
gammas[i] = static_cast<float>( gamma );
scaledRadii[i] = static_cast<float>( inputScaledRadii[i] );
bornRadiusScaleFactors[i] = static_cast<float>( bornRadiusScaleFactor );
}
std::vector<float> quinticSplineParameters;
if( force.getBornRadiusScalingMethod() == GBVISoftcoreForce::QuinticSpline ){
// quintic spline
quinticSplineParameters.resize(2);
quinticSplineParameters[0] = static_cast<float>(force.getQuinticLowerLimitFactor());
quinticSplineParameters[1] = static_cast<float>(force.getQuinticUpperBornRadiusLimit());
quinticSplineParameters[1] = powf( quinticSplineParameters[1], -3.0f );
quinticScaling = 1;
}
// load parameters onto board
// defined in kCalculateGBVISoftcore.cu
gpuSetGBVISoftcoreParameters( gpu, static_cast<float>( force.getSoluteDielectric() ), static_cast<float>( force.getSolventDielectric() ),
particle, radius, gammas, scaledRadii, bornRadiusScaleFactors, quinticSplineParameters);
data.getFreeEnergyGpu()->gpuContext->forces.push_back(new ForceInfo(force));
return;
}
double CudaFreeEnergyCalcGBVISoftcoreForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
freeEnergyGpuContext freeEnergyGpu = data.getFreeEnergyGpu();
gpuContext gpu = freeEnergyGpu->gpuContext;
// send address's of arrays, ... to device on first call
// required since force/energy buffers not set when CudaFreeEnergyCalcGBVISoftcoreForceKernel::initialize() was called
data.initializeGpu( );
// (1) clear Born force array
// (2) calculate Born radii and sum
// (3) loop 1
// (4) sum/calculate Born forces
// (5) loop 2
// calculate Born radii and first loop of GB/VI forces
kClearSoftcoreBornForces(gpu);
kCalculateGBVISoftcoreBornSum( freeEnergyGpu );
if( quinticScaling ){
kReduceGBVIBornSumQuinticScaling( freeEnergyGpu );
} else {
kReduceGBVISoftcoreBornSum( freeEnergyGpu );
}
kCalculateCDLJObcGbsaSoftcoreForces1( freeEnergyGpu );
if( quinticScaling ){
kReduceGBVIBornForcesQuinticScaling(freeEnergyGpu);
} else {
kReduceGBVISoftcoreBornForces( freeEnergyGpu );
}
// second loop of GB/VI forces
kCalculateGBVISoftcoreForces2( freeEnergyGpu );
if( data.getLog() ){
kPrintGBVISoftcore( freeEnergyGpu, "Post kCalculateGBVISoftcoreForces2", 0, data.getLog() );
}
return 0.0;
}
#ifndef OPENMM_FREE_ENERGY_CUDA_KERNELS_H_
#define OPENMM_FREE_ENERGY_CUDA_KERNELS_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "CudaPlatform.h"
#include "openmm/kernels.h"
#include "kernels/gputypes.h"
#include "openmm/System.h"
#include "OpenMMFreeEnergy.h"
#include "openmm/freeEnergyKernels.h"
#include "FreeEnergyCudaData.h"
namespace OpenMM {
/**
* This kernel is invoked by NonbondedSoftcoreForce to calculate the forces acting on the system.
*/
class CudaFreeEnergyCalcNonbondedSoftcoreForceKernel : public CalcNonbondedSoftcoreForceKernel {
public:
CudaFreeEnergyCalcNonbondedSoftcoreForceKernel(std::string name, const Platform& platform, FreeEnergyCudaData& data, System& system) :
CalcNonbondedSoftcoreForceKernel(name, platform), data(data), system(system) {
numExceptions = 0;
numParticles = 0;
bIncludeGBSA = false;
bIncludeGBVI = false;
includeSoftcore = false;
log = NULL;
data.incrementKernelCount();
}
~CudaFreeEnergyCalcNonbondedSoftcoreForceKernel();
/**
* Initialize the kernel.
*
* @param system the System this kernel will be applied to
* @param force the NonbondedForce this kernel will be used for
*/
void initialize(const System& system, const NonbondedSoftcoreForce& force);
/**
* Execute the kernel to calculate the forces and/or energy.
*
* @param context the context in which to execute this kernel
* @param includeForces true if forces should be calculated
* @param includeEnergy true if the energy should be calculated
* @return the potential energy due to the force
*/
double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
/**
* Get flag signalling whether GBSA/OBC force is included
*
* @return flag
*/
bool getIncludeGBSA( void ) const;
/**
* Set flag signalling whether GBSA/OBC force is included
*
* @param inputIncludeGBSA input flag value
*/
void setIncludeGBSA( bool inputIncludeGBSA );
/**
* Get flag signalling whether GB/VI force is included
*
* @return flag
*/
bool getIncludeGBVI( void ) const;
/**
* Set flag signalling whether GB/VI force is included
*
* @param inputIncludeGBVI input flag value
*/
void setIncludeGBVI( bool inputIncludeGBVI );
/**
* Get flag signalling whether softcore force is included
*
* @return flag
*/
int getIncludeSoftcore( void ) const;
/**
* Set flag signalling whether GB/VI force is included
*
* @param inputIncludeGBVI input flag value
*/
void setIncludeSoftcore( int inputSoftcore);
/**
* Get number of exceptions
*
* @return number of exceptions
*/
int getNumExceptions( void ) const;
private:
FreeEnergyCudaData& data;
class ForceInfo;
int numParticles;
System& system;
bool bIncludeGBSA;
bool bIncludeGBVI;
int includeSoftcore;
int numExceptions;
FILE* log;
};
/**
* This kernel is invoked by GBSAOBCForce to calculate the forces acting on the system.
*/
class CudaFreeEnergyCalcGBSAOBCSoftcoreForceKernel : public CalcGBSAOBCSoftcoreForceKernel {
public:
CudaFreeEnergyCalcGBSAOBCSoftcoreForceKernel(std::string name, const Platform& platform, FreeEnergyCudaData& data) :
CalcGBSAOBCSoftcoreForceKernel(name, platform), data(data) {
log = NULL;
data.incrementKernelCount();
}
~CudaFreeEnergyCalcGBSAOBCSoftcoreForceKernel();
/**
* Initialize the kernel.
*
* @param system the System this kernel will be applied to
* @param force the GBSAOBCForce this kernel will be used for
*/
void initialize(const System& system, const GBSAOBCSoftcoreForce& force);
/**
* Execute the kernel to calculate the forces and/or energy.
*
* @param context the context in which to execute this kernel
* @param includeForces true if forces should be calculated
* @param includeEnergy true if the energy should be calculated
* @return the potential energy due to the force
*/
double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
private:
FreeEnergyCudaData& data;
class ForceInfo;
FILE* log;
};
/**
* This kernel is invoked by GBVIForce to calculate the forces acting on the system.
*/
class CudaFreeEnergyCalcGBVISoftcoreForceKernel : public CalcGBVISoftcoreForceKernel {
public:
CudaFreeEnergyCalcGBVISoftcoreForceKernel(std::string name, const Platform& platform, FreeEnergyCudaData& data) :
CalcGBVISoftcoreForceKernel(name, platform), data(data) {
log = NULL;
quinticScaling = 0;
data.incrementKernelCount();
}
~CudaFreeEnergyCalcGBVISoftcoreForceKernel();
/**
* Initialize the kernel.
*
* @param system the System this kernel will be applied to
* @param force the GBVIForce this kernel will be used for
* @param scaledRadii the scaled radii (Eq. 5 of Labute paper)
*/
void initialize(const System& system, const GBVISoftcoreForce& force, const std::vector<double> & scaledRadii);
/**
* Execute the kernel to calculate the forces and/or energy.
*
* @param context the context in which to execute this kernel
* @param includeForces true if forces should be calculated
* @param includeEnergy true if the energy should be calculated
* @return the potential energy due to the force
*/
double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
private:
FreeEnergyCudaData& data;
class ForceInfo;
FILE* log;
int quinticScaling;
};
} // namespace OpenMM
#endif /*OPENMM_FREE_ENERGY_CUDA_KERNELS_H_*/
/* -------------------------------------------------------------------------- *
* OpenMMFreeEnergy *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008-2009 Stanford University and the Authors. *
* Authors: *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "FreeEnergyCudaData.h"
#include "openmm/OpenMMException.h"
#include <sstream>
extern "C" void removeFreeEnergyCudaDataFromContextMap( void* context );
namespace OpenMM {
FreeEnergyCudaData::FreeEnergyCudaData( CudaPlatform::PlatformData& data ) : cudaPlatformData(data) {
kernelCount = 0;
freeEnergyGpu = freeEnergyGpuInit( cudaPlatformData.gpu );
log = NULL;
contextImpl = NULL;
gpuInitialized = false;
boxDimensions[0] = 0.0;
boxDimensions[1] = 0.0;
boxDimensions[2] = 0.0;
}
FreeEnergyCudaData::~FreeEnergyCudaData() {
if( getLog() ){
(void) fprintf( getLog(), "~FreeEnergyCudaData called kernelCount=%d\n", kernelCount );
(void) fflush( getLog() );
}
freeEnergyGpuShutDown( freeEnergyGpu );
}
void FreeEnergyCudaData::decrementKernelCount( void ) {
kernelCount--;
if( getLog() ){
(void) fprintf( getLog(), "~reeEnergyCudaData decrementKernelCount called. %d\n", kernelCount );
(void) fflush( getLog() );
}
if( kernelCount == 0 && contextImpl != NULL ){
removeFreeEnergyCudaDataFromContextMap( contextImpl );
freeEnergyGpuShutDown( freeEnergyGpu );
}
}
void FreeEnergyCudaData::incrementKernelCount( void ) {
kernelCount++;
}
freeEnergyGpuContext FreeEnergyCudaData::getFreeEnergyGpu( void ) const {
return freeEnergyGpu;
}
void FreeEnergyCudaData::setLog( FILE* inputLog ) {
log = inputLog;
freeEnergyGpu->log = inputLog;
}
FILE* FreeEnergyCudaData::getLog( void ) const {
return log;
}
void FreeEnergyCudaData::setContextImpl( void* inputContextImpl ) {
contextImpl = inputContextImpl;
}
void FreeEnergyCudaData::initializeGpu( void ) {
if( !gpuInitialized ){
gpuContext gpu = freeEnergyGpu->gpuContext;
if( freeEnergyGpu->freeEnergySim.nonbondedCutoff != gpu->sim.nonbondedCutoff ){
std::stringstream msg;
msg << "The softcore non-bonded cutoff=" << freeEnergyGpu->freeEnergySim.nonbondedCutoff;
msg << "does not agree with the non-softcore cutoff= " << gpu->sim.nonbondedCutoff;
throw OpenMM::OpenMMException( msg.str() );
}
/*
freeEnergyGpuBuildOutputBuffers( freeEnergyGpu, getHasFreeEnergyGeneralizedKirkwood() );
freeEnergyGpuBuildThreadBlockWorkList( freeEnergyGpu );
boxDimensions[0] = freeEnergyGpu->gpuContext->sim.periodicBoxSizeX;
boxDimensions[1] = freeEnergyGpu->gpuContext->sim.periodicBoxSizeY;
boxDimensions[2] = freeEnergyGpu->gpuContext->sim.periodicBoxSizeZ;
*/
gpuBuildExclusionList( gpu );
gpuSetConstants( gpu );
freeEnergyGpuSetConstants( freeEnergyGpu );
gpuInitialized = true;
if( log ){
//gpuPrintCudaFreeEnergyGmxSimulation( freeEnergyGpu, getLog() );
(void) fprintf( log, "FreeEnergyCudaGpu initialized kernelCount=%d\n", kernelCount );
(void) fflush( log );
}
} else {
/*
if( boxDimensions[0] != freeEnergyGpu->gpuContext->sim.periodicBoxSizeX ||
boxDimensions[1] != freeEnergyGpu->gpuContext->sim.periodicBoxSizeY ||
boxDimensions[2] != freeEnergyGpu->gpuContext->sim.periodicBoxSizeZ ){
freeEnergyGpuSetConstants( freeEnergyGpu, 1 );
boxDimensions[0] = freeEnergyGpu->gpuContext->sim.periodicBoxSizeX;
boxDimensions[1] = freeEnergyGpu->gpuContext->sim.periodicBoxSizeY;
boxDimensions[2] = freeEnergyGpu->gpuContext->sim.periodicBoxSizeZ;
}
*/
}
return;
}
}
#ifndef FREE_ENERGY_CUDA_DATA_H_
#define FREE_ENERGY_CUDA_DATA_H_
/* -------------------------------------------------------------------------- *
* OpenMMFreeEnergy *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008 Stanford University and the Authors. *
* Authors: *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "CudaPlatform.h"
#include "kernels/freeEnergyGpuTypes.h"
#include "kernels/cudaKernels.h"
#include "openmm/KernelImpl.h"
namespace OpenMM {
/**
* Free energy Cuda data
*/
class FreeEnergyCudaData {
public:
FreeEnergyCudaData( CudaPlatform::PlatformData& data );
~FreeEnergyCudaData();
/**
* Increment kernel count
*
*/
void incrementKernelCount( void );
/**
* Decrement kernel count
*
*/
void decrementKernelCount( void );
/**
* Return freeEnergyGpuContext context
*
* @return freeEnergyGpuContext
*/
freeEnergyGpuContext OPENMMCUDA_EXPORT getFreeEnergyGpu( void ) const;
/**
* Set log file reference
*
* @param log file reference; if not set, then no logging
*/
void setLog( FILE* inputLog );
/**
* Get log file reference
*
* @return log file reference
*/
FILE* getLog( void ) const;
/**
* if gpuInitialized is false, write data to board
*
* @param log file reference; if not set, then no logging
*/
void initializeGpu( void );
/**
* Set contextImpl
*
* @param contextImpl reference
*/
void setContextImpl( void* contextImpl );
CudaPlatform::PlatformData& cudaPlatformData;
private:
freeEnergyGpuContext freeEnergyGpu;
unsigned int kernelCount;
void* contextImpl;
FILE* log;
bool gpuInitialized;
double boxDimensions[3];
};
} // namespace OpenMM
#endif /*FREE_ENERGY_CUDA_DATA_H_*/
#ifndef __GPU_FREE_ENERGY_KERNELS_H__
#define __GPU_FREE_ENERGY_KERNELS_H__
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "gputypes.h"
#include "freeEnergyGpuTypes.h"
#include "cudatypes.h"
#include <vector>
#include <cuda.h>
// Function prototypes
// CDLJ softcore
// setup methods called from CudaFreeEnergyKernels
// nonbonded and 1-4 ixns
extern "C" bool gpuIsAvailableSoftcore();
extern "C" void freeEnergyGpuSetPeriodicBoxSize( freeEnergyGpuContext gpu, float xsize, float ysize, float zsize);
extern "C" void gpuSetNonbondedSoftcoreParameters( freeEnergyGpuContext gpu, float epsfac, const std::vector<int>& atom, const std::vector<float>& c6,
const std::vector<float>& c12, const std::vector<float>& q,
const std::vector<float>& softcoreLJLambdaArray, const std::vector<char>& symbol,
const std::vector<std::vector<int> >& exclusions, CudaFreeEnergyNonbondedMethod method,
float cutoffDistance, float reactionFieldDielectric);
extern "C" void gpuSetLJ14SoftcoreParameters( freeEnergyGpuContext gpu, float epsfac, const std::vector<int>& atom1,
const std::vector<int>& atom2, const std::vector<float>& c6, const std::vector<float>& c12,
const std::vector<float>& qProd, const std::vector<float>& softcoreLJLambdaArray);
// write address's to device
extern "C" void SetCalculateCDLJSoftcoreGpuSim( freeEnergyGpuContext gpu );
extern "C" void SetCalculateLocalSoftcoreGpuSim( freeEnergyGpuContext gpu );
// kernel calls to device
extern "C" void kCalculateCDLJSoftcoreForces( freeEnergyGpuContext gpu );
extern "C" void kCalculateLocalSoftcoreForces( freeEnergyGpuContext gpu );
// GB/VI softcore
// setup method called from CudaFreeEnergyKernels
extern "C" void gpuSetGBVISoftcoreParameters( freeEnergyGpuContext gpu, float innerDielectric, float solventDielectric, const std::vector<int>& atom, const std::vector<float>& radius,
const std::vector<float>& gamma, const std::vector<float>& scaledRadii,
const std::vector<float>& bornRadiusScaleFactors, const std::vector<float>& quinticSplineParameters);
// write address's to device
extern "C" void SetCalculateGBVISoftcoreForcesSim( gpuContext gpu, float* softCoreLJLambda);
extern "C" void SetCalculateGBVISoftcoreBornSumGpuSim( freeEnergyGpuContext gpu );
extern "C" void SetCalculateGBVISoftcoreForces2Sim( freeEnergyGpuContext gpu);
// kernel calls to device
extern void kReduceGBVIBornSumQuinticScaling( freeEnergyGpuContext gpu );
extern void kCalculateGBVISoftcoreBornSum( freeEnergyGpuContext gpu );
extern void kReduceGBVIBornForcesQuinticScaling( freeEnergyGpuContext gpu );
extern void kCalculateGBVISoftcoreForces2( freeEnergyGpuContext gpu );
extern void kReduceGBVISoftcoreBornForces( freeEnergyGpuContext gpu);
extern void kReduceGBVISoftcoreBornSum( freeEnergyGpuContext gpu);
extern void kPrintGBVISoftcore( freeEnergyGpuContext gpu, std::string callId, int call, FILE* log);
extern void kClearSoftcoreBornForces(gpuContext gpu);
// Obc softcore
// setup method called from CudaFreeEnergyKernels
/**
* Initialize parameters for Cuda Obc softcore
*
* @param gpu gpu context
* @param innerDielectric solute dielectric
* @param solventDielectric solvent dielectric
* @param radius intrinsic Born radii
* @param scale Obc scaling factors
* @param charge atomic charges (possibly overwritten by other methods?)
* @param nonPolarScalingFactors non-polar scaling factors
*
*/
extern "C" void gpuSetObcSoftcoreParameters( freeEnergyGpuContext gpu, float innerDielectric, float solventDielectric, float nonPolarPrefactor,
const std::vector<float>& radius, const std::vector<float>& scale,
const std::vector<float>& charge, const std::vector<float>& nonPolarScalingFactors);
// write address's to device
extern "C" void SetCalculateObcGbsaSoftcoreBornSumSim( freeEnergyGpuContext gpu );
// this method and kCalculateObcGbsaSoftcoreForces2() are being
// used until changes in OpenMM version are made
extern "C" void SetCalculateObcGbsaSoftcoreForces2Sim( freeEnergyGpuContext gpu );
// kernel calls to device
extern void kClearObcGbsaSoftcoreBornSum( gpuContext gpu );
extern void kReduceObcGbsaSoftcoreBornForces( gpuContext gpu );
extern void kCalculateObcGbsaSoftcoreBornSum( freeEnergyGpuContext gpu );
extern void kReduceObcGbsaSoftcoreBornSum( gpuContext gpu );
// this method is not needed; the OpenMM version can be used
extern void kCalculateObcGbsaSoftcoreForces2( freeEnergyGpuContext gpu );
extern void kPrintObcGbsaSoftcore( freeEnergyGpuContext gpu, std::string callId, int call, FILE* log);
// shared
extern "C" void SetCalculateCDLJObcGbsaSoftcoreGpu1Sim( freeEnergyGpuContext gpu );
extern void kCalculateCDLJObcGbsaSoftcoreForces1( freeEnergyGpuContext gpu );
extern "C" void showWorkUnitsFreeEnergy( freeEnergyGpuContext freeEnergyGpu, int interactingWorkUnit );
#endif //__GPU_FREE_ENERGY_KERNELS_H__
/* -------------------------------------------------------------------------- *
* OpenMMFreeEnergy *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman, Mark Friedrichs *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#ifdef WIN32
#define _USE_MATH_DEFINES /* M_PI */
#endif
#define PARAMETER_PRINT 1
#define MAX_PARAMETER_PRINT 10
#include "openmm/OpenMMException.h"
#include "cudaKernels.h"
#include "GpuFreeEnergyCudaKernels.h"
#include "freeEnergyGpuTypes.h"
// for some reason, these are not being included w/ cudaKernels.h on Windows
//extern void OPENMMCUDA_EXPORT SetCalculateObcGbsaForces2Sim(gpuContext gpu);
extern void OPENMMCUDA_EXPORT SetForcesSim(gpuContext gpu);
#include <cmath>
#include <iostream>
#include <sstream>
#include <limits>
#include <cstring>
#include <vector>
#include <stdio.h>
#ifdef WIN32
#include <windows.h>
#else
#include <sys/time.h>
#endif
using std::vector;
extern "C"
freeEnergyGpuContext freeEnergyGpuInit( _gpuContext* gpu ){
// allocate and zero block
freeEnergyGpuContext freeEnergyGpu = new _freeEnergyGpuContext;
memset( freeEnergyGpu, 0, sizeof( struct _freeEnergyGpuContext ) );
freeEnergyGpu->gpuContext = gpu;
return freeEnergyGpu;
}
extern "C"
void gpuPrintCudaStream( std::string name,
unsigned int length, unsigned int subStreams, unsigned int stride,
unsigned int memoryFootprint,
void* pSysStream, void* pDevStream,
void* pSysData, void* pDevData, FILE* log)
{
(void) fprintf( log, " %-35s [%8u %5u %8u %8u] Stream[%p %p] Data[%16p %16p]\n",
name.c_str(), length, subStreams,
stride, memoryFootprint, pSysStream, pDevStream, pSysData, pDevData );
}
extern "C"
int gpuPrintCudaStreamFloat( CUDAStream<float>* cUDAStream, FILE* log )
{
if( cUDAStream == NULL )return 0;
gpuPrintCudaStream( cUDAStream->_name.c_str(),
cUDAStream->_length, cUDAStream->_subStreams, cUDAStream->_stride,
cUDAStream->_length*cUDAStream->_subStreams*sizeof( float ),
static_cast<void*>(cUDAStream->_pSysStream), static_cast<void*>(cUDAStream->_pDevStream),
static_cast<void*>(cUDAStream->_pSysData), static_cast<void*>(cUDAStream->_pDevData), log );
return cUDAStream->_length*cUDAStream->_subStreams*sizeof( float );
}
extern "C"
int gpuPrintCudaStreamFloat2( CUDAStream<float2>* cUDAStream, FILE* log )
{
if( cUDAStream == NULL )return 0;
gpuPrintCudaStream( cUDAStream->_name.c_str(),
cUDAStream->_length, cUDAStream->_subStreams, cUDAStream->_stride,
cUDAStream->_length*cUDAStream->_subStreams*sizeof( float2 ),
static_cast<void*>(cUDAStream->_pSysStream), static_cast<void*>(cUDAStream->_pDevStream),
static_cast<void*>(cUDAStream->_pSysData), static_cast<void*>(cUDAStream->_pDevData), log );
return cUDAStream->_length*cUDAStream->_subStreams*2*sizeof( float );
}
extern "C"
int gpuPrintCudaStreamFloat4( CUDAStream<float4>* cUDAStream, FILE* log )
{
if( cUDAStream == NULL )return 0;
gpuPrintCudaStream( cUDAStream->_name.c_str(),
cUDAStream->_length, cUDAStream->_subStreams, cUDAStream->_stride,
cUDAStream->_length*cUDAStream->_subStreams*sizeof( float4 ),
static_cast<void*>(cUDAStream->_pSysStream), static_cast<void*>(cUDAStream->_pDevStream),
static_cast<void*>(cUDAStream->_pSysData), static_cast<void*>(cUDAStream->_pDevData), log );
return cUDAStream->_length*cUDAStream->_subStreams*4*sizeof( float );
}
extern "C"
int gpuPrintCudaStreamUnsignedInt( CUDAStream<unsigned int>* cUDAStream, FILE* log )
{
if( cUDAStream == NULL )return 0;
gpuPrintCudaStream( cUDAStream->_name.c_str(),
cUDAStream->_length, cUDAStream->_subStreams, cUDAStream->_stride,
cUDAStream->_length*cUDAStream->_subStreams*sizeof( unsigned int ),
static_cast<void*>(cUDAStream->_pSysStream), static_cast<void*>(cUDAStream->_pDevStream),
static_cast<void*>(cUDAStream->_pSysData), static_cast<void*>(cUDAStream->_pDevData), log );
return cUDAStream->_length*cUDAStream->_subStreams*sizeof( unsigned int );
}
extern "C"
int gpuPrintCudaStreamInt( CUDAStream<int>* cUDAStream, FILE* log )
{
if( cUDAStream == NULL )return 0;
gpuPrintCudaStream( cUDAStream->_name.c_str(),
cUDAStream->_length, cUDAStream->_subStreams, cUDAStream->_stride,
cUDAStream->_length*cUDAStream->_subStreams*sizeof( int ),
static_cast<void*>(cUDAStream->_pSysStream), static_cast<void*>(cUDAStream->_pDevStream),
static_cast<void*>(cUDAStream->_pSysData), static_cast<void*>(cUDAStream->_pDevData), log );
return cUDAStream->_length*cUDAStream->_subStreams*sizeof( int );
}
extern "C"
int gpuPrintCudaStreamInt2( CUDAStream<int2>* cUDAStream, FILE* log )
{
if( cUDAStream == NULL )return 0;
gpuPrintCudaStream( cUDAStream->_name.c_str(),
cUDAStream->_length, cUDAStream->_subStreams, cUDAStream->_stride,
cUDAStream->_length*cUDAStream->_subStreams*sizeof( int2 ),
static_cast<void*>(cUDAStream->_pSysStream), static_cast<void*>(cUDAStream->_pDevStream),
static_cast<void*>(cUDAStream->_pSysData), static_cast<void*>(cUDAStream->_pDevData), log );
return cUDAStream->_length*cUDAStream->_subStreams*2*sizeof( int );
}
extern "C"
int gpuPrintCudaStreamInt4( CUDAStream<int4>* cUDAStream, FILE* log )
{
if( cUDAStream == NULL )return 0;
gpuPrintCudaStream( cUDAStream->_name.c_str(),
cUDAStream->_length, cUDAStream->_subStreams, cUDAStream->_stride,
cUDAStream->_length*cUDAStream->_subStreams*sizeof( int4 ),
static_cast<void*>(cUDAStream->_pSysStream), static_cast<void*>(cUDAStream->_pDevStream),
static_cast<void*>(cUDAStream->_pSysData), static_cast<void*>(cUDAStream->_pDevData), log );
return cUDAStream->_length*cUDAStream->_subStreams*4*sizeof( int );
}
extern "C"
void gpuPrintCudaFreeEnergyGmxSimulation(freeEnergyGpuContext freeEnergyGpu, FILE* log )
{
if( log == NULL )return;
_gpuContext* gpu = freeEnergyGpu->gpuContext;
int totalMemory = 0;
(void) fprintf( log, "cudaFreeEnergyGmxSimulation:\n\n" );
(void) fprintf( log, "\n" );
(void) fprintf( log, " numberOfAtoms %u\n", gpu->natoms );
(void) fprintf( log, " paddedNumberOfAtoms %u\n", gpu->sim.paddedNumberOfAtoms );
(void) fprintf( log, "\n\n" );
(void) fprintf( log, " gpuContext %p\n", freeEnergyGpu->gpuContext );
(void) fprintf( log, " log %p %s\n", freeEnergyGpu->log, freeEnergyGpu->log == stderr ? "is stderr" : "is not stderr");
(void) fprintf( log, " sm_version %u\n", gpu->sm_version );
(void) fprintf( log, " device %u\n", gpu->device );
(void) fprintf( log, " sharedMemoryPerBlock %u\n", gpu->sharedMemoryPerBlock );
(void) fprintf( log, " bOutputBufferPerWarp %d\n", gpu->bOutputBufferPerWarp );
(void) fprintf( log, " blocks %u\n", gpu->sim.blocks );
(void) fprintf( log, " threads_per_block %u\n", gpu->sim.threads_per_block);
(void) fprintf( log, " update_threads_per_block %u\n", gpu->sim.update_threads_per_block);
(void) fprintf( log, " nonbondBlocks %u\n", gpu->sim.nonbond_blocks );
(void) fprintf( log, " nonbondThreadsPerBlock %u\n", gpu->sim.nonbond_threads_per_block);
(void) fprintf( log, " bsf_reduce_threads_per_block %u\n", gpu->sim.bsf_reduce_threads_per_block);
(void) fprintf( log, " nonbondOutputBuffers %u\n", gpu->sim.nonbondOutputBuffers );
(void) fprintf( log, " outputBuffers %u\n", gpu->sim.outputBuffers );
totalMemory += gpuPrintCudaStreamFloat( freeEnergyGpu->gpuContext->psEnergy, log );
totalMemory += gpuPrintCudaStreamFloat4( freeEnergyGpu->gpuContext->psForce4, log );
(void) fflush( log );
}
extern "C"
void freeEnergyGpuShutDown( freeEnergyGpuContext freeEnergyGpu ){
if( freeEnergyGpu->log ){
(void) fprintf( freeEnergyGpu->log, "freeEnergyGpuShutDown called.\n" );
(void) fflush( freeEnergyGpu->log );
}
// free free energy Cuda arrays
delete freeEnergyGpu->psLJ14ID;
delete freeEnergyGpu->psLJ14Parameter;
delete freeEnergyGpu->psSigEps4;
delete freeEnergyGpu->psSwitchDerivative;
delete freeEnergyGpu->psNonPolarScalingFactors;
delete freeEnergyGpu;
return;
}
extern "C"
void freeEnergyGpuSetConstants( freeEnergyGpuContext freeEnergyGpu ){
if( freeEnergyGpu->log ){
(void) fprintf( freeEnergyGpu->log, "FreeEnergyGpuSetConstants called\n" );
(void) fflush( freeEnergyGpu->log );
}
SetCalculateLocalSoftcoreGpuSim( freeEnergyGpu );
SetCalculateCDLJSoftcoreGpuSim( freeEnergyGpu );
SetCalculateGBVISoftcoreBornSumGpuSim( freeEnergyGpu );
SetCalculateCDLJObcGbsaSoftcoreGpu1Sim( freeEnergyGpu );
SetCalculateGBVISoftcoreForces2Sim( freeEnergyGpu );
SetCalculateObcGbsaSoftcoreBornSumSim( freeEnergyGpu );
SetCalculateObcGbsaSoftcoreForces2Sim( freeEnergyGpu );
}
extern "C"
void freeEnergyGpuSetPeriodicBoxSize( freeEnergyGpuContext freeEnergyGpu, float xsize, float ysize, float zsize)
{
freeEnergyGpu->freeEnergySim.periodicBoxSizeX = xsize;
freeEnergyGpu->freeEnergySim.periodicBoxSizeY = ysize;
freeEnergyGpu->freeEnergySim.periodicBoxSizeZ = zsize;
freeEnergyGpu->freeEnergySim.invPeriodicBoxSizeX = 1.0f/xsize;
freeEnergyGpu->freeEnergySim.invPeriodicBoxSizeY = 1.0f/ysize;
freeEnergyGpu->freeEnergySim.invPeriodicBoxSizeZ = 1.0f/zsize;
freeEnergyGpu->freeEnergySim.recipBoxSizeX = 2.0f*PI/freeEnergyGpu->freeEnergySim.periodicBoxSizeX;
freeEnergyGpu->freeEnergySim.recipBoxSizeY = 2.0f*PI/freeEnergyGpu->freeEnergySim.periodicBoxSizeY;
freeEnergyGpu->freeEnergySim.recipBoxSizeZ = 2.0f*PI/freeEnergyGpu->freeEnergySim.periodicBoxSizeZ;
freeEnergyGpu->freeEnergySim.cellVolume = freeEnergyGpu->freeEnergySim.periodicBoxSizeX*freeEnergyGpu->freeEnergySim.periodicBoxSizeY*freeEnergyGpu->freeEnergySim.periodicBoxSizeZ;
gpuSetPeriodicBoxSize( freeEnergyGpu->gpuContext, xsize, ysize, zsize );
}
#if defined(_MSC_VER)
#pragma warning(push)
#pragma warning(disable: 4297)
#endif
extern "C"
void gpuSetNonbondedSoftcoreParameters( freeEnergyGpuContext freeEnergyGpu, float epsfac, const std::vector<int>& atom, const std::vector<float>& c6,
const std::vector<float>& c12, const std::vector<float>& q,
const std::vector<float>& softcoreLJLambdaArray, const std::vector<char>& symbol,
const std::vector<std::vector<int> >& exclusions, CudaFreeEnergyNonbondedMethod method,
float cutoffDistance, float solventDielectric ) {
unsigned int numberOfParticles = c6.size();
gpuContext gpu = freeEnergyGpu->gpuContext;
int paddedNumberOfAtoms = gpu->sim.paddedNumberOfAtoms;
// sanity checks
if( paddedNumberOfAtoms < 1 ){
std::stringstream msg;
msg << "gpuSetNonbondedSoftcoreParameters: number of padded atoms=" << gpu->sim.paddedNumberOfAtoms << " is less than 1.";
throw OpenMM::OpenMMException( msg.str() );
}
if( freeEnergyGpu->gpuContext->sim.atoms != numberOfParticles ){
std::stringstream msg;
msg << "gpuSetNonbondedSoftcoreParameters: number of atoms in gpuContext does not match input count: " << freeEnergyGpu->gpuContext->sim.atoms << " " << numberOfParticles << ".";
throw OpenMM::OpenMMException( msg.str() );
}
#if defined(_MSC_VER)
#pragma warning(pop)
#endif
freeEnergyGpu->freeEnergySim.epsfac = epsfac;
freeEnergyGpu->freeEnergySim.nonbondedMethod = method;
freeEnergyGpu->freeEnergySim.nonbondedCutoff = cutoffDistance;
freeEnergyGpu->freeEnergySim.nonbondedCutoffSqr = cutoffDistance*cutoffDistance;
gpu->sim.nonbondedCutoff = cutoffDistance;
gpu->sim.nonbondedCutoffSqr = cutoffDistance*cutoffDistance;
if( cutoffDistance > 0.0f ){
freeEnergyGpu->freeEnergySim.reactionFieldK = pow(cutoffDistance, -3.0f)*(solventDielectric-1.0f)/(2.0f*solventDielectric+1.0f);
freeEnergyGpu->freeEnergySim.reactionFieldC = (1.0f / cutoffDistance)*(3.0f*solventDielectric)/(2.0f*solventDielectric+1.0f);
gpu->sim.reactionFieldK = freeEnergyGpu->freeEnergySim.reactionFieldK;
gpu->sim.reactionFieldC = freeEnergyGpu->freeEnergySim.reactionFieldC;
} else {
freeEnergyGpu->freeEnergySim.reactionFieldK = 0.0f;
freeEnergyGpu->freeEnergySim.reactionFieldC = 0.0f;
}
setExclusions( gpu, exclusions );
// parameters
freeEnergyGpu->psSigEps4 = new CUDAStream<float4>( paddedNumberOfAtoms, 1, "freeEnergyGpuSigEps4");
freeEnergyGpu->freeEnergySim.pSigEps4 = freeEnergyGpu->psSigEps4->_pDevData;
for( unsigned int ii = 0; ii < numberOfParticles; ii++ ){
float p1 = 0.5f;
float p2 = 0.0f;
if( (c6[ii] > 0.0f) && (c12[ii] > 0.0f) ){
p1 = 0.5f * powf(c12[ii] / c6[ii], 1.0f / 6.0f);
p2 = c6[ii] * sqrtf(1.0f / c12[ii]);
}
/*
if (symbol.size() > 0)
freeEnergyGpu->pAtomSymbol[ii] = symbol[ii];
*/
(*freeEnergyGpu->psSigEps4)[ii].x = p1;
(*freeEnergyGpu->psSigEps4)[ii].y = p2;
(*freeEnergyGpu->psSigEps4)[ii].z = softcoreLJLambdaArray[ii];
(*freeEnergyGpu->psSigEps4)[ii].w = q[ii];
}
// Dummy out extra atom data
for( int ii = numberOfParticles; ii < paddedNumberOfAtoms; ii++ ){
(*freeEnergyGpu->psSigEps4)[ii].x = 1.0f;
(*freeEnergyGpu->psSigEps4)[ii].y = 0.0f;
(*freeEnergyGpu->psSigEps4)[ii].z = 0.0f;
(*freeEnergyGpu->psSigEps4)[ii].w = 0.0f;
(*gpu->psPosq4)[ii].x = 100000.0f + ii * 10.0f;
(*gpu->psPosq4)[ii].y = 100000.0f + ii * 10.0f;
(*gpu->psPosq4)[ii].z = 100000.0f + ii * 10.0f;
(*gpu->psPosq4)[ii].w = 0.0f;
}
if( freeEnergyGpu->log ){
(void) fprintf( freeEnergyGpu->log,"freeEnergyGpuSetNonbondedSoftcoreParameters: %5u padded=%u epsfac=%14.7e method=%d cutoffDistance=%9.2f solventDielectric=%9.2f\n",
numberOfParticles, freeEnergyGpu->gpuContext->sim.paddedNumberOfAtoms, epsfac, method, cutoffDistance, solventDielectric );
#ifdef PARAMETER_PRINT
int maxPrint = MAX_PARAMETER_PRINT;
for (unsigned int ii = 0; ii < numberOfParticles; ii++){
(void) fprintf( freeEnergyGpu->log,"%6u sig[%14.7e %14.7e] lambda=%10.3f q=%10.3f\n",
ii,
(*freeEnergyGpu->psSigEps4)[ii].x, (*freeEnergyGpu->psSigEps4)[ii].y, (*freeEnergyGpu->psSigEps4)[ii].z, (*freeEnergyGpu->psSigEps4)[ii].w );
if( ii == maxPrint && ii < freeEnergyGpu->gpuContext->sim.paddedNumberOfAtoms - maxPrint ){
ii = numberOfParticles - maxPrint;
}
}
unsigned int offset = paddedNumberOfAtoms - maxPrint;
if( offset > 0 ){
if( offset > numberOfParticles ){
(void) fprintf( freeEnergyGpu->log,"Dummy padded entries\n" );
for (int ii = offset; ii < paddedNumberOfAtoms; ii++){
(void) fprintf( freeEnergyGpu->log,"%6u sig[%14.7e %14.7e] lambda=%10.3f q=%10.3f\n",
ii,
(*freeEnergyGpu->psSigEps4)[ii].x, (*freeEnergyGpu->psSigEps4)[ii].y, (*freeEnergyGpu->psSigEps4)[ii].z, (*freeEnergyGpu->psSigEps4)[ii].w );
}
}
}
#endif
(void) fflush( freeEnergyGpu->log );
}
// upload data to board
freeEnergyGpu->psSigEps4->Upload();
gpu->psPosq4->Upload();
return;
}
/**---------------------------------------------------------------------------------------
Get threads/block
@param amoebaGpu amoebaGpuContext
@param sharedMemoryPerThread shared memory/thread
@param sharedMemoryPerBlock shared memory/block
@return threadsPerBlock
--------------------------------------------------------------------------------------- */
extern "C"
unsigned int getThreadsPerBlockFEP( freeEnergyGpuContext freeEnergyGpu, unsigned int sharedMemoryPerThread, unsigned int sharedMemoryPerBlock )
{
unsigned int grid = freeEnergyGpu->gpuContext->grid;
unsigned int threadsPerBlock = (sharedMemoryPerBlock + grid -1)/(grid*sharedMemoryPerThread);
threadsPerBlock = threadsPerBlock < 1 ? 1 : threadsPerBlock;
threadsPerBlock *= grid;
return threadsPerBlock;
}
static int decodeCell( int cellCode, unsigned int* x, unsigned int* y, unsigned int* exclusion ){
*x = cellCode >> 17;
*y = (cellCode >> 2 ) & 0x7FFF;
*exclusion = (cellCode & 1) ? 1 : 0;
return 0;
}
void showWorkUnitsFreeEnergy( freeEnergyGpuContext freeEnergyGpu, int interactingWorkUnit ){
gpuContext gpu = freeEnergyGpu->gpuContext;
gpu->psWorkUnit->Download();
gpu->psInteractingWorkUnit->Download();
gpu->psInteractionFlag->Download();
unsigned int totalWarps = (gpu->sim.nonbond_blocks*gpu->sim.nonbond_threads_per_block)/GRID;
//unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
//unsigned int numWorkUnits = cSim.pInteractionCount[0];
unsigned int numWorkUnits = gpu->psInteractionCount->_pSysData[0];
(void) fprintf( stderr, "Total warps=%u blocks=%u threads=%u GRID=%u wus=%u warpOutput=%d\n",
totalWarps, gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block, GRID,
numWorkUnits, gpu->bOutputBufferPerWarp );
unsigned int maxPrint = 3;
std::stringstream message;
char buffer[2048];
unsigned int targetAtom = 18;
for( unsigned int ii = 0; ii < gpu->sim.nonbond_blocks; ii++ )
{
unsigned int blockId = ii;
for( unsigned int jj = 0; jj < gpu->sim.nonbond_threads_per_block; jj++ )
{
unsigned int warp = (ii*gpu->sim.nonbond_threads_per_block+jj)/GRID;
unsigned int pos = warp*numWorkUnits/totalWarps;
unsigned int end = (warp+1)*numWorkUnits/totalWarps;
unsigned int print = 0;
while( pos < end ){
unsigned int x, y, exclusion, flags;
int flagInt;
if( interactingWorkUnit ){
decodeCell( gpu->psInteractingWorkUnit->_pSysData[pos], &x, &y, &exclusion );
flags = gpu->psInteractionFlag->_pSysData[pos];
if( flags == 0xFFFFFFFF ){
flagInt = -2;
} else {
flagInt = flags;
}
} else {
decodeCell( gpu->psWorkUnit->_pSysData[pos], &x, &y, &exclusion );
flagInt = -1;
}
x *= GRID;
y *= GRID;
unsigned int bufferX = gpu->bOutputBufferPerWarp ? warp*gpu->sim.stride : (x >> GRIDBITS) * gpu->sim.stride;
unsigned int bufferY = gpu->bOutputBufferPerWarp ? warp*gpu->sim.stride : (y >> GRIDBITS) * gpu->sim.stride;
bufferX /= gpu->sim.paddedNumberOfAtoms;
bufferY /= gpu->sim.paddedNumberOfAtoms;
if( jj == 1 ){
//if( bufferX >= 62 || bufferY >= 62 ){
(void) sprintf( buffer, "Block %4u thread %4u warp=%4u pos[%4u %4u] bufXY[%4u %4u]", ii, jj, warp, pos, end, bufferX, bufferY );
message << buffer;
(void) sprintf( buffer, " x[%4u %4u] y[%4u %4u] excl=%u", x, x+32, y, y+32, exclusion );
message << buffer;
if( interactingWorkUnit ){
(void) sprintf( buffer, " Flg=%d (-2=all) %u", flagInt, flags );
}
message << buffer;
message << std::endl;
}
pos++;
}
}
}
(void) fprintf( stderr, "%s\n\n", message.str().c_str() );
}
#ifndef FREE_ENERGY_CUDA_TYPES_H
#define FREE_ENERGY_CUDA_TYPES_H
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include <kernels/cudatypes.h>
#include <stdarg.h>
#include <limits>
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <string>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <cufft.h>
#include <builtin_types.h>
#include <vector_functions.h>
enum CudaFreeEnergyNonbondedMethod
{
FREE_ENERGY_NO_CUTOFF,
FREE_ENERGY_CUTOFF,
FREE_ENERGY_PERIODIC
};
struct cudaFreeEnergyGmxSimulation {
// Constants
unsigned int LJ14_count; // LJ count
int4* pLJ14ID; // LJ 14 particles ids
float4* pLJ14Parameter; // LJ 14 parameters
float epsfac; // Epsilon factor for CDLJ calculations
CudaFreeEnergyNonbondedMethod nonbondedMethod; // How to handle nonbonded interactions
float nonbondedCutoff; // Cutoff distance for nonbonded interactions
float nonbondedCutoffSqr; // Square of the cutoff distance for nonbonded interactions
float periodicBoxSizeX; // The X dimension of the periodic box
float periodicBoxSizeY; // The Y dimension of the periodic box
float periodicBoxSizeZ; // The Z dimension of the periodic box
float invPeriodicBoxSizeX; // The 1 over the X dimension of the periodic box
float invPeriodicBoxSizeY; // The 1 over the Y dimension of the periodic box
float invPeriodicBoxSizeZ; // The 1 over the Z dimension of the periodic box
float recipBoxSizeX; // The X dimension of the reciprocal box for Ewald summation
float recipBoxSizeY; // The Y dimension of the reciprocal box for Ewald summation
float recipBoxSizeZ; // The Z dimension of the reciprocal box for Ewald summation
float cellVolume; // Ewald parameter alpha (a.k.a. kappa)
float reactionFieldK; // Constant for reaction field correction
float reactionFieldC; // Constant for reaction field correction
float4* pSigEps4; // sigma, eps, lambda. charge
int bornRadiiScalingMethod; // flag for method to use scaling radii (0=none,1=quintic spline)
float quinticLowerLimitFactor; // lower limit factor for quintic spline
float quinticUpperLimit; // upper limit for quintic spline
float* pSwitchDerivative; // switch deriviatives for quintic spline
float* pNonPolarScalingFactors; // non-polar scaling factors
};
#endif // FREE_ENERGY_CUDA_TYPES_H
#ifndef __FREE_ENERGY_GPUTYPES_H__
#define __FREE_ENERGY_GPUTYPES_H__
/* -------------------------------------------------------------------------- *
* OpenMMFreeEnergy *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "kernels/gputypes.h"
#include "freeEnergyCudaTypes.h"
#include <map>
typedef std::map<int,float> MapIntFloat;
typedef MapIntFloat::const_iterator MapIntFloatCI;
struct _freeEnergyGpuContext {
_gpuContext* gpuContext;
cudaFreeEnergyGmxSimulation freeEnergySim;
std::vector<std::vector<int> > exclusions;
CUDAStream<float4>* psSigEps4;
CUDAStream<int4>* psLJ14ID;
CUDAStream<float4>* psLJ14Parameter;
CUDAStream<float>* psSwitchDerivative;
CUDAStream<float>* psNonPolarScalingFactors;
FILE* log;
};
typedef struct _freeEnergyGpuContext *freeEnergyGpuContext;
// Function prototypes
extern "C" freeEnergyGpuContext freeEnergyGpuInit( _gpuContext* gpu );
extern "C" void freeEnergyGpuShutDown(freeEnergyGpuContext gpu);
extern "C" void freeEnergyGpuSetConstants(freeEnergyGpuContext gpu);
extern "C" unsigned int getThreadsPerBlockFEP( freeEnergyGpuContext freeEnergyGpu, unsigned int sharedMemoryPerThread, unsigned int sharedMemoryPerBlock );
#endif // __FREE_ENERGY_GPUTYPES_H__
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "kernels/gputypes.h"
#include "kernels/cudatypes.h"
#include "kernels/cudaKernels.h"
#include "GpuFreeEnergyCudaKernels.h"
#include "openmm/OpenMMException.h"
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <iostream>
#include <sstream>
#define USE_SOFTCORE_LJ
struct Atom {
float x;
float y;
float z;
float q;
float sig;
float eps;
float br;
float softCoreLJLambda;
float fx;
float fy;
float fz;
float fb;
};
static __constant__ cudaGmxSimulation cSim;
static __constant__ cudaFreeEnergyGmxSimulation feSimDev;
void SetCalculateCDLJObcGbsaSoftcoreGpu1Sim( freeEnergyGpuContext freeEnergyGpu ){
cudaError_t status;
status = cudaMemcpyToSymbol(cSim, &freeEnergyGpu->gpuContext->sim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyToSymbol: SetCalculateCDLJObcGbsaSoftcoreGpu1Sim copy to cSim failed");
status = cudaMemcpyToSymbol( feSimDev, &freeEnergyGpu->freeEnergySim, sizeof(cudaFreeEnergyGmxSimulation));
RTERROR(status, "cudaMemcpyToSymbol: SetCalculateCDLJObcGbsaSoftcoreGpu1Sim copy to feSimDev failed");
}
// Include versions of the kernel for N^2 calculations with softcore LJ.
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##N2##b
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_SOFTCORE_LJ
#include "kCalculateCDLJObcGbsaSoftcoreForces1.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##N2ByWarp##b
#include "kCalculateCDLJObcGbsaSoftcoreForces1.h"
#undef USE_SOFTCORE_LJ
#undef USE_OUTPUT_BUFFER_PER_WARP
// Include versions of the kernel with cutoffs.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_CUTOFF
#define METHOD_NAME(a, b) a##Cutoff##b
#include "kCalculateCDLJObcGbsaSoftcoreForces1.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##CutoffByWarp##b
#include "kCalculateCDLJObcGbsaSoftcoreForces1.h"
// Include versions of the kernel with periodic boundary conditions.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_PERIODIC
#define METHOD_NAME(a, b) a##Periodic##b
#include "kCalculateCDLJObcGbsaSoftcoreForces1.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##PeriodicByWarp##b
#include "kCalculateCDLJObcGbsaSoftcoreForces1.h"
/**
*
* Calculate Born radii and first GBSA loop forces/energy
*
* @param gpu gpu context
*
*/
void kCalculateCDLJObcGbsaSoftcoreForces1( freeEnergyGpuContext freeEnergyGpu )
{
unsigned int threadsPerBlock;
static unsigned int threadsPerBlockPerMethod[3] = { 0, 0, 0 };
static unsigned int natoms[3] = { 0, 0, 0 };
gpuContext gpu = freeEnergyGpu->gpuContext;
unsigned int methodIndex = static_cast<unsigned int>(freeEnergyGpu->freeEnergySim.nonbondedMethod);
if( methodIndex > 2 ){
throw OpenMM::OpenMMException( "kCalculateCDLJObcGbsaSoftcoreForces1 method index invalid." );
}
if( natoms[methodIndex] != gpu->natoms ){
unsigned int extra = methodIndex == 0 ? 0 : sizeof(float);
threadsPerBlockPerMethod[methodIndex] = std::min(getThreadsPerBlockFEP( freeEnergyGpu, (sizeof(Atom) + extra), gpu->sharedMemoryPerBlock ), gpu->sim.nonbond_threads_per_block );
natoms[methodIndex] = gpu->natoms;
}
threadsPerBlock = threadsPerBlockPerMethod[methodIndex];
switch( freeEnergyGpu->freeEnergySim.nonbondedMethod )
{
case FREE_ENERGY_NO_CUTOFF:
// use softcore LJ potential
if (gpu->bOutputBufferPerWarp)
kCalculateCDLJObcGbsaSoftcoreN2ByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
sizeof(Atom)*threadsPerBlock>>>(gpu->sim.pWorkUnit );
else
kCalculateCDLJObcGbsaSoftcoreN2Forces1_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
sizeof(Atom)*threadsPerBlock>>>(gpu->sim.pWorkUnit );
LAUNCHERROR("kCalculateCDLJObcGbsaSoftcoreForces1");
break;
case FREE_ENERGY_CUTOFF:
if (gpu->bOutputBufferPerWarp)
kCalculateCDLJObcGbsaSoftcoreCutoffByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
(sizeof(Atom)+sizeof(float))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit );
else
kCalculateCDLJObcGbsaSoftcoreCutoffForces1_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
(sizeof(Atom)+sizeof(float))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit );
LAUNCHERROR("kCalculateCDLJObcGbsaSoftcoreCutoffForces1");
break;
case FREE_ENERGY_PERIODIC:
if (gpu->bOutputBufferPerWarp)
kCalculateCDLJObcGbsaSoftcorePeriodicByWarpForces1_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
(sizeof(Atom)+sizeof(float))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit);
else
kCalculateCDLJObcGbsaSoftcorePeriodicForces1_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
(sizeof(Atom)+sizeof(float))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit);
LAUNCHERROR("kCalculateCDLJObcGbsaSoftcorePeriodicForces1");
break;
}
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
/**
* This file contains the kernel for evalauating nonbonded forces and the first stage of GBSA.
* It is included several times in kCalculateCDLJObcGbsaForces1.cu with different #defines to generate
* different versions of the kernels.
*/
#define USE_SOFTCORE_LJ
#ifdef USE_SOFTCORE_LJ
#include "kSoftcoreLJ.h"
#endif
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_NONBOND_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
#endif
#ifdef DEBUG
void METHOD_NAME(kCalculateCDLJObcGbsaSoftcore, Forces1_kernel)(unsigned int* workUnit, float4* pdE1, float4* pdE2 )
#else
void METHOD_NAME(kCalculateCDLJObcGbsaSoftcore, Forces1_kernel)(unsigned int* workUnit )
#endif
{
extern __shared__ Atom sA[];
unsigned int totalWarps = gridDim.x*blockDim.x/GRID;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
unsigned int numWorkUnits = cSim.pInteractionCount[0];
unsigned int pos = warp*numWorkUnits/totalWarps;
unsigned int end = (warp+1)*numWorkUnits/totalWarps;
float CDLJObcGbsa_energy;
float energy = 0.0f;
#ifdef USE_CUTOFF
float* tempBuffer = (float*) &sA[blockDim.x];
#endif
unsigned int lasty = -0xFFFFFFFF;
while (pos < end)
{
// Extract cell coordinates from appropriate work unit
unsigned int x = workUnit[pos];
unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
bool bExclusionFlag = (x & 0x1);
x = (x >> 17) << GRIDBITS;
unsigned int tgx = threadIdx.x & (GRID - 1);
unsigned int i = x + tgx;
float4 apos = cSim.pPosq[i];
float4 a = feSimDev.pSigEps4[i];
float softCoreLJLambda = a.z;
float br = cSim.pBornRadii[i];
unsigned int tbx = threadIdx.x - tgx;
unsigned int tj = tgx;
Atom* psA = &sA[tbx];
float4 af;
af.x = 0.0f;
af.y = 0.0f;
af.z = 0.0f;
af.w = 0.0f;
if (x == y) // Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
sA[threadIdx.x].x = apos.x;
sA[threadIdx.x].y = apos.y;
sA[threadIdx.x].z = apos.z;
sA[threadIdx.x].q = a.w;
sA[threadIdx.x].sig = a.x;
sA[threadIdx.x].eps = a.y;
sA[threadIdx.x].br = br;
sA[threadIdx.x].softCoreLJLambda = softCoreLJLambda;
float q2 = cSim.preFactor*a.w;
a.w *= cSim.epsfac;
if (!bExclusionFlag)
{
for (unsigned int j = 0; j < GRID; j++)
{
float dx = psA[j].x - apos.x;
float dy = psA[j].y - apos.y;
float dz = psA[j].z - apos.z;
#ifdef USE_PERIODIC
dx -= floorf(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floorf(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floorf(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif
float r2 = dx * dx + dy * dy + dz * dz;
float invR = 1.0f / sqrtf(r2);
float sig = a.x + psA[j].sig;
float eps = a.y * psA[j].eps;
#ifdef USE_SOFTCORE_LJ
float dEdR = getSoftCoreLJ( r2, sig, eps, softCoreLJLambda, psA[j].softCoreLJLambda, &CDLJObcGbsa_energy );
#else
// CDLJ part
float sig2 = invR * sig;
sig2 *= sig2;
float sig6 = sig2 * sig2 * sig2;
float dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
CDLJObcGbsa_energy = eps * (sig6 - 1.0f) * sig6;
#endif
#ifdef USE_CUTOFF
dEdR += a.w * psA[j].q * (invR - 2.0f * feSimDev.reactionFieldK * r2);
CDLJObcGbsa_energy += a.w * psA[j].q * (invR + feSimDev.reactionFieldK * r2 - feSimDev.reactionFieldC);
#else
float factorX = a.w * psA[j].q * invR;
dEdR += factorX;
CDLJObcGbsa_energy += factorX;
#endif
dEdR *= invR * invR;
// ObcGbsaForce1 part
float alpha2_ij = br * psA[j].br;
float D_ij = r2 / (4.0f * alpha2_ij);
float expTerm = expf(-D_ij);
float denominator2 = r2 + alpha2_ij * expTerm;
float denominator = sqrtf(denominator2);
float Gpol = (q2 * psA[j].q) / (denominator * denominator2);
float dGpol_dalpha2_ij = -0.5f * Gpol * expTerm * (1.0f + D_ij);
dEdR += Gpol * (1.0f - 0.25f * expTerm);
CDLJObcGbsa_energy += (q2 * psA[j].q) / denominator;
#ifdef USE_CUTOFF
if ( i >= cSim.atoms || (x+j) >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
#else
if ( i >= cSim.atoms || (x+j) >= cSim.atoms)
#endif
{
dEdR = 0.0f;
CDLJObcGbsa_energy = 0.0f;
dGpol_dalpha2_ij = 0.0f;
}
af.w += dGpol_dalpha2_ij * psA[j].br;
energy += 0.5f*CDLJObcGbsa_energy;
// Add Forces
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
}
} else {
unsigned int xi = x>>GRIDBITS;
unsigned int cell = xi+xi*cSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
unsigned int excl = cSim.pExclusion[cSim.pExclusionIndex[cell]+tgx];
for (unsigned int j = 0; j < GRID; j++)
{
float dx = psA[j].x - apos.x;
float dy = psA[j].y - apos.y;
float dz = psA[j].z - apos.z;
#ifdef USE_PERIODIC
dx -= floorf(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floorf(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floorf(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif
float r2 = dx * dx + dy * dy + dz * dz;
float invR = 1.0f / sqrtf(r2);
float sig = a.x + psA[j].sig;
float eps = a.y * psA[j].eps;
#ifdef USE_SOFTCORE_LJ
float dEdR = getSoftCoreLJ( r2, sig, eps, softCoreLJLambda, psA[j].softCoreLJLambda, &CDLJObcGbsa_energy );
//float dEdR = getSoftCoreLJMod( (invR*sig), eps, softCoreLJLambda, psA[j].softCoreLJLambda, &CDLJObcGbsa_energy );
#else
// CDLJ part
float sig2 = invR * sig;
sig2 *= sig2;
float sig6 = sig2 * sig2 * sig2;
float dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
CDLJObcGbsa_energy = eps * (sig6 - 1.0f) * sig6;
#endif
#ifdef USE_CUTOFF
dEdR += a.w * psA[j].q * (invR - 2.0f * feSimDev.reactionFieldK * r2);
CDLJObcGbsa_energy += a.w * psA[j].q * (invR + feSimDev.reactionFieldK * r2 - feSimDev.reactionFieldC);
#else
float factorX = a.w * psA[j].q * invR;
dEdR += factorX;
CDLJObcGbsa_energy += factorX;
#endif
dEdR *= invR * invR;
if (!(excl & 0x1))
{
dEdR = 0.0f;
CDLJObcGbsa_energy = 0.0f;
}
// ObcGbsaForce1 part
float alpha2_ij = br * psA[j].br;
float D_ij = r2 / (4.0f * alpha2_ij);
float expTerm = expf(-D_ij);
float denominator2 = r2 + alpha2_ij * expTerm;
float denominator = sqrtf(denominator2);
float Gpol = (q2 * psA[j].q) / (denominator * denominator2);
float dGpol_dalpha2_ij = -0.5f * Gpol * expTerm * (1.0f + D_ij);
dEdR += Gpol * (1.0f - 0.25f * expTerm);
CDLJObcGbsa_energy += (q2 * psA[j].q) / denominator;
#if defined USE_CUTOFF
if (i >= cSim.atoms || x+j >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
#else
if (i >= cSim.atoms || x+j >= cSim.atoms )
#endif
{
dEdR = 0.0f;
CDLJObcGbsa_energy = 0.0f;
dGpol_dalpha2_ij = 0.0f;
}
af.w += dGpol_dalpha2_ij * psA[j].br;
energy += 0.5f*CDLJObcGbsa_energy;
// Add Forces
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
excl >>= 1;
}
}
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = x + tgx + warp*cSim.stride;
#else
unsigned int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
#endif
float4 of = cSim.pForce4[offset];
float bf = cSim.pBornForce[offset];
of.x += af.x;
of.y += af.y;
of.z += af.z;
bf += af.w;
cSim.pForce4[offset] = of;
cSim.pBornForce[offset] = bf;
} else {
// Read fixed atom data into registers and GRF
if (lasty != y)
{
unsigned int j = y + tgx;
float4 temp = cSim.pPosq[j];
float4 temp1 = feSimDev.pSigEps4[j];
sA[threadIdx.x].br = cSim.pBornRadii[j];
sA[threadIdx.x].x = temp.x;
sA[threadIdx.x].y = temp.y;
sA[threadIdx.x].z = temp.z;
sA[threadIdx.x].q = temp1.w;
sA[threadIdx.x].sig = temp1.x;
sA[threadIdx.x].eps = temp1.y;
sA[threadIdx.x].softCoreLJLambda = temp1.z;
}
sA[threadIdx.x].fx = 0.0f;
sA[threadIdx.x].fy = 0.0f;
sA[threadIdx.x].fz = 0.0f;
sA[threadIdx.x].fb = 0.0f;
float q2 = a.w * cSim.preFactor;
a.w *= cSim.epsfac;
if (!bExclusionFlag)
{
#ifdef USE_CUTOFF
unsigned int flags = cSim.pInteractionFlag[pos];
if (flags == 0)
{
// No interactions in this block.
}
//else if (flags == 0xFFFFFFFF)
else if (flags)
#endif
{
// Compute all interactions within this block.
for (unsigned int j = 0; j < GRID; j++)
{
float dx = psA[tj].x - apos.x;
float dy = psA[tj].y - apos.y;
float dz = psA[tj].z - apos.z;
#ifdef USE_PERIODIC
dx -= floorf(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floorf(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floorf(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif
float r2 = dx * dx + dy * dy + dz * dz;
float invR = 1.0f / sqrtf(r2);
float sig = a.x + psA[tj].sig;
float eps = a.y * psA[tj].eps;
#ifdef USE_SOFTCORE_LJ
float dEdR = getSoftCoreLJ( r2, sig, eps, softCoreLJLambda, psA[tj].softCoreLJLambda, &CDLJObcGbsa_energy );
#else
// CDLJ part
float sig2 = invR * sig;
sig2 *= sig2;
float sig6 = sig2 * sig2 * sig2;
float dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
CDLJObcGbsa_energy = eps * (sig6 - 1.0f) * sig6;
#endif
#ifdef USE_CUTOFF
dEdR += a.w * psA[tj].q * (invR - 2.0f * feSimDev.reactionFieldK * r2);
CDLJObcGbsa_energy += a.w * psA[tj].q * (invR + feSimDev.reactionFieldK * r2 - feSimDev.reactionFieldC);
#else
float factorX = a.w * psA[tj].q * invR;
dEdR += factorX;
CDLJObcGbsa_energy += factorX;
#endif
dEdR *= invR * invR;
// ObcGbsaForce1 part
float alpha2_ij = br * psA[tj].br;
float D_ij = r2 / (4.0f * alpha2_ij);
float expTerm = expf(-D_ij);
float denominator2 = r2 + alpha2_ij * expTerm;
float denominator = sqrtf(denominator2);
float Gpol = (q2 * psA[tj].q) / (denominator * denominator2);
float dGpol_dalpha2_ij = -0.5f * Gpol * expTerm * (1.0f + D_ij);
dEdR += Gpol * (1.0f - 0.25f * expTerm);
CDLJObcGbsa_energy += (q2 * psA[tj].q) / denominator;
#ifdef USE_CUTOFF
if ( i >= cSim.atoms || (y+tj) >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
#else
if ( i >= cSim.atoms || (y+tj) >= cSim.atoms)
#endif
{
dEdR = 0.0f;
CDLJObcGbsa_energy = 0.0f;
dGpol_dalpha2_ij = 0.0f;
}
psA[tj].fb += dGpol_dalpha2_ij * br;
af.w += dGpol_dalpha2_ij * psA[tj].br;
energy += CDLJObcGbsa_energy;
// Add forces
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
psA[tj].fx += dx;
psA[tj].fy += dy;
psA[tj].fz += dz;
tj = (tj + 1) & (GRID - 1);
}
}
#ifdef USE_CUTOFF
else {
// Compute only a subset of the interactions in this block.
for (unsigned int j = 0; j < GRID; j++)
{
if ((flags&(1<<j)) != 0)
{
float dx = psA[j].x - apos.x;
float dy = psA[j].y - apos.y;
float dz = psA[j].z - apos.z;
#ifdef USE_PERIODIC
dx -= floorf(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floorf(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floorf(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif
float r2 = dx * dx + dy * dy + dz * dz;
float invR = 1.0f / sqrtf(r2);
float sig = a.x + psA[j].sig;
float eps = a.y * psA[j].eps;
#ifdef USE_SOFTCORE_LJ
float dEdR = getSoftCoreLJ( r2, sig, eps, softCoreLJLambda, psA[j].softCoreLJLambda, &CDLJObcGbsa_energy );
#else
// CDLJ part
float sig2 = invR * sig;
sig2 *= sig2;
float sig6 = sig2 * sig2 * sig2;
float dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
CDLJObcGbsa_energy = eps * (sig6 - 1.0f) * sig6;
#endif
#ifdef USE_CUTOFF
dEdR += a.w * psA[j].q * (invR - 2.0f * feSimDev.reactionFieldK * r2);
CDLJObcGbsa_energy += a.w * psA[j].q * (invR + feSimDev.reactionFieldK * r2 - feSimDev.reactionFieldC);
#else
float factorX = a.w * psA[j].q * invR;
dEdR += factorX;
CDLJObcGbsa_energy += factorX;
#endif
dEdR *= invR * invR;
// ObcGbsaForce1 part
float alpha2_ij = br * psA[j].br;
float D_ij = r2 / (4.0f * alpha2_ij);
float expTerm = expf(-D_ij);
float denominator2 = r2 + alpha2_ij * expTerm;
float denominator = sqrtf(denominator2);
float Gpol = (q2 * psA[j].q) / (denominator * denominator2);
float dGpol_dalpha2_ij = -0.5f * Gpol * expTerm * (1.0f + D_ij);
dEdR += Gpol * (1.0f - 0.25f * expTerm);
CDLJObcGbsa_energy += (q2 * psA[j].q) / denominator;
#ifdef USE_CUTOFF
if ( i >= cSim.atoms || (y+j) >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
#else
if ( i >= cSim.atoms || (y+j) >= cSim.atoms)
#endif
{
dEdR = 0.0f;
CDLJObcGbsa_energy = 0.0f;
dGpol_dalpha2_ij = 0.0f;
}
af.w += dGpol_dalpha2_ij * psA[j].br;
// Sum the Born forces.
tempBuffer[threadIdx.x] = dGpol_dalpha2_ij * br;
if (tgx % 2 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+1];
if (tgx % 4 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+2];
if (tgx % 8 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+4];
if (tgx % 16 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+8];
if (tgx == 0)
psA[j].fb += tempBuffer[threadIdx.x] + tempBuffer[threadIdx.x+16];
energy += CDLJObcGbsa_energy;
// Add forces
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
tempBuffer[threadIdx.x] = dx;
if (tgx % 2 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+1];
if (tgx % 4 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+2];
if (tgx % 8 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+4];
if (tgx % 16 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+8];
if (tgx == 0)
psA[j].fx += tempBuffer[threadIdx.x] + tempBuffer[threadIdx.x+16];
tempBuffer[threadIdx.x] = dy;
if (tgx % 2 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+1];
if (tgx % 4 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+2];
if (tgx % 8 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+4];
if (tgx % 16 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+8];
if (tgx == 0)
psA[j].fy += tempBuffer[threadIdx.x] + tempBuffer[threadIdx.x+16];
tempBuffer[threadIdx.x] = dz;
if (tgx % 2 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+1];
if (tgx % 4 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+2];
if (tgx % 8 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+4];
if (tgx % 16 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+8];
if (tgx == 0)
psA[j].fz += tempBuffer[threadIdx.x] + tempBuffer[threadIdx.x+16];
}
}
}
#endif
} else {
unsigned int xi = x>>GRIDBITS;
unsigned int yi = y>>GRIDBITS;
unsigned int cell = xi+yi*cSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
unsigned int excl = cSim.pExclusion[cSim.pExclusionIndex[cell]+tgx];
excl = (excl >> tgx) | (excl << (GRID - tgx));
for (unsigned int j = 0; j < GRID; j++)
{
float dx = psA[tj].x - apos.x;
float dy = psA[tj].y - apos.y;
float dz = psA[tj].z - apos.z;
#ifdef USE_PERIODIC
dx -= floorf(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floorf(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floorf(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif
float r2 = dx * dx + dy * dy + dz * dz;
float invR = 1.0f / sqrtf(r2);
float sig = a.x + psA[tj].sig;
float eps = a.y * psA[tj].eps;
#ifdef USE_SOFTCORE_LJ
float dEdR = getSoftCoreLJ( r2, sig, eps, softCoreLJLambda, psA[tj].softCoreLJLambda, &CDLJObcGbsa_energy );
#else
// CDLJ part
float sig2 = invR * sig;
sig2 *= sig2;
float sig6 = sig2 * sig2 * sig2;
float dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
CDLJObcGbsa_energy = eps * (sig6 - 1.0f) * sig6;
#endif
#ifdef USE_CUTOFF
dEdR += a.w * psA[tj].q * (invR - 2.0f * feSimDev.reactionFieldK * r2);
CDLJObcGbsa_energy += a.w * psA[tj].q * (invR + feSimDev.reactionFieldK * r2 - feSimDev.reactionFieldC);
#else
float factorX = a.w * psA[tj].q * invR;
dEdR += factorX;
CDLJObcGbsa_energy += factorX;
#endif
dEdR *= invR * invR;
if (!(excl & 0x1))
{
dEdR = 0.0f;
CDLJObcGbsa_energy = 0.0f;
}
// ObcGbsaForce1 part
float alpha2_ij = br * psA[tj].br;
float D_ij = r2 / (4.0f * alpha2_ij);
float expTerm = expf(-D_ij);
float denominator2 = r2 + alpha2_ij * expTerm;
float denominator = sqrtf(denominator2);
float Gpol = (q2 * psA[tj].q) / (denominator * denominator2);
float dGpol_dalpha2_ij = -0.5f * Gpol * expTerm * (1.0f + D_ij);
dEdR += Gpol * (1.0f - 0.25f * expTerm);
CDLJObcGbsa_energy += (q2 * psA[tj].q) / denominator;
#if defined USE_CUTOFF
if (i >= cSim.atoms || y+tj >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
#else
if (i >= cSim.atoms || y+tj >= cSim.atoms)
#endif
{
dEdR = 0.0f;
CDLJObcGbsa_energy = 0.0f;
dGpol_dalpha2_ij = 0.0f;
}
af.w += dGpol_dalpha2_ij * psA[tj].br;
psA[tj].fb += dGpol_dalpha2_ij * br;
energy += CDLJObcGbsa_energy;
// Add forces
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
psA[tj].fx += dx;
psA[tj].fy += dy;
psA[tj].fz += dz;
excl >>= 1;
tj = (tj + 1) & (GRID - 1);
}
}
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = x + tgx + warp*cSim.stride;
#else
unsigned int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
#endif
float4 of = cSim.pForce4[offset];
float bf = cSim.pBornForce[offset];
of.x += af.x;
of.y += af.y;
of.z += af.z;
bf += af.w;
cSim.pForce4[offset] = of;
cSim.pBornForce[offset] = bf;
#ifdef USE_OUTPUT_BUFFER_PER_WARP
offset = y + tgx + warp*cSim.stride;
#else
offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
#endif
of = cSim.pForce4[offset];
bf = cSim.pBornForce[offset];
of.x += sA[threadIdx.x].fx;
of.y += sA[threadIdx.x].fy;
of.z += sA[threadIdx.x].fz;
bf += sA[threadIdx.x].fb;
cSim.pForce4[offset] = of;
cSim.pBornForce[offset] = bf;
lasty = y;
}
pos++;
}
cSim.pEnergy[blockIdx.x*blockDim.x+threadIdx.x] += energy;
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include "GpuFreeEnergyCudaKernels.h"
#include "freeEnergyGpuTypes.h"
#include "openmm/OpenMMException.h"
#include <cuda.h>
#include <iostream>
#include <sstream>
#define PARAMETER_PRINT 0
#define MAX_PARAMETER_PRINT 10
static __constant__ cudaGmxSimulation cSim;
static __constant__ cudaFreeEnergyGmxSimulation gbviSimDev;
void SetCalculateGBVISoftcoreBornSumGpuSim( freeEnergyGpuContext freeEnergyGpu)
{
cudaError_t status;
status = cudaMemcpyToSymbol( cSim, &freeEnergyGpu->gpuContext->sim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyToSymbol: SetCalculateGBVISoftcoreBornSumGpuSim copy to cSim failed");
status = cudaMemcpyToSymbol( gbviSimDev, &freeEnergyGpu->freeEnergySim, sizeof(cudaFreeEnergyGmxSimulation));
RTERROR(status, "cudaMemcpyToSymbol: SetCalculateGBVISoftcoreBornSumGpuSim copy to feSim failed");
}
// create, initialize and enter BornRadiusScaleFactors values (used to scale contribution of atoms to Born sum of other atoms)
extern "C"
void gpuSetGBVISoftcoreParameters( freeEnergyGpuContext freeEnergyGpu, float innerDielectric, float solventDielectric, const std::vector<int>& atom,
const std::vector<float>& radius, const std::vector<float>& gamma,
const std::vector<float>& scaledRadii, const std::vector<float>& bornRadiusScaleFactors,
const std::vector<float>& quinticSplineParameters ){
unsigned int numberOfParticles = radius.size();
gpuContext gpu = freeEnergyGpu->gpuContext;
static const float electricConstant = -166.02691f;
double tau = ((1.0f/innerDielectric)-(1.0f/solventDielectric));
freeEnergyGpu->psSwitchDerivative = new CUDAStream<float>( gpu->sim.paddedNumberOfAtoms, 1, "SwitchDerivative");
freeEnergyGpu->freeEnergySim.pSwitchDerivative = freeEnergyGpu->psSwitchDerivative->_pDevData;
// create gpuGBVISoftcore, load parameters, and track minimum softcore value
// gpuGBVISoftcore is not really being used (it was in the initial implementation) --
// will be removed in future once confirmed not needed
// check if quintic scaling to be applied
if( quinticSplineParameters.size() == 2 ){
freeEnergyGpu->freeEnergySim.bornRadiiScalingMethod = 1;
freeEnergyGpu->freeEnergySim.quinticLowerLimitFactor = quinticSplineParameters[0];
freeEnergyGpu->freeEnergySim.quinticUpperLimit = quinticSplineParameters[1];
} else {
freeEnergyGpu->freeEnergySim.bornRadiiScalingMethod = 0;
freeEnergyGpu->freeEnergySim.quinticLowerLimitFactor = 0.8f;
freeEnergyGpu->freeEnergySim.quinticUpperLimit = 5.0f;
}
for( unsigned int ii = 0; ii < bornRadiusScaleFactors.size(); ii++ ){
(*gpu->psGBVIData)[ii].x = radius[ii];
(*gpu->psGBVIData)[ii].y = scaledRadii[ii];
(*gpu->psGBVIData)[ii].z = tau*gamma[ii];
(*gpu->psGBVIData)[ii].w = bornRadiusScaleFactors[ii];
(*gpu->psBornRadii)[ii] = 0.0f;
(*freeEnergyGpu->psSwitchDerivative)[ii] = 0.0f;
}
// Dummy out extra atom data
for( unsigned int ii = bornRadiusScaleFactors.size(); ii < gpu->sim.paddedNumberOfAtoms; ii++ ){
(*gpu->psGBVIData)[ii].x = 0.01f;
(*gpu->psGBVIData)[ii].y = 0.01f;
(*gpu->psGBVIData)[ii].z = 0.0f;
(*gpu->psGBVIData)[ii].w = 0.0f;
(*gpu->psBornRadii)[ii] = 0.0f;
(*freeEnergyGpu->psSwitchDerivative)[ii] = 0.0f;
}
gpu->sim.preFactor = 2.0f*electricConstant*((1.0f/innerDielectric)-(1.0f/solventDielectric))*gpu->sim.forceConversionFactor;
// diagnostics
if( freeEnergyGpu->log ){
(void) fprintf( freeEnergyGpu->log,"GBVISoftcore: part.=%u padded=%u sclMeth=%d\n",
static_cast<unsigned int>(bornRadiusScaleFactors.size()), static_cast<unsigned int>(gpu->sim.paddedNumberOfAtoms),
freeEnergyGpu->freeEnergySim.bornRadiiScalingMethod );
if( quinticSplineParameters.size() == 2 ){
(void) fprintf( freeEnergyGpu->log,"QuinticScaling: LwFct=%8.3f UpLmt=[%12.5e (nm) %12.5e]\n",
freeEnergyGpu->freeEnergySim.quinticLowerLimitFactor,
powf( freeEnergyGpu->freeEnergySim.quinticUpperLimit, -0.3333333f ), freeEnergyGpu->freeEnergySim.quinticUpperLimit );
}
(void) fprintf( freeEnergyGpu->log, "gpuSetGBVISoftcoreParameters: preFactor=%14.6e elecCnstnt=%.4f frcCnvrsnFctr=%.4f tau=%.4f.\n",
gpu->sim.preFactor, 2.0f*electricConstant, gpu->sim.forceConversionFactor, ((1.0f/innerDielectric)-(1.0f/solventDielectric)) );
#ifdef PARAMETER_PRINT
int maxPrint = MAX_PARAMETER_PRINT;
(void) fprintf( freeEnergyGpu->log, " radius scaled radius tau*gamma lambda\n" );
for( unsigned int ii = 0; ii < bornRadiusScaleFactors.size(); ii++ ){
(void) fprintf( freeEnergyGpu->log,"%6u %14.7e %14.7e %14.7e %14.7e\n",
ii, (*gpu->psGBVIData)[ii].x, (*gpu->psGBVIData)[ii].y, (*gpu->psGBVIData)[ii].z, (*gpu->psGBVIData)[ii].w );
if( ii == maxPrint ){
ii = bornRadiusScaleFactors.size() - maxPrint;
if( ii < maxPrint )ii = maxPrint;
}
}
unsigned int offset = gpu->sim.paddedNumberOfAtoms - MAX_PARAMETER_PRINT;
if( offset > 0 && gpu->sim.paddedNumberOfAtoms > bornRadiusScaleFactors.size() ){
for( unsigned int ii = offset; ii < gpu->sim.paddedNumberOfAtoms; ii++ ){
(void) fprintf( freeEnergyGpu->log,"%6u %14.7e %14.7e %14.7e %14.7e\n",
ii, (*gpu->psGBVIData)[ii].x, (*gpu->psGBVIData)[ii].y, (*gpu->psGBVIData)[ii].z, (*gpu->psGBVIData)[ii].w );
}
}
#endif
}
gpu->psGBVIData->Upload();
gpu->psBornRadii->Upload();
freeEnergyGpu->psSwitchDerivative->Upload();
return;
}
struct Atom {
float x;
float y;
float z;
float r;
float sr;
float sum;
float gamma;
float bornRadiusScaleFactor;
};
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
void kClearGBVISoftcoreBornSum_kernel()
{
unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
while (pos < cSim.stride * cSim.nonbondOutputBuffers)
{
((float*)cSim.pBornSum)[pos] = 0.0f;
pos += gridDim.x * blockDim.x;
}
}
void kClearGBVISoftcoreBornSum(gpuContext gpu) {
kClearGBVISoftcoreBornSum_kernel<<<gpu->sim.blocks, gpu->sim.threads_per_block>>>();
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
void kReduceGBVISoftcoreBornForces_kernel()
{
unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
float energy = 0.0f;
while (pos < cSim.atoms)
{
float bornRadius = cSim.pBornRadii[pos];
float4 gbviData = cSim.pGBVIData[pos];
float totalForce = 0.0f;
float* pFt = cSim.pBornForce + pos;
int i = cSim.nonbondOutputBuffers;
while (i >= 4)
{
float f1 = *pFt;
pFt += cSim.stride;
float f2 = *pFt;
pFt += cSim.stride;
float f3 = *pFt;
pFt += cSim.stride;
float f4 = *pFt;
pFt += cSim.stride;
totalForce += f1 + f2 + f3 + f4;
i -= 4;
}
if (i >= 2)
{
float f1 = *pFt;
pFt += cSim.stride;
float f2 = *pFt;
pFt += cSim.stride;
totalForce += f1 + f2;
i -= 2;
}
if (i > 0)
{
totalForce += *pFt;
}
float ratio = (gbviData.x/bornRadius);
float ratio3 = ratio*ratio*ratio;
energy -= gbviData.z*ratio3; // gbviData.z = gamma*tau
totalForce += (3.0f*gbviData.z*ratio3)/bornRadius; // 'cavity' term
float br2 = bornRadius*bornRadius;
totalForce *= (1.0f/3.0f)*br2*br2;
pFt = cSim.pBornForce + pos;
*pFt = totalForce;
pos += gridDim.x * blockDim.x;
}
cSim.pEnergy[blockIdx.x * blockDim.x + threadIdx.x] += energy;
}
void kReduceGBVISoftcoreBornForces( freeEnergyGpuContext freeEnergyGpu )
{
gpuContext gpu = freeEnergyGpu->gpuContext;
kReduceGBVISoftcoreBornForces_kernel<<<gpu->sim.blocks, gpu->sim.bf_reduce_threads_per_block>>>();
LAUNCHERROR("kReduceGBVISoftcoreBornForces");
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
void kReduceGBVISoftcoreBornSum_kernel()
{
unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
while (pos < cSim.atoms)
{
float sum = 0.0f;
float* pSt = cSim.pBornSum + pos;
float4 atom = cSim.pGBVIData[pos];
// Get summed Born data
for (int i = 0; i < cSim.nonbondOutputBuffers; i++)
{
sum += *pSt;
pSt += cSim.stride;
}
// Now calculate Born radius
float Rinv = 1.0f/atom.x;
sum = Rinv*Rinv*Rinv - sum;
cSim.pBornRadii[pos] = pow( sum, (-1.0f/3.0f) );
pos += gridDim.x * blockDim.x;
}
}
void kReduceGBVISoftcoreBornSum( freeEnergyGpuContext freeEnergyGpu )
{
gpuContext gpu = freeEnergyGpu->gpuContext;
kReduceGBVISoftcoreBornSum_kernel<<<gpu->sim.blocks, gpu->sim.threads_per_block>>>();
LAUNCHERROR("kReduceGBVISoftcoreBornSum");
}
// Include versions of the kernels for N^2 calculations.
#define METHOD_NAME(a, b) a##N2##b
#include "kCalculateGBVISoftcoreBornSum.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##N2ByWarp##b
#include "kCalculateGBVISoftcoreBornSum.h"
// Include versions of the kernels with cutoffs.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_CUTOFF
#define METHOD_NAME(a, b) a##Cutoff##b
#include "kCalculateGBVISoftcoreBornSum.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##CutoffByWarp##b
#include "kCalculateGBVISoftcoreBornSum.h"
// Include versions of the kernels with periodic boundary conditions.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_PERIODIC
#define METHOD_NAME(a, b) a##Periodic##b
#include "kCalculateGBVISoftcoreBornSum.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##PeriodicByWarp##b
#include "kCalculateGBVISoftcoreBornSum.h"
__device__ void quinticSpline( float x, float rl, float ru, float* outValue, float* outDerivative )
{
float numerator = x - rl;
float denominator = ru - rl;
float ratio = numerator/denominator;
float ratio2 = ratio*ratio;
float ratio3 = ratio2*ratio;
*outValue = 1.0f + ratio3*(-10.f + 3.0f*ratio*(5.0f - 2.0f*ratio));
*outDerivative = -30.0f*ratio2*( 1.0f + ratio*(ratio - 2.0f))/denominator;
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
void kReduceGBVIBornSumQuinticScaling_kernel()
{
unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
while (pos < cSim.atoms)
{
float sum = 0.0f;
float* pSt = cSim.pBornSum + pos;
float4 atom = cSim.pGBVIData[pos];
// Get summed Born data
for (int i = 0; i < cSim.nonbondOutputBuffers; i++)
{
sum += *pSt;
pSt += cSim.stride;
}
// Now calculate Born radius
float Rinv = 1.0f/atom.x;
float r3 = Rinv*Rinv*Rinv;
float splineL = gbviSimDev.quinticLowerLimitFactor*r3;
float switchDeriviative;
if( sum > splineL ){
if( sum < r3 ){
float splineValue, splineDerivative;
quinticSpline( sum, splineL, r3, &splineValue, &splineDerivative );
switchDeriviative = splineValue - (r3 - sum)*splineDerivative;
sum = (r3 - sum)*splineValue + gbviSimDev.quinticUpperLimit;
} else {
sum = gbviSimDev.quinticUpperLimit;
switchDeriviative = 0.0f;
}
} else {
sum = r3 - sum;
switchDeriviative = 1.0f;
}
cSim.pBornRadii[pos] = pow( sum, (-1.0f/3.0f) );
gbviSimDev.pSwitchDerivative[pos] = switchDeriviative;
pos += gridDim.x * blockDim.x;
}
}
void kReduceGBVIBornSumQuinticScaling( freeEnergyGpuContext freeEnergyGpu )
{
gpuContext gpu = freeEnergyGpu->gpuContext;
kReduceGBVIBornSumQuinticScaling_kernel<<<gpu->sim.blocks, gpu->sim.threads_per_block>>>();
LAUNCHERROR("kReduceGBVIBornSumQuinticScaling_kernel");
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
void kReduceGBVIBornForcesQuinticScaling_kernel()
{
unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
float energy = 0.0f;
while (pos < cSim.atoms)
{
float bornRadius = cSim.pBornRadii[pos];
float4 gbviData = cSim.pGBVIData[pos];
float switchDeriv = gbviSimDev.pSwitchDerivative[pos];
float totalForce = 0.0f;
float* pFt = cSim.pBornForce + pos;
int i = cSim.nonbondOutputBuffers;
while (i >= 4)
{
float f1 = *pFt;
pFt += cSim.stride;
float f2 = *pFt;
pFt += cSim.stride;
float f3 = *pFt;
pFt += cSim.stride;
float f4 = *pFt;
pFt += cSim.stride;
totalForce += f1 + f2 + f3 + f4;
i -= 4;
}
if (i >= 2)
{
float f1 = *pFt;
pFt += cSim.stride;
float f2 = *pFt;
pFt += cSim.stride;
totalForce += f1 + f2;
i -= 2;
}
if (i > 0)
{
totalForce += *pFt;
}
float ratio = (gbviData.x/bornRadius);
float ratio3 = ratio*ratio*ratio;
energy -= gbviData.z*ratio3;
totalForce += (3.0f*gbviData.z*ratio3)/bornRadius; // 'cavity' term
float br2 = bornRadius*bornRadius;
totalForce *= (1.0f/3.0f)*br2*br2*switchDeriv;
pFt = cSim.pBornForce + pos;
*pFt = totalForce;
pos += gridDim.x * blockDim.x;
}
cSim.pEnergy[blockIdx.x * blockDim.x + threadIdx.x] += energy;
}
void kReduceGBVIBornForcesQuinticScaling( freeEnergyGpuContext freeEnergyGpu )
{
gpuContext gpu = freeEnergyGpu->gpuContext;
//(void) fprintf( stderr, "kReduceObcGbsaBornForces %6d blks=%u bsf_reduce_threads_per_block=%5u %5u %5u %5u %5u\n",
// gpu->natoms, gpu->sim.blocks, gpu->sim.bsf_reduce_threads_per_block, gpu->sim.bf_reduce_threads_per_block,
// GF1XX_THREADS_PER_BLOCK, GT2XX_THREADS_PER_BLOCK, G8X_THREADS_PER_BLOCK); fflush( stderr );
kReduceGBVIBornForcesQuinticScaling_kernel<<<gpu->sim.blocks, gpu->sim.bsf_reduce_threads_per_block>>>();
LAUNCHERROR("kReduceGBVIBornForcesQuinticScaling");
}
void kPrintGBVISoftcore( freeEnergyGpuContext freeEnergyGpu, std::string callId, int call, FILE* log)
{
gpuContext gpu = freeEnergyGpu->gpuContext;
//int maxPrint = gpu->natoms;
gpu->psGBVIData->Download();
gpu->psBornRadii->Download();
gpu->psBornForce->Download();
gpu->psPosq4->Download();
CUDAStream<float>* switchDeriviative = freeEnergyGpu->psSwitchDerivative;
CUDAStream<float4>* sigEps4 = freeEnergyGpu->psSigEps4;
switchDeriviative->Download();
sigEps4->Download();
(void) fprintf( log, "kPrintGBVISoftcore Cuda Softcore bR bF swd prm sigeps4\n" );
for( int ii = 0; ii < gpu->sim.paddedNumberOfAtoms; ii++ ){
(void) fprintf( log, "%6d %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e %15.7e \n",
ii,
gpu->psBornRadii->_pSysData[ii],
gpu->psBornForce->_pSysData[ii],
switchDeriviative->_pSysData[ii],
gpu->psGBVIData->_pSysData[ii].x,
gpu->psGBVIData->_pSysData[ii].y,
gpu->psGBVIData->_pSysData[ii].z,
gpu->psGBVIData->_pSysData[ii].w,
sigEps4->_pSysData[ii].x,
sigEps4->_pSysData[ii].y,
sigEps4->_pSysData[ii].z,
sigEps4->_pSysData[ii].w );
}
}
extern __global__ void kFindBlockBoundsCutoff_kernel();
extern __global__ void kFindBlockBoundsPeriodic_kernel();
extern __global__ void kFindBlocksWithInteractionsCutoff_kernel();
extern __global__ void kFindBlocksWithInteractionsPeriodic_kernel();
extern __global__ void kFindInteractionsWithinBlocksCutoff_kernel(unsigned int*);
extern __global__ void kFindInteractionsWithinBlocksPeriodic_kernel(unsigned int*);
void kCalculateGBVISoftcoreBornSum( freeEnergyGpuContext freeEnergyGpu )
{
unsigned int threadsPerBlock;
static unsigned int threadsPerBlockPerMethod[3] = { 0, 0, 0 };
static unsigned int natoms[3] = { 0, 0, 0 };
gpuContext gpu = freeEnergyGpu->gpuContext;
unsigned int methodIndex = static_cast<unsigned int>(freeEnergyGpu->freeEnergySim.nonbondedMethod);
if( methodIndex > 2 ){
throw OpenMM::OpenMMException( "kCalculateGBVISoftcoreBornSum method index invalid." );
}
if( natoms[methodIndex] != gpu->natoms ){
unsigned int extra = methodIndex == 0 ? 0 : sizeof(float);
threadsPerBlockPerMethod[methodIndex] = std::min(getThreadsPerBlockFEP( freeEnergyGpu, (sizeof(Atom) + extra), gpu->sharedMemoryPerBlock ), gpu->sim.nonbond_threads_per_block );
natoms[methodIndex] = gpu->natoms;
}
threadsPerBlock = threadsPerBlockPerMethod[methodIndex];
kClearGBVISoftcoreBornSum( gpu );
LAUNCHERROR("kClearGBVIBornSum from kCalculateGBVISoftcoreBornSum");
switch (freeEnergyGpu->freeEnergySim.nonbondedMethod)
{
case FREE_ENERGY_NO_CUTOFF:
if (gpu->bOutputBufferPerWarp){
kCalculateGBVISoftcoreN2ByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
sizeof(Atom)*threadsPerBlock>>>(gpu->sim.pWorkUnit);
} else {
kCalculateGBVISoftcoreN2BornSum_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
sizeof(Atom)*threadsPerBlock>>>(gpu->sim.pWorkUnit);
}
break;
case FREE_ENERGY_CUTOFF:
kFindBlockBoundsCutoff_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
LAUNCHERROR("kFindBlockBoundsCutoff");
kFindBlocksWithInteractionsCutoff_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
LAUNCHERROR("kFindBlocksWithInteractionsCutoff");
compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
kFindInteractionsWithinBlocksCutoff_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
sizeof(unsigned int)*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit);
if (gpu->bOutputBufferPerWarp)
kCalculateGBVISoftcoreCutoffByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
(sizeof(Atom)+sizeof(float))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit);
else
kCalculateGBVISoftcoreCutoffBornSum_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
(sizeof(Atom)+sizeof(float))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit );
break;
case FREE_ENERGY_PERIODIC:
kFindBlockBoundsPeriodic_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
LAUNCHERROR("kFindBlockBoundsPeriodic");
kFindBlocksWithInteractionsPeriodic_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
LAUNCHERROR("kFindBlocksWithInteractionsPeriodic");
compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
kFindInteractionsWithinBlocksPeriodic_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
sizeof(unsigned int)*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit);
if (gpu->bOutputBufferPerWarp)
kCalculateGBVISoftcorePeriodicByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
(sizeof(Atom)+sizeof(float))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit );
else
kCalculateGBVISoftcorePeriodicBornSum_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock,
(sizeof(Atom)+sizeof(float))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit );
break;
default:
throw OpenMM::OpenMMException( "Nonbonded softcore method not recognized." );
}
LAUNCHERROR("kCalculateGBVISoftcoreBornSum");
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
/**
* This file contains the kernel for calculating Born sums. It is included
* several times in kCalculateGBVIBornSum.cu with different #defines to generate
* different versions of the kernels.
*/
#include "kCalculateGBVIAux.h"
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_NONBOND_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
#endif
void METHOD_NAME(kCalculateGBVISoftcore, BornSum_kernel)(unsigned int* workUnit)
{
extern __shared__ Atom sA[];
unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
unsigned int numWorkUnits = cSim.pInteractionCount[0];
unsigned int pos = warp*numWorkUnits/totalWarps;
unsigned int end = (warp+1)*numWorkUnits/totalWarps;
#ifdef USE_CUTOFF
float* tempBuffer = (float*) &sA[cSim.nonbond_threads_per_block];
#endif
while ( pos < end )
{
// Extract cell coordinates from appropriate work unit
unsigned int x = workUnit[pos];
unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
x = (x >> 17) << GRIDBITS;
float dx;
float dy;
float dz;
float r2;
float r;
// forces tgx into interval [0,31]
// forces tbx 0
unsigned int tgx = threadIdx.x & (GRID - 1);
unsigned int tbx = threadIdx.x - tgx;
unsigned int tj = tgx;
Atom* psA = &sA[tbx];
if (x == y) // Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
unsigned int i = x + tgx;
float4 apos = cSim.pPosq[i]; // Local atom x, y, z, sum
float4 ar = cSim.pGBVIData[i]; // Local atom vr, sr
sA[threadIdx.x].x = apos.x;
sA[threadIdx.x].y = apos.y;
sA[threadIdx.x].z = apos.z;
sA[threadIdx.x].r = ar.x;
sA[threadIdx.x].sr = ar.y;
sA[threadIdx.x].bornRadiusScaleFactor = ar.w;
float bSum = 0.0f;
for (unsigned int j = 0; j < GRID; j++)
{
dx = psA[j].x - apos.x;
dy = psA[j].y - apos.y;
dz = psA[j].z - apos.z;
#ifdef USE_PERIODIC
dx -= floorf(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floorf(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floorf(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif
r2 = dx * dx + dy * dy + dz * dz;
#if defined USE_CUTOFF
if (i < cSim.atoms && x+j < cSim.atoms && r2 < cSim.nonbondedCutoffSqr && j != tgx)
#else
if (i < cSim.atoms && x+j < cSim.atoms && j != tgx )
#endif
{
bSum += psA[j].bornRadiusScaleFactor*getGBVI_Volume( sqrtf(r2), ar.x, psA[j].sr );
}
}
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = x + tgx + warp*cSim.stride;
cSim.pBornSum[offset] += bSum;
#else
unsigned int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pBornSum[offset] = bSum;
#endif
} else {
// Read fixed atom data into registers and GRF
unsigned int j = y + tgx;
unsigned int i = x + tgx;
float4 temp = cSim.pPosq[j];
float4 temp1 = cSim.pGBVIData[j];
float4 apos = cSim.pPosq[i]; // Local atom x, y, z, sum
float4 ar = cSim.pGBVIData[i]; // Local atom vr, sr
sA[threadIdx.x].x = temp.x;
sA[threadIdx.x].y = temp.y;
sA[threadIdx.x].z = temp.z;
sA[threadIdx.x].r = temp1.x;
sA[threadIdx.x].sr = temp1.y;
sA[threadIdx.x].bornRadiusScaleFactor = temp1.w;
sA[threadIdx.x].sum = 0.0f;
apos.w = 0.0f;
#ifdef USE_CUTOFF
unsigned int flags = cSim.pInteractionFlag[pos];
if (flags == 0)
{
// No interactions in this block.
}
else if (flags == 0xFFFFFFFF)
//else if (flags )
#endif
{
// Compute all interactions within this block.
for (unsigned int j = 0; j < GRID; j++)
{
dx = psA[tj].x - apos.x;
dy = psA[tj].y - apos.y;
dz = psA[tj].z - apos.z;
#ifdef USE_PERIODIC
dx -= floorf(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floorf(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floorf(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif
r2 = dx * dx + dy * dy + dz * dz;
#ifdef USE_CUTOFF
if (i < cSim.atoms && y+tj < cSim.atoms && r2 < cSim.nonbondedCutoffSqr)
#else
if (i < cSim.atoms && y+tj < cSim.atoms )
#endif
{
r = sqrtf(r2);
// psA[tj].sr = Sj
// ar.x = Ri
apos.w += psA[tj].bornRadiusScaleFactor*getGBVI_Volume( r, ar.x, psA[tj].sr );
psA[tj].sum += ar.w*getGBVI_Volume( r, psA[tj].r, ar.y );
}
tj = (tj - 1) & (GRID - 1);
}
}
#ifdef USE_CUTOFF
else
{
// Compute only a subset of the interactions in this block.
for (unsigned int j = 0; j < GRID; j++)
{
if ((flags&(1<<j)) != 0)
{
tempBuffer[threadIdx.x] = 0.0f;
dx = psA[j].x - apos.x;
dy = psA[j].y - apos.y;
dz = psA[j].z - apos.z;
#ifdef USE_PERIODIC
dx -= floorf(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floorf(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floorf(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif
r2 = dx * dx + dy * dy + dz * dz;
#ifdef USE_CUTOFF
if (i < cSim.atoms && y+j < cSim.atoms && r2 < cSim.nonbondedCutoffSqr)
#else
if (i < cSim.atoms && y+j < cSim.atoms)
#endif
{
r = sqrtf(r2);
tempBuffer[threadIdx.x] = ar.w*getGBVI_Volume( r, psA[j].r, ar.y );
apos.w += psA[j].bornRadiusScaleFactor*getGBVI_Volume( r, ar.x, psA[j].sr );
}
// Sum the terms.
if (tgx % 2 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+1];
if (tgx % 4 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+2];
if (tgx % 8 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+4];
if (tgx % 16 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+8];
if (tgx == 0)
psA[j].sum += tempBuffer[threadIdx.x] + tempBuffer[threadIdx.x+16];
}
}
}
#endif
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = x + tgx + warp*cSim.stride;
cSim.pBornSum[offset] += apos.w;
offset = y + tgx + warp*cSim.stride;
cSim.pBornSum[offset] += sA[threadIdx.x].sum;
#else
unsigned int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
cSim.pBornSum[offset] = apos.w;
offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pBornSum[offset] = sA[threadIdx.x].sum;
#endif
}
pos++;
}
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include "gputypes.h"
#include "GpuFreeEnergyCudaKernels.h"
#include "openmm/OpenMMException.h"
#include <stdio.h>
#include <cuda.h>
#include <vector_functions.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
using namespace std;
struct Atom {
float x;
float y;
float z;
float r;
float sr;
float fx;
float fy;
float fz;
float fb;
float bornRadiusScaleFactor;
};
static __constant__ cudaGmxSimulation cSim;
void SetCalculateGBVISoftcoreForces2Sim( freeEnergyGpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyToSymbol(cSim, &gpu->gpuContext->sim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyToSymbol: SetCalculateGBVISoftcoreForces2Sim copy to cSim failed");
}
#include "kCalculateGBVIAux.h"
// Include versions of the kernels for N^2 calculations.
#define METHOD_NAME(a, b) a##N2##b
#include "kCalculateGBVISoftcoreForces2.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##N2ByWarp##b
#include "kCalculateGBVISoftcoreForces2.h"
// Include versions of the kernels with cutoffs.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_CUTOFF
#define METHOD_NAME(a, b) a##Cutoff##b
#include "kCalculateGBVISoftcoreForces2.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##CutoffByWarp##b
#include "kCalculateGBVISoftcoreForces2.h"
// Include versions of the kernels with periodic boundary conditions.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_PERIODIC
#define METHOD_NAME(a, b) a##Periodic##b
#include "kCalculateGBVISoftcoreForces2.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##PeriodicByWarp##b
#include "kCalculateGBVISoftcoreForces2.h"
void kCalculateGBVISoftcoreForces2( freeEnergyGpuContext freeEnergyGpu )
{
unsigned int threadsPerBlock;
static unsigned int threadsPerBlockPerMethod[3] = { 0, 0, 0 };
static unsigned int natoms[3] = { 0, 0, 0 };
gpuContext gpu = freeEnergyGpu->gpuContext;
unsigned int methodIndex = static_cast<unsigned int>(freeEnergyGpu->freeEnergySim.nonbondedMethod);
if( methodIndex > 2 ){
throw OpenMM::OpenMMException( "kCalculateGBVISoftcoreForces2 method index invalid." );
}
if( natoms[methodIndex] != gpu->natoms ){
unsigned int extra = methodIndex == 0 ? 0 : sizeof(float3);
threadsPerBlockPerMethod[methodIndex] = std::min(getThreadsPerBlockFEP( freeEnergyGpu, (sizeof(Atom) + extra), gpu->sharedMemoryPerBlock ), gpu->sim.nonbond_threads_per_block );
natoms[methodIndex] = gpu->natoms;
}
threadsPerBlock = threadsPerBlockPerMethod[methodIndex];
switch (freeEnergyGpu->freeEnergySim.nonbondedMethod)
{
case FREE_ENERGY_NO_CUTOFF:
if (gpu->bOutputBufferPerWarp)
kCalculateGBVISoftcoreN2ByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, threadsPerBlock,
sizeof(Atom)*threadsPerBlock>>>(gpu->sim.pWorkUnit);
else
kCalculateGBVISoftcoreN2Forces2_kernel<<<gpu->sim.bornForce2_blocks, threadsPerBlock,
sizeof(Atom)*threadsPerBlock>>>(gpu->sim.pWorkUnit);
break;
case FREE_ENERGY_CUTOFF:
if (gpu->bOutputBufferPerWarp)
kCalculateGBVISoftcoreCutoffByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, threadsPerBlock,
(sizeof(Atom)+sizeof(float3))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit );
else
kCalculateGBVISoftcoreCutoffForces2_kernel<<<gpu->sim.bornForce2_blocks, threadsPerBlock,
(sizeof(Atom)+sizeof(float3))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit );
break;
case FREE_ENERGY_PERIODIC:
if (gpu->bOutputBufferPerWarp)
kCalculateGBVISoftcorePeriodicByWarpForces2_kernel<<<gpu->sim.bornForce2_blocks, threadsPerBlock,
(sizeof(Atom)+sizeof(float3))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit );
else
kCalculateGBVISoftcorePeriodicForces2_kernel<<<gpu->sim.bornForce2_blocks, threadsPerBlock,
(sizeof(Atom)+sizeof(float3))*threadsPerBlock>>>(gpu->sim.pInteractingWorkUnit );
break;
}
LAUNCHERROR("kCalculateGBVISoftcoreForces2");
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include "kCalculateGBVIAux.h"
/**
* This file contains the kernel for evaluating the second stage of GB/VI. It is included
* several times in kCalculateGBVIForces2.cu with different #defines to generate
* different versions of the kernels.
*/
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_BORNFORCE2_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_BORNFORCE2_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_BORNFORCE2_THREADS_PER_BLOCK, 1)
#endif
void METHOD_NAME(kCalculateGBVISoftcore, Forces2_kernel)(unsigned int* workUnit )
{
extern __shared__ Atom sA[];
unsigned int totalWarps = gridDim.x*blockDim.x/GRID;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
unsigned int numWorkUnits = cSim.pInteractionCount[0];
unsigned int pos = warp*numWorkUnits/totalWarps;
unsigned int end = (warp+1)*numWorkUnits/totalWarps;
#ifdef USE_CUTOFF
float3* tempBuffer = (float3*) &sA[blockDim.x];
#endif
unsigned int lasty = -0xFFFFFFFF;
while (pos < end)
{
// Extract cell coordinates from appropriate work unit
unsigned int x = workUnit[pos];
unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
x = (x >> 17) << GRIDBITS;
unsigned int tgx = threadIdx.x & (GRID - 1);
unsigned int i = x + tgx;
float4 apos = cSim.pPosq[i];
float4 ar = cSim.pGBVIData[i];
float fb = cSim.pBornForce[i];
unsigned int tbx = threadIdx.x - tgx;
unsigned int tj = tgx;
Atom* psA = &sA[tbx];
sA[threadIdx.x].fx = 0.0f;
sA[threadIdx.x].fy = 0.0f;
sA[threadIdx.x].fz = 0.0f;
float3 af;
af.x = 0.0f;
af.y = 0.0f;
af.z = 0.0f;
if (x == y) // Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
sA[threadIdx.x].x = apos.x;
sA[threadIdx.x].y = apos.y;
sA[threadIdx.x].z = apos.z;
sA[threadIdx.x].r = ar.x;
sA[threadIdx.x].sr = ar.y;
sA[threadIdx.x].bornRadiusScaleFactor = ar.w;
sA[threadIdx.x].fb = fb;
for (unsigned int j = (tgx+1)&(GRID-1); j != tgx; j = (j+1)&(GRID-1))
{
float dx = psA[j].x - apos.x;
float dy = psA[j].y - apos.y;
float dz = psA[j].z - apos.z;
#ifdef USE_PERIODIC
dx -= floorf(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floorf(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floorf(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif
float r2 = dx * dx + dy * dy + dz * dz;
float r = sqrtf(r2);
// Atom I Born forces and sum
float dE = psA[j].bornRadiusScaleFactor*getGBVI_dE2( r, ar.x, psA[j].sr, fb );
#if defined USE_CUTOFF
if (i >= cSim.atoms || x+j >= cSim.atoms || (i == (x+j)) || r2 > cSim.nonbondedCutoffSqr)
#else
if(i >= cSim.atoms || x+j >= cSim.atoms || (i == (x+j)) )
#endif
{
dE = 0.0f;
}
float d = dx * dE;
af.x -= d;
psA[j].fx += d;
d = dy * dE;
af.y -= d;
psA[j].fy += d;
d = dz * dE;
af.z -= d;
psA[j].fz += d;
}
// Write results
float4 of;
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = x + tgx + warp*cSim.stride;
of = cSim.pForce4[offset];
of.x += af.x + sA[threadIdx.x].fx;
of.y += af.y + sA[threadIdx.x].fy;
of.z += af.z + sA[threadIdx.x].fz;
cSim.pForce4[offset] = of;
#else
unsigned int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
of = cSim.pForce4[offset];
of.x += af.x + sA[threadIdx.x].fx;
of.y += af.y + sA[threadIdx.x].fy;
of.z += af.z + sA[threadIdx.x].fz;
of.w = 0.0f;
cSim.pForce4[offset] = of;
#endif
}
else
{
// Read fixed atom data into registers and GRF
if (lasty != y)
{
unsigned int j = y + tgx;
float4 temp = cSim.pPosq[j];
float4 temp1 = cSim.pGBVIData[j];
float fb = cSim.pBornForce[j];
sA[threadIdx.x].fb = fb;
sA[threadIdx.x].x = temp.x;
sA[threadIdx.x].y = temp.y;
sA[threadIdx.x].z = temp.z;
sA[threadIdx.x].r = temp1.x;
sA[threadIdx.x].sr = temp1.y;
sA[threadIdx.x].bornRadiusScaleFactor = temp1.w;
}
#ifdef USE_CUTOFF
unsigned int flags = cSim.pInteractionFlag[pos];
if (flags == 0)
{
// No interactions in this block.
}
//else if (flags == 0xFFFFFFFF)
else if (flags)
#endif
{
// Compute all interactions within this block.
for (unsigned int j = 0; j < GRID; j++)
{
float dx = psA[tj].x - apos.x;
float dy = psA[tj].y - apos.y;
float dz = psA[tj].z - apos.z;
#ifdef USE_PERIODIC
dx -= floorf(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floorf(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floorf(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif
float r2 = dx * dx + dy * dy + dz * dz;
float r = sqrtf(r2);
float dE = psA[tj].bornRadiusScaleFactor*getGBVI_dE2( r, ar.x, psA[tj].sr, fb );
#if defined USE_CUTOFF
if (i >= cSim.atoms || y+tj >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
#else
if (i >= cSim.atoms || y+tj >= cSim.atoms )
#endif
{
dE = 0.0f;
}
float d = dx * dE;
af.x -= d;
psA[tj].fx += d;
d = dy * dE;
af.y -= d;
psA[tj].fy += d;
d = dz * dE;
af.z -= d;
psA[tj].fz += d;
// Atom J Born sum term
dE = ar.w*getGBVI_dE2( r, psA[tj].r, ar.y, psA[tj].fb );
#if defined USE_CUTOFF
if (i >= cSim.atoms || y+tj >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
#else
if (i >= cSim.atoms || y+tj >= cSim.atoms )
#endif
{
dE = 0.0f;
}
dx *= dE;
dy *= dE;
dz *= dE;
psA[tj].fx += dx;
psA[tj].fy += dy;
psA[tj].fz += dz;
af.x -= dx;
af.y -= dy;
af.z -= dz;
tj = (tj + 1) & (GRID - 1);
}
}
#ifdef USE_CUTOFF
else
{
// Compute only a subset of the interactions in this block.
for (unsigned int j = 0; j < GRID; j++)
{
if ((flags&(1<<j)) != 0)
{
float dx = psA[j].x - apos.x;
float dy = psA[j].y - apos.y;
float dz = psA[j].z - apos.z;
#ifdef USE_PERIODIC
dx -= floorf(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floorf(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floorf(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif
float r2 = dx * dx + dy * dy + dz * dz;
float r = sqrtf(r2);
// Interleaved Atom I and J Born Forces and sum components
float dE = psA[j].bornRadiusScaleFactor*getGBVI_dE2( r, ar.x, psA[j].sr, fb );
#if defined USE_CUTOFF
if (i >= cSim.atoms || y+j >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
#else
if (i >= cSim.atoms || y+j >= cSim.atoms )
#endif
{
dE = 0.0f;
}
float d = dx * dE;
af.x -= d;
tempBuffer[threadIdx.x].x = d;
d = dy * dE;
af.y -= d;
tempBuffer[threadIdx.x].y = d;
d = dz * dE;
af.z -= d;
tempBuffer[threadIdx.x].z = d;
// Atom J Born sum term
dE = ar.w*getGBVI_dE2( r, psA[j].r, ar.y, psA[j].fb );
#if defined USE_CUTOFF
if (i >= cSim.atoms || y+j >= cSim.atoms || r2 > cSim.nonbondedCutoffSqr)
#else
if (i >= cSim.atoms || y+j >= cSim.atoms )
#endif
{
dE = 0.0f;
}
dx *= dE;
dy *= dE;
dz *= dE;
tempBuffer[threadIdx.x].x += dx;
tempBuffer[threadIdx.x].y += dy;
tempBuffer[threadIdx.x].z += dz;
af.x -= dx;
af.y -= dy;
af.z -= dz;
// Sum the forces on atom j.
if (tgx % 2 == 0)
{
tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+1].x;
tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+1].y;
tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+1].z;
}
if (tgx % 4 == 0)
{
tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+2].x;
tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+2].y;
tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+2].z;
}
if (tgx % 8 == 0)
{
tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+4].x;
tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+4].y;
tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+4].z;
}
if (tgx % 16 == 0)
{
tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+8].x;
tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+8].y;
tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+8].z;
}
if (tgx == 0)
{
psA[j].fx += tempBuffer[threadIdx.x].x + tempBuffer[threadIdx.x+16].x;
psA[j].fy += tempBuffer[threadIdx.x].y + tempBuffer[threadIdx.x+16].y;
psA[j].fz += tempBuffer[threadIdx.x].z + tempBuffer[threadIdx.x+16].z;
}
}
}
}
#endif
// Write results
float4 of;
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = x + tgx + warp*cSim.stride;
of = cSim.pForce4[offset];
of.x += af.x;
of.y += af.y;
of.z += af.z;
cSim.pForce4[offset] = of;
offset = y + tgx + warp*cSim.stride;
of = cSim.pForce4[offset];
of.x += sA[threadIdx.x].fx;
of.y += sA[threadIdx.x].fy;
of.z += sA[threadIdx.x].fz;
cSim.pForce4[offset] = of;
#else
unsigned int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
of = cSim.pForce4[offset];
of.x += af.x;
of.y += af.y;
of.z += af.z;
of.w = 0.0f;
cSim.pForce4[offset] = of;
offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
of = cSim.pForce4[offset];
of.x += sA[threadIdx.x].fx;
of.y += sA[threadIdx.x].fy;
of.z += sA[threadIdx.x].fz;
cSim.pForce4[offset] = of;
#endif
}
lasty = y;
pos++;
}
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "GpuFreeEnergyCudaKernels.h"
#include "freeEnergyGpuTypes.h"
#include <cudatypes.h>
#include "kSoftcoreLJ.h"
#define PARAMETER_PRINT 0
#define MAX_PARAMETER_PRINT 10
static __constant__ cudaGmxSimulation cSim;
static __constant__ cudaFreeEnergyGmxSimulation feSim;
void SetCalculateLocalSoftcoreGpuSim( freeEnergyGpuContext gpu)
{
cudaError_t status;
status = cudaMemcpyToSymbol(cSim, &gpu->gpuContext->sim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyToSymbol: SetCalculateLocalSoftcoreGpuSim copy to cSim failed");
status = cudaMemcpyToSymbol(feSim, &gpu->freeEnergySim, sizeof(cudaFreeEnergyGmxSimulation));
RTERROR(status, "cudaMemcpyToSymbol: SetCalculateLocalSoftcoreGpuSim copy to feSim failed");
}
extern "C"
void gpuSetLJ14SoftcoreParameters( freeEnergyGpuContext gpu, float epsfac, const std::vector<int>& atom1, const std::vector<int>& atom2,
const std::vector<float>& c6, const std::vector<float>& c12, const std::vector<float>& qProd,
const std::vector<float>& softcoreLJLambdaArray ){
unsigned int LJ14s = atom1.size();
gpu->freeEnergySim.LJ14_count = LJ14s;
gpu->psLJ14ID = new CUDAStream<int4>(LJ14s, 1, "LJ14SoftcoreID");
CUDAStream<int4>* psLJ14ID = gpu->psLJ14ID;
gpu->freeEnergySim.pLJ14ID = psLJ14ID->_pDevData;
gpu->psLJ14Parameter = new CUDAStream<float4>(LJ14s, 1, "LJ14SoftcoreParameter");
CUDAStream<float4>* psLJ14Parameter = gpu->psLJ14Parameter;
gpu->freeEnergySim.pLJ14Parameter = psLJ14Parameter->_pDevData;
std::vector<int> outputBufferCounter( gpu->gpuContext->sim.atoms, 0 );
for( int ii = 0; ii < LJ14s; ii++ ){
(*psLJ14ID)[ii].x = atom1[ii];
(*psLJ14ID)[ii].y = atom2[ii];
(*psLJ14ID)[ii].z = outputBufferCounter[atom1[ii]]++;
(*psLJ14ID)[ii].w = outputBufferCounter[atom2[ii]]++;
float p0, p1, p2, p3;
if( c12[ii] == 0.0f ){
p0 = 0.0f;
p1 = 1.0f;
} else {
p0 = c6[ii] * c6[ii] / c12[ii];
p1 = pow(c12[ii] / c6[ii], 1.0f / 6.0f);
}
p2 = epsfac*qProd[ii];
p3 = softcoreLJLambdaArray[ii];
(*psLJ14Parameter)[ii].x = p0;
(*psLJ14Parameter)[ii].y = p1;
(*psLJ14Parameter)[ii].z = p2;
(*psLJ14Parameter)[ii].w = p3;
}
// logging info
if( gpu->log ){
(void) fprintf( gpu->log, "gpuSetLJ14SoftcoreParameters: number of 1-4 bonds=%5u\n", LJ14s );
#ifdef PARAMETER_PRINT
unsigned int maxPrint = MAX_PARAMETER_PRINT;
for( unsigned int ii = 0; ii < LJ14s; ii++ ){
(void) fprintf( gpu->log, " %5d [%5d %5d %5d %5d] %15.7e %15.7e %15.7e %15.7e\n",
ii, (*psLJ14ID)[ii].x, (*psLJ14ID)[ii].y, (*psLJ14ID)[ii].z, (*psLJ14ID)[ii].w,
(*psLJ14Parameter)[ii].x, (*psLJ14Parameter)[ii].y,
(*psLJ14Parameter)[ii].z/epsfac, (*psLJ14Parameter)[ii].w );
if( ii == maxPrint ){
(void) fprintf( gpu->log, "\n" );
ii = LJ14s - maxPrint;
if( ii < maxPrint )ii = maxPrint;
}
}
(void) fprintf( gpu->log, "\n" );
#endif
(void) fflush( gpu->log );
}
psLJ14ID->Upload();
psLJ14Parameter->Upload();
return;
}
#define DOT3(v1, v2) (v1.x * v2.x + v1.y * v2.y + v1.z * v2.z)
__global__ void kCalculateLocalSoftcoreForces_kernel()
{
unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
float energy = 0.0f;
if (feSim.nonbondedMethod == NO_CUTOFF)
{
while (pos < feSim.LJ14_count)
{
int4 atom = feSim.pLJ14ID[pos];
float4 LJ14 = feSim.pLJ14Parameter[pos];
float4 a1 = cSim.pPosq[atom.x];
float4 a2 = cSim.pPosq[atom.y];
float3 d;
d.x = a1.x - a2.x;
d.y = a1.y - a2.y;
d.z = a1.z - a2.z;
float r2 = DOT3(d, d);
float inverseR = 1.0f / sqrtf(r2);
#ifdef USE_SOFTCORE_LJ
float CDLJ_energy = 0.0f;
float dEdR = getSoftCoreLJ( r2, LJ14.y, LJ14.x, LJ14.w, LJ14.w, &CDLJ_energy );
energy += CDLJ_energy;
#else
float sig2 = inverseR * LJ14.y;
sig2 *= sig2;
float sig6 = sig2 * sig2 * sig2;
float dEdR = LJ14.x * (12.0f * sig6 - 6.0f) * sig6;
energy += LJ14.x * (sig6 - 1.0f) * sig6;
#endif
energy += LJ14.z * inverseR;
dEdR += LJ14.z * inverseR;
dEdR *= inverseR * inverseR;
unsigned int offsetA = atom.x + atom.z * cSim.stride;
unsigned int offsetB = atom.y + atom.w * cSim.stride;
float4 forceA = cSim.pForce4[offsetA];
float4 forceB = cSim.pForce4[offsetB];
d.x *= dEdR;
d.y *= dEdR;
d.z *= dEdR;
forceA.x += d.x;
forceA.y += d.y;
forceA.z += d.z;
forceB.x -= d.x;
forceB.y -= d.y;
forceB.z -= d.z;
cSim.pForce4[offsetA] = forceA;
cSim.pForce4[offsetB] = forceB;
pos += blockDim.x * gridDim.x;
}
} else if (feSim.nonbondedMethod == CUTOFF) {
float LJ14_energy;
while (pos < feSim.LJ14_count ){
int4 atom = feSim.pLJ14ID[pos];
float4 LJ14 = feSim.pLJ14Parameter[pos];
float4 a1 = cSim.pPosq[atom.x];
float4 a2 = cSim.pPosq[atom.y];
float3 d;
d.x = a1.x - a2.x;
d.y = a1.y - a2.y;
d.z = a1.z - a2.z;
float r2 = DOT3(d, d);
float inverseR = 1.0f / sqrtf(r2);
#ifdef USE_SOFTCORE_LJ
float dEdR = getSoftCoreLJ( r2, LJ14.y, LJ14.x, LJ14.w, LJ14.w, &LJ14_energy);
#else
float sig2 = inverseR * LJ14.y;
sig2 *= sig2;
float sig6 = sig2 * sig2 * sig2;
float dEdR = LJ14.x * (12.0f * sig6 - 6.0f) * sig6;
LJ14_energy = LJ14.x * (sig6 - 1.0f) * sig6;
#endif
LJ14_energy += LJ14.z * (inverseR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
dEdR += LJ14.z * (inverseR - 2.0f * cSim.reactionFieldK * r2);
dEdR *= inverseR * inverseR;
if (r2 > feSim.nonbondedCutoffSqr)
{
dEdR = 0.0f;
LJ14_energy = 0.0f;
}
energy += LJ14_energy;
unsigned int offsetA = atom.x + atom.z * cSim.stride;
unsigned int offsetB = atom.y + atom.w * cSim.stride;
float4 forceA = cSim.pForce4[offsetA];
float4 forceB = cSim.pForce4[offsetB];
d.x *= dEdR;
d.y *= dEdR;
d.z *= dEdR;
forceA.x += d.x;
forceA.y += d.y;
forceA.z += d.z;
forceB.x -= d.x;
forceB.y -= d.y;
forceB.z -= d.z;
cSim.pForce4[offsetA] = forceA;
cSim.pForce4[offsetB] = forceB;
pos += blockDim.x * gridDim.x;
}
} else if (feSim.nonbondedMethod == PERIODIC ){
float LJ14_energy;
while (pos < feSim.LJ14_count ){
int4 atom = feSim.pLJ14ID[pos];
float4 LJ14 = feSim.pLJ14Parameter[pos];
float4 a1 = cSim.pPosq[atom.x];
float4 a2 = cSim.pPosq[atom.y];
float3 d;
d.x = a1.x - a2.x;
d.y = a1.y - a2.y;
d.z = a1.z - a2.z;
d.x -= floorf(d.x/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
d.y -= floorf(d.y/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
d.z -= floorf(d.z/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
float r2 = DOT3(d, d);
float inverseR = 1.0f / sqrtf(r2);
#ifdef USE_SOFTCORE_LJ
float dEdR = getSoftCoreLJ( r2, LJ14.y, LJ14.x, LJ14.w, LJ14.w, &LJ14_energy);
#else
float sig2 = inverseR * LJ14.y;
sig2 *= sig2;
float sig6 = sig2 * sig2 * sig2;
float dEdR = LJ14.x * (12.0f * sig6 - 6.0f) * sig6;
LJ14_energy = LJ14.x * (sig6 - 1.0f) * sig6;
#endif
LJ14_energy += LJ14.z * (inverseR + cSim.reactionFieldK * r2 - cSim.reactionFieldC);
dEdR += LJ14.z * (inverseR - 2.0f * cSim.reactionFieldK * r2);
dEdR *= inverseR * inverseR;
if (r2 > feSim.nonbondedCutoffSqr)
{
dEdR = 0.0f;
LJ14_energy = 0.0f;
}
energy += LJ14_energy;
unsigned int offsetA = atom.x + atom.z * cSim.stride;
unsigned int offsetB = atom.y + atom.w * cSim.stride;
float4 forceA = cSim.pForce4[offsetA];
float4 forceB = cSim.pForce4[offsetB];
d.x *= dEdR;
d.y *= dEdR;
d.z *= dEdR;
forceA.x += d.x;
forceA.y += d.y;
forceA.z += d.z;
forceB.x -= d.x;
forceB.y -= d.y;
forceB.z -= d.z;
cSim.pForce4[offsetA] = forceA;
cSim.pForce4[offsetB] = forceB;
pos += blockDim.x * gridDim.x;
}
}
cSim.pEnergy[blockIdx.x * blockDim.x + threadIdx.x] += energy;
}
void kCalculateLocalSoftcoreForces( freeEnergyGpuContext freeEnergyGpuContext )
{
gpuContext gpu = freeEnergyGpuContext->gpuContext;
kCalculateLocalSoftcoreForces_kernel<<<gpu->sim.blocks, gpu->sim.localForces_threads_per_block, gpu->sim.localForces_threads_per_block * sizeof(Vectors)>>>();
LAUNCHERROR("kCalculateLocalSoftcoreForces");
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "freeEnergyGpuTypes.h"
#include "GpuFreeEnergyCudaKernels.h"
#include "openmm/OpenMMException.h"
#include <iostream>
#include <sstream>
#define PARAMETER_PRINT 0
#define MAX_PARAMETER_PRINT 10
// device handles
static __constant__ cudaGmxSimulation cSim;
static __constant__ cudaFreeEnergyGmxSimulation feSimDev;
// write address of structs to devices
void SetCalculateCDLJSoftcoreGpuSim( freeEnergyGpuContext freeEnergyGpu ){
cudaError_t status;
status = cudaMemcpyToSymbol(cSim, &freeEnergyGpu->gpuContext->sim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyToSymbol: SetCalculateCDLJSoftcoreGpuSim copy to cSim failed");
status = cudaMemcpyToSymbol( feSimDev, &freeEnergyGpu->freeEnergySim, sizeof(cudaFreeEnergyGmxSimulation));
RTERROR(status, "cudaMemcpyToSymbol: SetCalculateCDLJSoftcoreGpuSim copy to feSimDev failed");
}
extern "C"
bool gpuIsAvailableSoftcore()
{
int deviceCount;
cudaGetDeviceCount(&deviceCount);
return (deviceCount > 0);
}
struct Atom {
float x;
float y;
float z;
float q;
float sig;
float eps;
float softCoreLJLambda;
float fx;
float fy;
float fz;
};
// Include versions of the kernels for N^2 calculations with softcore LJ.
#define USE_SOFTCORE_LJ
#ifdef USE_SOFTCORE_LJ
#include "kSoftcoreLJ.h"
#endif
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##N2SoftcoreLJ##b
#undef USE_OUTPUT_BUFFER_PER_WARP
#include "kCalculateNonbondedSoftcore.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##N2SoftcoreLJByWarp##b
#include "kCalculateNonbondedSoftcore.h"
// Include versions of the kernels with cutoffs.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_CUTOFF
#define METHOD_NAME(a, b) a##Cutoff##b
#include "kCalculateNonbondedSoftcore.h"
#include "kFindInteractingBlocks.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##CutoffByWarp##b
#include "kCalculateNonbondedSoftcore.h"
// Include versions of the kernels with periodic boundary conditions.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_PERIODIC
#define METHOD_NAME(a, b) a##Periodic##b
#include "kCalculateNonbondedSoftcore.h"
#include "kFindInteractingBlocks.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##PeriodicByWarp##b
#include "kCalculateNonbondedSoftcore.h"
void kCalculateCDLJSoftcoreForces( freeEnergyGpuContext freeEnergyGpu )
{
gpuContext gpu = freeEnergyGpu->gpuContext;
switch (freeEnergyGpu->freeEnergySim.nonbondedMethod)
{
case FREE_ENERGY_NO_CUTOFF:
if (gpu->bOutputBufferPerWarp)
kCalculateCDLJSoftcoreN2SoftcoreLJByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit);
else
kCalculateCDLJSoftcoreN2SoftcoreLJForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit );
LAUNCHERROR("kCalculateCDLJSoftcoreN2Forces");
break;
case FREE_ENERGY_CUTOFF:
kFindBlockBoundsCutoff_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
LAUNCHERROR("kFindBlockBoundsCutoff");
kFindBlocksWithInteractionsCutoff_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
LAUNCHERROR("kFindBlocksWithInteractionsCutoff");
compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
kFindInteractionsWithinBlocksCutoff_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
if (gpu->bOutputBufferPerWarp)
kCalculateCDLJSoftcoreCutoffByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
else
kCalculateCDLJSoftcoreCutoffForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
LAUNCHERROR("kCalculateCDLJSoftcoreCutoffForces");
break;
case FREE_ENERGY_PERIODIC:
kFindBlockBoundsPeriodic_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
LAUNCHERROR("kFindBlockBoundsPeriodic");
kFindBlocksWithInteractionsPeriodic_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
LAUNCHERROR("kFindBlocksWithInteractionsPeriodic");
compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
kFindInteractionsWithinBlocksPeriodic_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
if (gpu->bOutputBufferPerWarp)
kCalculateCDLJSoftcorePeriodicByWarpForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
else
kCalculateCDLJSoftcorePeriodicForces_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float3))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
LAUNCHERROR("kCalculateCDLJSoftcorePeriodicForces");
break;
default:
throw OpenMM::OpenMMException( "Nonbonded softcore method not recognized." );
}
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
/**
* This file contains the kernels for evaluating nonbonded softcore forces. It is included
* several times in kCalculateNonbondedSoftcore.cu with different #defines to generate
* different versions of the kernels.
*/
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_NONBOND_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
#endif
void METHOD_NAME(kCalculateCDLJSoftcore, Forces_kernel)(unsigned int* workUnit )
{
extern __shared__ Atom sA[];
unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
unsigned int numWorkUnits = cSim.pInteractionCount[0];
unsigned int pos = warp*numWorkUnits/totalWarps;
unsigned int end = (warp+1)*numWorkUnits/totalWarps;
float CDLJ_energy;
float energy = 0.0f;
#ifdef USE_CUTOFF
float3* tempBuffer = (float3*) &sA[cSim.nonbond_threads_per_block];
#endif
unsigned int lasty = 0xFFFFFFFF;
while (pos < end)
{
// Extract cell coordinates from appropriate work unit
unsigned int x = workUnit[pos];
unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
bool bExclusionFlag = (x & 0x1);
x = (x >> 17) << GRIDBITS;
float4 apos; // Local atom x, y, z, q
float3 af; // Local atom fx, fy, fz
float dx;
float dy;
float dz;
float r2;
float invR;
float sig;
float eps;
float dEdR;
unsigned int tgx = threadIdx.x & (GRID - 1);
unsigned int tbx = threadIdx.x - tgx;
unsigned int tj = tgx;
Atom* psA = &sA[tbx];
unsigned int i = x + tgx;
apos = cSim.pPosq[i];
float4 a = feSimDev.pSigEps4[i];
float softCoreLJLambda = a.z;
af.x = 0.0f;
af.y = 0.0f;
af.z = 0.0f;
if (x == y) // Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
sA[threadIdx.x].x = apos.x;
sA[threadIdx.x].y = apos.y;
sA[threadIdx.x].z = apos.z;
sA[threadIdx.x].q = a.w;
sA[threadIdx.x].sig = a.x;
sA[threadIdx.x].eps = a.y;
sA[threadIdx.x].softCoreLJLambda = a.z;
a.w *= cSim.epsfac;
if (!bExclusionFlag)
{
for (unsigned int j = 0; j < GRID; j++)
{
dx = psA[j].x - apos.x;
dy = psA[j].y - apos.y;
dz = psA[j].z - apos.z;
#ifdef USE_PERIODIC
dx -= floorf(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floorf(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floorf(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif
r2 = dx * dx + dy * dy + dz * dz;
invR = 1.0f / sqrtf(r2);
sig = a.x + psA[j].sig;
eps = a.y * psA[j].eps;
#ifdef USE_SOFTCORE_LJ
dEdR = getSoftCoreLJ( r2, sig, eps, softCoreLJLambda, psA[j].softCoreLJLambda, &CDLJ_energy );
#else
float sig2 = invR * sig;
sig2 *= sig2;
float sig6 = sig2 * sig2 * sig2;
dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
CDLJ_energy = eps * (sig6 - 1.0f) * sig6;
#endif
#ifdef USE_CUTOFF
dEdR += a.w * psA[j].q * (invR - 2.0f * feSimDev.reactionFieldK * r2);
CDLJ_energy += a.w * psA[j].q * (invR + feSimDev.reactionFieldK * r2 - feSimDev.reactionFieldC);
#else
dEdR += a.w * psA[j].q * invR;
CDLJ_energy += a.w * psA[j].q * invR;
#endif
dEdR *= invR * invR;
#ifdef USE_CUTOFF
if (r2 > cSim.nonbondedCutoffSqr)
{
dEdR = 0.0f;
CDLJ_energy = 0.0f;
}
#endif
energy += 0.5f*CDLJ_energy;
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
}
} else {
unsigned int xi = x>>GRIDBITS;
unsigned int cell = xi+xi*cSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
unsigned int excl = cSim.pExclusion[cSim.pExclusionIndex[cell]+tgx];
for (unsigned int j = 0; j < GRID; j++)
{
dx = psA[j].x - apos.x;
dy = psA[j].y - apos.y;
dz = psA[j].z - apos.z;
#ifdef USE_PERIODIC
dx -= floorf(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floorf(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floorf(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif
r2 = dx * dx + dy * dy + dz * dz;
invR = 1.0f / sqrtf(r2);
sig = a.x + psA[j].sig;
eps = a.y * psA[j].eps;
#ifdef USE_SOFTCORE_LJ
dEdR = getSoftCoreLJ( r2, sig, eps, softCoreLJLambda, psA[j].softCoreLJLambda, &CDLJ_energy );
#else
float sig2 = invR * sig;
sig2 *= sig2;
float sig6 = sig2 * sig2 * sig2;
dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
CDLJ_energy = eps * (sig6 - 1.0f) * sig6;
#endif
#ifdef USE_CUTOFF
dEdR += a.w * psA[j].q * (invR - 2.0f * feSimDev.reactionFieldK * r2);
CDLJ_energy += a.w * psA[j].q * (invR + feSimDev.reactionFieldK * r2 - feSimDev.reactionFieldC);
#else
dEdR += a.w * psA[j].q * invR;
CDLJ_energy += a.w * psA[j].q * invR;
#endif
dEdR *= invR * invR;
#ifdef USE_CUTOFF
if (!(excl & 0x1) || r2 > cSim.nonbondedCutoffSqr)
#else
if (!(excl & 0x1))
#endif
{
dEdR = 0.0f;
CDLJ_energy = 0.0f;
}
energy += 0.5f*CDLJ_energy;
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
excl >>= 1;
}
}
// Write results
float4 of;
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = x + tgx + warp*cSim.stride;
of = cSim.pForce4[offset];
of.x += af.x;
of.y += af.y;
of.z += af.z;
cSim.pForce4[offset] = of;
#else
of.x = af.x;
of.y = af.y;
of.z = af.z;
of.w = 0.0f;
unsigned int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pForce4[offset] = of;
#endif
} else {
// Read fixed atom data into registers and GRF
if (lasty != y)
{
unsigned int j = y + tgx;
float4 temp = cSim.pPosq[j];
//float2 temp1 = cSim.pAttr[j];
float4 temp1 = feSimDev.pSigEps4[j];
//float temp3 = cSim.pSoftCoreLJLambda[j];
//float temp3 = softCoreLJLambdaArray[j];
float temp3 = temp1.z;
sA[threadIdx.x].x = temp.x;
sA[threadIdx.x].y = temp.y;
sA[threadIdx.x].z = temp.z;
sA[threadIdx.x].q = temp1.w;
sA[threadIdx.x].sig = temp1.x;
sA[threadIdx.x].eps = temp1.y;
sA[threadIdx.x].softCoreLJLambda = temp3;
}
sA[threadIdx.x].fx = 0.0f;
sA[threadIdx.x].fy = 0.0f;
sA[threadIdx.x].fz = 0.0f;
a.w *= cSim.epsfac;
if (!bExclusionFlag)
{
#ifdef USE_CUTOFF
unsigned int flags = cSim.pInteractionFlag[pos];
if (flags == 0)
{
// No interactions in this block.
}
else if (flags == 0xFFFFFFFF)
#endif
{
// Compute all interactions within this block.
for (unsigned int j = 0; j < GRID; j++)
{
dx = psA[tj].x - apos.x;
dy = psA[tj].y - apos.y;
dz = psA[tj].z - apos.z;
#ifdef USE_PERIODIC
dx -= floorf(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floorf(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floorf(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif
r2 = dx * dx + dy * dy + dz * dz;
invR = 1.0f / sqrtf(r2);
sig = a.x + psA[tj].sig;
eps = a.y * psA[tj].eps;
#ifdef USE_SOFTCORE_LJ
dEdR = getSoftCoreLJ( r2, sig, eps, softCoreLJLambda, psA[tj].softCoreLJLambda, &CDLJ_energy );
#else
float sig2 = invR * sig;
sig2 *= sig2;
float sig6 = sig2 * sig2 * sig2;
dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
CDLJ_energy = eps * (sig6 - 1.0f) * sig6;
#endif
#ifdef USE_CUTOFF
dEdR += a.w * psA[tj].q * (invR - 2.0f * feSimDev.reactionFieldK * r2);
CDLJ_energy += a.w * psA[tj].q * (invR + feSimDev.reactionFieldK * r2 - feSimDev.reactionFieldC);
#else
dEdR += a.w * psA[tj].q * invR;
CDLJ_energy += a.w * psA[tj].q * invR;
#endif
dEdR *= invR * invR;
#ifdef USE_CUTOFF
if (r2 > cSim.nonbondedCutoffSqr)
{
dEdR = 0.0f;
CDLJ_energy = 0.0f;
}
#endif
energy += CDLJ_energy;
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
psA[tj].fx += dx;
psA[tj].fy += dy;
psA[tj].fz += dz;
tj = (tj + 1) & (GRID - 1);
}
}
#ifdef USE_CUTOFF
else
{
// Compute only a subset of the interactions in this block.
for (unsigned int j = 0; j < GRID; j++)
{
if ((flags&(1<<j)) != 0)
{
dx = psA[j].x - apos.x;
dy = psA[j].y - apos.y;
dz = psA[j].z - apos.z;
#ifdef USE_PERIODIC
dx -= floorf(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floorf(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floorf(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif
r2 = dx * dx + dy * dy + dz * dz;
invR = 1.0f / sqrtf(r2);
sig = a.x + psA[j].sig;
eps = a.y * psA[j].eps;
#ifdef USE_SOFTCORE_LJ
dEdR = getSoftCoreLJ( r2, sig, eps, softCoreLJLambda, psA[j].softCoreLJLambda, &CDLJ_energy );
#else
float sig2 = invR * sig;
sig2 *= sig2;
float sig6 = sig2 * sig2 * sig2;
dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
CDLJ_energy = eps * (sig6 - 1.0f) * sig6;
#endif
#ifdef USE_CUTOFF
dEdR += a.w * psA[j].q * (invR - 2.0f * feSimDev.reactionFieldK * r2);
CDLJ_energy += a.w * psA[j].q * (invR + feSimDev.reactionFieldK * r2 - feSimDev.reactionFieldC);
#else
dEdR += a.w * psA[j].q * invR;
CDLJ_energy += a.w * psA[j].q * invR;
#endif
dEdR *= invR * invR;
#ifdef USE_CUTOFF
if (r2 > cSim.nonbondedCutoffSqr)
{
dEdR = 0.0f;
CDLJ_energy = 0.0f;
}
#endif
energy += CDLJ_energy;
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
tempBuffer[threadIdx.x].x = dx;
tempBuffer[threadIdx.x].y = dy;
tempBuffer[threadIdx.x].z = dz;
// Sum the forces on atom j.
if (tgx % 2 == 0)
{
tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+1].x;
tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+1].y;
tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+1].z;
}
if (tgx % 4 == 0)
{
tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+2].x;
tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+2].y;
tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+2].z;
}
if (tgx % 8 == 0)
{
tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+4].x;
tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+4].y;
tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+4].z;
}
if (tgx % 16 == 0)
{
tempBuffer[threadIdx.x].x += tempBuffer[threadIdx.x+8].x;
tempBuffer[threadIdx.x].y += tempBuffer[threadIdx.x+8].y;
tempBuffer[threadIdx.x].z += tempBuffer[threadIdx.x+8].z;
}
if (tgx == 0)
{
psA[j].fx += tempBuffer[threadIdx.x].x + tempBuffer[threadIdx.x+16].x;
psA[j].fy += tempBuffer[threadIdx.x].y + tempBuffer[threadIdx.x+16].y;
psA[j].fz += tempBuffer[threadIdx.x].z + tempBuffer[threadIdx.x+16].z;
}
}
}
}
#endif
}
else // bExclusion
{
// Read fixed atom data into registers and GRF
unsigned int xi = x>>GRIDBITS;
unsigned int yi = y>>GRIDBITS;
unsigned int cell = xi+yi*cSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
unsigned int excl = cSim.pExclusion[cSim.pExclusionIndex[cell]+tgx];
excl = (excl >> tgx) | (excl << (GRID - tgx));
for (unsigned int j = 0; j < GRID; j++)
{
dx = psA[tj].x - apos.x;
dy = psA[tj].y - apos.y;
dz = psA[tj].z - apos.z;
#ifdef USE_PERIODIC
dx -= floorf(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floorf(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floorf(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif
r2 = dx * dx + dy * dy + dz * dz;
invR = 1.0f / sqrtf(r2);
sig = a.x + psA[tj].sig;
eps = a.y * psA[tj].eps;
#ifdef USE_SOFTCORE_LJ
dEdR = getSoftCoreLJ( r2, sig, eps, softCoreLJLambda, psA[tj].softCoreLJLambda, &CDLJ_energy );
#else
float sig2 = invR * sig;
sig2 *= sig2;
float sig6 = sig2 * sig2 * sig2;
dEdR = eps * (12.0f * sig6 - 6.0f) * sig6;
CDLJ_energy = eps * (sig6 - 1.0f) * sig6;
#endif
#ifdef USE_CUTOFF
dEdR += a.w * psA[tj].q * (invR - 2.0f * feSimDev.reactionFieldK * r2);
CDLJ_energy += a.w * psA[tj].q * (invR + feSimDev.reactionFieldK * r2 - feSimDev.reactionFieldC);
#else
dEdR += a.w * psA[tj].q * invR;
CDLJ_energy += a.w * psA[tj].q * invR;
#endif
dEdR *= invR * invR;
#ifdef USE_CUTOFF
if (!(excl & 0x1) || r2 > cSim.nonbondedCutoffSqr)
#else
if (!(excl & 0x1))
#endif
{
dEdR = 0.0f;
CDLJ_energy = 0.0f;
}
energy += CDLJ_energy;
dx *= dEdR;
dy *= dEdR;
dz *= dEdR;
af.x -= dx;
af.y -= dy;
af.z -= dz;
psA[tj].fx += dx;
psA[tj].fy += dy;
psA[tj].fz += dz;
excl >>= 1;
tj = (tj + 1) & (GRID - 1);
}
}
// Write results
float4 of;
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = x + tgx + warp*cSim.stride;
of = cSim.pForce4[offset];
of.x += af.x;
of.y += af.y;
of.z += af.z;
cSim.pForce4[offset] = of;
offset = y + tgx + warp*cSim.stride;
of = cSim.pForce4[offset];
of.x += sA[threadIdx.x].fx;
of.y += sA[threadIdx.x].fy;
of.z += sA[threadIdx.x].fz;
cSim.pForce4[offset] = of;
#else
of.x = af.x;
of.y = af.y;
of.z = af.z;
of.w = 0.0f;
unsigned int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
cSim.pForce4[offset] = of;
of.x = sA[threadIdx.x].fx;
of.y = sA[threadIdx.x].fy;
of.z = sA[threadIdx.x].fz;
offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pForce4[offset] = of;
#endif
lasty = y;
}
pos++;
}
cSim.pEnergy[blockIdx.x*blockDim.x+threadIdx.x] += energy;
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "gputypes.h"
#include "freeEnergyGpuTypes.h"
#include "GpuFreeEnergyCudaKernels.h"
#include "kernels/cudaKernels.h"
#include "openmm/OpenMMException.h"
#include <cuda.h>
#define PARAMETER_PRINT 0
#define MAX_PARAMETER_PRINT 10
struct Atom {
float x;
float y;
float z;
float r;
float sr;
float sum;
float polarScaleData;
};
static __constant__ cudaGmxSimulation cSim;
static __constant__ cudaFreeEnergyGmxSimulation gbsaSimDev;
extern "C" void SetCalculateObcGbsaSoftcoreBornSumSim( freeEnergyGpuContext freeEnergyGpu)
{
cudaError_t status;
status = cudaMemcpyToSymbol( cSim, &freeEnergyGpu->gpuContext->sim, sizeof(cudaGmxSimulation));
RTERROR(status, "cudaMemcpyToSymbol: SetCalculateObcGbsaSoftcoreBornSumSim copy to cSim failed.");
status = cudaMemcpyToSymbol( gbsaSimDev, &freeEnergyGpu->freeEnergySim, sizeof(cudaFreeEnergyGmxSimulation));
RTERROR(status, "cudaMemcpyToSymbol: SetCalculateObcGbsaSoftcoreBornSumSim copy to gbsaSimDev failed.");
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
void kClearObcGbsaSoftcoreBornSum_kernel()
{
unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
while (pos < cSim.stride * cSim.nonbondOutputBuffers)
{
((float*)cSim.pBornSum)[pos] = 0.0f;
pos += gridDim.x * blockDim.x;
}
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
void kClearSoftcoreBornForces_kernel()
{
unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
while (pos < cSim.stride * cSim.nonbondOutputBuffers)
{
((float*)cSim.pBornForce)[pos] = 0.0f;
pos += gridDim.x * blockDim.x;
}
}
void kClearSoftcoreBornForces(gpuContext gpu)
{
// printf("kClearSoftcoreBornForces\n");
kClearSoftcoreBornForces_kernel<<<gpu->sim.blocks, gpu->sim.threads_per_block>>>();
LAUNCHERROR("kClearSoftcoreBornForces");
}
void kClearObcGbsaSoftcoreBornSum(gpuContext gpu)
{
// printf("kClearObcGbsaBornSum\n");
kClearObcGbsaSoftcoreBornSum_kernel<<<gpu->sim.blocks, gpu->sim.threads_per_block>>>();
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
void kReduceObcGbsaSoftcoreBornForces_kernel()
{
unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
float energy = 0.0f;
while (pos < cSim.atoms)
{
float bornRadius = cSim.pBornRadii[pos];
float obcChain = cSim.pObcChain[pos];
float2 obcData = cSim.pObcData[pos];
float nonPolarScaleData = gbsaSimDev.pNonPolarScalingFactors[pos];
float totalForce = 0.0f;
float* pFt = cSim.pBornForce + pos;
int i = cSim.nonbondOutputBuffers;
while (i >= 4)
{
float f1 = *pFt;
pFt += cSim.stride;
float f2 = *pFt;
pFt += cSim.stride;
float f3 = *pFt;
pFt += cSim.stride;
float f4 = *pFt;
pFt += cSim.stride;
totalForce += f1 + f2 + f3 + f4;
i -= 4;
}
if (i >= 2)
{
float f1 = *pFt;
pFt += cSim.stride;
float f2 = *pFt;
pFt += cSim.stride;
totalForce += f1 + f2;
i -= 2;
}
if (i > 0)
{
totalForce += *pFt;
}
float r = (obcData.x + cSim.dielectricOffset + cSim.probeRadius);
float ratio6 = pow((obcData.x + cSim.dielectricOffset) / bornRadius, 6.0f);
float saTerm = nonPolarScaleData*cSim.surfaceAreaFactor * r * r * ratio6;
totalForce += saTerm / bornRadius; // 1.102 == Temp mysterious fudge factor, FIX FIX FIX
energy += saTerm;
totalForce *= bornRadius * bornRadius * obcChain;
pFt = cSim.pBornForce + pos;
*pFt = totalForce;
pos += gridDim.x * blockDim.x;
}
// correct for surface area factor of -6
cSim.pEnergy[blockIdx.x * blockDim.x + threadIdx.x] += energy / -6.0f;
}
void kReduceObcGbsaSoftcoreBornForces( gpuContext gpu ){
kReduceObcGbsaSoftcoreBornForces_kernel<<<gpu->sim.blocks, gpu->sim.threads_per_block>>>();
LAUNCHERROR("kReduceObcGbsaSoftcoreBornForces");
}
// Include versions of the kernels for N^2 calculations.
#define METHOD_NAME(a, b) a##N2##b
#include "kCalculateObcGbsaSoftcoreBornSum.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##N2ByWarp##b
#include "kCalculateObcGbsaSoftcoreBornSum.h"
// Include versions of the kernels with cutoffs.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_CUTOFF
#define METHOD_NAME(a, b) a##Cutoff##b
#include "kCalculateObcGbsaSoftcoreBornSum.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##CutoffByWarp##b
#include "kCalculateObcGbsaSoftcoreBornSum.h"
// Include versions of the kernels with periodic boundary conditions.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define USE_PERIODIC
#define METHOD_NAME(a, b) a##Periodic##b
#include "kCalculateObcGbsaSoftcoreBornSum.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##PeriodicByWarp##b
#include "kCalculateObcGbsaSoftcoreBornSum.h"
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
void kReduceObcGbsaSoftcoreBornSum_kernel()
{
unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
while (pos < cSim.atoms)
{
float sum = 0.0f;
float* pSt = cSim.pBornSum + pos;
float2 atom = cSim.pObcData[pos];
// Get summed Born data
for( int i = 0; i < cSim.nonbondOutputBuffers; i++ ){
sum += *pSt;
pSt += cSim.stride;
}
// Now calculate Born radius and OBC term.
sum *= 0.5f * atom.x;
float sum2 = sum * sum;
float sum3 = sum * sum2;
float tanhSum = tanh(cSim.alphaOBC * sum - cSim.betaOBC * sum2 + cSim.gammaOBC * sum3);
float nonOffsetRadii = atom.x + cSim.dielectricOffset;
float bornRadius = 1.0f / (1.0f / atom.x - tanhSum / nonOffsetRadii);
float obcChain = atom.x * (cSim.alphaOBC - 2.0f * cSim.betaOBC * sum + 3.0f * cSim.gammaOBC * sum2);
obcChain = (1.0f - tanhSum * tanhSum) * obcChain / nonOffsetRadii;
cSim.pBornRadii[pos] = bornRadius;
cSim.pObcChain[pos] = obcChain;
pos += gridDim.x * blockDim.x;
}
}
void kReduceObcGbsaSoftcoreBornSum(gpuContext gpu)
{
// printf("kReduceObcGbsaSoftcoreBornSum\n");
kReduceObcGbsaSoftcoreBornSum_kernel<<<gpu->sim.blocks, gpu->sim.threads_per_block>>>();
gpu->bRecalculateBornRadii = false;
LAUNCHERROR("kReduceObcGbsaSoftcoreBornSum");
}
/**
* Initialize parameters for Cuda Obc softcore
*
* @param freeEnergyGpu freeEnergyGpu context
* @param innerDielectric solute dielectric
* @param solventDielectric solvent dielectric
* @param radius intrinsic Born radii
* @param scale Obc scaling factors
* @param charge atomic charges (possibly overwritten by other methods?)
* @param nonPolarScalingFactors non-polar scaling factors
*
*/
extern "C"
void gpuSetObcSoftcoreParameters( freeEnergyGpuContext freeEnergyGpu, float innerDielectric, float solventDielectric, float nonPolarPrefactor,
const std::vector<float>& radius, const std::vector<float>& scale,
const std::vector<float>& charge, const std::vector<float>& nonPolarScalingFactors)
{
// ---------------------------------------------------------------------------------------
static const float dielectricOffset = 0.009f;
static const float electricConstant = -166.02691f;
static const std::string methodName = "gpuSetObcSoftcoreParameters";
// ---------------------------------------------------------------------------------------
unsigned int numberOfParticles = radius.size();
gpuContext gpu = freeEnergyGpu->gpuContext;
// initialize parameters
freeEnergyGpu->psNonPolarScalingFactors = new CUDAStream<float>( gpu->sim.paddedNumberOfAtoms, 1, "ObcSoftcoreNonPolarScaling");
freeEnergyGpu->freeEnergySim.pNonPolarScalingFactors = freeEnergyGpu->psNonPolarScalingFactors->_pDevData;
gpu->sim.surfaceAreaFactor = -6.0f*PI*4.0f*nonPolarPrefactor;
gpu->sim.preFactor = 2.0f*electricConstant*((1.0f/innerDielectric)-(1.0f/solventDielectric))*gpu->sim.forceConversionFactor;
for( unsigned int ii = 0; ii < numberOfParticles; ii++ ){
(*gpu->psObcData)[ii].x = radius[ii] - dielectricOffset;
(*gpu->psObcData)[ii].y = scale[ii] * (*gpu->psObcData)[ii].x;
(*gpu->psPosq4)[ii].w = charge[ii];
(*gpu->psBornRadii)[ii] = 0.0f;
(*freeEnergyGpu->psNonPolarScalingFactors)[ii] = nonPolarScalingFactors[ii];
}
// diagnostics
if( freeEnergyGpu->log ){
(void) fprintf( freeEnergyGpu->log, "%s %u %u\n", methodName.c_str(), gpu->natoms, gpu->sim.paddedNumberOfAtoms );
(void) fprintf( freeEnergyGpu->log, "surfaceAreaFactor=%15.7e preFactor=%15.7e\n", gpu->sim.surfaceAreaFactor, gpu->sim.preFactor);
#ifdef PARAMETER_PRINT
int maxPrint = MAX_PARAMETER_PRINT;
for( unsigned int ii = 0; ii < numberOfParticles; ii++ ){
(void) fprintf( freeEnergyGpu->log, "%6u %13.6e %13.6e %8.3f %8.3f\n", ii,
(*gpu->psObcData)[ii].x, (*gpu->psObcData)[ii].y, (*gpu->psPosq4)[ii].w, (*freeEnergyGpu->psNonPolarScalingFactors)[ii] );
if( ii == maxPrint ){
ii = numberOfParticles - maxPrint;
if( ii < maxPrint )ii = maxPrint;
}
}
#endif
}
// dummy out extra atom data
for (unsigned int ii = gpu->natoms; ii < gpu->sim.paddedNumberOfAtoms; ii++ ){
(*gpu->psObcData)[ii].x = 0.01f;
(*gpu->psObcData)[ii].y = 0.01f;
(*freeEnergyGpu->psNonPolarScalingFactors)[ii] = 0.0f;
(*gpu->psBornRadii)[ii] = 0.0f;
}
// load data to board
gpu->psObcData->Upload();
gpu->psPosq4->Upload();
gpu->psBornRadii->Upload();
freeEnergyGpu->psNonPolarScalingFactors->Upload();
return;
}
void kPrintObcGbsaSoftcore( freeEnergyGpuContext freeEnergyGpu, std::string callId, int call, FILE* log){
gpuContext gpu = freeEnergyGpu->gpuContext;
int maxPrint = gpu->natoms;
(void) fprintf( log, "kPrintObcGbsaSoftcore %s %d\n", callId.c_str(), call );
gpu->psObcData->Download();
gpu->psBornRadii->Download();
gpu->psBornForce->Download();
gpu->psPosq4->Download();
freeEnergyGpu->psNonPolarScalingFactors->Download();
CUDAStream<float4>* sigEps4 = freeEnergyGpu->psSigEps4;
sigEps4->Download();
(void) fprintf( log, "BornSum Born radii & params\n" );
for( int ii = 0; ii < gpu->sim.paddedNumberOfAtoms; ii++ ){
//(void) fprintf( log, "%6d prm[%15.7e %15.7e %15.7e %15.7e] [%15.7e %15.7e %15.7e %15.7e] bR=%15.7e bF=%15.7e swDrv=%3.1f x[%8.3f %8.3f %8.3f %15.7f]\n",
(void) fprintf( log, "%6d prm[%15.7e %15.7e %15.7e] sig/eps4[%15.7e %15.7e %15.7e %15.7e] bR=%15.7e bF=%15.7e\n",
ii,
gpu->psObcData->_pSysData[ii].x,
gpu->psObcData->_pSysData[ii].y,
freeEnergyGpu->psNonPolarScalingFactors->_pSysData[ii],
sigEps4->_pSysData[ii].x,
sigEps4->_pSysData[ii].y,
sigEps4->_pSysData[ii].z,
sigEps4->_pSysData[ii].w,
gpu->psBornRadii->_pSysData[ii],
gpu->psBornForce->_pSysData[ii]
/*
gpu->psPosq4->_pSysData[ii].x,
gpu->psPosq4->_pSysData[ii].y,
gpu->psPosq4->_pSysData[ii].z,
gpu->psPosq4->_pSysData[ii].w );
*/
);
if( (ii == maxPrint) && ( ii < (gpu->natoms - maxPrint)) ){
ii = gpu->natoms - maxPrint;
}
}
}
extern __global__ void kFindBlockBoundsCutoff_kernel();
extern __global__ void kFindBlockBoundsPeriodic_kernel();
extern __global__ void kFindBlocksWithInteractionsCutoff_kernel();
extern __global__ void kFindBlocksWithInteractionsPeriodic_kernel();
extern __global__ void kFindInteractionsWithinBlocksCutoff_kernel(unsigned int*);
extern __global__ void kFindInteractionsWithinBlocksPeriodic_kernel(unsigned int*);
void kCalculateObcGbsaSoftcoreBornSum( freeEnergyGpuContext freeEnergyGpu )
{
// printf("kCalculateObcGbsaSoftcoreBornSum\n");
gpuContext gpu = freeEnergyGpu->gpuContext;
kClearObcGbsaSoftcoreBornSum(gpu);
LAUNCHERROR("kClearBornSum from kCalculateObcGbsaSoftcoreBornSum");
switch ( freeEnergyGpu->freeEnergySim.nonbondedMethod )
{
case FREE_ENERGY_NO_CUTOFF:
if (gpu->bOutputBufferPerWarp)
kCalculateObcGbsaSoftcoreN2ByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit);
else
kCalculateObcGbsaSoftcoreN2BornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(Atom)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pWorkUnit);
break;
case FREE_ENERGY_CUTOFF:
kFindBlockBoundsCutoff_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
LAUNCHERROR("kFindBlockBoundsCutoff");
kFindBlocksWithInteractionsCutoff_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
LAUNCHERROR("kFindBlocksWithInteractionsCutoff");
compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
kFindInteractionsWithinBlocksCutoff_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
if (gpu->bOutputBufferPerWarp)
kCalculateObcGbsaSoftcoreCutoffByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
else
kCalculateObcGbsaSoftcoreCutoffBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
break;
case FREE_ENERGY_PERIODIC:
kFindBlockBoundsPeriodic_kernel<<<(gpu->psGridBoundingBox->_length+63)/64, 64>>>();
LAUNCHERROR("kFindBlockBoundsPeriodic");
kFindBlocksWithInteractionsPeriodic_kernel<<<gpu->sim.interaction_blocks, gpu->sim.interaction_threads_per_block>>>();
LAUNCHERROR("kFindBlocksWithInteractionsPeriodic");
compactStream(gpu->compactPlan, gpu->sim.pInteractingWorkUnit, gpu->sim.pWorkUnit, gpu->sim.pInteractionFlag, gpu->sim.workUnits, gpu->sim.pInteractionCount);
kFindInteractionsWithinBlocksPeriodic_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
sizeof(unsigned int)*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
if (gpu->bOutputBufferPerWarp)
kCalculateObcGbsaSoftcorePeriodicByWarpBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
else
kCalculateObcGbsaSoftcorePeriodicBornSum_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.nonbond_threads_per_block,
(sizeof(Atom)+sizeof(float))*gpu->sim.nonbond_threads_per_block>>>(gpu->sim.pInteractingWorkUnit);
break;
default:
throw OpenMM::OpenMMException( "Nonbonded softcore method not recognized." );
}
LAUNCHERROR("kCalculateObcGbsaSoftcoreBornSum");
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
/**
* This file contains the kernel for calculating Born sums. It is included
* several times in kCalculateObcGbsaBornSum.cu with different #defines to generate
* different versions of the kernels.
*/
#undef TARGET
//#define TARGET 1
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_NONBOND_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
#endif
void METHOD_NAME(kCalculateObcGbsaSoftcore, BornSum_kernel)(unsigned int* workUnit)
{
extern __shared__ Atom sA[];
unsigned int totalWarps = cSim.nonbond_blocks*cSim.nonbond_threads_per_block/GRID;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
unsigned int numWorkUnits = cSim.pInteractionCount[0];
unsigned int pos = warp*numWorkUnits/totalWarps;
unsigned int end = (warp+1)*numWorkUnits/totalWarps;
#ifdef USE_CUTOFF
float* tempBuffer = (float*) &sA[cSim.nonbond_threads_per_block];
#endif
while (pos < end)
{
// Extract cell coordinates from appropriate work unit
//unsigned int x = workUnit[pos + (blockIdx.x*numWorkUnits)/gridDim.x];
unsigned int x = workUnit[pos];
unsigned int y = ((x >> 2) & 0x7fff) << GRIDBITS;
x = (x >> 17) << GRIDBITS;
float dx;
float dy;
float dz;
float r2;
float r;
unsigned int tgx = threadIdx.x & (GRID - 1);
unsigned int tbx = threadIdx.x - tgx;
unsigned int tj = tgx;
Atom* psA = &sA[tbx];
if (x == y) // Handle diagonals uniquely at 50% efficiency
{
// Read fixed atom data into registers and GRF
unsigned int i = x + tgx;
float4 apos = cSim.pPosq[i]; // Local atom x, y, z, sum
float2 ar = cSim.pObcData[i]; // Local atom vr, sr
float polarScaleData = gbsaSimDev.pNonPolarScalingFactors[i]; // scale contribution
sA[threadIdx.x].x = apos.x;
sA[threadIdx.x].y = apos.y;
sA[threadIdx.x].z = apos.z;
sA[threadIdx.x].r = ar.x;
sA[threadIdx.x].sr = ar.y;
sA[threadIdx.x].polarScaleData = polarScaleData;
apos.w = 0.0f;
for (unsigned int j = 0; j < GRID; j++)
{
dx = psA[j].x - apos.x;
dy = psA[j].y - apos.y;
dz = psA[j].z - apos.z;
#ifdef USE_PERIODIC
dx -= floorf(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floorf(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floorf(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif
r2 = dx * dx + dy * dy + dz * dz;
#if defined USE_CUTOFF
if (i < cSim.atoms && x+j < cSim.atoms && r2 < cSim.nonbondedCutoffSqr)
#else
if (i < cSim.atoms && x+j < cSim.atoms )
#endif
{
r = sqrtf(r2);
float rInverse = 1.0f/r;
float rScaledRadiusJ = r + psA[j].sr;
if( (j != tgx) && (ar.x < rScaledRadiusJ) ){
float l_ij = 1.0f / max(ar.x, fabs(r - psA[j].sr));
float u_ij = 1.0f / rScaledRadiusJ;
float l_ij2 = l_ij * l_ij;
float u_ij2 = u_ij * u_ij;
float ratio = logf(u_ij / l_ij);
float term = l_ij - u_ij +
0.25f * r * (u_ij2 - l_ij2) +
(0.50f * rInverse * ratio) +
(0.25f * psA[j].sr * psA[j].sr * rInverse) *
(l_ij2 - u_ij2);
float rj = psA[j].sr;
if( ar.x < (rj - r) ){
term += 2.0f * ((1.0f / ar.x) - l_ij);
}
apos.w += psA[j].polarScaleData*term;
}
}
}
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = x + tgx + warp*cSim.stride;
cSim.pBornSum[offset] += apos.w;
#else
unsigned int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pBornSum[offset] = apos.w;
#endif
} else {
// Read fixed atom data into registers and GRF
unsigned int j = y + tgx;
unsigned int i = x + tgx;
float4 temp = cSim.pPosq[j];
float2 temp1 = cSim.pObcData[j];
float polarScaleDataJ = gbsaSimDev.pNonPolarScalingFactors[j]; // scale contribution
float4 apos = cSim.pPosq[i]; // Local atom x, y, z, sum
apos.w = 0.0f;
float2 ar = cSim.pObcData[i]; // Local atom vr, sr
float polarScaleDataI = gbsaSimDev.pNonPolarScalingFactors[i]; // scale contribution
sA[threadIdx.x].x = temp.x;
sA[threadIdx.x].y = temp.y;
sA[threadIdx.x].z = temp.z;
sA[threadIdx.x].r = temp1.x;
sA[threadIdx.x].sr = temp1.y;
sA[threadIdx.x].polarScaleData = polarScaleDataJ;
sA[threadIdx.x].sum = 0.0f;
#ifdef USE_CUTOFF
unsigned int flags = cSim.pInteractionFlag[pos];
if (flags == 0)
{
// No interactions in this block.
}
// else if (flags == 0xFFFFFFFF)
else if (flags )
#endif
{
// Compute all interactions within this block.
for( unsigned int j = 0; j < GRID; j++ ){
dx = psA[tj].x - apos.x;
dy = psA[tj].y - apos.y;
dz = psA[tj].z - apos.z;
#ifdef USE_PERIODIC
dx -= floorf(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floorf(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floorf(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif
r2 = dx * dx + dy * dy + dz * dz;
#ifdef USE_CUTOFF
if (i < cSim.atoms && y+tj < cSim.atoms && r2 < cSim.nonbondedCutoffSqr)
#else
if (i < cSim.atoms && y+tj < cSim.atoms )
#endif
{
r = sqrtf(r2);
float rInverse = 1.0f / r;
float rScaledRadiusJ = r + psA[tj].sr;
if (ar.x < rScaledRadiusJ)
{
float l_ij = 1.0f / max(ar.x, fabs(r - psA[tj].sr));
float u_ij = 1.0f / rScaledRadiusJ;
float l_ij2 = l_ij * l_ij;
float u_ij2 = u_ij * u_ij;
float ratio = logf(u_ij / l_ij);
float term = l_ij - u_ij +
0.25f * r * (u_ij2 - l_ij2) +
(0.50f * rInverse * ratio) +
(0.25f * psA[tj].sr * psA[tj].sr * rInverse) *
(l_ij2 - u_ij2);
float srj = psA[tj].sr;
float scale = psA[tj].polarScaleData;
if (ar.x < (srj - r))
{
term += 2.0f * ((1.0f / ar.x) - l_ij);
}
apos.w += (scale*term);
}
float rScaledRadiusI = r + ar.y;
if (psA[tj].r < rScaledRadiusI)
{
float l_ij = 1.0f / max(psA[tj].r, fabs(r - ar.y));
float u_ij = 1.0f / rScaledRadiusI;
float l_ij2 = l_ij * l_ij;
float u_ij2 = u_ij * u_ij;
float ratio = logf(u_ij / l_ij);
float term = l_ij - u_ij +
0.25f * r * (u_ij2 - l_ij2) +
(0.50f * rInverse * ratio) +
(0.25f * ar.y * ar.y * rInverse) *
(l_ij2 - u_ij2);
float rj = psA[tj].r;
if (rj < (ar.y - r))
{
term += 2.0f * ((1.0f / psA[tj].r) - l_ij);
}
psA[tj].sum += polarScaleDataI*term;
}
}
tj = (tj - 1) & (GRID - 1);
}
}
#ifdef USE_CUTOFF
else
{
// Compute only a subset of the interactions in this block.
for (unsigned int j = 0; j < GRID; j++)
{
if ((flags&(1<<j)) != 0)
{
tempBuffer[threadIdx.x] = 0.0f;
dx = psA[j].x - apos.x;
dy = psA[j].y - apos.y;
dz = psA[j].z - apos.z;
#ifdef USE_PERIODIC
dx -= floorf(dx/cSim.periodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
dy -= floorf(dy/cSim.periodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
dz -= floorf(dz/cSim.periodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
#endif
r2 = dx * dx + dy * dy + dz * dz;
#ifdef USE_CUTOFF
if (i < cSim.atoms && y+j < cSim.atoms && r2 < cSim.nonbondedCutoffSqr)
#else
if (i < cSim.atoms && y+j < cSim.atoms )
#endif
{
r = sqrtf(r2);
float rInverse = 1.0f / r;
float rScaledRadiusJ = r + psA[j].sr;
if (ar.x < rScaledRadiusJ)
{
float l_ij = 1.0f / max(ar.x, fabs(r - psA[j].sr));
float u_ij = 1.0f / rScaledRadiusJ;
float l_ij2 = l_ij * l_ij;
float u_ij2 = u_ij * u_ij;
float ratio = logf(u_ij / l_ij);
float term = l_ij -
u_ij +
0.25f * r * (u_ij2 - l_ij2) +
(0.50f * rInverse * ratio) +
(0.25f * psA[j].sr * psA[j].sr * rInverse) *
(l_ij2 - u_ij2);
float srj = psA[j].sr;
if (ar.x < (srj - r))
{
term += 2.0f * ((1.0f / ar.x) - l_ij);
}
apos.w += psA[j].polarScaleData*term;
}
float rScaledRadiusI = r + ar.y;
if (psA[j].r < rScaledRadiusI)
{
float l_ij = 1.0f / max(psA[j].r, fabs(r - ar.y));
float u_ij = 1.0f / rScaledRadiusI;
float l_ij2 = l_ij * l_ij;
float u_ij2 = u_ij * u_ij;
float ratio = logf(u_ij / l_ij);
float term = l_ij -
u_ij +
0.25f * r * (u_ij2 - l_ij2) +
(0.50f * rInverse * ratio) +
(0.25f * ar.y * ar.y * rInverse) *
(l_ij2 - u_ij2);
float rj = psA[j].r;
if (rj < (ar.y - r))
{
term += 2.0f * ((1.0f / psA[j].r) - l_ij);
}
tempBuffer[threadIdx.x] = polarScaleDataI*term;
}
}
// Sum the terms.
if (tgx % 2 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+1];
if (tgx % 4 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+2];
if (tgx % 8 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+4];
if (tgx % 16 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+8];
if (tgx == 0)
psA[j].sum += tempBuffer[threadIdx.x] + tempBuffer[threadIdx.x+16];
}
}
}
#endif
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = x + tgx + warp*cSim.stride;
cSim.pBornSum[offset] += apos.w;
offset = y + tgx + warp*cSim.stride;
cSim.pBornSum[offset] += sA[threadIdx.x].sum;
#else
unsigned int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
cSim.pBornSum[offset] = apos.w;
offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pBornSum[offset] = sA[threadIdx.x].sum;
#endif
}
pos++;
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment