Commit 25058a77 authored by Mark Friedrichs's avatar Mark Friedrichs
Browse files

Replaced calculation of Born radii using OBC algorithm w/ Grycuk algorithm

TorsionTorsion grids reordered, if needed, so that first angle is the 'slow' index
Fixes for MonteCarloBarostat 
parent e772769e
......@@ -18,6 +18,7 @@
# ----------------------------------------------------------------------------
#SET(CREATE_SERIALIZABLE_OPENMM_AMOEBA OFF CACHE BOOL "Build verison of OpenMMAmoeba w/ backdoor serialization capability")
#SET(CREATE_SERIALIZABLE_OPENMM_AMOEBA TRUE )
SET(CREATE_SERIALIZABLE_OPENMM_AMOEBA FALSE )
# ----------------------------------------------------------------------------
......
......@@ -41,6 +41,7 @@
namespace OpenMM {
typedef std::vector< std::vector< std::vector<double> > > TorsionTorsionGrid;
typedef std::vector< std::vector< std::vector<float> > > TorsionTorsionGridFloat;
/**
* This class implements the Amoeba torsion-torsion interaction
......@@ -198,6 +199,13 @@ public:
}
}
}
/*
for( unsigned int kk = 0; kk < grid.size(); kk++ ){
for( unsigned int jj = 0; jj < grid[kk].size(); jj++ ){
fprintf( stderr, "xGrid %4d %4d %12.3f %12.3f %15.7e %15.7e\n", kk, jj, grid[kk][jj][0], grid[kk][jj][1], grid[kk][jj][2], grid[kk][jj][2]/4.184 );
}
}
*/
_startValues[0] = _grid[0][0][0];
_startValues[1] = _grid[0][0][1];
......
......@@ -60,6 +60,8 @@ public:
return std::map<std::string, double>(); // This force field doesn't define any parameters.
}
std::vector<std::string> getKernelNames();
static void reorderGrid( const TorsionTorsionGrid& grid, TorsionTorsionGrid& reorderedGrid );
private:
AmoebaTorsionTorsionForce& owner;
Kernel kernel;
......
......@@ -54,6 +54,121 @@ double AmoebaTorsionTorsionForceImpl::calcForcesAndEnergy(ContextImpl& context,
return dynamic_cast<CalcAmoebaTorsionTorsionForceKernel&>(kernel.getImpl()).execute(context, includeForces, includeEnergy);
}
struct IntPair {
unsigned int index1;
unsigned int index2;
};
typedef std::map< double, struct IntPair > Map_Double_IntPair;
typedef Map_Double_IntPair::iterator Map_Double_IntPairI;
typedef Map_Double_IntPair::const_iterator Map_Double_IntPairCI;
typedef std::map< double, Map_Double_IntPair > Map_Double_MapDoubleIntPair;
typedef Map_Double_MapDoubleIntPair::iterator Map_Double_MapDoubleIntPairI;
typedef Map_Double_MapDoubleIntPair::const_iterator Map_Double_MapDoubleIntPairCI;
void AmoebaTorsionTorsionForceImpl::reorderGrid( const TorsionTorsionGrid& grid, TorsionTorsionGrid& reorderedGrid ){
reorderedGrid.resize( grid.size() );
std::vector<Map_Double_IntPair> map_Double_IntPair_Vector( grid.size() );
Map_Double_MapDoubleIntPair mapAngles;
//(void) fprintf( stderr, "AmoebaTorsionTorsionForceImpl::reorder grid\n" );
// (1) set dimensions for reorderd grid
// (2) build map:
// map[angleX][angleY] = <ii, jj> indices
// assume map keys are sorted from least to greatest
for (unsigned int ii = 0; ii < grid.size(); ii++) {
reorderedGrid[ii].resize( grid[ii].size() );
for (unsigned int jj = 0; jj < grid[ii].size(); jj++) {
reorderedGrid[ii][jj].resize( grid[ii][jj].size() );
double angleX = grid[ii][jj][0];
double angleY = grid[ii][jj][1];
if( mapAngles.find( angleX ) == mapAngles.end() ){
if( map_Double_IntPair_Vector[ii].size() > 0 ){
char buffer[1024];
(void) sprintf( buffer, "TorsionTorsion grid reorder: x-angle not set correctly: x=%15.7e y=%15.7e size=%u should be zero; ii/jj indies=%u %u.\n",
angleX, angleY, static_cast<unsigned int>(map_Double_IntPair_Vector[ii].size()), ii, jj );
throw OpenMMException(buffer);
}
mapAngles[angleX] = map_Double_IntPair_Vector[ii];
}
Map_Double_IntPair& map_Double_IntPair = mapAngles[angleX];
if( map_Double_IntPair.find( angleY ) != map_Double_IntPair.end() ){
char buffer[1024];
(void) sprintf( buffer, "TorsionTorsion grid reorder: angle pair found twice: %15.7e %15.7e %u\n", angleX, angleY, static_cast<unsigned int>(map_Double_IntPair.size()) );
throw OpenMMException(buffer);
}
struct IntPair pair;
pair.index1 = ii;
pair.index2 = jj;
map_Double_IntPair[angleY] = pair;
}
}
#if 0
(void) fprintf( stderr, "TorsionTorsion grid reorder map\n" );
for( Map_Double_MapDoubleIntPairCI ii = mapAngles.begin(); ii != mapAngles.end(); ii++ ){
double angleX = ii->first;
Map_Double_IntPair map_Double_IntPair = ii->second;
(void) fprintf( stderr, " %15.7e %u \n", angleX, static_cast<unsigned int>(map_Double_IntPair.size()) );
}
for( Map_Double_MapDoubleIntPairCI ii = mapAngles.begin(); ii != mapAngles.end(); ii++ ){
double angleX = ii->first;
Map_Double_IntPair map_Double_IntPair = ii->second;
for( Map_Double_IntPairCI jj = map_Double_IntPair.begin(); jj != map_Double_IntPair.end(); jj++ ){
double angle = jj->first;
struct IntPair pair = jj->second;
(void) fprintf( stderr, " %15.7e %15.7e %d %d\n", angleX, angle, pair.index1, pair.index2 );
}
}
#endif
// load reordered entries
Map_Double_MapDoubleIntPairCI mapII = mapAngles.begin();
Map_Double_IntPair map_Double_IntPair = mapII->second;
Map_Double_IntPairCI mapJJ = map_Double_IntPair.begin();
for (unsigned int ii = 0; ii < grid.size(); ii++) {
for (unsigned int jj = 0; jj < grid[ii].size(); jj++) {
struct IntPair pair = mapJJ->second;
int index1 = pair.index1;
int index2 = pair.index2;
//(void) fprintf( stderr, " %3d %3d %15.7e %15.7e %3d %3d zzz\n", ii, jj, mapII->first, mapJJ->first, index1, index2 );
for (unsigned int kk = 0; kk < grid[ii][jj].size(); kk++) {
reorderedGrid[ii][jj][kk] = static_cast<float>(grid[index1][index2][kk]);
}
// increment map iterators
mapJJ++;
if( mapJJ == map_Double_IntPair.end() ){
mapII++;
if( mapII == mapAngles.end() ){
if( (jj != (grid[ii].size()-1)) && (ii != (grid.size()-1)) ){
char buffer[1024];
(void) sprintf( buffer, "AmoebaTorsionTorsionForceImpl::reorderGrid: error detected with map iterators.\n" );
throw OpenMMException(buffer);
}
} else {
map_Double_IntPair = mapII->second;
mapJJ = map_Double_IntPair.begin();
}
}
}
}
return;
}
std::vector<std::string> AmoebaTorsionTorsionForceImpl::getKernelNames() {
std::vector<std::string> names;
names.push_back(CalcAmoebaTorsionTorsionForceKernel::Name());
......
......@@ -37,6 +37,7 @@ AmoebaCudaData::AmoebaCudaData( CudaPlatform::PlatformData& data ) : cudaPlatfor
hasAmoebaBonds = false;
hasAmoebaGeneralizedKirkwood = false;
hasAmoebaMultipole = false;
useGrycuk = true;
amoebaGpu = amoebaGpuInit( cudaPlatformData.gpu );
localForceKernel = NULL;
log = NULL;
......@@ -45,6 +46,10 @@ AmoebaCudaData::AmoebaCudaData( CudaPlatform::PlatformData& data ) : cudaPlatfor
applyMultipoleCutoff = 0;
useVdwNeighborList = 0;
multipoleForceCount = 0;
boxDimensions[0] = 0.0;
boxDimensions[1] = 0.0;
boxDimensions[2] = 0.0;
}
AmoebaCudaData::~AmoebaCudaData() {
......@@ -74,6 +79,10 @@ bool AmoebaCudaData::getHasAmoebaMultipole( void ) const {
return hasAmoebaMultipole;
}
int AmoebaCudaData::getUseGrycuk( void ) const {
return useGrycuk;
}
void AmoebaCudaData::setHasAmoebaGeneralizedKirkwood( bool inputHasAmoebaGeneralizedKirkwood ) {
hasAmoebaGeneralizedKirkwood = inputHasAmoebaGeneralizedKirkwood;
}
......@@ -108,6 +117,7 @@ void AmoebaCudaData::setContextImpl( void* inputContextImpl ) {
}
void AmoebaCudaData::initializeGpu( void ) {
if( !gpuInitialized ){
if( getHasAmoebaGeneralizedKirkwood() && !getHasAmoebaMultipole() ){
throw OpenMMException("GK force requires Multipole force\n");
......@@ -117,16 +127,33 @@ void AmoebaCudaData::initializeGpu( void ) {
amoebaGpuBuildThreadBlockWorkList( amoebaGpu );
amoebaGpuBuildVdwExclusionList( amoebaGpu );
amoebaGpuBuildScalingList( amoebaGpu );
amoebaGpuSetConstants( amoebaGpu );
amoebaGpuSetConstants( amoebaGpu, 0 );
gpuInitialized = true;
boxDimensions[0] = amoebaGpu->gpuContext->sim.periodicBoxSizeX;
boxDimensions[1] = amoebaGpu->gpuContext->sim.periodicBoxSizeY;
boxDimensions[2] = amoebaGpu->gpuContext->sim.periodicBoxSizeZ;
gpuInitialized = true;
if( log ){
gpuPrintCudaAmoebaGmxSimulation( amoebaGpu, getLog() );
(void) fprintf( log, "Gpu initialized\n" );
(void) fflush( log );
}
} else {
if( boxDimensions[0] != amoebaGpu->gpuContext->sim.periodicBoxSizeX ||
boxDimensions[1] != amoebaGpu->gpuContext->sim.periodicBoxSizeY ||
boxDimensions[2] != amoebaGpu->gpuContext->sim.periodicBoxSizeZ ){
amoebaGpuSetConstants( amoebaGpu, 1 );
boxDimensions[0] = amoebaGpu->gpuContext->sim.periodicBoxSizeX;
boxDimensions[1] = amoebaGpu->gpuContext->sim.periodicBoxSizeY;
boxDimensions[2] = amoebaGpu->gpuContext->sim.periodicBoxSizeZ;
}
}
return;
}
......
......@@ -76,6 +76,13 @@ public:
*/
bool getHasAmoebaMultipole( void ) const;
/**
* Get use Grycuk flag
*
* @return value of useGrycuk
*/
int getUseGrycuk( void ) const;
/**
* Set value of hasAmoebaGeneralizedKirkwood
*
......@@ -190,11 +197,13 @@ private:
int multipoleForceCount;
int applyMultipoleCutoff;
int useVdwNeighborList;
int useGrycuk;
KernelImpl* localForceKernel;
unsigned int kernelCount;
void* contextImpl;
FILE* log;
bool gpuInitialized;
double boxDimensions[3];
};
......
......@@ -31,6 +31,7 @@
#include "kernels/amoebaCudaKernels.h"
#include "openmm/internal/AmoebaMultipoleForceImpl.h"
#include "openmm/internal/AmoebaWcaDispersionForceImpl.h"
#include "openmm/internal/AmoebaTorsionTorsionForceImpl.h"
#include "openmm/internal/NonbondedForceImpl.h"
#include "CudaForceInfo.h"
......@@ -738,22 +739,39 @@ void CudaCalcAmoebaTorsionTorsionForceKernel::initialize(const System& system, c
// torsion-torsion grids
numTorsionTorsionGrids = force.getNumTorsionTorsionGrids();
std::vector< std::vector< std::vector< std::vector<float> > > > floatGrids;
std::vector<TorsionTorsionGridFloat> floatGrids;
floatGrids.resize(numTorsionTorsionGrids);
for (int gridIndex = 0; gridIndex < numTorsionTorsionGrids; gridIndex++) {
const TorsionTorsionGrid& grid = force.getTorsionTorsionGrid( gridIndex );
floatGrids[gridIndex].resize( grid.size() );
// check if grid needs to be reordered: x-angle should be 'slow' index
TorsionTorsionGrid reorderedGrid;
int reorder = 0;
if( grid[0][0][0] != grid[0][1][0] ){
AmoebaTorsionTorsionForceImpl::reorderGrid( grid, reorderedGrid );
reorder = 1;
if( data.getLog() ){
(void) fprintf( data.getLog(), "CudaCalcAmoebaTorsionTorsionForceKernel::initialize: reordering torsion-torsion grid %d.\n", gridIndex );
}
}
for (unsigned int ii = 0; ii < grid.size(); ii++) {
floatGrids[gridIndex][ii].resize( grid[ii].size() );
for (unsigned int jj = 0; jj < grid[ii].size(); jj++) {
floatGrids[gridIndex][ii][jj].resize( grid[ii][jj].size() );
for (unsigned int kk = 0; kk < grid[ii][jj].size(); kk++) {
floatGrids[gridIndex][ii][jj][kk] = static_cast<float>(grid[ii][jj][kk]);
if( reorder ){
for( unsigned int kk = 0; kk < grid[ii][jj].size(); kk++) {
floatGrids[gridIndex][ii][jj][kk] = static_cast<float>(reorderedGrid[ii][jj][kk]);
}
} else {
for( unsigned int kk = 0; kk < grid[ii][jj].size(); kk++) {
floatGrids[gridIndex][ii][jj][kk] = static_cast<float>(grid[ii][jj][kk]);
}
}
}
}
......@@ -788,12 +806,17 @@ static void computeAmoebaMultipoleForce( AmoebaCudaData& data ) {
data.initializeGpu();
// calculate Born radii
// calculate Born radii using either the Grycuk or OBC algorithm if GK is active
if( data.getHasAmoebaGeneralizedKirkwood() ){
kClearBornSum( gpu->gpuContext );
kCalculateObcGbsaBornSum(gpu->gpuContext);
kReduceObcGbsaBornSum(gpu->gpuContext);
if( data.getUseGrycuk() ){
kCalculateAmoebaGrycukBornRadii( gpu );
kReduceGrycukGbsaBornSum( gpu );
} else {
kCalculateObcGbsaBornSum(gpu->gpuContext);
kReduceObcGbsaBornSum(gpu->gpuContext);
}
}
// multipoles
......@@ -990,23 +1013,27 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
zsize = pmeGridDimension[2];
pmeParametersSetBasedOnEwaldErrorTolerance = 0;
}
gpuSetAmoebaPMEParameters(data.getAmoebaGpu(), (float) alpha, xsize, ysize, zsize);
if( data.getLog() ){
(void) fprintf( data.getLog(), "AmoebaMultipoleForce: PME parameters tol=%12.3e cutoff=%12.3f alpha=%12.3f [%d %d %d] -",
(void) fprintf( data.getLog(), "AmoebaMultipoleForce: PME parameters tol=%12.3e cutoff=%12.3f alpha=%12.3f [%d %d %d]\n",
force.getEwaldErrorTolerance(), force.getCutoffDistance(), alpha, xsize, ysize, zsize );
if( pmeParametersSetBasedOnEwaldErrorTolerance ){
(void) fprintf( data.getLog(), " parameters set based on error tolerance and OpenMM algorithm.\n" );
(void) fprintf( data.getLog(), "Parameters based on error tolerance and OpenMM algorithm.\n" );
} else {
double alphaT;
int xsizeT, ysizeT, zsizeT;
NonbondedForceImpl::calcPMEParameters(system, nb, alphaT, xsizeT, ysizeT, zsizeT);
double impliedTolerance = alpha*force.getCutoffDistance();
impliedTolerance = 0.5*exp( -(impliedTolerance*impliedTolerance) );
(void) fprintf( data.getLog(), " using input parameters implied tolerance=%12.3e;", impliedTolerance );
(void) fprintf( data.getLog(), "Using input parameters implied tolerance=%12.3e;", impliedTolerance );
(void) fprintf( data.getLog(), "OpenMM param: aEwald=%12.3f [%6d %6d %6d]\n", alphaT, xsizeT, ysizeT, zsizeT);
}
(void) fprintf( data.getLog(), "\n" );
(void) fflush( data.getLog() );
}
gpuSetAmoebaPMEParameters(data.getAmoebaGpu(), (float) alpha, xsize, ysize, zsize);
data.setApplyMultipoleCutoff( 1 );
data.cudaPlatformData.nonbondedMethod = PARTICLE_MESH_EWALD;
......@@ -1067,12 +1094,25 @@ void CudaCalcAmoebaGeneralizedKirkwoodForceKernel::initialize(const System& syst
scale[ii] = static_cast<float>( scalingFactor );
charge[ii] = static_cast<float>( particleCharge );
}
gpuSetAmoebaObcParameters( data.getAmoebaGpu(), static_cast<float>(force.getSoluteDielectric() ),
static_cast<float>( force.getSolventDielectric() ),
static_cast<float>( force.getDielectricOffset() ), radius, scale, charge,
force.getIncludeCavityTerm(),
static_cast<float>( force.getProbeRadius() ),
static_cast<float>( force.getSurfaceAreaFactor() ) );
if( data.getUseGrycuk() ){
gpuSetAmoebaGrycukParameters( data.getAmoebaGpu(), static_cast<float>(force.getSoluteDielectric() ),
static_cast<float>( force.getSolventDielectric() ),
static_cast<float>( force.getDielectricOffset() ), radius, scale, charge,
force.getIncludeCavityTerm(),
static_cast<float>( force.getProbeRadius() ),
static_cast<float>( force.getSurfaceAreaFactor() ) );
} else {
gpuSetAmoebaObcParameters( data.getAmoebaGpu(), static_cast<float>(force.getSoluteDielectric() ),
static_cast<float>( force.getSolventDielectric() ),
static_cast<float>( force.getDielectricOffset() ), radius, scale, charge,
force.getIncludeCavityTerm(),
static_cast<float>( force.getProbeRadius() ),
static_cast<float>( force.getSurfaceAreaFactor() ) );
}
data.getAmoebaGpu()->gpuContext->forces.push_back(new ForceInfo(force));
}
......
......@@ -124,9 +124,15 @@ extern void GetCalculateAmoebaKirkwoodEDiffSim( amoebaGpuContext amoebaGpu );
//extern void cudaComputeAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu );
extern void kCalculateAmoebaKirkwoodEDiff( amoebaGpuContext amoebaGpu );
extern void SetCalculateAmoebaObcGbsaBornSumSim( gpuContext gpu );
extern void GetCalculateAmoebaObcGbsaBornSumSim( gpuContext gpu );
extern void cudaComputeAmoebaBornRadii( amoebaGpuContext amoebaGpu );
//extern void SetCalculateAmoebaObcGbsaBornSumSim( gpuContext gpu );
//extern void GetCalculateAmoebaObcGbsaBornSumSim( gpuContext gpu );
//extern void cudaComputeAmoebaBornRadii( amoebaGpuContext amoebaGpu );
extern void kCalculateAmoebaGrycukBornRadii( amoebaGpuContext amoebaGpu );
extern void kReduceGrycukGbsaBornSum( amoebaGpuContext gpu );
extern void SetCalculateAmoebaGrycukSim(amoebaGpuContext amoebaGpu );
extern void GetCalculateAmoebaGrycukSim(amoebaGpuContext amoebaGpu );
extern void kCalculateGrycukGbsaForces2( amoebaGpuContext amoebaGpu );
// OBC -- Part 1
//extern void SetCalculateObcGbsaForces1Sim(gpuContext gpu);
......
......@@ -288,6 +288,11 @@ void gpuSetAmoebaObcParameters( amoebaGpuContext amoebaGpu , float innerDielectr
const std::vector<float>& radius, const std::vector<float>& scale, const std::vector<float>& charge,
int includeCavityTerm, float probeRadius, float surfaceAreaFactor);
extern "C"
void gpuSetAmoebaGrycukParameters( amoebaGpuContext amoebaGpu , float innerDielectric, float solventDielectric, float dielectricOffset,
const std::vector<float>& radius, const std::vector<float>& scale, const std::vector<float>& charge,
int includeCavityTerm, float probeRadius, float surfaceAreaFactor);
extern "C"
void gpuSetAmoebaVdwParameters( amoebaGpuContext amoebaGpu,
const std::vector<int>& indexIVs,
......@@ -313,7 +318,7 @@ void gpuSetAmoebaWcaDispersionParameters( amoebaGpuContext amoebaGpu,
const float awater, const float shctd, const float dispoff );
extern "C"
void amoebaGpuSetConstants(amoebaGpuContext gpu);
void amoebaGpuSetConstants(amoebaGpuContext gpu, int updateFlag );
extern "C"
void gpuSetAmoebaBondOffsets(amoebaGpuContext gpu);
......
//-----------------------------------------------------------------------------------------
//-----------------------------------------------------------------------------------------
#include "cudaKernels.h"
#include "amoebaCudaKernels.h"
//#define AMOEBA_DEBUG
#undef AMOEBA_DEBUG
static __constant__ cudaGmxSimulation cSim;
static __constant__ cudaAmoebaGmxSimulation cAmoebaSim;
void SetCalculateAmoebaGrycukSim(amoebaGpuContext amoebaGpu)
{
cudaError_t status;
gpuContext gpu = amoebaGpu->gpuContext;
status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
RTERROR(status, "SetCalculateAmoebaGrycukSim: cudaMemcpyToSymbol: SetSim copy to cSim failed");
status = cudaMemcpyToSymbol(cAmoebaSim, &amoebaGpu->amoebaSim, sizeof(cudaAmoebaGmxSimulation));
RTERROR(status, "SetCalculateAmoebaGrycukSim: cudaMemcpyToSymbol: SetSim copy to cAmoebaSim failed");
}
void GetCalculateAmoebaGrycukSim(amoebaGpuContext amoebaGpu)
{
cudaError_t status;
gpuContext gpu = amoebaGpu->gpuContext;
status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
RTERROR(status, "GetCalculateAmoebaGrycukSim: cudaMemcpyFromSymbol: SetSim copy from cSim failed");
status = cudaMemcpyFromSymbol(&amoebaGpu->amoebaSim, cAmoebaSim, sizeof(cudaAmoebaGmxSimulation));
RTERROR(status, "GetCalculateAmoebaGrycukSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed");
}
struct GrycukParticle {
float x;
float y;
float z;
float radius;
float scaledRadius;
float bornSum;
};
__device__ void loadGrycukShared( struct GrycukParticle* sA, unsigned int atomI )
{
// coordinates, radii and scaled radii
sA->x = cSim.pPosq[atomI].x;
sA->y = cSim.pPosq[atomI].y;
sA->z = cSim.pPosq[atomI].z;
sA->radius = cSim.pObcData[atomI].x;
sA->scaledRadius = cSim.pObcData[atomI].y;
}
__device__ void calculateGrycukBornRadiiPairIxn_kernel( GrycukParticle& atomI, GrycukParticle& atomJ, float* bornSum ){
/*
* radius: radius (TINKER rsolv)
* scaledRadius: radius*overlap scale factor (TINKER rsolv*shct)
*
*/
float xr,yr,zr;
float r,r2;
float sk, sk2;
float lik, uik;
float lik3, uik3;
float l2, l4, lr, l4r;
float u2, u4, ur, u4r;
float term;
// decide whether to compute the current interaction;
*bornSum = 0.0f;
if( atomI.radius <= 0.0f ){
return;
}
xr = atomJ.x - atomI.x;
yr = atomJ.y - atomI.y;
zr = atomJ.z - atomI.z;
r2 = xr*xr + yr*yr + zr*zr;
r = sqrt(r2);
sk = atomJ.scaledRadius;
sk2 = sk*sk;
if( (atomI.radius + r) < sk ){
lik = atomI.radius;
uik = sk - r;
lik3 = lik*lik*lik;
uik3 = uik*uik*uik;
*bornSum -= (1.0f/uik3 - 1.0f/lik3);
}
uik = r + sk;
if( (atomI.radius + r) < sk ){
lik = sk - r;
} else if( r < (atomI.radius + sk) ){
lik = atomI.radius;
} else {
lik = r - sk;
}
l2 = lik*lik;
l4 = l2*l2;
lr = lik*r;
l4r = l4*r;
u2 = uik*uik;
u4 = u2*u2;
ur = uik*r;
u4r = u4*r;
term = (3.0f*(r2-sk2)+6.0f*u2-8.0f*ur)/u4r - (3.0f*(r2-sk2)+6.0f*l2-8.0f*lr)/l4r;
*bornSum += term/16.0f;
}
__device__ void zeroGrycukParticleSharedField( struct GrycukParticle* sA )
{
sA->bornSum = 0.0f;
}
__global__
__launch_bounds__(384, 1)
void kReduceGrycukGbsaBornSum_kernel()
{
unsigned int pos = (blockIdx.x * blockDim.x + threadIdx.x);
while (pos < cSim.atoms)
{
float sum = 0.0f;
float* pSt = cSim.pBornSum + pos;
// Get summed Born data
for (int i = 0; i < cSim.nonbondOutputBuffers; i++)
{
sum += *pSt;
pSt += cSim.stride;
}
// Now calculate Born radius
float radius = cSim.pObcData[pos].x;
radius = 1.0f/(radius*radius*radius);
sum = radius - sum;
sum = sum <= 0.0f ? 1000.0f : pow( sum, -1.0f/3.0f );
cSim.pBornRadii[pos] = sum;
pos += gridDim.x * blockDim.x;
}
}
/**---------------------------------------------------------------------------------------
Reduce Born radii
@param amoebaGpu amoebaGpu context
--------------------------------------------------------------------------------------- */
void kReduceGrycukGbsaBornSum( amoebaGpuContext amoebaGpu )
{
kReduceGrycukGbsaBornSum_kernel<<<amoebaGpu->gpuContext->sim.blocks, 384>>>();
LAUNCHERROR("kReduceGrycukGbsaBornSum");
if( 1 ){
static int callId = 0;
gpuContext gpu = amoebaGpu->gpuContext;
std::vector<int> fileId;
fileId.push_back( callId++ );
VectorOfDoubleVectors outputVector;
cudaLoadCudaFloatArray( gpu->natoms, 1, gpu->psBornRadii, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaWriteVectorOfDoubleVectorsToFile( "BornRGry", fileId, outputVector );
}
}
// Include versions of the kernels for N^2 calculations.
#undef USE_OUTPUT_BUFFER_PER_WARP
#define METHOD_NAME(a, b) a##N2##b
#include "kCalculateAmoebaCudaGrycukBornRadii.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##N2ByWarp##b
#include "kCalculateAmoebaCudaGrycukBornRadii.h"
/**---------------------------------------------------------------------------------------
Compute Born radii using Grycuk algorithm
@param amoebaGpu amoebaGpu context
--------------------------------------------------------------------------------------- */
void kCalculateAmoebaGrycukBornRadii( amoebaGpuContext amoebaGpu )
{
#ifdef AMOEBA_DEBUG
static const char* methodName = "kCalculateAmoebaGrycukBornRadii";
static int timestep = 0;
std::vector<int> fileId;
timestep++;
fileId.resize( 2 );
fileId[0] = timestep;
fileId[1] = 1;
#endif
// ---------------------------------------------------------------------------------------
gpuContext gpu = amoebaGpu->gpuContext;
// apparently debug array can take up nontrivial no. registers
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "%s %d maxCovalentDegreeSz=%d ZZZ\n",
methodName, gpu->natoms, amoebaGpu->maxCovalentDegreeSz );
amoebaGpu->scalingDistanceCutoff );
}
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
CUDAStream<float4>* debugArray = new CUDAStream<float4>(paddedNumberOfAtoms*paddedNumberOfAtoms, 1, "DebugArray");
memset( debugArray->_pSysData, 0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
debugArray->Upload();
unsigned int targetAtom = 0;
gpu->psBornRadii->Download();
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "Grycuk input\n" ); (void) fflush( amoebaGpu->log );
for( int ii = 0; ii < amoebaGpu->gpuContext->sim.paddedNumberOfAtoms; ii++ ){
(void) fprintf( amoebaGpu->log,"Born %6d %16.9e\n", ii,
gpu->psBornRadii->_pSysData[ii] );
}
}
#endif
// on first pass, set threads/block and based on that setting the energy buffer array
static unsigned int threadsPerBlock = 0;
if( threadsPerBlock == 0 ){
unsigned int maxThreads;
if (gpu->sm_version >= SM_20)
//maxThreads = 384;
maxThreads = 512;
else if (gpu->sm_version >= SM_12)
maxThreads = 128;
else
maxThreads = 64;
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(GrycukParticle), gpu->sharedMemoryPerBlock ), maxThreads);
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaGrycuk: blcks=%u tds=%u %u bPrWrp=%u atm=%lu shrd=%lu ixnCt=%lu workUnits=%u\n",
gpu->sim.nonbond_blocks, threadsPerBlock, maxThreads, gpu->bOutputBufferPerWarp,
sizeof(GrycukParticle), sizeof(GrycukParticle)*threadsPerBlock,
(*gpu->psInteractionCount)[0], gpu->sim.workUnits );
(void) fflush( amoebaGpu->log );
}
#endif
}
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaGrycukN2Forces%swarp: numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%lu shrd=%lu ixnCt=%lu workUnits=%u\n",
(gpu->bOutputBufferPerWarp ? " " : " no "), gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
sizeof(GrycukParticle), sizeof(GrycukParticle)*threadsPerBlock,
(*gpu->psInteractionCount)[0], gpu->sim.workUnits );
(void) fflush( amoebaGpu->log );
}
#endif
if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaGrycukBornRadiiN2ByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(GrycukParticle)*threadsPerBlock>>>( gpu->psWorkUnit->_pDevData);
} else {
kCalculateAmoebaGrycukBornRadiiN2_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(GrycukParticle)*threadsPerBlock>>>( gpu->psWorkUnit->_pDevData);
}
LAUNCHERROR("kCalculateAmoebaCudaGrycukN2Forces");
// ---------------------------------------------------------------------------------------
}
// Born radius chain rule component for Grycuk
struct GrycukChainRuleParticle {
float x;
float y;
float z;
float radius;
float scaledRadius;
float bornRadius;
float bornForce;
float force[3];
};
__device__ void loadGrycukChainRuleParticleShared( struct GrycukChainRuleParticle* sA, unsigned int atomI )
{
// coordinates, radii and scaled radii
sA->x = cSim.pPosq[atomI].x;
sA->y = cSim.pPosq[atomI].y;
sA->z = cSim.pPosq[atomI].z;
sA->radius = cSim.pObcData[atomI].x;
sA->scaledRadius = cSim.pObcData[atomI].y;
sA->bornRadius = cSim.pBornRadii[atomI];
sA->bornForce = cSim.pBornForce[atomI];
}
__device__ void zeroGrycukChainRuleParticleSharedField( struct GrycukChainRuleParticle* sA )
{
// zero force
sA->force[0] = 0.0f;
sA->force[1] = 0.0f;
sA->force[2] = 0.0f;
}
//#define AMOEBA_DEBUG
__device__ void calculateGrycukChainRulePairIxn_kernel( GrycukChainRuleParticle& atomI, GrycukChainRuleParticle& atomJ, float force[3]
#ifdef AMOEBA_DEBUG
, float4 pullDebug[5]
#endif
){
const float pi = 3.1415926535897f;
float third = 1.0f/3.0f;
float pi43 = 4.0f*third*pi;
float lik, uik;
float lik4, uik4;
float factor = -pow(pi,third)*pow(6.0f,(2.0f*third))/9.0f;
float term = pi43/(atomI.bornRadius*atomI.bornRadius*atomI.bornRadius);
term = factor/pow( term, (4.0f*third) );
float xr = atomJ.x - atomI.x;
float yr = atomJ.y - atomI.y;
float zr = atomJ.z - atomI.z;
float sk = atomJ.scaledRadius;
float sk2 = sk*sk;
float r2 = xr*xr + yr*yr + zr*zr;
float r = sqrt(r2);
float de = 0.0f;
if( (atomI.radius + r) < sk ){
float uik4;
uik = sk - r;
uik4 = uik*uik;
uik4 = uik4*uik4;
de = -4.0f*pi/uik4;
}
if( (atomI.radius + r) < sk){
lik = sk - r;
lik4 = lik*lik;
lik4 = lik4*lik4;
de += 0.25f*pi*(sk2-4.0f*sk*r+17.0f*r2)/ (r2*lik4);
} else if( r < (atomI.radius +sk) ){
lik = atomI.radius;
lik4 = lik*lik;
lik4 = lik4*lik4;
de += 0.25f*pi*(2.0f*atomI.radius*atomI.radius-sk2-r2)/ (r2*lik4);
} else {
lik = r - sk;
lik4 = lik*lik;
lik4 = lik4*lik4;
de += 0.25f*pi*(sk2-4.0f*sk*r+r2)/ (r2*lik4);
}
uik = r + sk;
uik4 = uik*uik;
uik4 = uik4*uik4;
de -= 0.25f*pi*(sk2+4.0f*sk*r+r2)/ (r2*uik4);
float dbr = term * de/r;
de = dbr*atomI.bornForce;
#ifdef AMOEBA_DEBUG
pullDebug[0].x = de;
pullDebug[0].y = r;
pullDebug[0].z = factor;
pullDebug[0].w = -4.0f;
pullDebug[1].x = atomI.bornForce/4.184f;
pullDebug[1].y = atomI.bornRadius;
pullDebug[1].z = atomJ.bornForce/4.184f;
pullDebug[1].w = -5.0f;
#endif
force[0] = xr*de;
force[1] = yr*de;
force[2] = zr*de;
}
// Include versions of the kernels for N^2 calculations.
#undef METHOD_NAME
#undef USE_OUTPUT_BUFFER_PER_WARP
#define METHOD_NAME(a, b) a##N2##b
#include "kCalculateAmoebaCudaGrycukChainRule.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##N2ByWarp##b
#include "kCalculateAmoebaCudaGrycukChainRule.h"
/**---------------------------------------------------------------------------------------
Compute Grycuk chain rule contribution to force
@param amoebaGpu amoebaGpu context
--------------------------------------------------------------------------------------- */
void kCalculateGrycukGbsaForces2( amoebaGpuContext amoebaGpu )
{
#ifdef AMOEBA_DEBUG
static const char* methodName = "kCalculateGrycukGbsaForces2";
static int timestep = 0;
std::vector<int> fileId;
timestep++;
fileId.resize( 2 );
fileId[0] = timestep;
fileId[1] = 1;
#endif
// ---------------------------------------------------------------------------------------
gpuContext gpu = amoebaGpu->gpuContext;
// apparently debug array can take up nontrivial no. registers
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "%s %d maxCovalentDegreeSz=%d ZZZ\n",
methodName, gpu->natoms, amoebaGpu->maxCovalentDegreeSz );
}
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
CUDAStream<float4>* debugArray = new CUDAStream<float4>(20*paddedNumberOfAtoms, 1, "DebugArray");
memset( debugArray->_pSysData, 0, sizeof( float )*4*20*paddedNumberOfAtoms);
debugArray->Upload();
unsigned int targetAtom = 0;
gpu->psBornRadii->Download();
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "Grycuk input\n" ); (void) fflush( amoebaGpu->log );
for( int ii = 0; ii < amoebaGpu->gpuContext->sim.paddedNumberOfAtoms; ii++ ){
(void) fprintf( amoebaGpu->log,"Born %6d %16.9e\n", ii,
gpu->psBornRadii->_pSysData[ii] );
}
}
#endif
// on first pass, set threads/block and based on that setting the energy buffer array
static unsigned int threadsPerBlock = 0;
if( threadsPerBlock == 0 ){
unsigned int maxThreads;
if (gpu->sm_version >= SM_20)
//maxThreads = 384;
maxThreads = 512;
else if (gpu->sm_version >= SM_12)
maxThreads = 128;
else
maxThreads = 64;
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(GrycukChainRuleParticle), gpu->sharedMemoryPerBlock ), maxThreads);
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaGrycuk: blcks=%u tds=%u %u bPrWrp=%u atm=%lu shrd=%lu ixnCt=%lu workUnits=%u\n",
gpu->sim.nonbond_blocks, threadsPerBlock, maxThreads, gpu->bOutputBufferPerWarp,
sizeof(GrycukChainRuleParticle), sizeof(GrycukChainRuleParticle)*threadsPerBlock,
(*gpu->psInteractionCount)[0], gpu->sim.workUnits );
(void) fflush( amoebaGpu->log );
}
#endif
}
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaGrycukN2Forces%swarp: numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%lu shrd=%lu ixnCt=%lu workUnits=%u\n",
(gpu->bOutputBufferPerWarp ? " " : " no "), gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
sizeof(GrycukChainRuleParticle), sizeof(GrycukChainRuleParticle)*threadsPerBlock,
(*gpu->psInteractionCount)[0], gpu->sim.workUnits );
(void) fflush( amoebaGpu->log );
}
#endif
//kClearForces( gpu );
if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaGrycukChainRuleN2ByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(GrycukChainRuleParticle)*threadsPerBlock>>>( gpu->psWorkUnit->_pDevData
#ifdef AMOEBA_DEBUG
,debugArray->_pDevData, targetAtom
#endif
);
} else {
kCalculateAmoebaGrycukChainRuleN2_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(GrycukChainRuleParticle)*threadsPerBlock>>>( gpu->psWorkUnit->_pDevData
#ifdef AMOEBA_DEBUG
,debugArray->_pDevData, targetAtom
#endif
);
}
LAUNCHERROR("kCalculateAmoebaCudaGrycukN2Forces");
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
debugArray->Download();
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
for( int jj = 0; jj < gpu->natoms; jj++ ){
int debugIndex = jj;
(void) fprintf( amoebaGpu->log,"%5d %5d DebugGrycukChain\n", targetAtom, jj );
for( int kk = 0; kk < 7; kk++ ){
(void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e %16.9e]\n",
debugArray->_pSysData[debugIndex].x, debugArray->_pSysData[debugIndex].y,
debugArray->_pSysData[debugIndex].z, debugArray->_pSysData[debugIndex].w );
debugIndex += paddedNumberOfAtoms;
}
(void) fprintf( amoebaGpu->log,"\n" );
}
}
#endif
if( 0 ){
static int callId = 0;
gpuContext gpu = amoebaGpu->gpuContext;
std::vector<int> fileId;
fileId.push_back( callId++ );
VectorOfDoubleVectors outputVector;
//cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
//cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psLabFrameDipole, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
CUDAStream<float>* temp = new CUDAStream<float>(3*gpu->sim.paddedNumberOfAtoms, 1, "Temp1");
reduceAndCopyCUDAStreamFloat4( gpu->psForce4, temp, 1.0 );
cudaLoadCudaFloatArray( gpu->natoms, 3, temp, outputVector, gpu->psAtomIndex->_pSysData, 1.0f/4.184f );
cudaLoadCudaFloatArray( gpu->natoms, 1, gpu->psBornForce, outputVector, gpu->psAtomIndex->_pSysData, 1.0f/4.184f );
cudaLoadCudaFloatArray( gpu->natoms, 1, gpu->psBornRadii, outputVector, gpu->psAtomIndex->_pSysData, 1.0f );
cudaWriteVectorOfDoubleVectorsToFile( "GryF", fileId, outputVector );
delete temp;
exit(0);
}
// ---------------------------------------------------------------------------------------
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "amoebaScaleFactors.h"
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_NONBOND_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
#endif
void METHOD_NAME(kCalculateAmoebaGrycukBornRadii, _kernel)( unsigned int* workUnit ){
extern __shared__ GrycukParticle sA[];
unsigned int totalWarps = gridDim.x*blockDim.x/GRID;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
unsigned int numWorkUnits = cSim.pInteractionCount[0];
unsigned int pos = warp*numWorkUnits/totalWarps;
unsigned int end = (warp+1)*numWorkUnits/totalWarps;
unsigned int lasty = 0xFFFFFFFF;
while (pos < end)
{
unsigned int x;
unsigned int y;
bool bExclusionFlag;
// Extract cell coordinates
decodeCell( workUnit[pos], &x, &y, &bExclusionFlag );
unsigned int tgx = threadIdx.x & (GRID - 1);
unsigned int tbx = threadIdx.x - tgx;
unsigned int tj = tgx;
GrycukParticle* psA = &sA[tbx];
unsigned int atomI = x + tgx;
GrycukParticle localParticle;
loadGrycukShared( &localParticle, atomI );
float bornSum = 0.0f;
if (x == y) // Handle diagonals uniquely at 50% efficiency
{
// load shared data
loadGrycukShared( &(sA[threadIdx.x]), atomI );
for (unsigned int j = 0; j < GRID; j++)
{
float localBornSum;
calculateGrycukBornRadiiPairIxn_kernel( localParticle, psA[j], &localBornSum );
bornSum += ( (atomI == (y + j)) || (atomI >= cSim.atoms) || ((y+j) >= cSim.atoms) ) ? 0.0 : localBornSum;
}
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = x + tgx + warp*cSim.stride;
cSim.pBornSum[offset] += bornSum;
#else
unsigned int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
cSim.pBornSum[offset] = bornSum;
#endif
} else {
if (lasty != y) {
unsigned int atomJ = y + tgx;
loadGrycukShared( &(sA[threadIdx.x]), atomJ );
}
// zero shared fields
zeroGrycukParticleSharedField( &(sA[threadIdx.x]) );
for (unsigned int j = 0; j < GRID; j++)
{
float localBornSum;
calculateGrycukBornRadiiPairIxn_kernel( localParticle, psA[tj], &localBornSum );
bornSum += ( (atomI >= cSim.atoms) || ((y+tj) >= cSim.atoms) ) ? 0.0 : localBornSum;
calculateGrycukBornRadiiPairIxn_kernel( psA[tj], localParticle, &localBornSum );
psA[tj].bornSum += ( (atomI >= cSim.atoms) || ((y+tj) >= cSim.atoms) ) ? 0.0 : localBornSum;
tj = (tj + 1) & (GRID - 1);
}
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = x + tgx + warp*cSim.stride;
cSim.pBornSum[offset] += bornSum;
offset = y + tgx + warp*cSim.stride;
cSim.pBornSum[offset] += sA[threadIdx.x].bornSum;
#else
unsigned int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
cSim.pBornSum[offset] = bornSum;
offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
cSim.pBornSum[offset] = sA[threadIdx.x].bornSum;
#endif
lasty = y;
}
pos++;
}
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "amoebaScaleFactors.h"
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_NONBOND_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
#endif
void METHOD_NAME(kCalculateAmoebaGrycukChainRule, _kernel)( unsigned int* workUnit
#ifdef AMOEBA_DEBUG
, float4* debugArray, unsigned int targetAtom
#endif
){
extern __shared__ GrycukChainRuleParticle sAChainRule[];
unsigned int totalWarps = gridDim.x*blockDim.x/GRID;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
unsigned int numWorkUnits = cSim.pInteractionCount[0];
unsigned int pos = warp*numWorkUnits/totalWarps;
unsigned int end = (warp+1)*numWorkUnits/totalWarps;
unsigned int lasty = 0xFFFFFFFF;
#ifdef AMOEBA_DEBUG
float4 pullDebug[5];
#endif
while (pos < end)
{
unsigned int x;
unsigned int y;
bool bExclusionFlag;
// Extract cell coordinates
decodeCell( workUnit[pos], &x, &y, &bExclusionFlag );
unsigned int tgx = threadIdx.x & (GRID - 1);
unsigned int tbx = threadIdx.x - tgx;
unsigned int tj = tgx;
GrycukChainRuleParticle* psAChainRule = &sAChainRule[tbx];
unsigned int atomI = x + tgx;
GrycukChainRuleParticle localParticle;
loadGrycukChainRuleParticleShared( &localParticle, atomI );
zeroGrycukChainRuleParticleSharedField( &localParticle );
if (x == y){
// load shared data and zero force
loadGrycukChainRuleParticleShared( &(sAChainRule[threadIdx.x]), atomI );
zeroGrycukChainRuleParticleSharedField( &(sAChainRule[threadIdx.x]));
for (unsigned int j = (tgx+1)&(GRID-1); j != tgx; j = (j+1)&(GRID-1))
{
float localForce[3];
calculateGrycukChainRulePairIxn_kernel( localParticle, psAChainRule[j], localForce
#ifdef AMOEBA_DEBUG
, pullDebug
#endif
);
if( (atomI != (y + j)) && (atomI < cSim.atoms) && ((y+j) < cSim.atoms) ){
localParticle.force[0] -= localForce[0];
localParticle.force[1] -= localForce[1];
localParticle.force[2] -= localForce[2];
psAChainRule[j].force[0] += localForce[0];
psAChainRule[j].force[1] += localForce[1];
psAChainRule[j].force[2] += localForce[2];
#ifdef AMOEBA_DEBUG
if( atomI == targetAtom || (y+j) == targetAtom ){
unsigned int index = (atomI == targetAtom) ? (y + j) : atomI;
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + j);
debugArray[index].z = -1.0f;
debugArray[index].w = -1.0f;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = (float) x;
debugArray[index].y = (float) y;
debugArray[index].z = (float) tgx;
debugArray[index].w = -2.0f;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullDebug[0].x;
debugArray[index].y = pullDebug[0].y;
debugArray[index].z = pullDebug[0].z;
debugArray[index].w = pullDebug[0].w;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullDebug[1].x;
debugArray[index].y = pullDebug[1].y;
debugArray[index].z = pullDebug[1].z;
debugArray[index].w = pullDebug[1].w;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = localForce[0];
debugArray[index].y = localForce[1];
debugArray[index].z = localForce[2];
debugArray[index].w = -12.0f;
calculateGrycukChainRulePairIxn_kernel( psAChainRule[j], localParticle, localForce , pullDebug );
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullDebug[0].x;
debugArray[index].y = pullDebug[0].y;
debugArray[index].z = pullDebug[0].z;
debugArray[index].w = -13.0f;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = localForce[0];
debugArray[index].y = localForce[1];
debugArray[index].z = localForce[2];
debugArray[index].w = -14.0f;
}
#endif
}
}
// Write results
float4 of;
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = x + tgx + warp*cSim.stride;
#else
unsigned int offset = x + tgx + (x >> GRIDBITS) * cSim.stride;
#endif
of = cSim.pForce4[offset];
of.x += localParticle.force[0] + sAChainRule[threadIdx.x].force[0];
of.y += localParticle.force[1] + sAChainRule[threadIdx.x].force[1];
of.z += localParticle.force[2] + sAChainRule[threadIdx.x].force[2];
cSim.pForce4[offset] = of;
} else {
if (lasty != y) {
unsigned int atomJ = y + tgx;
loadGrycukChainRuleParticleShared( &(sAChainRule[threadIdx.x]), atomJ );
}
// zero shared fields
zeroGrycukChainRuleParticleSharedField( &(sAChainRule[threadIdx.x]) );
for (unsigned int j = 0; j < GRID; j++)
{
if( (atomI < cSim.atoms) && ((y+tj) < cSim.atoms) ){
float localForce[3];
calculateGrycukChainRulePairIxn_kernel( localParticle, psAChainRule[tj], localForce
#ifdef AMOEBA_DEBUG
, pullDebug
#endif
);
localParticle.force[0] -= localForce[0];
localParticle.force[1] -= localForce[1];
localParticle.force[2] -= localForce[2];
psAChainRule[tj].force[0] += localForce[0];
psAChainRule[tj].force[1] += localForce[1];
psAChainRule[tj].force[2] += localForce[2];
#ifdef AMOEBA_DEBUG
unsigned int index = (atomI == targetAtom) ? (y + tj) : atomI;
if( atomI == targetAtom || (y+tj) == targetAtom ){
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + tj);
debugArray[index].z = -1.0f;
debugArray[index].w = -1.0f;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = (float) x;
debugArray[index].y = (float) y;
debugArray[index].z = (float) tgx;
debugArray[index].w = -2.0f;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullDebug[0].x;
debugArray[index].y = pullDebug[0].y;
debugArray[index].z = pullDebug[0].z;
debugArray[index].w = pullDebug[0].w;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullDebug[1].x;
debugArray[index].y = pullDebug[1].y;
debugArray[index].z = pullDebug[1].z;
debugArray[index].w = pullDebug[1].w;
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = localForce[0];
debugArray[index].y = localForce[1];
debugArray[index].z = localForce[2];
debugArray[index].w = -10.0f;
}
#endif
calculateGrycukChainRulePairIxn_kernel( psAChainRule[tj], localParticle, localForce
#ifdef AMOEBA_DEBUG
, pullDebug
#endif
);
#ifdef AMOEBA_DEBUG
if( atomI == targetAtom || (y+tj) == targetAtom ){
index += cSim.paddedNumberOfAtoms;
debugArray[index].x = pullDebug[0].x;
debugArray[index].y = localForce[1];
debugArray[index].z = localForce[2];
debugArray[index].w = -11.0f;
}
#endif
localParticle.force[0] += localForce[0];
localParticle.force[1] += localForce[1];
localParticle.force[2] += localForce[2];
psAChainRule[tj].force[0] -= localForce[0];
psAChainRule[tj].force[1] -= localForce[1];
psAChainRule[tj].force[2] -= localForce[2];
}
tj = (tj + 1) & (GRID - 1);
}
// Write results
float4 of;
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = x + tgx + warp*cSim.stride;
#else
unsigned int offset = x + tgx + (y >> GRIDBITS) * cSim.stride;
#endif
of = cSim.pForce4[offset];
of.x += localParticle.force[0];
of.y += localParticle.force[1];
of.z += localParticle.force[2];
cSim.pForce4[offset] = of;
#ifdef USE_OUTPUT_BUFFER_PER_WARP
offset = y + tgx + warp*cSim.stride;
#else
offset = y + tgx + (x >> GRIDBITS) * cSim.stride;
#endif
of = cSim.pForce4[offset];
of.x += sAChainRule[threadIdx.x].force[0];
of.y += sAChainRule[threadIdx.x].force[1];
of.z += sAChainRule[threadIdx.x].force[2];
cSim.pForce4[offset] = of;
lasty = y;
}
pos++;
}
}
......@@ -1666,8 +1666,8 @@ __launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
void kReduceToBornForcePrefactor_kernel( unsigned int fieldComponents, unsigned int outputBuffers, float* fieldIn1, float* fieldIn2,
float* fieldOut )
void kReduceToObcBornForcePrefactor_kernel( unsigned int fieldComponents, unsigned int outputBuffers, float* fieldIn1, float* fieldIn2,
float* fieldOut )
{
unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
......@@ -1723,8 +1723,8 @@ __launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
void kReduceToBornForcePrefactorAndSASA_kernel( unsigned int fieldComponents, unsigned int outputBuffers, float* fieldIn1, float* fieldIn2,
float* fieldOut )
void kReduceToObcBornForcePrefactorAndSASA_kernel( unsigned int fieldComponents, unsigned int outputBuffers, float* fieldIn1, float* fieldIn2,
float* fieldOut )
{
unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
......@@ -1788,17 +1788,137 @@ void kReduceToBornForcePrefactorAndSASA_kernel( unsigned int fieldComponents, un
cSim.pEnergy[blockIdx.x * blockDim.x + threadIdx.x] += energy / -6.0f;
}
/*
static void kReduceAndCombine_dBorn(amoebaGpuContext amoebaGpu )
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
void kReduceToBornForcePrefactor_kernel( unsigned int fieldComponents, unsigned int outputBuffers, float* fieldIn1, float* fieldIn2,
float* fieldOut )
{
unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
// Reduce field
while (pos < fieldComponents)
{
float totalField = 0.0f;
float* pFt1 = fieldIn1 + pos;
float* pFt2 = fieldIn2 + pos;
//float bornRadius = cSim.pBornRadii[pos];
//float obcChain = cSim.pObcChain[pos];
unsigned int i = outputBuffers;
while (i >= 4)
{
totalField += pFt1[0] + pFt1[fieldComponents] + pFt1[2*fieldComponents] + pFt1[3*fieldComponents];
totalField += pFt2[0] + pFt2[fieldComponents] + pFt2[2*fieldComponents] + pFt2[3*fieldComponents];
pFt1 += fieldComponents*4;
pFt2 += fieldComponents*4;
i -= 4;
}
if (i >= 2)
{
totalField += pFt1[0] + pFt1[fieldComponents];
totalField += pFt2[0] + pFt2[fieldComponents];
pFt1 += fieldComponents*2;
pFt2 += fieldComponents*2;
i -= 2;
}
if (i > 0)
{
totalField += pFt1[0];
totalField += pFt2[0];
}
//fieldOut[pos] = totalField*bornRadius*bornRadius*obcChain;
fieldOut[pos] = totalField;
pos += gridDim.x * blockDim.x;
}
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
void kReduceToBornForcePrefactorAndSASA_kernel( unsigned int fieldComponents, unsigned int outputBuffers, float* fieldIn1, float* fieldIn2,
float* fieldOut )
{
unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
float energy = 0.0f;
// Reduce field
while (pos < fieldComponents)
{
float totalForce = 0.0f;
float* pFt1 = fieldIn1 + pos;
float* pFt2 = fieldIn2 + pos;
float bornRadius = cSim.pBornRadii[pos];
//float obcChain = cSim.pObcChain[pos];
float2 obcData = cSim.pObcData[pos];
unsigned int i = outputBuffers;
while (i >= 4)
{
totalForce += pFt1[0] + pFt1[fieldComponents] + pFt1[2*fieldComponents] + pFt1[3*fieldComponents];
totalForce += pFt2[0] + pFt2[fieldComponents] + pFt2[2*fieldComponents] + pFt2[3*fieldComponents];
pFt1 += fieldComponents*4;
pFt2 += fieldComponents*4;
i -= 4;
}
if (i >= 2)
{
totalForce += pFt1[0] + pFt1[fieldComponents];
totalForce += pFt2[0] + pFt2[fieldComponents];
pFt1 += fieldComponents*2;
pFt2 += fieldComponents*2;
i -= 2;
}
if (i > 0)
{
totalForce += pFt1[0];
totalForce += pFt2[0];
}
kReduceAndCombineFields_kernel<<<gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_1_1->_pDevData,
amoebaGpu->psWorkArray_1_2->_pDevData,
amoebaGpu->psBorn->_pDevData );
LAUNCHERROR("kReduce_dBorn");
} */
float r = (obcData.x + cSim.dielectricOffset + cSim.probeRadius);
float ratio6 = ( (obcData.x + cSim.dielectricOffset) / bornRadius);
ratio6 = ratio6*ratio6*ratio6;
ratio6 = ratio6*ratio6;
float saTerm = cSim.surfaceAreaFactor * r * r * ratio6;
totalForce += saTerm / bornRadius;
//totalForce *= bornRadius * bornRadius * obcChain;
fieldOut[pos] = totalForce;
energy += saTerm;
pos += gridDim.x * blockDim.x;
}
cSim.pEnergy[blockIdx.x * blockDim.x + threadIdx.x] += energy / -6.0f;
}
static void kReduceToBornForcePrefactor( amoebaGpuContext amoebaGpu )
{
......@@ -1982,7 +2102,8 @@ void kCalculateAmoebaKirkwood( amoebaGpuContext amoebaGpu )
// Tinker's Born1 && E-diff
kCalculateObcGbsaForces2( amoebaGpu->gpuContext );
//kCalculateObcGbsaForces2( amoebaGpu->gpuContext );
kCalculateGrycukGbsaForces2( amoebaGpu );
kCalculateAmoebaKirkwoodEDiff( amoebaGpu );
// ---------------------------------------------------------------------------------------
......
......@@ -36,6 +36,7 @@
#include "AmoebaReferenceMultipoleForce.h"
#include "AmoebaReferenceVdwForce.h"
#include "AmoebaReferenceWcaDispersionForce.h"
#include "openmm/internal/AmoebaTorsionTorsionForceImpl.h"
#include "openmm/internal/AmoebaWcaDispersionForceImpl.h"
#include "AmoebaReferenceUreyBradleyForce.h"
#include "ReferencePlatform.h"
......@@ -412,16 +413,31 @@ void ReferenceCalcAmoebaTorsionTorsionForceKernel::initialize(const System& syst
for (int ii = 0; ii < numTorsionTorsionGrids; ii++) {
const TorsionTorsionGrid grid = force.getTorsionTorsionGrid( ii );
torsionTorsionGrids[ii].resize( grid.size() );
// check if grid needs to be reordered: x-angle should be 'slow' index
TorsionTorsionGrid reorderedGrid;
int reorder = 0;
if( grid[0][0][0] != grid[0][1][0] ){
AmoebaTorsionTorsionForceImpl::reorderGrid( grid, reorderedGrid );
reorder = 1;
}
for (unsigned int kk = 0; kk < grid.size(); kk++) {
torsionTorsionGrids[ii][kk].resize( grid[kk].size() );
for (unsigned int jj = 0; jj < grid[kk].size(); jj++) {
torsionTorsionGrids[ii][kk][jj].resize( grid[kk][jj].size() );
for (unsigned int ll = 0; ll < grid[ll][jj].size(); ll++) {
torsionTorsionGrids[ii][kk][jj][ll] = static_cast<RealOpenMM>(grid[kk][jj][ll]);
if( reorder ){
for (unsigned int ll = 0; ll < grid[ll][jj].size(); ll++) {
torsionTorsionGrids[ii][kk][jj][ll] = static_cast<RealOpenMM>(reorderedGrid[kk][jj][ll]);
}
} else {
for (unsigned int ll = 0; ll < grid[ll][jj].size(); ll++) {
torsionTorsionGrids[ii][kk][jj][ll] = static_cast<RealOpenMM>(grid[kk][jj][ll]);
}
}
}
}
......
......@@ -27,8 +27,8 @@
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "openmm/amoebaKernels.h"
#include "openmm/System.h"
#include "openmm/amoebaKernels.h"
#include "SimTKUtilities/SimTKOpenMMRealType.h"
namespace OpenMM {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment