Commit cb130f92 authored by Mark Friedrichs's avatar Mark Friedrichs
Browse files

Mods

parent cc8b4de0
......@@ -1536,3 +1536,276 @@ std::string BrookBonded::getContentsString( int level ) const {
return message.str();
}
/*
* Helper functions for building inverse maps for
* torsions, impropers and angles.
*
* For each atom, calculates the positions at which it's
* forces are to be picked up from and stores the position
* in the appropriate index.
*
* Input: number of dihedrals, the atom indices, and a flag indicating
* whether we're doing i(0), j(1), k(2) or l(3)
* Output: an array of counts per atom
* arrays of inversemaps
* nimaps - the number of invmaps actually used.
*
* @param posflag 0-niatoms-1
* @param niatoms 3 for angles, 4 for torsions, impropers
* @param nints number of interactions
* @param natoms number of atoms
* @param *atoms gromacs interaction list
* @param nmaps maximum number of inverse maps
* @param counts[] output counts of how many places each atom occurs
* @param *invmaps[] output array of nmaps inverse maps
* @param *nimaps, output max number of inverse maps actually used
*
* @return DefaultReturnValue, unless error in which case exits w/ OpenMM exception
*
**/
int BrookBonded::gpuCalcInvMap( int posflag, int niatoms, int nints, int natoms,
int *atoms, int nmaps, int counts[], float4 *invmaps[],
int *nimaps ){
// ---------------------------------------------------------------------------------------
int i, j;
int atom;
int mapnum, mapcomp;
static const std::string methodName = "BrookBonded::gpuCalcInvMap";
static const unsigned int MAX_LINE_CHARS = 256;
//char value[MAX_LINE_CHARS];
static const char* Set = "Set";
static const char* NotSet = "Not set";
static const int PrintOn = 0;
// ---------------------------------------------------------------------------------------
memset( counts, 0, sizeof( int )*natoms );
for( i = 0; i < nmaps; i++ ){
for( j = 0; j < natoms; j++ ){
invmaps[i][j] = float4( -1.0, -1.0, -1.0, -1.0 );
}
}
//This will hold the number of imaps actually used
*nimaps = -1;
//Now note down the positions where each atom occurs
if( PrintOn && getLog() ){
(void) fprintf( getLog(), "%s: pos=%d ni=%d nints=%d natoms=%d nmaps=<%d>\n", methodName.c_str(), posflag, niatoms, nints, natoms, nmaps );
(void) fflush( getLog() );
}
int atomRange[2] = { 90000000, -90000000 };
int mapnumRange[2] = { 90000000, -90000000 };
for( i = 0; i < nints; i++ ){
//This is our atom
atom = atoms[ (niatoms + 1) * i + posflag + 1 ];
//Special for merged bondeds
if ( atom == -1 ){
continue;
}
if( atom < atomRange[0] ){
atomRange[0] = atom;
}
if( atom > atomRange[1] ){
atomRange[1] = atom;
}
//Check to make sure we're inside the limits
if ( counts[atom] > nmaps * 4 ){
if( PrintOn && getLog() ){
(void) fprintf( getLog(), "%s Atom %d has too many proper dihedrals(%d, max %d)\n",
methodName.c_str(), atom, counts[atom], nmaps*4 );
(void) fflush( getLog() );
}
std::stringstream message;
message << methodName << " Atom " << atom << " has too many proper dihedrals; valid range:(" << counts[atom] << ", " << nmaps*4 << ")";
throw OpenMMException( message.str() );
}
//Which invmap will this go into
mapnum = counts[atom] / 4;
if ( mapnum > *nimaps )
*nimaps = mapnum;
//Which component will it be
mapcomp = counts[atom] % 4;
//Set it
//This is silly, but otherwise I have to declare it as float*
//and things get even more confusing. :)
switch (mapcomp){
case 0: invmaps[mapnum][atom].x = (float) i; break;
case 1: invmaps[mapnum][atom].y = (float) i; break;
case 2: invmaps[mapnum][atom].z = (float) i; break;
case 3: invmaps[mapnum][atom].w = (float) i; break;
default:
if( PrintOn && getLog() ){
(void) fprintf( getLog(), "mapcomp %d invalid -- impossible!\n", mapcomp );
(void) fflush( getLog() );
}
std::stringstream message;
message << methodName << " mapcomp " << mapcomp << " invalid -- actually impossible!";
throw OpenMMException( message.str() );
break;
}
counts[atom]++;
if( mapnum < mapnumRange[0] ){
mapnumRange[0] = mapnum;
}
if( mapnum > mapnumRange[1] ){
mapnumRange[1] = mapnum;
}
//fprintf( gpu->log, "%d atom=%d mapcomp=%d counts[]=%d mapnum=%d\n", i, atom, mapcomp, counts[atom], mapnum );
}
(*nimaps)++;
if( PrintOn && getLog() ){
(void) fprintf( getLog(), "%s mnmaps=%d Ranges: atom [%d %d] mapnum [%d %d]\n",
methodName.c_str(), *nimaps, atomRange[0], atomRange[1], mapnumRange[0], mapnumRange[1] );
(void) fflush( getLog() );
}
return DefaultReturnValue;
}
void BrookBonded::gpuPrintInvMaps( int nmaps, int natoms, int counts[], float4 *invmap[], FILE* logFile ){
int i;
int j;
for( i = 0; i < natoms; i++ ){
fprintf( logFile, "%d %d ", i, counts[i] );
for( j = 0; j < nmaps; j++ ){
fprintf( logFile, "%6.0f %6.0f %6.0f %6.0f", invmap[j][i].x, invmap[j][i].y,
invmap[j][i].z, invmap[j][i].w );
}
fprintf( logFile, "\n");
}
}
/* We are still plagued by kernel call overheads. This is for a big fat
* merged inverse gather kernel:
* Since we have 32 bit floats, we have 23 bits of mantissa or the largest
* integer we can represent is 2^23. So it should be quite safe to add
* 100000 * n to the index where n is the stream in which we should do the
* lookup. This assumes that nints < 100000, preferably nints << 100000
* which should always be true
* */
int BrookBonded::gpuCalcInvMap_merged(
int nints, //number of interactions
int natoms, //number of atoms
int *atoms, //ijkl,ijkl,ijkl...
int nmaps, //maximum number of inverse maps
int counts[], //output counts of how many places each atom occurs
float4 *invmaps[], //output array of nmaps inverse maps
int *nimaps //output max number of inverse maps actually used
){
int i, j;
int atom;
int mapnum, mapcomp;
int pos;
for( i = 0; i < natoms; i++ )
counts[i] = 0;
for( i = 0; i < nmaps; i++ ){
for( j = 0; j < natoms; j++ ){
invmaps[i][j] = float4( -1.0, -1.0, -1.0, -1.0 );
}
}
//This will hold the number of imaps actually used
*nimaps = -1;
//For each atom
for( i = 0; i < nints; i++ ){
for( j = 0; j < 4; j++ ){
atom = atoms[ i * 4 + j ];
if ( atom == -1 ){
//Nothing to be done for this atom, go to next
continue;
}
//Which map
mapnum = counts[ atom ] / 4;
//Make sure we have space
if ( mapnum >= nmaps ){
printf( "Atom %d has too many bondeds(%d, max %d)\n",
atom, counts[atom], nmaps * 4 );
return 0;
}
if ( mapnum > *nimaps ){
*nimaps = mapnum;
}
//Which component
mapcomp = counts[ atom ] % 4;
//Encode target stream and position
pos = 100000 * j + i;
switch ( mapcomp ){
case 0: invmaps[mapnum][atom].x = (float) pos; break;
case 1: invmaps[mapnum][atom].y = (float) pos; break;
case 2: invmaps[mapnum][atom].z = (float) pos; break;
case 3: invmaps[mapnum][atom].w = (float) pos; break;
}
counts[ atom ]++;
}
}
(*nimaps)++;
return 1;
}
/* Repacks the invmap streams for more efficient access in the
* merged inverse gather kernel
*
* buf should be nimaps * natoms large.
* */
int BrookBonded::gpuRepackInvMap_merged( int natoms, int nmaps, int *counts,
float4 *invmaps[], float4 *buf ){
int i, j;
int nmaps_i;
for( i = 0; i < natoms; i++ ){
for( j = 0; j < nmaps; j++ ){
buf[ i + j*natoms ] = float4( -1.0f, -1.0f, -1.0f, -1.0f );
}
}
for( i = 0; i < natoms; i++ ){
nmaps_i = counts[i] / 4;
if ( counts[i] % 4 )
nmaps_i++;
for( j = 0; j < nmaps_i; j++ ){
buf[ i + j * natoms ] = invmaps[j][i];
}
}
return 1;
}
......@@ -421,6 +421,58 @@ class BrookBonded : public BrookCommon {
int validateInverseMapStreamCount( int index, int count ) const;
/*
* Helper functions for building inverse maps for
* torsions, impropers and angles.
*
* For each atom, calculates the positions at which it's
* forces are to be picked up from and stores the position
* in the appropriate index.
*
* Input: number of dihedrals, the atom indices, and a flag indicating
* whether we're doing i(0), j(1), k(2) or l(3)
* Output: an array of counts per atom
* arrays of inversemaps
* nimaps - the number of invmaps actually used.
*
* @param posflag 0-niatoms-1
* @param niatoms 3 for angles, 4 for torsions, impropers
* @param nints number of interactions
* @param natoms number of atoms
* @param *atoms gromacs interaction list
* @param nmaps maximum number of inverse maps
* @param counts[] output counts of how many places each atom occurs
* @param *invmaps[] output array of nmaps inverse maps
* @param *nimaps, output max number of inverse maps actually used
*
* @return DefaultReturnValue, unless error in which case exits w/ OpenMM exception
*
**/
int gpuCalcInvMap( int posflag, int niatoms, int nints, int natoms,
int *atoms, int nmaps, int counts[], float4 *invmaps[],
int *nimaps );
void gpuPrintInvMaps( int nmaps, int natoms, int counts[], float4 *invmap[], FILE* logFile );
/* We are still plagued by kernel call overheads. This is for a big fat
* merged inverse gather kernel:
* Since we have 32 bit floats, we have 23 bits of mantissa or the largest
* integer we can represent is 2^23. So it should be quite safe to add
* 100000 * n to the index where n is the stream in which we should do the
* lookup. This assumes that nints < 100000, preferably nints << 100000
* which should always be true
* */
int gpuCalcInvMap_merged( int nints, int natoms, int *atoms, int nmaps, int counts[], float4 *invmaps[], int *nimaps );
/* Repacks the invmap streams for more efficient access in the
* merged inverse gather kernel
*
* buf should be nimaps * natoms large.
* */
int gpuRepackInvMap_merged( int natoms, int nmaps, int *counts, float4 *invmaps[], float4 *buf );
};
} // namespace OpenMM
......
......@@ -36,9 +36,8 @@
#include "BrookStreamImpl.h"
#include "BrookCalcGBSAOBCForceFieldKernel.h"
#include "force.h"
#include "kgbsa.h"
#include "kforce.h"
#include "gpu/kgbsa.h"
#include "gpu/kforce.h"
#include "math.h"
using namespace OpenMM;
......
......@@ -33,7 +33,7 @@
* -------------------------------------------------------------------------- */
#include "kernels.h"
#include "SimTKUtilities/SimTKOpenMMRealType.h"
#include "../../reference/src/SimTKUtilities/SimTKOpenMMRealType.h"
#include "BrookGbsa.h"
namespace OpenMM {
......
......@@ -36,10 +36,9 @@
#include "BrookStreamImpl.h"
#include "BrookCalcStandardMMForceFieldKernel.h"
#include "kforce.h"
#include "kinvmap_gather.h"
#include "gpu/kforce.h"
#include "gpu/kinvmap_gather.h"
#include "ReferencePlatform.h"
#include "ReferenceFloatStreamImpl.h"
#include "VerletIntegrator.h"
#include "StandardMMForceField.h"
......@@ -254,7 +253,7 @@ void BrookCalcStandardMMForceFieldKernel::initialize(
const vector<int> periodicTorsionInd = *periodicTorsionI_it;
const vector<double> periodicTorsionPrm = *periodicTorsionP_it;
_refForceField->setPeriodicTorsionParameters( ii, periodicTorsionInd[0], periodicTorsionInd[1], periodicTorsionInd[2], periodicTorsionInd[3],
periodicTorsionPrm[2], periodicTorsionPrm[1], periodicTorsionPrm[0] );
(int) (periodicTorsionPrm[2] + 0.001), periodicTorsionPrm[1], periodicTorsionPrm[0] );
/*
printf( "PeriodicTor: [%d %d %d %d] [%.5e %.5e %.5e]\n", periodicTorsionInd[0], periodicTorsionInd[1], periodicTorsionInd[2], periodicTorsionInd[3],
periodicTorsionPrm[2], periodicTorsionPrm[1], periodicTorsionPrm[0] ); fflush( stdout );
......
......@@ -33,7 +33,7 @@
* -------------------------------------------------------------------------- */
#include "kernels.h"
#include "SimTKUtilities/SimTKOpenMMRealType.h"
#include "../../reference/src/SimTKUtilities/SimTKOpenMMRealType.h"
#include "BrookBonded.h"
#include "BrookNonBonded.h"
#include "StandardMMForceField.h"
......
......@@ -34,7 +34,7 @@
#include "BrookPlatform.h"
#include "BrookStreamInternal.h"
#include "SimTKUtilities/SimTKOpenMMRealType.h"
#include "../../reference/src/SimTKUtilities/SimTKOpenMMRealType.h"
namespace OpenMM {
......
......@@ -33,7 +33,7 @@
#include "BrookKernelFactory.h"
#include "OpenMMException.h"
#include "kernels.h"
#include "SimTKUtilities/SimTKOpenMMRealType.h"
#include "../../reference/src/SimTKUtilities/SimTKOpenMMRealType.h"
#include <brook/brook.hpp>
#include <stdlib.h>
#include <sstream>
......@@ -83,10 +83,20 @@ BrookPlatform::BrookPlatform( ){
// get Brook runtime
#ifdef WIN32
char* runtime;
size_t numberOfEnv;
_dupenv_s( &runtime, &numberOfEnv, "brt_runtime" );
#else
char* runtime = getenv( "brt_runtime" );
#endif
_initializeKernelFactory( );
_setBrookRuntime( runtime );
#ifdef WIN32
free( runtime );
#endif
}
/**
......
......@@ -32,11 +32,7 @@
#include <sstream>
#include "BrookRandomNumberGenerator.h"
#include "OpenMMException.h"
#include "kupdatesd.h"
// use random number generator
#include "SimTKOpenMMUtilities.h"
#include "gpu/kupdatesd.h"
using namespace OpenMM;
using namespace std;
......@@ -244,7 +240,7 @@ inline int MaxInt( unsigned int x, unsigned int y ){ return x > y ? x : y; }
*/
void BrookRandomNumberGenerator::_generateRandomsKiss( float* randomV1, float* randomV2, float* randomV3,
unsigned int state[4] ){
unsigned int state[4] ){
// ---------------------------------------------------------------------------------------
......
......@@ -35,10 +35,6 @@
#include "OpenMMException.h"
#include "BrookStreamImpl.h"
// use random number generator
#include "SimTKOpenMMUtilities.h"
using namespace OpenMM;
using namespace std;
......
......@@ -34,13 +34,13 @@
#include "BrookPlatform.h"
#include "OpenMMException.h"
#include "BrookStreamImpl.h"
#include "kshakeh.h"
#include "kupdatesd.h"
#include "kcommon.h"
#include "gpu/kshakeh.h"
#include "gpu/kupdatesd.h"
#include "gpu/kcommon.h"
// use random number generator
#include "SimTKOpenMMUtilities.h"
#include "../../reference/src/SimTKUtilities/SimTKOpenMMUtilities.h"
using namespace OpenMM;
using namespace std;
......
......@@ -33,7 +33,6 @@
* -------------------------------------------------------------------------- */
#include <brook/brook.hpp>
#include "SimTKUtilities/SimTKOpenMMRealType.h"
namespace OpenMM {
......
/****************************************************************
* This file is part of the gpu acceleration library for gromacs.
* Author: V. Vishal
* Copyright (C) Pande Group, Stanford, 2006
*****************************************************************/
#include <stdio.h>
#include <brook/brook.hpp>
// #include "typedefs.h"
#include "invmap.h"
/*
* Helper functions for building inverse maps for
* torsions, impropers and angles.
*
* */
/*
* For each atom, calculates the positions at which it's
* forces are to be picked up from and stores the position
* in the appropriate index.
*
* Input: number of dihedrals, the atom indices, and a flag indicating
* whether we're doing i(0), j(1), k(2) or l(3)
* Output: an array of counts per atom
* arrays of inversemaps
* nimaps - the number of invmaps actually used.
*
* */
int
gpuCalcInvMap(
int posflag, //0-niatoms-1
int niatoms, //3 for angles, 4 for torsions, impropers
int nints, //number of interactions
int natoms, //number of atoms
int *atoms, //gromacs interaction list
int nmaps, //maximum number of inverse maps
int counts[], //output counts of how many places each atom occurs
float4 *invmaps[], //output array of nmaps inverse maps
int *nimaps //output max number of inverse maps actually used
)
{
int i, j;
int atom;
int mapnum, mapcomp;
for ( i = 0; i < natoms; i++ )
counts[i] = 0;
for ( i = 0; i < nmaps; i++ ) {
for ( j = 0; j < natoms; j++ ) {
invmaps[i][j] = float4( -1.0, -1.0, -1.0, -1.0 );
}
}
//This will hold the number of imaps actually used
*nimaps = -1;
printf( "gpuCalcInvMap: posflag=%d niatoms=%d nints=%d natoms=%d nmaps=%d\n",
posflag, niatoms, nints, natoms, nmaps );
//Now note down the positions where each atom occurs
for ( i = 0; i < nints; i++ ) {
//This is our atom
atom = atoms[ (niatoms + 1) * i + posflag + 1 ];
//Special for merged bondeds
if ( atom == -1 ) {
continue;
}
//Check to make sure we're inside the limits
if ( counts[atom] > nmaps * 4 ) {
printf( "Atom %d has too many proper dihedrals(%d, max %d)\n",
atom, counts[atom], nmaps * 4 );
return 0;
}
//Which invmap will this go into
mapnum = counts[atom] / 4;
if ( mapnum > *nimaps )
*nimaps = mapnum;
//Which component will it be
mapcomp = counts[atom] % 4;
//Set it
//This is silly, but otherwise I have to declare it as float*
//and things get even more confusing. :)
switch (mapcomp) {
case 0: invmaps[mapnum][atom].x = (float) i; break;
case 1: invmaps[mapnum][atom].y = (float) i; break;
case 2: invmaps[mapnum][atom].z = (float) i; break;
case 3: invmaps[mapnum][atom].w = (float) i; break;
}
counts[atom]++;
printf( "Atom %d count=%d max %d mapcomp=%d val=%d mapnum=%d\n", atom, counts[atom],
nmaps * 4, mapcomp, i, mapnum );
}
(*nimaps)++;
return 1;
}
void
gpuPrintInvMaps( int nmaps, int natoms, int counts[], float4 *invmap[], FILE* logFile )
{
int i;
int j;
for ( i = 0; i < natoms; i++ ) {
fprintf( logFile, "%d %d ", i, counts[i] );
for ( j = 0; j < nmaps; j++ ) {
fprintf( logFile, "%6.0f %6.0f %6.0f %6.0f", invmap[j][i].x, invmap[j][i].y,
invmap[j][i].z, invmap[j][i].w );
}
fprintf( logFile, "\n");
}
}
/* We are still plagued by kernel call overheads. This is for a big fat
* merged inverse gather kernel:
* Since we have 32 bit floats, we have 23 bits of mantissa or the largest
* integer we can represent is 2^23. So it should be quite safe to add
* 100000 * n to the index where n is the stream in which we should do the
* lookup. This assumes that nints < 100000, preferably nints << 100000
* which should always be true
* */
int
gpuCalcInvMap_merged(
int nints, //number of interactions
int natoms, //number of atoms
int *atoms, //ijkl,ijkl,ijkl...
int nmaps, //maximum number of inverse maps
int counts[], //output counts of how many places each atom occurs
float4 *invmaps[], //output array of nmaps inverse maps
int *nimaps //output max number of inverse maps actually used
)
{
int i, j;
int atom;
int mapnum, mapcomp;
int pos;
for ( i = 0; i < natoms; i++ )
counts[i] = 0;
for ( i = 0; i < nmaps; i++ ) {
for ( j = 0; j < natoms; j++ ) {
invmaps[i][j] = float4( -1.0, -1.0, -1.0, -1.0 );
}
}
//This will hold the number of imaps actually used
*nimaps = -1;
//For each atom
for ( i = 0; i < nints; i++ ) {
for ( j = 0; j < 4; j++ ) {
atom = atoms[ i * 4 + j ];
if ( atom == -1 ) {
//Nothing to be done for this atom, go to next
continue;
}
//Which map
mapnum = counts[ atom ] / 4;
//Make sure we have space
if ( mapnum >= nmaps ) {
printf( "Atom %d has too many bondeds(%d, max %d)\n",
atom, counts[atom], nmaps * 4 );
return 0;
}
if ( mapnum > *nimaps ) {
*nimaps = mapnum;
}
//Which component
mapcomp = counts[ atom ] % 4;
//Encode target stream and position
pos = 100000 * j + i;
switch ( mapcomp ) {
case 0: invmaps[mapnum][atom].x = (float) pos; break;
case 1: invmaps[mapnum][atom].y = (float) pos; break;
case 2: invmaps[mapnum][atom].z = (float) pos; break;
case 3: invmaps[mapnum][atom].w = (float) pos; break;
}
counts[ atom ]++;
}
}
(*nimaps)++;
return 1;
}
/* Repacks the invmap streams for more efficient access in the
* merged inverse gather kernel
*
* buf should be nimaps * natoms large.
* */
int
gpuRepackInvMap_merged( int natoms, int nmaps, int *counts,
float4 *invmaps[], float4 *buf )
{
int i, j;
int nmaps_i;
for ( i = 0; i < natoms; i++ ) {
for ( j = 0; j < nmaps; j++ ) {
buf[ i + j*natoms ] = float4( -1.0f, -1.0f, -1.0f, -1.0f );
}
}
for ( i = 0; i < natoms; i++ ) {
nmaps_i = counts[i] / 4;
if ( counts[i] % 4 )
nmaps_i++;
for ( j = 0; j < nmaps_i; j++ ) {
buf[ i + j * natoms ] = invmaps[j][i];
}
}
return 1;
}
/****************************************************************
* This file is part of the gpu acceleration library for gromacs.
* Author: Mark Friedrichs
*
* This kernel was developed in collaboration with
*
* Copyright (C) Pande Group, Stanford, 2006
*****************************************************************/
kernel void loop2Internal( float3 d1, float3 d2, float3 d3, float3 d4, float4 jAtomicRadiiScaled,
float bornForce, float iAtomicRadii, out float4 de<>, out float4 bornSum<> ){
// ---------------------------------------------------------------------------------------
float4 r, rInverse, r2, r2Inverse;
float4 l_ij, l_ij2;
float4 u_ij, u_ij2;
float4 t3;
float4 logRatio, scaledRadiusJ2, rPlusScaledJ, rMinusScaledJ, rMinusScaledJAbs;
float4 iAtomicRadii4;
float4 diff_u2_l2;
const float bigValue = 1000000.0f;
const float4 bigValue4 = float4( bigValue, bigValue, bigValue, bigValue );
const float smallValue = 0.000001f;
const float4 smallValue4 = float4( smallValue, smallValue, smallValue, smallValue );
const float4 zero4 = float4( 0.0f, 0.0f, 0.0f, 0.0f );
const float4 one4 = float4( 1.0f, 1.0f, 1.0f, 1.0f );
float4 mask;
// ---------------------------------------------------------------------------------------
iAtomicRadii4 = float4( iAtomicRadii, iAtomicRadii, iAtomicRadii, iAtomicRadii );
r2 = float4( dot(d1, d1), dot( d2, d2 ), dot( d3, d3 ), dot( d4, d4 ) );
// ---------------------------------------------------------------------------------------
// not needed for force calculation since deltaX = deltaY = deltaZ = 0, if r = 0
// r2 not masked -- assume for i = j, that 0 * infinity i 0; appears to work
mask = r2 < smallValue4 ? zero4 : one4;
// r2 = r2 < smallValue4 ? bigValue4 : r2;
// ---------------------------------------------------------------------------------------
r = sqrt( r2 );
rInverse = rsqrt( r2 );
r2Inverse = rInverse*rInverse;
rPlusScaledJ = r + jAtomicRadiiScaled;
rMinusScaledJ = r - jAtomicRadiiScaled;
rMinusScaledJAbs = abs( rMinusScaledJ );
// ---------------------------------------------------------------------------------------
mask = iAtomicRadii4 >= rPlusScaledJ ? zero4 : mask;
l_ij = iAtomicRadii4 > rMinusScaledJAbs ? iAtomicRadii4 : rMinusScaledJAbs;
// ---------------------------------------------------------------------------------------
l_ij = 1.0f/l_ij;
l_ij2 = l_ij*l_ij;
u_ij = 1.0f/rPlusScaledJ;
u_ij2 = u_ij*u_ij;
diff_u2_l2 = u_ij2 - l_ij2;
scaledRadiusJ2 = jAtomicRadiiScaled*jAtomicRadiiScaled;
logRatio = log(u_ij/l_ij);
// terms associated w/ dL/dr & dU/dr are zero and not included
t3 = -0.125f*(1.0f + scaledRadiusJ2*r2Inverse)*diff_u2_l2 + 0.25f*logRatio*r2Inverse;
de = mask*bornForce*t3*rInverse;
// de = float4( 1.0f, 1.0f, 1.0f, 1.0f );
// de = scaledRadiusJ2;
// de = mask*t3;
// de = mask*rPlusScaledJ;
// de = mask*r2;
// de = mask;
// de = mask*rInverse;
// de = mask*bornForce;
// Born radius term
// bornSum = l_ij - u_ij + 0.25*r*diff_u2_l2 + 0.5f*rInverse*logRatio - (0.25*jAtomicRadiiScaled*jAtomicRadiiScaled*rInverse)*diff_u2_l2;
bornSum = l_ij - u_ij + 0.5f*rInverse*logRatio + 0.25f*( r - jAtomicRadiiScaled*jAtomicRadiiScaled*rInverse)*diff_u2_l2;
t3 = ( iAtomicRadii4 < (jAtomicRadiiScaled - r) ) ? 2.0f*( (1.0f/iAtomicRadii) - l_ij) : zero4;
bornSum += t3;
bornSum *= mask;
}
/****************************************************************
* This file is part of the gpu acceleration library for gromacs.
* Author: Mark Friedrichs
*
* This kernel was developed in collaboration with
*
* Copyright (C) Pande Group, Stanford, 2006
*****************************************************************/
kernel void loop2InternalR2( float4 r2, float4 jAtomicRadiiScaled,
float bornForce, float iAtomicRadii, out float4 de<>, out float4 bornSum<> ){
// ---------------------------------------------------------------------------------------
float4 r, rInverse, r2Inverse;
float4 l_ij, l_ij2;
float4 u_ij, u_ij2;
float4 t3;
float4 logRatio, scaledRadiusJ2, rPlusScaledJ, rMinusScaledJ, rMinusScaledJAbs;
float4 iAtomicRadii4;
float4 diff_u2_l2;
const float bigValue = 1000000.0f;
const float4 bigValue4 = float4( bigValue, bigValue, bigValue, bigValue );
const float smallValue = 0.000001f;
const float4 smallValue4 = float4( smallValue, smallValue, smallValue, smallValue );
const float4 zero4 = float4( 0.0f, 0.0f, 0.0f, 0.0f );
const float4 one4 = float4( 1.0f, 1.0f, 1.0f, 1.0f );
float4 mask;
// ---------------------------------------------------------------------------------------
iAtomicRadii4 = float4( iAtomicRadii, iAtomicRadii, iAtomicRadii, iAtomicRadii );
// ---------------------------------------------------------------------------------------
// not needed for force calculation since deltaX = deltaY = deltaZ = 0, if r = 0
// r2 not masked -- assume for i = j, that 0 * infinity i 0; appears to work
mask = r2 < smallValue4 ? zero4 : one4;
// r2 = r2 < smallValue4 ? bigValue4 : r2;
// ---------------------------------------------------------------------------------------
r = sqrt( r2 );
rInverse = rsqrt( r2 );
r2Inverse = rInverse*rInverse;
rPlusScaledJ = r + jAtomicRadiiScaled;
rMinusScaledJ = r - jAtomicRadiiScaled;
rMinusScaledJAbs = abs( rMinusScaledJ );
// ---------------------------------------------------------------------------------------
mask = iAtomicRadii4 >= rPlusScaledJ ? zero4 : mask;
l_ij = iAtomicRadii4 > rMinusScaledJAbs ? iAtomicRadii4 : rMinusScaledJAbs;
// ---------------------------------------------------------------------------------------
l_ij = 1.0f/l_ij;
l_ij2 = l_ij*l_ij;
u_ij = 1.0f/rPlusScaledJ;
u_ij2 = u_ij*u_ij;
diff_u2_l2 = u_ij2 - l_ij2;
scaledRadiusJ2 = jAtomicRadiiScaled*jAtomicRadiiScaled;
logRatio = log(u_ij/l_ij);
// terms associated w/ dL/dr & dU/dr are zero and not included
t3 = -0.125f*(1.0f + scaledRadiusJ2*r2Inverse)*diff_u2_l2 + 0.25f*logRatio*r2Inverse;
de = mask*bornForce*t3*rInverse;
// de = float4( 1.0f, 1.0f, 1.0f, 1.0f );
// de = scaledRadiusJ2;
// de = mask*t3;
// de = mask*rPlusScaledJ;
// de = mask*r2;
// de = mask;
// de = mask*rInverse;
// de = mask*bornForce;
// Born radius term
// bornSum = l_ij - u_ij + 0.25*r*diff_u2_l2 + 0.5f*rInverse*logRatio - (0.25*jAtomicRadiiScaled*jAtomicRadiiScaled*rInverse)*diff_u2_l2;
bornSum = l_ij - u_ij + 0.5f*rInverse*logRatio + 0.25f*( r - jAtomicRadiiScaled*jAtomicRadiiScaled*rInverse)*diff_u2_l2;
t3 = ( iAtomicRadii4 < (jAtomicRadiiScaled - r) ) ? 2.0f*( (1.0f/iAtomicRadii) - l_ij) : zero4;
bornSum += t3;
bornSum *= mask;
}
/* This is a copy of loop2Internal(), but w/ the calculation of bornSum removed */
kernel void loop2InternalNoSum( float3 d1, float3 d2, float3 d3, float3 d4, float4 jAtomicRadiiScaled,
float bornForce, float iAtomicRadii, out float4 de<> ){
// ---------------------------------------------------------------------------------------
float4 r, rInverse, r2, r2Inverse;
float4 l_ij, l_ij2;
float4 u_ij, u_ij2;
float4 t3;
float4 logRatio, scaledRadiusJ2, rPlusScaledJ, rMinusScaledJ, rMinusScaledJAbs;
float4 iAtomicRadii4;
const float bigValue = 1000000.0f;
const float4 bigValue4 = float4( bigValue, bigValue, bigValue, bigValue );
const float smallValue = 0.000001f;
const float4 smallValue4 = float4( smallValue, smallValue, smallValue, smallValue );
const float4 zero4 = float4( 0.0f, 0.0f, 0.0f, 0.0f );
const float4 one4 = float4( 1.0f, 1.0f, 1.0f, 1.0f );
float4 mask;
// ---------------------------------------------------------------------------------------
iAtomicRadii4 = float4( iAtomicRadii, iAtomicRadii, iAtomicRadii, iAtomicRadii );
r2 = float4( dot(d1, d1), dot( d2, d2 ), dot( d3, d3 ), dot( d4, d4 ) );
// ---------------------------------------------------------------------------------------
// not needed for force calculation since deltaX = deltaY = deltaZ = 0, if r = 0
// mask = r2 < smallValue4 ? zero4 : one4;
// r2 = r2 < smallValue4 ? bigValue4 : r2;
// ---------------------------------------------------------------------------------------
r = sqrt( r2 );
rInverse = rsqrt( r2 );
r2Inverse = rInverse*rInverse;
rPlusScaledJ = r + jAtomicRadiiScaled;
rMinusScaledJ = r - jAtomicRadiiScaled;
rMinusScaledJAbs = abs( rMinusScaledJ );
// ---------------------------------------------------------------------------------------
// mask = iAtomicRadii4 >= rPlusScaledJ ? zero4 : mask;
mask = iAtomicRadii4 >= rPlusScaledJ ? zero4 : one4;
l_ij = iAtomicRadii4 > rMinusScaledJAbs ? iAtomicRadii4 : rMinusScaledJAbs;
// ---------------------------------------------------------------------------------------
l_ij = 1.0f/l_ij;
l_ij2 = l_ij*l_ij;
u_ij = 1.0f/rPlusScaledJ;
u_ij2 = u_ij*u_ij;
scaledRadiusJ2 = jAtomicRadiiScaled*jAtomicRadiiScaled;
logRatio = log(u_ij/l_ij);
// terms associated w/ dL/dr & dU/dr are zero and not included
t3 = 0.125f*(1.0f + scaledRadiusJ2*r2Inverse)*(l_ij2 - u_ij2) + 0.25f*logRatio*r2Inverse;
de = mask*bornForce*t3*rInverse;
// de = rInverse;
// de = float4( 1.0f, 1.0f, 1.0f, 1.0f );
// de = bornForce;
}
// This kernel includes terms dL/dr & dU/dr from paper
// these values are zero, but included here to confirm that is the case
kernel void loop2InternalX( float3 d1, float3 d2, float3 d3, float3 d4, float4 jAtomicRadiiScaled,
float bornForce, float iAtomicRadii, out float4 de<>, out float4 bornSum<> ){
// ---------------------------------------------------------------------------------------
float4 r, rInverse, r2, r2Inverse;
float4 l_ij, l_ij2, l_ij3;
float4 u_ij, u_ij2, u_ij3;
float4 t2,t3;
float4 logRatio, scaledRadiusJ2, rPlusScaledJ, rMinusScaledJ, rMinusScaledJAbs;
float4 iAtomicRadii4;
float4 diff_u2_l2;
const float bigValue = 1000000.0f;
const float4 bigValue4 = float4( bigValue, bigValue, bigValue, bigValue );
const float smallValue = 0.000001f;
const float4 smallValue4 = float4( smallValue, smallValue, smallValue, smallValue );
const float4 zero4 = float4( 0.0f, 0.0f, 0.0f, 0.0f );
const float4 one4 = float4( 1.0f, 1.0f, 1.0f, 1.0f );
float4 mask, t2Mask;
// ---------------------------------------------------------------------------------------
iAtomicRadii4 = float4( iAtomicRadii, iAtomicRadii, iAtomicRadii, iAtomicRadii );
r2 = float4( dot(d1, d1), dot( d2, d2 ), dot( d3, d3 ), dot( d4, d4 ) );
// ---------------------------------------------------------------------------------------
// not needed for force calculation since deltaX = deltaY = deltaZ = 0, if r = 0
// r2 not masked -- assume for i = j, that 0 * infinity i 0; appears to work
mask = r2 < smallValue4 ? zero4 : one4;
// r2 = r2 < smallValue4 ? bigValue4 : r2;
// ---------------------------------------------------------------------------------------
r = sqrt( r2 );
rInverse = rsqrt( r2 );
r2Inverse = rInverse*rInverse;
rPlusScaledJ = r + jAtomicRadiiScaled;
rMinusScaledJ = r - jAtomicRadiiScaled;
rMinusScaledJAbs = abs( rMinusScaledJ );
// ---------------------------------------------------------------------------------------
mask = iAtomicRadii4 >= rPlusScaledJ ? zero4 : mask;
l_ij = iAtomicRadii4 > rMinusScaledJAbs ? iAtomicRadii4 : rMinusScaledJAbs;
// ---------------------------------------------------------------------------------------
l_ij = 1.0f/l_ij;
l_ij2 = l_ij*l_ij;
u_ij = 1.0f/rPlusScaledJ;
u_ij2 = u_ij*u_ij;
diff_u2_l2 = u_ij2 - l_ij2;
scaledRadiusJ2 = jAtomicRadiiScaled*jAtomicRadiiScaled;
logRatio = log(u_ij/l_ij);
// terms associated (t2) w/ dL/dr & dU/dr are zero; included here to test that true
// loop2Internal() should be used
t3 = -0.125f*(1.0f + scaledRadiusJ2*r2Inverse)*diff_u2_l2 + 0.25f*logRatio*r2Inverse;
l_ij3 = l_ij2*l_ij;
// t1 in Tinker code
t2 = 0.5f*l_ij2 + 0.25f*scaledRadiusJ2*l_ij3/r - 0.25f*(l_ij/r + l_ij3*r);
t2Mask = iAtomicRadii4 < rMinusScaledJ ? t2 : zero4;
t3 += t2Mask;
// t2 in Tinker code
u_ij3 = u_ij2*u_ij;
t2 = -0.5f*u_ij2 - 0.25f*scaledRadiusJ2*u_ij3/r + 0.25f*(u_ij/r + u_ij3*r);
t3 += t2;
de = mask*bornForce*t3*rInverse;
// Born radius term
// bornSum = l_ij - u_ij + 0.25*r*diff_u2_l2 + 0.5f*rInverse*logRatio - (0.25*jAtomicRadiiScaled*jAtomicRadiiScaled*rInverse)*diff_u2_l2;
bornSum = l_ij - u_ij + 0.5f*rInverse*logRatio + 0.25f*( r - jAtomicRadiiScaled*jAtomicRadiiScaled*rInverse)*diff_u2_l2;
t3 = ( iAtomicRadii4 < (jAtomicRadiiScaled - r) ) ? 2.0f*( (1.0f/iAtomicRadii) - l_ij) : zero4;
bornSum += t3;
bornSum *= mask;
}
/* This is a copy of loop2Internal(), but w/ the calculation of bornSum removed */
kernel void loop2InternalNoSumX( float3 d1, float3 d2, float3 d3, float3 d4, float4 jAtomicRadiiScaled,
float bornForce, float iAtomicRadii, out float4 de<> ){
// ---------------------------------------------------------------------------------------
float4 r, rInverse, r2, r2Inverse;
float4 l_ij, l_ij2, l_ij3;
float4 u_ij, u_ij2, u_ij3;
float4 t2, t3;
float4 logRatio, scaledRadiusJ2, rPlusScaledJ, rMinusScaledJ, rMinusScaledJAbs;
float4 iAtomicRadii4;
const float bigValue = 1000000.0f;
const float4 bigValue4 = float4( bigValue, bigValue, bigValue, bigValue );
const float smallValue = 0.000001f;
const float4 smallValue4 = float4( smallValue, smallValue, smallValue, smallValue );
const float4 zero4 = float4( 0.0f, 0.0f, 0.0f, 0.0f );
const float4 one4 = float4( 1.0f, 1.0f, 1.0f, 1.0f );
float4 mask, t2Mask;
// ---------------------------------------------------------------------------------------
iAtomicRadii4 = float4( iAtomicRadii, iAtomicRadii, iAtomicRadii, iAtomicRadii );
r2 = float4( dot(d1, d1), dot( d2, d2 ), dot( d3, d3 ), dot( d4, d4 ) );
// ---------------------------------------------------------------------------------------
// not needed for force calculation since deltaX = deltaY = deltaZ = 0, if r = 0
// mask = r2 < smallValue4 ? zero4 : one4;
// r2 = r2 < smallValue4 ? bigValue4 : r2;
// ---------------------------------------------------------------------------------------
r = sqrt( r2 );
rInverse = rsqrt( r2 );
r2Inverse = rInverse*rInverse;
rPlusScaledJ = r + jAtomicRadiiScaled;
rMinusScaledJ = r - jAtomicRadiiScaled;
rMinusScaledJAbs = abs( rMinusScaledJ );
// ---------------------------------------------------------------------------------------
// mask = iAtomicRadii4 >= rPlusScaledJ ? zero4 : mask;
mask = iAtomicRadii4 >= rPlusScaledJ ? zero4 : one4;
l_ij = iAtomicRadii4 > rMinusScaledJAbs ? iAtomicRadii4 : rMinusScaledJAbs;
// ---------------------------------------------------------------------------------------
l_ij = 1.0f/l_ij;
l_ij2 = l_ij*l_ij;
u_ij = 1.0f/rPlusScaledJ;
u_ij2 = u_ij*u_ij;
scaledRadiusJ2 = jAtomicRadiiScaled*jAtomicRadiiScaled;
logRatio = log(u_ij/l_ij);
// terms associated w/ dL/dr & dU/dr are zero and not included
t3 = 0.125f*(1.0f + scaledRadiusJ2*r2Inverse)*(l_ij2 - u_ij2) + 0.25f*logRatio*r2Inverse;
l_ij3 = l_ij2*l_ij;
// t1 in Tinker code
t2 = 0.5f*l_ij2 + 0.25f*scaledRadiusJ2*l_ij3/r - 0.25f*(l_ij/r + l_ij3*r);
t2Mask = iAtomicRadii4 < rMinusScaledJ ? t2 : zero4;
t3 += t2Mask;
// t2 in Tinker code
u_ij3 = u_ij2*u_ij;
t2 = -0.5f*u_ij2 - 0.25f*scaledRadiusJ2*u_ij3/r + 0.25f*(u_ij/r + u_ij3*r);
t3 += t2;
de = mask*bornForce*t3*rInverse;
}
/* ---------------------------------------------------------------------------------------
Calculate Loop 2 OBC forces (Simbios)
numberOfAtoms: no. of atoms
roundedUpAtoms: rounded up number of atoms based on unroll
duplicationFactor: ?
strheight: atom stream height
streamWidth: atom stream width
fstreamWidth: force stream width (output -- i-unroll)
posq: atom positions and charge
atomicRadii: atomic radii
scaledAtomicRadii: scaled atomic radii
bornForceFactor: (Born radii)**2*bornForce*Obcforce
bornForce1: i-unroll first force component, including dBorn_r/dr in .w
bornForce2: i-unroll second force component, including dBorn_r/dr in .w
bornForce3: i-unroll third force component, including dBorn_r/dr in .w
bornForce4: i-unroll fourth force component, including dBorn_r/dr in .w
#endif
--------------------------------------------------------------------------------------- */
kernel void kObcLoop2( float numberOfAtoms, float roundedUpAtoms, float duplicationFactor,
float streamWidth, float fstreamWidth, float3 posq[][],
float2 scaledAtomicRadii[][],
float bornForceFactor[][],
out float4 bornForce1<>, out float4 bornForce2<>,
out float4 bornForce3<>, out float4 bornForce4<> ){
// ---------------------------------------------------------------------------------------
// atomic radii: radius - dielectric offset
float i1AtomicRadii, i2AtomicRadii, i3AtomicRadii, i4AtomicRadii;
float j1AtomicRadii, j2AtomicRadii, j3AtomicRadii, j4AtomicRadii;
float4 iAtomicRadii, jAtomicRadii;
// scaled atomic radii: scaleFactor*( radius - dielectric offset )
float i1ScaledAtomicRadii, i2ScaledAtomicRadii, i3ScaledAtomicRadii, i4ScaledAtomicRadii;
float j1ScaledAtomicRadii, j2ScaledAtomicRadii, j3ScaledAtomicRadii, j4ScaledAtomicRadii;
float4 iScaledAtomicRadii, jScaledAtomicRadii;
// (Born radii**2)*BornForce*ObcChainForce
float i1BornForceFactor, i2BornForceFactor, i3BornForceFactor, i4BornForceFactor;
float j1BornForceFactor, j2BornForceFactor, j3BornForceFactor, j4BornForceFactor;
//float4 iBornForceFactor;
//float4 jBornForceFactor;
// i,j coordinates
float3 i1pos, i2pos, i3pos, i4pos;
float3 j1pos, j2pos, j3pos, j4pos;
// delta coordinates
float3 d1, d2, d3, d4;
// indices
float2 iAtom;
float forceIndex;
//This is forceIndex mod numberOfAtoms, the true i index
float4 de, bornSum;
float iAtomLinearIndex, jLinind;
float2 jAtom;
float jEnd, jStart, jBlock;
float whichRep;
float tmp;
// ---------------------------------------------------------------------------------------
// constants
const float I_Unroll = 4.0f;
// ---------------------------------------------------------------------------------------
// set linear index
iAtom = indexof( bornForce1 );
forceIndex = I_Unroll*( iAtom.x + iAtom.y*fstreamWidth );
iAtomLinearIndex = fmod( forceIndex, roundedUpAtoms );
// ---------------------------------------------------------------------------------------
// set gather coordinates
iAtom.x = fmod( iAtomLinearIndex, streamWidth );
iAtom.y = round( (iAtomLinearIndex - iAtom.x)/streamWidth );
// ---------------------------------------------------------------------------------------
// etch i1 position, Born prefactor, atomic radius
i1pos = posq[ iAtom ];
i1BornForceFactor = bornForceFactor[ iAtom ];
i1AtomicRadii = scaledAtomicRadii[ iAtom ].y;
i1ScaledAtomicRadii = scaledAtomicRadii[ iAtom ].x;
bornForce1 = float4( 0.0f, 0.0f, 0.0f, 0.0f );
//bornForce1.xyz = float3( i1BornForceFactor, i1ScaledAtomicRadii, i1AtomicRadii );
// etch i2 position, Born prefactor, atomic radius
iAtom.x += 1;
i2pos = posq[ iAtom ];
i2BornForceFactor = bornForceFactor[ iAtom ];
i2AtomicRadii = scaledAtomicRadii[ iAtom ].y;
i2ScaledAtomicRadii = scaledAtomicRadii[ iAtom ].x;
bornForce2 = float4( 0.0f, 0.0f, 0.0f, 0.0f );
//bornForce2.xyz = float3( i2BornForceFactor, i2ScaledAtomicRadii, i2AtomicRadii );
// etch i3 position, Born prefactor, atomic radius
iAtom.x += 1;
i3pos = posq[ iAtom ];
i3BornForceFactor = bornForceFactor[ iAtom ];
i3AtomicRadii = scaledAtomicRadii[ iAtom ].y;
i3ScaledAtomicRadii = scaledAtomicRadii[ iAtom ].x;
bornForce3 = float4( 0.0f, 0.0f, 0.0f, 0.0f );
//bornForce3.xyz = float3( i3BornForceFactor, i3ScaledAtomicRadii, i3AtomicRadii );
// etch i4 position, Born prefactor, atomic radius
iAtom.x += 1;
i4pos = posq[ iAtom ];
i4BornForceFactor = bornForceFactor[ iAtom ];
i4AtomicRadii = scaledAtomicRadii[ iAtom ].y;
i4ScaledAtomicRadii = scaledAtomicRadii[ iAtom ].x;
bornForce4 = float4( 0.0f, 0.0f, 0.0f, 0.0f );
//bornForce4.xyz = float3( i4BornForceFactor, i4ScaledAtomicRadii, i4AtomicRadii );
// create float4 for main vars
//iBornForceFactor = float4( i1BornForceFactor, i2BornForceFactor, i3BornForceFactor, i4BornForceFactor );
iAtomicRadii = float4( i1AtomicRadii, i2AtomicRadii, i3AtomicRadii, i4AtomicRadii );
iScaledAtomicRadii = float4( i1ScaledAtomicRadii, i2ScaledAtomicRadii, i3ScaledAtomicRadii, i4ScaledAtomicRadii );
// inner loop indices setup
//changed the following instruction for rounding issues on some ASICs
//whichRep = floor( forceIndex / roundedUpAtoms );
tmp = fmod(forceIndex, roundedUpAtoms);
whichRep = round((forceIndex - tmp)/roundedUpAtoms);
jBlock = 1 + floor( numberOfAtoms/(duplicationFactor*streamWidth ) );
jStart = whichRep*jBlock;
jEnd = ( whichRep > duplicationFactor - 1.5f ) ? 999999.0f : (jStart + jBlock);
jAtom.y = jStart;
jLinind = jAtom.y*streamWidth;
// ---------------------------------------------------------------------------------------
while ( jAtom.y < jEnd && ( numberOfAtoms - jLinind ) > 0.9f ){
jAtom.x = 0.0f;
while ( jAtom.x < streamWidth && ( numberOfAtoms - jLinind ) > 0.9f ){
// ---------------------------------------------------------------------------------------
// gather required values
j1BornForceFactor = bornForceFactor[ jAtom ];
j1AtomicRadii = scaledAtomicRadii[ jAtom ].y;
j1ScaledAtomicRadii = scaledAtomicRadii[ jAtom ].x;
j1pos = posq[ jAtom ];
jAtom.x += 1.0f;
j2BornForceFactor = bornForceFactor[ jAtom ];
j2AtomicRadii = scaledAtomicRadii[ jAtom ].y;
j2ScaledAtomicRadii = scaledAtomicRadii[ jAtom ].x;
j2pos = posq[ jAtom ];
jAtom.x += 1.0f;
j3BornForceFactor = bornForceFactor[ jAtom ];
j3AtomicRadii = scaledAtomicRadii[ jAtom ].y;
j3ScaledAtomicRadii = scaledAtomicRadii[ jAtom ].x;
j3pos = posq[ jAtom ];
jAtom.x += 1.0f;
j4BornForceFactor = bornForceFactor[ jAtom ];
j4AtomicRadii = scaledAtomicRadii[ jAtom ].y;
j4ScaledAtomicRadii = scaledAtomicRadii[ jAtom ].x;
j4pos = posq[ jAtom ];
jAtom.x += 1.0f;
//jBornForceFactor = float4( j1BornForceFactor, j2BornForceFactor, j3BornForceFactor, j4BornForceFactor );
jAtomicRadii = float4( j1AtomicRadii, j2AtomicRadii, j3AtomicRadii, j4AtomicRadii );
jScaledAtomicRadii = float4( j1ScaledAtomicRadii, j2ScaledAtomicRadii, j3ScaledAtomicRadii, j4ScaledAtomicRadii );
// ---------------------------------------------------------------------------------------
// First set of 4 -- i == 1
d1 = j1pos - i1pos;
d2 = j2pos - i1pos;
d3 = j3pos - i1pos;
d4 = j4pos - i1pos;
// i = 1
loop2Internal( d1, d2, d3, d4, jScaledAtomicRadii, i1BornForceFactor, i1AtomicRadii, de, bornSum );
bornForce1.xyz += de.x*d1;
bornForce1.xyz += de.y*d2;
bornForce1.xyz += de.z*d3;
bornForce1.xyz += de.w*d4;
bornForce1.w += bornSum.x + bornSum.y + bornSum.z + bornSum.w;
// ---------------------------------------------------------------------------------------
// i = 2
d1 = j1pos - i2pos;
d2 = j2pos - i2pos;
d3 = j3pos - i2pos;
d4 = j4pos - i2pos;
loop2Internal( d1, d2, d3, d4, jScaledAtomicRadii, i2BornForceFactor, i2AtomicRadii, de, bornSum );
bornForce2.xyz += de.x*d1;
bornForce2.xyz += de.y*d2;
bornForce2.xyz += de.z*d3;
bornForce2.xyz += de.w*d4;
bornForce2.w += bornSum.x + bornSum.y + bornSum.z + bornSum.w;
// ---------------------------------------------------------------------------------------
// i = 3
d1 = j1pos - i3pos;
d2 = j2pos - i3pos;
d3 = j3pos - i3pos;
d4 = j4pos - i3pos;
loop2Internal( d1, d2, d3, d4, jScaledAtomicRadii, i3BornForceFactor, i3AtomicRadii, de, bornSum );
bornForce3.xyz += de.x*d1;
bornForce3.xyz += de.y*d2;
bornForce3.xyz += de.z*d3;
bornForce3.xyz += de.w*d4;
bornForce3.w += bornSum.x + bornSum.y + bornSum.z + bornSum.w;
// ---------------------------------------------------------------------------------------
// i = 4
d1 = j1pos - i4pos;
d2 = j2pos - i4pos;
d3 = j3pos - i4pos;
d4 = j4pos - i4pos;
loop2Internal( d1, d2, d3, d4, jScaledAtomicRadii, i4BornForceFactor, i4AtomicRadii, de, bornSum );
bornForce4.xyz += de.x*d1;
bornForce4.xyz += de.y*d2;
bornForce4.xyz += de.z*d3;
bornForce4.xyz += de.w*d4;
bornForce4.w += bornSum.x + bornSum.y + bornSum.z + bornSum.w;
// ---------------------------------------------------------------------------------------
// j = 1
d1 = j1pos - i1pos;
d2 = j1pos - i2pos;
d3 = j1pos - i3pos;
d4 = j1pos - i4pos;
// loop2Internal( d1, d2, d3, d4, iScaledAtomicRadii, j1BornForceFactor, j1AtomicRadii, de, bornSum );
loop2InternalNoSum( d1, d2, d3, d4, iScaledAtomicRadii, j1BornForceFactor, j1AtomicRadii, de );
bornForce1.xyz += de.x*d1;
bornForce2.xyz += de.y*d2;
bornForce3.xyz += de.z*d3;
bornForce4.xyz += de.w*d4;
// ---------------------------------------------------------------------------------------
// j = 2
d1 = j2pos - i1pos;
d2 = j2pos - i2pos;
d3 = j2pos - i3pos;
d4 = j2pos - i4pos;
// loop2Internal( d1, d2, d3, d4, iScaledAtomicRadii, j2BornForceFactor, j2AtomicRadii, de, bornSum );
loop2InternalNoSum( d1, d2, d3, d4, iScaledAtomicRadii, j2BornForceFactor, j2AtomicRadii, de );
bornForce1.xyz += de.x*d1;
bornForce2.xyz += de.y*d2;
bornForce3.xyz += de.z*d3;
bornForce4.xyz += de.w*d4;
// ---------------------------------------------------------------------------------------
// j = 3
d1 = j3pos - i1pos;
d2 = j3pos - i2pos;
d3 = j3pos - i3pos;
d4 = j3pos - i4pos;
// loop2Internal( d1, d2, d3, d4, iScaledAtomicRadii, j3BornForceFactor, j3AtomicRadii, de, bornSum );
loop2InternalNoSum( d1, d2, d3, d4, iScaledAtomicRadii, j3BornForceFactor, j3AtomicRadii, de );
bornForce1.xyz += de.x*d1;
bornForce2.xyz += de.y*d2;
bornForce3.xyz += de.z*d3;
bornForce4.xyz += de.w*d4;
// ---------------------------------------------------------------------------------------
// j = 4
d1 = j4pos - i1pos;
d2 = j4pos - i2pos;
d3 = j4pos - i3pos;
d4 = j4pos - i4pos;
// loop2Internal( d1, d2, d3, d4, iScaledAtomicRadii, j4BornForceFactor, j4AtomicRadii, de, bornSum );
loop2InternalNoSum( d1, d2, d3, d4, iScaledAtomicRadii, j4BornForceFactor, j4AtomicRadii, de );
bornForce1.xyz += de.x*d1;
bornForce2.xyz += de.y*d2;
bornForce3.xyz += de.z*d3;
bornForce4.xyz += de.w*d4;
// ---------------------------------------------------------------------------------------
// increment linear index by 4
jLinind += 4.0f;
}
jAtom.y += 1.0f;
}
}
/* ---------------------------------------------------------------------------------------
Calculate Born radii and Obc chain derivative term
numberOfAtoms: no. of atoms
streamWidth: atom stream width
forces input forces
atomicRadii atomic radii
bornRadii output Born radii
obcChain output Obc chain derivative
--------------------------------------------------------------------------------------- */
kernel void kObcBornRadii( float numberOfAtoms, float streamWidth, float4 forces<>,
float atomicRadii<>, out float bornRadii<>, out float obcChain<> ){
// ---------------------------------------------------------------------------------------
float sum2, sum3, bornSum, tanhSum, atomicRadiiOffset, obcIntermediate;
float2 iAtom;
float expPlus, expMinus;
// ---------------------------------------------------------------------------------------
// constants -- OBC Type II
const float alphaObc = 1.0f;
const float betaObc = 0.8f;
const float gammaObc = 4.85f;
const float dielectricOffset = 0.009f;
// ---------------------------------------------------------------------------------------
atomicRadiiOffset = atomicRadii - dielectricOffset;
bornSum = forces.w;
bornSum *= 0.5f*atomicRadiiOffset;
sum2 = bornSum*bornSum;
sum3 = bornSum*sum2;
// Tanh does not exist?
// calculate [ exp(x) - exp(-x) ]/[ exp(x) + exp(-x) ]
// tanhSum = tanh( bornSum - betaObc*sum2 + gammaObc*sum3 );
tanhSum = bornSum - betaObc*sum2 + gammaObc*sum3;
expPlus = exp( tanhSum );
expMinus = 1.0f/expPlus;
tanhSum = ( expPlus - expMinus )/( expPlus + expMinus );
bornRadii = 1.0f/( (1.0f/(atomicRadiiOffset)) - tanhSum/atomicRadii );
obcIntermediate = atomicRadiiOffset*( alphaObc - 2.0f*betaObc*bornSum + 3.0f*gammaObc*sum2 );
obcChain = (1.0f - tanhSum*tanhSum)*obcIntermediate/atomicRadii;
}
kernel float4 scalar_force_CDLJMerge( float4 qq, float epsfac, float4 sig, float4 eps, float4 r2 ) {
float4 invr, invrsig2, invrsig6;
float4 f;
invr = rsqrt( r2 );
invrsig2 = invr * sig;
invrsig2 = invrsig2 * invrsig2;
invrsig6 = invrsig2 * invrsig2 * invrsig2;
f = eps * ( 12.0f*invrsig6 - 6.0f ) * invrsig6;
f += epsfac*qq*invr;
f *= invr*invr*2.39f;
return f;
}
/* ---------------------------------------------------------------------------------------
Calculate Loop 2 OBC forces (Simbios)
numberOfAtoms: no. of atoms
roundedUpAtoms: rounded up number of atoms based on unroll
duplicationFactor: ?
strheight: atom stream height
streamWidth: atom stream width
fstreamWidth: force stream width (output -- i-unroll)
posq: atom positions and charge
atomicRadii: atomic radii
scaledAtomicRadii: scaled atomic radii
bornForceFactor: (Born radii)**2*bornForce*Obcforce
bornForce1: i-unroll first force component, including dBorn_r/dr in .w
bornForce2: i-unroll second force component, including dBorn_r/dr in .w
bornForce3: i-unroll third force component, including dBorn_r/dr in .w
bornForce4: i-unroll fourth force component, including dBorn_r/dr in .w
#endif
--------------------------------------------------------------------------------------- */
kernel void kObcLoop2Cdlj4( float numberOfAtoms, float roundedUpAtoms, float duplicationFactor,
float streamWidth, float fstreamWidth,
float jstreamWidth, float epsfac, float4 posq[][],
float atomicRadii[][], float scaledAtomicRadii[][],
float bornForceFactor[][], float4 isigeps[][],
float4 sigma[][], float4 epsilon[][], float excl[][],
out float4 bornForce1<>, out float4 bornForce2<>,
out float4 bornForce3<>, out float4 bornForce4<> ){
// ---------------------------------------------------------------------------------------
// atomic radii: radius - dielectric offset
float i1AtomicRadii, i2AtomicRadii, i3AtomicRadii, i4AtomicRadii;
float j1AtomicRadii, j2AtomicRadii, j3AtomicRadii, j4AtomicRadii;
float4 iAtomicRadii, jAtomicRadii;
// scaled atomic radii: scaleFactor*( radius - dielectric offset )
float i1ScaledAtomicRadii, i2ScaledAtomicRadii, i3ScaledAtomicRadii, i4ScaledAtomicRadii;
float j1ScaledAtomicRadii, j2ScaledAtomicRadii, j3ScaledAtomicRadii, j4ScaledAtomicRadii;
float4 iScaledAtomicRadii, jScaledAtomicRadii;
// (Born radii**2)*BornForce*ObcChainForce
float i1BornForceFactor, i2BornForceFactor, i3BornForceFactor, i4BornForceFactor;
float j1BornForceFactor, j2BornForceFactor, j3BornForceFactor, j4BornForceFactor;
//float4 iBornForceFactor;
// i,j coordinates
float3 i1pos, i2pos, i3pos, i4pos;
float3 j1pos, j2pos, j3pos, j4pos;
// delta coordinates
float3 d1, d2, d3, d4;
// indices
float2 iAtom;
float forceIndex;
float i1q, i2q, i3q, i4q;
float i1sig, i2sig, i3sig, i4sig;
float i1eps, i2eps, i3eps, i4eps;
float j1q, j2q, j3q, j4q;
float2 exclind;
float4 sig, eps;
float exclusions;
float4 r2, r2Masked;
float3 pad;
float4 exclmask;
float2 iatom;
float2 jstrindex;
float linind;
float4 jsig, jeps;
float4 jQ,qq;
//This is forceIndex mod numberOfAtoms, the true i index
float4 de, fs, bornSum;
float iAtomLinearIndex, jLinind;
float2 jAtom;
float jEnd, jStart, jBlock;
float whichRep;
// ---------------------------------------------------------------------------------------
// constants
const float I_Unroll = 4.0f;
const float4 exclconst = float4( 2.0f, 3.0f, 5.0f, 7.0f );
// ---------------------------------------------------------------------------------------
// set linear index
iAtom = indexof( bornForce1 );
forceIndex = I_Unroll*( iAtom.x + iAtom.y*fstreamWidth );
iAtomLinearIndex = fmod( forceIndex, roundedUpAtoms );
// ---------------------------------------------------------------------------------------
// set gather coordinates
iAtom.x = fmod( iAtomLinearIndex, streamWidth );
iAtom.y = round( (iAtomLinearIndex - iAtom.x)/streamWidth );
// ---------------------------------------------------------------------------------------
// fetch i1 position, Born prefactor, atomic radius
i1pos = posq[ iAtom ].xyz;
i1q = posq[ iAtom ].w;
i1BornForceFactor = bornForceFactor[ iAtom ];
i1AtomicRadii = atomicRadii[ iAtom ];
i1ScaledAtomicRadii = scaledAtomicRadii[ iAtom ];
jstrindex = isigeps[ iAtom ];
i1sig = jstrindex.x;
i1eps = jstrindex.y;
bornForce1 = float4( 0.0f, 0.0f, 0.0f, 0.0f );
// fetch i2 position, Born prefactor, atomic radius
iAtom.x += 1;
i2pos = posq[ iAtom ].xyz;
i2q = posq[ iAtom ].w;
i2BornForceFactor = bornForceFactor[ iAtom ];
i2AtomicRadii = atomicRadii[ iAtom ];
i2ScaledAtomicRadii = scaledAtomicRadii[ iAtom ];
jstrindex = isigeps[ iAtom ];
i2sig = jstrindex.x;
i2eps = jstrindex.y;
bornForce2 = float4( 0.0f, 0.0f, 0.0f, 0.0f );
// fetch i3 position, Born prefactor, atomic radius
iAtom.x += 1;
i3pos = posq[ iAtom ].xyz;
i3q = posq[ iAtom ].w;
i3BornForceFactor = bornForceFactor[ iAtom ];
i3AtomicRadii = atomicRadii[ iAtom ];
i3ScaledAtomicRadii = scaledAtomicRadii[ iAtom ];
jstrindex = isigeps[ iAtom ];
i3sig = jstrindex.x;
i3eps = jstrindex.y;
bornForce3 = float4( 0.0f, 0.0f, 0.0f, 0.0f );
// fetch i4 position, Born prefactor, atomic radius
iAtom.x += 1;
i4pos = posq[ iAtom ].xyz;
i4q = posq[ iAtom ].w;
i4BornForceFactor = bornForceFactor[ iAtom ];
i4AtomicRadii = atomicRadii[ iAtom ];
i4ScaledAtomicRadii = scaledAtomicRadii[ iAtom ];
jstrindex = isigeps[ iAtom ];
i4sig = jstrindex.x;
i4eps = jstrindex.y;
bornForce4 = float4( 0.0f, 0.0f, 0.0f, 0.0f );
// create float4 for main vars
//iBornForceFactor = float4( i1BornForceFactor, i2BornForceFactor, i3BornForceFactor, i4BornForceFactor );
iAtomicRadii = float4( i1AtomicRadii, i2AtomicRadii, i3AtomicRadii, i4AtomicRadii );
iScaledAtomicRadii = float4( i1ScaledAtomicRadii, i2ScaledAtomicRadii, i3ScaledAtomicRadii, i4ScaledAtomicRadii );
// inner loop indices setup
whichRep = floor( forceIndex / roundedUpAtoms );
jBlock = 1 + floor( numberOfAtoms/(duplicationFactor*streamWidth ) );
jStart = whichRep*jBlock;
jEnd = ( whichRep > duplicationFactor - 1.5f ) ? 999999.0f : (jStart + jBlock);
jAtom.y = jStart;
jLinind = jAtom.y*streamWidth;
exclind.x = iAtomLinearIndex;
exclind.y = jStart*streamWidth/4.0f;
exclind.y = floor( exclind.y + 0.25f );
// ---------------------------------------------------------------------------------------
while ( jAtom.y < jEnd && ( numberOfAtoms - jLinind ) > 0.9f ){
jAtom.x = 0.0f;
while ( jAtom.x < streamWidth && ( numberOfAtoms - jLinind ) > 0.9f ){
// ---------------------------------------------------------------------------------------
// gather required values
exclusions = excl[ exclind ];
exclmask = fmod( exclusions, exclconst );
linind = (jAtom.x + jAtom.y*streamWidth)/4.0f;
linind = floor( linind + 0.25f );
jstrindex.y = round( (linind - fmod(linind, jstreamWidth))/jstreamWidth);
jstrindex.x = linind - jstrindex.y*jstreamWidth;
jstrindex.x = floor( jstrindex.x + 0.25f );
jsig = sigma[ jstrindex ];
jeps = epsilon[ jstrindex ];
j1BornForceFactor = bornForceFactor[ jAtom ];
j1AtomicRadii = atomicRadii[ jAtom ];
j1ScaledAtomicRadii = scaledAtomicRadii[ jAtom ];
j1pos = posq[ jAtom ].xyz;
j1q = posq[ jAtom ].w;
jAtom.x += 1.0f;
j2BornForceFactor = bornForceFactor[ jAtom ];
j2AtomicRadii = atomicRadii[ jAtom ];
j2ScaledAtomicRadii = scaledAtomicRadii[ jAtom ];
j2pos = posq[ jAtom ].xyz;
j2q = posq[ jAtom ].w;
jAtom.x += 1.0f;
j3BornForceFactor = bornForceFactor[ jAtom ];
j3AtomicRadii = atomicRadii[ jAtom ];
j3ScaledAtomicRadii = scaledAtomicRadii[ jAtom ];
j3pos = posq[ jAtom ].xyz;
j3q = posq[ jAtom ].w;
jAtom.x += 1.0f;
j4BornForceFactor = bornForceFactor[ jAtom ];
j4AtomicRadii = atomicRadii[ jAtom ];
j4ScaledAtomicRadii = scaledAtomicRadii[ jAtom ];
j4pos = posq[ jAtom ].xyz;
j4q = posq[ jAtom ].w;
jAtom.x += 1.0f;
//jBornForceFactor = float4( j1BornForceFactor, j2BornForceFactor, j3BornForceFactor, j4BornForceFactor );
jAtomicRadii = float4( j1AtomicRadii, j2AtomicRadii, j3AtomicRadii, j4AtomicRadii );
jScaledAtomicRadii = float4( j1ScaledAtomicRadii, j2ScaledAtomicRadii, j3ScaledAtomicRadii, j4ScaledAtomicRadii );
jQ = float4( j1q, j2q, j3q, j4q );
// ---------------------------------------------------------------------------------------
// First set of 4 -- i == 1
sig = i1sig + jsig;
eps = i1eps * jeps;
qq = i1q*jQ;
d1 = j1pos - i1pos;
d2 = j2pos - i1pos;
d3 = j3pos - i1pos;
d4 = j4pos - i1pos;
// i = 1
r2 = float4( dot(d1, d1), dot( d2, d2 ), dot( d3, d3 ), dot( d4, d4 ) );
r2Masked = r2 + exclmask*10000.0f;
fs = scalar_force_CDLJMerge( qq, epsfac, sig, eps, r2Masked );
loop2InternalR2( r2, jScaledAtomicRadii, i1BornForceFactor, i1AtomicRadii, de, bornSum );
de += fs;
//de = fs;
bornForce1.xyz += de.x*d1;
bornForce1.xyz += de.y*d2;
bornForce1.xyz += de.z*d3;
bornForce1.xyz += de.w*d4;
bornForce1.w += bornSum.x + bornSum.y + bornSum.z + bornSum.w;
// ---------------------------------------------------------------------------------------
// i = 2
exclind.x = floor( exclind.x + 1.2f );
exclusions = excl[ exclind ];
exclmask = fmod( exclusions, exclconst );
sig = i2sig + jsig;
eps = i2eps * jeps;
qq = i2q*jQ;
d1 = j1pos - i2pos;
d2 = j2pos - i2pos;
d3 = j3pos - i2pos;
d4 = j4pos - i2pos;
r2 = float4( dot(d1, d1), dot( d2, d2 ), dot( d3, d3 ), dot( d4, d4 ) );
r2Masked = r2 + exclmask*10000.0f;
fs = scalar_force_CDLJMerge( qq, epsfac, sig, eps, r2Masked );
loop2InternalR2( r2, jScaledAtomicRadii, i2BornForceFactor, i2AtomicRadii, de, bornSum );
de += fs;
//de = fs;
bornForce2.xyz += de.x*d1;
bornForce2.xyz += de.y*d2;
bornForce2.xyz += de.z*d3;
bornForce2.xyz += de.w*d4;
bornForce2.w += bornSum.x + bornSum.y + bornSum.z + bornSum.w;
// ---------------------------------------------------------------------------------------
// i = 3
exclind.x = floor( exclind.x + 1.2f );
exclusions = excl[ exclind ];
exclmask = fmod( exclusions, exclconst );
sig = i3sig + jsig;
eps = i3eps * jeps;
qq = i3q*jQ;
d1 = j1pos - i3pos;
d2 = j2pos - i3pos;
d3 = j3pos - i3pos;
d4 = j4pos - i3pos;
r2 = float4( dot(d1, d1), dot( d2, d2 ), dot( d3, d3 ), dot( d4, d4 ) );
r2Masked = r2 + exclmask*10000.0f;
fs = scalar_force_CDLJMerge( qq, epsfac, sig, eps, r2Masked );
loop2InternalR2( r2, jScaledAtomicRadii, i3BornForceFactor, i3AtomicRadii, de, bornSum );
de += fs;
//de = fs;
bornForce3.xyz += de.x*d1;
bornForce3.xyz += de.y*d2;
bornForce3.xyz += de.z*d3;
bornForce3.xyz += de.w*d4;
bornForce3.w += bornSum.x + bornSum.y + bornSum.z + bornSum.w;
// ---------------------------------------------------------------------------------------
// i = 4
exclind.x = floor( exclind.x + 1.2f );
exclusions = excl[ exclind ];
exclmask = fmod( exclusions, exclconst );
sig = i4sig + jsig;
eps = i4eps * jeps;
qq = i4q*jQ;
d1 = j1pos - i4pos;
d2 = j2pos - i4pos;
d3 = j3pos - i4pos;
d4 = j4pos - i4pos;
r2 = float4( dot(d1, d1), dot( d2, d2 ), dot( d3, d3 ), dot( d4, d4 ) );
r2Masked = r2 + exclmask*10000.0f;
fs = scalar_force_CDLJMerge( qq, epsfac, sig, eps, r2Masked );
loop2InternalR2( r2, jScaledAtomicRadii, i4BornForceFactor, i4AtomicRadii, de, bornSum );
de += fs;
//de = fs;
bornForce4.xyz += de.x*d1;
bornForce4.xyz += de.y*d2;
bornForce4.xyz += de.z*d3;
bornForce4.xyz += de.w*d4;
bornForce4.w += bornSum.x + bornSum.y + bornSum.z + bornSum.w;
// ---------------------------------------------------------------------------------------
// j = 1
d1 = j1pos - i1pos;
d2 = j1pos - i2pos;
d3 = j1pos - i3pos;
d4 = j1pos - i4pos;
// loop2Internal( d1, d2, d3, d4, iScaledAtomicRadii, j1BornForceFactor, j1AtomicRadii, de, bornSum );
loop2InternalNoSum( d1, d2, d3, d4, iScaledAtomicRadii, j1BornForceFactor, j1AtomicRadii, de );
bornForce1.xyz += de.x*d1;
bornForce2.xyz += de.y*d2;
bornForce3.xyz += de.z*d3;
bornForce4.xyz += de.w*d4;
// ---------------------------------------------------------------------------------------
// j = 2
d1 = j2pos - i1pos;
d2 = j2pos - i2pos;
d3 = j2pos - i3pos;
d4 = j2pos - i4pos;
// loop2Internal( d1, d2, d3, d4, iScaledAtomicRadii, j2BornForceFactor, j2AtomicRadii, de, bornSum );
loop2InternalNoSum( d1, d2, d3, d4, iScaledAtomicRadii, j2BornForceFactor, j2AtomicRadii, de );
bornForce1.xyz += de.x*d1;
bornForce2.xyz += de.y*d2;
bornForce3.xyz += de.z*d3;
bornForce4.xyz += de.w*d4;
// ---------------------------------------------------------------------------------------
// j = 3
d1 = j3pos - i1pos;
d2 = j3pos - i2pos;
d3 = j3pos - i3pos;
d4 = j3pos - i4pos;
// loop2Internal( d1, d2, d3, d4, iScaledAtomicRadii, j3BornForceFactor, j3AtomicRadii, de, bornSum );
loop2InternalNoSum( d1, d2, d3, d4, iScaledAtomicRadii, j3BornForceFactor, j3AtomicRadii, de );
bornForce1.xyz += de.x*d1;
bornForce2.xyz += de.y*d2;
bornForce3.xyz += de.z*d3;
bornForce4.xyz += de.w*d4;
// ---------------------------------------------------------------------------------------
// j = 4
d1 = j4pos - i1pos;
d2 = j4pos - i2pos;
d3 = j4pos - i3pos;
d4 = j4pos - i4pos;
// loop2Internal( d1, d2, d3, d4, iScaledAtomicRadii, j4BornForceFactor, j4AtomicRadii, de, bornSum );
loop2InternalNoSum( d1, d2, d3, d4, iScaledAtomicRadii, j4BornForceFactor, j4AtomicRadii, de );
bornForce1.xyz += de.x*d1;
bornForce2.xyz += de.y*d2;
bornForce3.xyz += de.z*d3;
bornForce4.xyz += de.w*d4;
// ---------------------------------------------------------------------------------------
//exclind.x -= 3.0f;
exclind.x = floor( exclind.x - 2.9f );
//exclind.y += 1.0f;
exclind.y = floor( exclind.y + 1.25f );
// increment linear index by 4
jLinind += 4.0f;
}
jAtom.y += 1.0f;
}
}
/* ---------------------------------------------------------------------------------------
Calculate Loop 2 OBC forces (Simbios)
numberOfAtoms: no. of atoms
roundedUpAtoms: rounded up number of atoms based on unroll
duplicationFactor: ?
strheight: atom stream height
streamWidth: atom stream width
fstreamWidth: force stream width (output -- i-unroll)
posq: atom positions and charge
atomicRadii: atomic radii
scaledAtomicRadii: scaled atomic radii
bornForceFactor: (Born radii)**2*bornForce*Obcforce
bornForce1: i-unroll first force component, including dBorn_r/dr in .w
bornForce2: i-unroll second force component, including dBorn_r/dr in .w
bornForce3: i-unroll third force component, including dBorn_r/dr in .w
bornForce4: i-unroll fourth force component, including dBorn_r/dr in .w
#endif
--------------------------------------------------------------------------------------- */
kernel void kObcLoop2Cdlj4X( float numberOfAtoms, float roundedUpAtoms, float duplicationFactor,
float streamWidth, float fstreamWidth,
float jstreamWidth, float epsfac, float4 posq[][],
float atomicRadii[][], float scaledAtomicRadii[][],
float bornForceFactor[][], float4 isigeps[][],
float4 sigma[][], float4 epsilon[][], float excl[][],
out float4 bornForce1<>, out float4 bornForce2<>,
out float4 bornForce3<>, out float4 bornForce4<> ){
// ---------------------------------------------------------------------------------------
// atomic radii: radius - dielectric offset
float i1AtomicRadii, i2AtomicRadii, i3AtomicRadii, i4AtomicRadii;
float j1AtomicRadii, j2AtomicRadii, j3AtomicRadii, j4AtomicRadii;
float4 iAtomicRadii, jAtomicRadii;
// scaled atomic radii: scaleFactor*( radius - dielectric offset )
float i1ScaledAtomicRadii, i2ScaledAtomicRadii, i3ScaledAtomicRadii, i4ScaledAtomicRadii;
float j1ScaledAtomicRadii, j2ScaledAtomicRadii, j3ScaledAtomicRadii, j4ScaledAtomicRadii;
float4 iScaledAtomicRadii, jScaledAtomicRadii;
// (Born radii**2)*BornForce*ObcChainForce
float i1BornForceFactor, i2BornForceFactor, i3BornForceFactor, i4BornForceFactor;
float j1BornForceFactor, j2BornForceFactor, j3BornForceFactor, j4BornForceFactor;
//float4 iBornForceFactor;
//float4 jBornForceFactor;
// i,j coordinates
float3 i1pos, i2pos, i3pos, i4pos;
float3 j1pos, j2pos, j3pos, j4pos;
// delta coordinates
float3 d1, d2, d3, d4;
// indices
float2 iAtom;
float forceIndex;
float i1q, i2q, i3q, i4q;
float i1sig, i2sig, i3sig, i4sig;
float i1eps, i2eps, i3eps, i4eps;
float j1q, j2q, j3q, j4q;
float2 exclind;
float4 sig, eps;
float exclusions;
float4 r2, r2Masked;
float3 pad;
float4 exclmask;
float2 iatom;
float2 jstrindex;
float linind;
float4 jsig, jeps;
float4 jQ,qq;
//This is forceIndex mod numberOfAtoms, the true i index
float4 de, fs, bornSum;
float iAtomLinearIndex, jLinind;
float2 jAtom;
float jEnd, jStart, jBlock;
float whichRep;
float sum;
// ---------------------------------------------------------------------------------------
// constants
const float4 one4 = float4( 1.0f, 1.0f, 1.0f, 1.0f );
const float4 zero4 = float4( 0.0f, 0.0f, 0.0f, 0.0f );
const float4 frac4 = float4( 0.2f, 0.2f, 0.2f, 0.2f );
const float I_Unroll = 4.0f;
const float4 exclconst = float4( 2.0f, 3.0f, 5.0f, 7.0f );
// ---------------------------------------------------------------------------------------
// set linear index
iAtom = indexof( bornForce1 );
forceIndex = I_Unroll*( iAtom.x + iAtom.y*fstreamWidth );
iAtomLinearIndex = fmod( forceIndex, roundedUpAtoms );
// ---------------------------------------------------------------------------------------
// set gather coordinates
iAtom.x = fmod( iAtomLinearIndex, streamWidth );
iAtom.y = round( (iAtomLinearIndex - iAtom.x)/streamWidth );
// ---------------------------------------------------------------------------------------
// fetch i1 position, Born prefactor, atomic radius
i1pos = posq[ iAtom ].xyz;
i1q = posq[ iAtom ].w;
i1BornForceFactor = bornForceFactor[ iAtom ];
i1AtomicRadii = atomicRadii[ iAtom ];
i1ScaledAtomicRadii = scaledAtomicRadii[ iAtom ];
jstrindex = isigeps[ iAtom ];
i1sig = jstrindex.x;
i1eps = jstrindex.y;
bornForce1 = float4( 0.0f, 0.0f, 0.0f, 0.0f );
// fetch i2 position, Born prefactor, atomic radius
iAtom.x += 1;
i2pos = posq[ iAtom ].xyz;
i2q = posq[ iAtom ].w;
i2BornForceFactor = bornForceFactor[ iAtom ];
i2AtomicRadii = atomicRadii[ iAtom ];
i2ScaledAtomicRadii = scaledAtomicRadii[ iAtom ];
jstrindex = isigeps[ iAtom ];
i2sig = jstrindex.x;
i2eps = jstrindex.y;
bornForce2 = float4( 0.0f, 0.0f, 0.0f, 0.0f );
// fetch i3 position, Born prefactor, atomic radius
iAtom.x += 1;
i3pos = posq[ iAtom ].xyz;
i3q = posq[ iAtom ].w;
i3BornForceFactor = bornForceFactor[ iAtom ];
i3AtomicRadii = atomicRadii[ iAtom ];
i3ScaledAtomicRadii = scaledAtomicRadii[ iAtom ];
jstrindex = isigeps[ iAtom ];
i3sig = jstrindex.x;
i3eps = jstrindex.y;
bornForce3 = float4( 0.0f, 0.0f, 0.0f, 0.0f );
// fetch i4 position, Born prefactor, atomic radius
iAtom.x += 1;
i4pos = posq[ iAtom ].xyz;
i4q = posq[ iAtom ].w;
i4BornForceFactor = bornForceFactor[ iAtom ];
i4AtomicRadii = atomicRadii[ iAtom ];
i4ScaledAtomicRadii = scaledAtomicRadii[ iAtom ];
jstrindex = isigeps[ iAtom ];
i4sig = jstrindex.x;
i4eps = jstrindex.y;
bornForce4 = float4( 0.0f, 0.0f, 0.0f, 0.0f );
// create float4 for main vars
//iBornForceFactor = float4( i1BornForceFactor, i2BornForceFactor, i3BornForceFactor, i4BornForceFactor );
iAtomicRadii = float4( i1AtomicRadii, i2AtomicRadii, i3AtomicRadii, i4AtomicRadii );
iScaledAtomicRadii = float4( i1ScaledAtomicRadii, i2ScaledAtomicRadii, i3ScaledAtomicRadii, i4ScaledAtomicRadii );
// inner loop indices setup
whichRep = floor( forceIndex / roundedUpAtoms );
jBlock = 1 + floor( numberOfAtoms/(duplicationFactor*streamWidth ) );
jStart = whichRep*jBlock;
jEnd = ( whichRep > duplicationFactor - 1.5f ) ? 999999.0f : (jStart + jBlock);
jAtom.y = jStart;
jLinind = jAtom.y*streamWidth;
exclind.x = iAtomLinearIndex;
exclind.y = jStart*streamWidth/4.0f;
exclind.y = floor( exclind.y + 0.25f );
// ---------------------------------------------------------------------------------------
while ( jAtom.y < jEnd && ( numberOfAtoms - jLinind ) > 0.9f ){
jAtom.x = 0.0f;
while ( jAtom.x < streamWidth && ( numberOfAtoms - jLinind ) > 0.9f ){
// ---------------------------------------------------------------------------------------
// gather required values
exclusions = excl[ exclind ];
exclmask = fmod( exclusions, exclconst );
linind = (jAtom.x + jAtom.y*streamWidth)/4.0f;
linind = floor( linind + 0.25f );
jstrindex.y = round( (linind - fmod(linind, jstreamWidth))/jstreamWidth);
jstrindex.x = linind - jstrindex.y*jstreamWidth;
jstrindex.x = floor( jstrindex.x + 0.25f );
jsig = sigma[ jstrindex ];
jeps = epsilon[ jstrindex ];
j1BornForceFactor = bornForceFactor[ jAtom ];
j1AtomicRadii = atomicRadii[ jAtom ];
j1ScaledAtomicRadii = scaledAtomicRadii[ jAtom ];
j1pos = posq[ jAtom ].xyz;
j1q = posq[ jAtom ].w;
jAtom.x += 1.0f;
j2BornForceFactor = bornForceFactor[ jAtom ];
j2AtomicRadii = atomicRadii[ jAtom ];
j2ScaledAtomicRadii = scaledAtomicRadii[ jAtom ];
j2pos = posq[ jAtom ].xyz;
j2q = posq[ jAtom ].w;
jAtom.x += 1.0f;
j3BornForceFactor = bornForceFactor[ jAtom ];
j3AtomicRadii = atomicRadii[ jAtom ];
j3ScaledAtomicRadii = scaledAtomicRadii[ jAtom ];
j3pos = posq[ jAtom ].xyz;
j3q = posq[ jAtom ].w;
jAtom.x += 1.0f;
j4BornForceFactor = bornForceFactor[ jAtom ];
j4AtomicRadii = atomicRadii[ jAtom ];
j4ScaledAtomicRadii = scaledAtomicRadii[ jAtom ];
j4pos = posq[ jAtom ].xyz;
j4q = posq[ jAtom ].w;
jAtom.x += 1.0f;
//jBornForceFactor = float4( j1BornForceFactor, j2BornForceFactor, j3BornForceFactor, j4BornForceFactor );
jAtomicRadii = float4( j1AtomicRadii, j2AtomicRadii, j3AtomicRadii, j4AtomicRadii );
jScaledAtomicRadii = float4( j1ScaledAtomicRadii, j2ScaledAtomicRadii, j3ScaledAtomicRadii, j4ScaledAtomicRadii );
jQ = float4( j1q, j2q, j3q, j4q );
// ---------------------------------------------------------------------------------------
// First set of 4 -- i == 1
/*
sig = i1sig + jsig;
eps = i1eps * jeps;
qq = i1q*jQ;
d1 = j1pos - i1pos;
d2 = j2pos - i1pos;
d3 = j3pos - i1pos;
d4 = j4pos - i1pos;
// i = 1
r2 = float4( dot(d1, d1), dot( d2, d2 ), dot( d3, d3 ), dot( d4, d4 ) );
r2Masked = r2 + exclmask*10000.0f;
fs = scalar_force_CDLJMerge( qq, epsfac, sig, eps, r2Masked );
loop2InternalR2( r2, jScaledAtomicRadii, i1BornForceFactor, i1AtomicRadii, de, bornSum );
//de += fs;
de = fs;
bornForce1.xyz += de.x*d1;
bornForce1.xyz += de.y*d2;
bornForce1.xyz += de.z*d3;
bornForce1.xyz += de.w*d4;
bornForce1.w += bornSum.x + bornSum.y + bornSum.z + bornSum.w;
*/
exclmask = exclmask > frac4 ? one4 : zero4;
sum = floor( exclmask.x ) + floor( exclmask.y ) + floor( exclmask.z ) + floor( exclmask.w );
if( sum > 0.25f ){
//if( exclind.y < 4.9f && exclind.y > 3.1f ){
//if( exclind.y < 0.9f ){
//if( exclind.y < 1.9f && exclind.y > 0.1f ){
//if( exclind.y < 2.9f && exclind.y > 1.1f ){
//if( exclind.y < 5.9f && exclind.y > 4.1f ){
bornForce1.x += 1.0f;
bornForce1.y += exclusions;
bornForce1.z += sum;
bornForce1.w += exclmask.w;
//bornForce1 = exclmask;
}
// ---------------------------------------------------------------------------------------
// i = 2
exclind.x = floor( exclind.x + 1.2f );
exclusions = excl[ exclind ];
exclmask = fmod( exclusions, exclconst );
exclmask = exclmask > frac4 ? one4 : zero4;
sum = floor( exclmask.x + exclmask.y + exclmask.z + exclmask.w + 0.25f );
if( sum > 0.25f ){
//if( exclind.y < 5.9f && exclind.y > 4.1f ){
bornForce2.x += 1.0f;
bornForce2.y += exclusions;
bornForce2.z += sum;
bornForce2.w += exclind.y;
}
/*
sig = i2sig + jsig;
eps = i2eps * jeps;
qq = i2q*jQ;
d1 = j1pos - i2pos;
d2 = j2pos - i2pos;
d3 = j3pos - i2pos;
d4 = j4pos - i2pos;
r2 = float4( dot(d1, d1), dot( d2, d2 ), dot( d3, d3 ), dot( d4, d4 ) );
r2Masked = r2 + exclmask*10000.0f;
fs = scalar_force_CDLJMerge( qq, epsfac, sig, eps, r2Masked );
loop2InternalR2( r2, jScaledAtomicRadii, i2BornForceFactor, i2AtomicRadii, de, bornSum );
//de += fs;
de = fs;
bornForce2.xyz += de.x*d1;
bornForce2.xyz += de.y*d2;
bornForce2.xyz += de.z*d3;
bornForce2.xyz += de.w*d4;
bornForce2.w += bornSum.x + bornSum.y + bornSum.z + bornSum.w;
*/
// ---------------------------------------------------------------------------------------
// i = 3
exclind.x = floor( exclind.x + 1.2f );
exclusions = excl[ exclind ];
exclmask = fmod( exclusions, exclconst );
exclmask = exclmask > frac4 ? one4 : zero4;
sum = floor( exclmask.x + exclmask.y + exclmask.z + exclmask.w + 0.25f );
if( sum > 0.25f ){
//if( exclind.y < 5.9f && exclind.y > 4.1f ){
bornForce3.x += 1.0f;
bornForce3.y += exclusions;
bornForce3.z += sum;
bornForce3.w += exclind.y;
}
/*
sig = i3sig + jsig;
eps = i3eps * jeps;
qq = i3q*jQ;
d1 = j1pos - i3pos;
d2 = j2pos - i3pos;
d3 = j3pos - i3pos;
d4 = j4pos - i3pos;
r2 = float4( dot(d1, d1), dot( d2, d2 ), dot( d3, d3 ), dot( d4, d4 ) );
r2Masked = r2 + exclmask*10000.0f;
fs = scalar_force_CDLJMerge( qq, epsfac, sig, eps, r2Masked );
loop2InternalR2( r2, jScaledAtomicRadii, i3BornForceFactor, i3AtomicRadii, de, bornSum );
//de += fs;
de = fs;
bornForce3.xyz += de.x*d1;
bornForce3.xyz += de.y*d2;
bornForce3.xyz += de.z*d3;
bornForce3.xyz += de.w*d4;
bornForce3.w += bornSum.x + bornSum.y + bornSum.z + bornSum.w;
*/
// ---------------------------------------------------------------------------------------
// i = 4
exclind.x = floor( exclind.x + 1.2f );
exclusions = excl[ exclind ];
exclmask = fmod( exclusions, exclconst );
exclmask = exclmask > frac4 ? one4 : zero4;
sum = floor( exclmask.x + exclmask.y + exclmask.z + exclmask.w + 0.25f );
if( sum > 0.25f ){
//if( exclind.y < 5.9f && exclind.y > 4.1f ){
bornForce4.x += 1.0f;
bornForce4.y += exclusions;
bornForce4.z += sum;
bornForce4.w += exclind.y;
}
/*
sig = i4sig + jsig;
eps = i4eps * jeps;
qq = i4q*jQ;
d1 = j1pos - i4pos;
d2 = j2pos - i4pos;
d3 = j3pos - i4pos;
d4 = j4pos - i4pos;
r2 = float4( dot(d1, d1), dot( d2, d2 ), dot( d3, d3 ), dot( d4, d4 ) );
r2Masked = r2 + exclmask*10000.0f;
fs = scalar_force_CDLJMerge( qq, epsfac, sig, eps, r2Masked );
loop2InternalR2( r2, jScaledAtomicRadii, i4BornForceFactor, i4AtomicRadii, de, bornSum );
//de += fs;
de = fs;
bornForce4.xyz += de.x*d1;
bornForce4.xyz += de.y*d2;
bornForce4.xyz += de.z*d3;
bornForce4.xyz += de.w*d4;
bornForce4.w += bornSum.x + bornSum.y + bornSum.z + bornSum.w;
*/
// ---------------------------------------------------------------------------------------
/*
// j = 1
d1 = j1pos - i1pos;
d2 = j1pos - i2pos;
d3 = j1pos - i3pos;
d4 = j1pos - i4pos;
// loop2Internal( d1, d2, d3, d4, iScaledAtomicRadii, j1BornForceFactor, j1AtomicRadii, de, bornSum );
loop2InternalNoSum( d1, d2, d3, d4, iScaledAtomicRadii, j1BornForceFactor, j1AtomicRadii, de );
bornForce1.xyz += de.x*d1;
bornForce2.xyz += de.y*d2;
bornForce3.xyz += de.z*d3;
bornForce4.xyz += de.w*d4;
// ---------------------------------------------------------------------------------------
// j = 2
d1 = j2pos - i1pos;
d2 = j2pos - i2pos;
d3 = j2pos - i3pos;
d4 = j2pos - i4pos;
// loop2Internal( d1, d2, d3, d4, iScaledAtomicRadii, j2BornForceFactor, j2AtomicRadii, de, bornSum );
loop2InternalNoSum( d1, d2, d3, d4, iScaledAtomicRadii, j2BornForceFactor, j2AtomicRadii, de );
bornForce1.xyz += de.x*d1;
bornForce2.xyz += de.y*d2;
bornForce3.xyz += de.z*d3;
bornForce4.xyz += de.w*d4;
// ---------------------------------------------------------------------------------------
// j = 3
d1 = j3pos - i1pos;
d2 = j3pos - i2pos;
d3 = j3pos - i3pos;
d4 = j3pos - i4pos;
// loop2Internal( d1, d2, d3, d4, iScaledAtomicRadii, j3BornForceFactor, j3AtomicRadii, de, bornSum );
loop2InternalNoSum( d1, d2, d3, d4, iScaledAtomicRadii, j3BornForceFactor, j3AtomicRadii, de );
bornForce1.xyz += de.x*d1;
bornForce2.xyz += de.y*d2;
bornForce3.xyz += de.z*d3;
bornForce4.xyz += de.w*d4;
// ---------------------------------------------------------------------------------------
// j = 4
d1 = j4pos - i1pos;
d2 = j4pos - i2pos;
d3 = j4pos - i3pos;
d4 = j4pos - i4pos;
// loop2Internal( d1, d2, d3, d4, iScaledAtomicRadii, j4BornForceFactor, j4AtomicRadii, de, bornSum );
loop2InternalNoSum( d1, d2, d3, d4, iScaledAtomicRadii, j4BornForceFactor, j4AtomicRadii, de );
bornForce1.xyz += de.x*d1;
bornForce2.xyz += de.y*d2;
bornForce3.xyz += de.z*d3;
bornForce4.xyz += de.w*d4;
*/
// ---------------------------------------------------------------------------------------
//exclind.x -= 3.0f;
exclind.x = floor( exclind.x - 2.9f );
//exclind.y += 1.0f;
exclind.y = floor( exclind.y + 1.25f );
// increment linear index by 4
jLinind += 4.0f;
}
jAtom.y += 1.0f;
}
}
/****************************************************************
* This file is part of the gpu acceleration library for gromacs.
* Author: V. Vishal
* Copyright (C) Pande Group, Stanford, 2006
*****************************************************************/
//Harmonic angles kernel
//Input is a stream of triplets i, j, k
//parms is float2( theta0, kA )
//Output is three streams of forces fi, fj, fk
//Again, this is kept simple for now, can be optimized
//later as necessary
kernel void kangles_harmonic(
float xstrwidth,
float3 atoms<>,
float2 parms<>,
float4 posq[][],
out float3 fi<>,
out float3 fj<>,
out float3 fk<>
) {
float theta;
float dx, dx2, fs;
float st, dvdt, cik, sth, nrkj2, nrij2, cii, ckk, costheta, sintheta;
float rij2, rkj2;
float2 idx;
float2 ai, aj, ak;
float3 xi, xj, xk, rij, rkj;
ai.y = floor( atoms.x / xstrwidth );
ai.x = atoms.x - ai.y * xstrwidth;
aj.y = floor( atoms.y / xstrwidth );
aj.x = atoms.y - aj.y * xstrwidth;
ak.y = floor( atoms.z / xstrwidth );
ak.x = atoms.z - ak.y * xstrwidth;
rij = posq[ ai ].xyz - posq[ aj ].xyz; //3
rkj = posq[ ak ].xyz - posq[ aj ].xyz; //3
rij2 = dot( rij, rij ); //5
rkj2 = dot( rkj, rkj ); //5
costheta = dot( rij, rkj ) / sqrt( rij2 * rkj2 ); //8
costheta = clamp( costheta, -1.0, 1.0 );
theta = acos( costheta ); //1 flop, ouch
sintheta = sqrt( 1 - costheta * costheta ); //3
dx = theta - parms.x; //1
dx2 = dx * dx; //1
/*scalar force = dv/dtheta*/
fs = -parms.y * dx; //1
st = fs / sintheta; //1
st = clamp( st, -1000000.0, 1000000.0 ); //Does this work on the gpu for st=inf?
sth = st * costheta; //1
nrkj2 = dot( rkj, rkj ); //5
nrij2 = dot( rij, rij ); //5
cik = st * rsqrt( nrkj2 * nrij2 ); //3
cii = sth / nrij2; //1
ckk = sth / nrkj2; //1
fi = -( cik * rkj - cii * rij ); //7
fk = -( cik * rij - ckk * rkj ); //7
fj = -fi - fk; //3
//Total flops: 64
}
#ifndef __KCOMMON_H__
#define __KCOMMON_H__
void kgetxyz (::brook::stream instr,
::brook::stream outstr);
void kzerof3 (::brook::stream outstr);
void kzerof4 (::brook::stream outstr);
void kzerof4 (::brook::stream outstr);
void ksetf4 (const float4 val, ::brook::stream outstr);
void kadd3( ::brook::stream instr, ::brook::stream outstr );
void ksetStr3( ::brook::stream instr, ::brook::stream outstr );
#endif // __KCOMMON_H__
/****************************************************************
* This file is part of the gpu acceleration library for gromacs.
* Author: V. Vishal
* Copyright (C) Pande Group, Stanford, 2006
*****************************************************************/
//Kernel to set the xyz components of a float4 stream
//Used for changing the coordinates without changing
//the charges in strPosQ
kernel void ksetxyz( float3 instr<>, float4 before<>, out float4 after<> ) {
after.xyz = instr;
after.w = before.w;
}
//Inverse of above
kernel void kgetxyz( float4 instr<>, out float3 outstr<> ) {
outstr = instr.xyz;
}
//Zeroes out a stream
kernel void kzerof3( out float3 outstr<> ) {
outstr = float3( 0.0, 0.0, 0.0 );
}
//Zeros out a stream
kernel void kzerof4( out float4 outstr<> ) {
outstr = float4( 0.0, 0.0, 0.0, 0.0 );
}
kernel void ksetf4( float4 val, out float4 outstr<> ) {
outstr = val;
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment