Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
cb130f92
Commit
cb130f92
authored
Sep 25, 2008
by
Mark Friedrichs
Browse files
Mods
parent
cc8b4de0
Changes
28
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
1559 additions
and
424 deletions
+1559
-424
platforms/brook/src/gpu/kgbsa1.br
platforms/brook/src/gpu/kgbsa1.br
+388
-0
platforms/brook/src/gpu/kmerge.br
platforms/brook/src/gpu/kmerge.br
+933
-0
platforms/brook/src/gpu/kmerge_partial_forces.br
platforms/brook/src/gpu/kmerge_partial_forces.br
+50
-0
platforms/brook/src/gpu/knlist.br
platforms/brook/src/gpu/knlist.br
+0
-237
platforms/brook/src/gpu/kpdihs.br
platforms/brook/src/gpu/kpdihs.br
+0
-87
platforms/brook/src/gpu/krbdihs.br
platforms/brook/src/gpu/krbdihs.br
+0
-100
platforms/brook/src/gpu/kshakeh.h
platforms/brook/src/gpu/kshakeh.h
+83
-0
platforms/brook/src/gpu/kupdatesd.h
platforms/brook/src/gpu/kupdatesd.h
+105
-0
No files found.
platforms/brook/src/gpu/kgbsa1.br
0 → 100644
View file @
cb130f92
/****************************************************************
* This file is part of the gpu acceleration library for gromacs.
* Author: Mark Friedrichs
*
* This kernel was developed in collaboration with
*
* Copyright (C) Pande Group, Stanford, 2006
*****************************************************************/
kernel void loop1Internal( float3 d1, float3 d2, float3 d3, float3 d4, float4 jBornR,
float4 jQ, float iBornR, float iQ, out float4 dGpol_dr<>,
out float4 dGpol_dalpha2_ij<> ){
// ---------------------------------------------------------------------------------------
float4 r2, alpha2_ij, D_ij, expTerm, denominator2, denominator, Gpol;
// ---------------------------------------------------------------------------------------
r2 = float4( dot(d1, d1), dot( d2, d2 ), dot( d3, d3 ), dot( d4, d4 ) );
alpha2_ij = jBornR*iBornR;
D_ij = r2/(4.0f*alpha2_ij);
expTerm = exp( -D_ij );
denominator2 = r2 + alpha2_ij*expTerm;
denominator = sqrt( denominator2 );
Gpol = jQ/denominator;
Gpol *= iQ;
dGpol_dr = -Gpol*( 1.0f - 0.25f*expTerm )/denominator2;
dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*( 1.0f + D_ij )*jBornR/denominator2;
}
/* ---------------------------------------------------------------------------------------
Calculate nonpolar ACE term (Simbios)
bornRadius: Born radius
vdwRadius: Vdw radius
duplicationFactor: duplication factor
aceForce: ACE term
--------------------------------------------------------------------------------------- */
kernel void kAceNonPolarLoop1( float iBornRadius, float iVdwRadius, float duplicationFactor,
out float aceForce<> ){
// ---------------------------------------------------------------------------------------
// nonpolar term
float iSurface;
float iAceTerm;
// ---------------------------------------------------------------------------------------
// constants
// solvent radius
const float probeRadius = 0.14f;
// PI*4*6*0.0054*1000 (0.0054=asolv from Tinker)
//const float PI_24_aI = -0.3694512961;
const float PI_24_aI = -407.1504079f;
// ---------------------------------------------------------------------------------------
// etch i position and partial charge
// e = ai * term * (ri+probe)**2 * (ri/rb)**6
// (drbi) = drb(i) - 6.0fd0*e/rb
// (rI+probe)**2
iSurface = (iVdwRadius+probeRadius);
iSurface = iSurface*iSurface;
// (rI/rB)**6
iAceTerm = iVdwRadius/iBornRadius;
iAceTerm = iAceTerm*iAceTerm*iAceTerm;
iAceTerm = iAceTerm*iAceTerm;
aceForce = iSurface*iAceTerm*PI_24_aI/(duplicationFactor*iBornRadius);
}
/* ---------------------------------------------------------------------------------------
Calculate first loop force terms (Simbios)
numberOfAtoms: no. of atoms
roundedUpAtoms: rounded up number of atoms -- accounts for unrolling
duplicationFactor: number of threads for inner loop
streamWidth: atom stream width
fstreamWidth: force stream width (output -- i-unroll)
soluteDielectric: solute dielectric
solventDielectric: solvent dielectric
includeAce: include ACE term
posq: atom positions and charge
bornRadii: Born radii
nonpolarForce: nonpolar force (0 if nonpolar not included, else
ACE value)
bornForce1: i-unroll first force component, including dBornR/dr in .w
bornForce2: i-unroll second force component, including dBornR/dr in .w
bornForce3: i-unroll first force component, including dBornR/dr in .w
bornForce4: i-unroll second force component, including dBornR/dr in .w
--------------------------------------------------------------------------------------- */
kernel void kObcLoop1( float numberOfAtoms, float roundedUpAtoms, float duplicationFactor,
float streamWidth, float fstreamWidth, float soluteDielectric,
float solventDielectric, float includeAce,
float3 posq[][], float bornRadii[][], float2 atomicRadii[][],
out float4 bornForce1<>, out float4 bornForce2<>,
out float4 bornForce3<>, out float4 bornForce4<> ){
// ---------------------------------------------------------------------------------------
// Born radii
float i1BornR, i2BornR, i3BornR, i4BornR;
float j1BornR, j2BornR, j3BornR, j4BornR;
float4 jBornR;
// atomic radii
float i1AtomicR, i2AtomicR, i3AtomicR, i4AtomicR;
// i,j coordinates
float3 i1Pos, i2Pos, i3Pos, i4Pos;
float3 j1Pos, j2Pos, j3Pos, j4Pos;
float4 j1PosQ, j2PosQ, j3PosQ, j4PosQ;
// i, j partial charges
float i1Q, i2Q, i3Q, i4Q;
float j1Q, j2Q, j3Q, j4Q;
float4 jQ;
float aceForce;
// delta coordinates
float3 d1, d2, d3, d4;
// intermediate terms
float4 dGpol_dr, dGpol_dalpha2_ij;
// indices
float2 iAtom;
float forceIndex;
// This is forceIndex mod numberOfAtoms, the true i index
float iAtomLinearIndex, jLinind;
float2 jAtom;
float jEnd, jStart, jBlock;
float whichRep;
float tmp;
// ---------------------------------------------------------------------------------------
// electricConstant = -166.0f2691;
// preFactor = 2.0f*electricConstant*(1.0f - (1.0f/waterDielectric))
float preFactor = -332.05382f;
const float I_Unroll = 4.0f;
const float3 zero3 = float3( 0.0f, 0.0f, 0.0f );
// ---------------------------------------------------------------------------------------
preFactor *= ( (1.0f/soluteDielectric) - (1.0f/solventDielectric) );
iAtom = indexof( bornForce1 );
forceIndex = I_Unroll*( iAtom.x + iAtom.y*fstreamWidth );
iAtomLinearIndex = fmod( forceIndex, roundedUpAtoms );
// ---------------------------------------------------------------------------------------
// set gather index
iAtom.x = fmod( iAtomLinearIndex, streamWidth );
iAtom.y = round( (iAtomLinearIndex - fmod(iAtomLinearIndex, streamWidth ))/streamWidth );
// ---------------------------------------------------------------------------------------
// etch i1 position and partial charge
jQ = posq[ iAtom ];
i1Pos = jQ.xyz;
i1Q = atomicRadii[ iAtom ].y;
i1Q *= preFactor;
i1BornR = bornRadii[ iAtom ];
i1AtomicR = atomicRadii[ iAtom ].x;
kAceNonPolarLoop1( i1BornR, i1AtomicR, duplicationFactor, aceForce );
bornForce1.xyz = zero3;
bornForce1.w = includeAce > 0.5f ? aceForce : 0.0f;
// ---------------------------------------------------------------------------------------
// etch i2 position and partial charge
iAtom.x += 1;
jQ = posq[ iAtom ];
i2Pos = jQ.xyz;
i2Q = atomicRadii[ iAtom ].y;
i2Q *= preFactor;
i2BornR = bornRadii[ iAtom ];
i2AtomicR = atomicRadii[ iAtom ].x;
kAceNonPolarLoop1( i2BornR, i2AtomicR, duplicationFactor, aceForce );
bornForce2.xyz = zero3;
bornForce2.w = includeAce > 0.5f ? aceForce : 0.0f;
// ---------------------------------------------------------------------------------------
// etch i3 position and partial charge
iAtom.x += 1;
jQ = posq[ iAtom ];
i3Pos = jQ.xyz;
i3Q = atomicRadii[ iAtom ].y;
i3Q *= preFactor;
i3BornR = bornRadii[ iAtom ];
i3AtomicR = atomicRadii[ iAtom ].x;
kAceNonPolarLoop1( i3BornR, i3AtomicR, duplicationFactor, aceForce );
bornForce3.xyz = zero3;
bornForce3.w = includeAce > 0.5f ? aceForce : 0.0f;
// ---------------------------------------------------------------------------------------
// etch i4 position and partial charge
iAtom.x += 1;
jQ = posq[ iAtom ];
i4Pos = jQ.xyz;
i4Q = atomicRadii[ iAtom ].y;
i4Q *= preFactor;
i4BornR = bornRadii[ iAtom ];
i4AtomicR = atomicRadii[ iAtom ].x;
kAceNonPolarLoop1( i4BornR, i4AtomicR, duplicationFactor, aceForce );
bornForce4.xyz = zero3;
bornForce4.w = includeAce > 0.5f ? aceForce : 0.0f;
// ---------------------------------------------------------------------------------------
// inner loop setup
// if dupFac == 4, I_UnRoll =2, then breaking inner loop into two segments
// to increase number of threads in flight
// forceStreamSz = N*RepFac/I_UnRoll
// forceIndex = I_UnRoll*( a.x + a.y*forceStreamSz )
// whichRep = 0 or 1
// jBlock = 1 + floor[ N/(duplicationFactor*streamWidth) ]
//changed the following instruction for rounding issues on some ASICs
//whichRep = floor( forceIndex / roundedUpAtoms );
tmp = fmod(forceIndex, roundedUpAtoms);
whichRep = round((forceIndex - tmp)/roundedUpAtoms);
jBlock = 1 + floor( numberOfAtoms/(duplicationFactor*streamWidth ) );
jStart = whichRep*jBlock;
jEnd = ( whichRep > duplicationFactor - 1.5f ) ? 999999.0f : (jStart + jBlock);
jAtom.y = jStart;
jLinind = jAtom.y*streamWidth;
// ---------------------------------------------------------------------------------------
while ( jAtom.y < jEnd && ( numberOfAtoms - jLinind ) > 0.9f ){
jAtom.x = 0.0f;
while ( jAtom.x < streamWidth && ( numberOfAtoms - jLinind ) > 0.9f ) {
// ---------------------------------------------------------------------------------------
// gather required values
j1Pos = posq[ jAtom ];
j1Q = atomicRadii[ jAtom ].y;
j1BornR = bornRadii[ jAtom ];
jAtom.x += 1.0f;
j2Pos = posq[ jAtom ];
j2Q = atomicRadii[ jAtom ].y;
j2BornR = bornRadii[ jAtom ];
jAtom.x += 1.0f;
j3Pos = posq[ jAtom ];
j3Q = atomicRadii[ jAtom ].y;
j3BornR = bornRadii[ jAtom ];
jAtom.x += 1.0f;
j4Pos = posq[ jAtom ];
j4Q = atomicRadii[ jAtom ].y;
j4BornR = bornRadii[ jAtom ];
jAtom.x += 1.0f;
jBornR = float4( j1BornR, j2BornR, j3BornR, j4BornR );
jQ = float4( j1Q, j2Q, j3Q, j4Q );
// ---------------------------------------------------------------------------------------
// i == 1
d1 = i1Pos - j1Pos;
d2 = i1Pos - j2Pos;
d3 = i1Pos - j3Pos;
d4 = i1Pos - j4Pos;
loop1Internal( d1, d2, d3, d4, jBornR, jQ, i1BornR, i1Q, dGpol_dr, dGpol_dalpha2_ij );
bornForce1.xyz += dGpol_dr.x*d1;
bornForce1.xyz += dGpol_dr.y*d2;
bornForce1.xyz += dGpol_dr.z*d3;
bornForce1.xyz += dGpol_dr.w*d4;
bornForce1.w += dGpol_dalpha2_ij.x + dGpol_dalpha2_ij.y + dGpol_dalpha2_ij.z + dGpol_dalpha2_ij.w;
// ---------------------------------------------------------------------------------------
// i == 2
d1 = i2Pos - j1Pos;
d2 = i2Pos - j2Pos;
d3 = i2Pos - j3Pos;
d4 = i2Pos - j4Pos;
loop1Internal( d1, d2, d3, d4, jBornR, jQ, i2BornR, i2Q, dGpol_dr, dGpol_dalpha2_ij );
bornForce2.xyz += dGpol_dr.x*d1;
bornForce2.xyz += dGpol_dr.y*d2;
bornForce2.xyz += dGpol_dr.z*d3;
bornForce2.xyz += dGpol_dr.w*d4;
bornForce2.w += dGpol_dalpha2_ij.x + dGpol_dalpha2_ij.y + dGpol_dalpha2_ij.z + dGpol_dalpha2_ij.w;
// ---------------------------------------------------------------------------------------
// i == 3
d1 = i3Pos - j1Pos;
d2 = i3Pos - j2Pos;
d3 = i3Pos - j3Pos;
d4 = i3Pos - j4Pos;
loop1Internal( d1, d2, d3, d4, jBornR, jQ, i3BornR, i3Q, dGpol_dr, dGpol_dalpha2_ij );
bornForce3.xyz += dGpol_dr.x*d1;
bornForce3.xyz += dGpol_dr.y*d2;
bornForce3.xyz += dGpol_dr.z*d3;
bornForce3.xyz += dGpol_dr.w*d4;
bornForce3.w += dGpol_dalpha2_ij.x + dGpol_dalpha2_ij.y + dGpol_dalpha2_ij.z + dGpol_dalpha2_ij.w;
// ---------------------------------------------------------------------------------------
// i == 4
d1 = i4Pos - j1Pos;
d2 = i4Pos - j2Pos;
d3 = i4Pos - j3Pos;
d4 = i4Pos - j4Pos;
loop1Internal( d1, d2, d3, d4, jBornR, jQ, i4BornR, i4Q, dGpol_dr, dGpol_dalpha2_ij );
bornForce4.xyz += dGpol_dr.x*d1;
bornForce4.xyz += dGpol_dr.y*d2;
bornForce4.xyz += dGpol_dr.z*d3;
bornForce4.xyz += dGpol_dr.w*d4;
bornForce4.w += dGpol_dalpha2_ij.x + dGpol_dalpha2_ij.y + dGpol_dalpha2_ij.z + dGpol_dalpha2_ij.w;
// ---------------------------------------------------------------------------------------
jLinind += 4.0f;
}
jAtom.y += 1.0f;
}
}
platforms/brook/src/gpu/kmerge.br
0 → 100644
View file @
cb130f92
This diff is collapsed.
Click to expand it.
platforms/brook/src/gpu/kmerge_partial_forces.br
View file @
cb130f92
...
@@ -99,3 +99,53 @@ kernel void kMergeFloat3_4(
...
@@ -99,3 +99,53 @@ kernel void kMergeFloat3_4(
}
}
}
}
kernel void kMergeFloat3_4_nobranch(
float repfac,
float atomStreamWidth,
float pStreamWidth,
float natoms,
float roundNatoms,
float iUnroll,
float3 pstream1[][],
float3 pstream2[][],
float3 pstream3[][],
float3 pstream4[][],
out float3 outstream<> )
{
float atomIndex, forceIndex, qIndex, qOff;
float2 pindex;
float i;
float3 o1,o2,o3,o4;
float3 tmp;
// given atom index find force indices and streams
pindex = indexof( outstream );
atomIndex = pindex.x + pindex.y*atomStreamWidth;
forceIndex = atomIndex;
outstream = float3( 0.0f, 0.0f, 0.0f );
for( i = 0.0f; i < repfac; i += 1.0f ){
qIndex = round( (forceIndex - fmod( forceIndex, iUnroll))/iUnroll );
qOff = forceIndex - iUnroll*qIndex;
pindex.y = round( (qIndex - fmod( qIndex, pStreamWidth ))/pStreamWidth );
pindex.x = qIndex - pindex.y*pStreamWidth;
o1 = pstream1[ pindex ];
o2 = pstream2[ pindex ];
o3 = pstream3[ pindex ];
o4 = pstream4[ pindex ];
tmp = qOff < 0.5f ? o1 : o2;
tmp = qOff < 1.5f ? tmp : o3;
tmp = qOff < 2.5f ? tmp : o4;
outstream += tmp;
forceIndex += roundNatoms;
}
}
platforms/brook/src/gpu/knlist.br
deleted
100644 → 0
View file @
cc8b4de0
/****************************************************************
* This file is part of the gpu acceleration library for gromacs.
* Author: V. Vishal
* Copyright (C) Pande Group, Stanford, 2006
*****************************************************************/
/* Order N^2 neighbor searching.
*
* This only works for force fields that don't have charge groups.
* If you insist on charge groups, you'll have to pass in appropriate masks here.
*
* This is a simplified kernel, for testing the O(N) speeds.
*
* This does a complete N^2 search without considering groups of
* atoms. Most likely this will prove to be inefficient for
* the O(N) kernel. Lets find out.
*
*
* Each component of the curpass textures is an atom index. The w component
* of curpass3 is a count indicating how many j particles we have
* scanned for this particular i atom.
*
* */
kernel void knborsearch(
float first, //Positive means constructing the first 16.
iter float2 wpos<>, //pixel position of output
float AtomStrHeight,
float AtomStrWidth,
float cutoff2, //square of the cutoff
float natoms, //number of atoms
float excl[][], //exclusions in 1x1 format, 0 means not excluded, 1 means excluded.
float4 posq[][], //atom positions/charges
float4 prevpass3<>, //Last output texture of previous pass
out float4 curpass0<>, //First output of current pass
out float4 curpass1<>,
out float4 curpass2<>,
out float4 curpass3<> //Last output of current pass, used in next pass
){
/*For this kernel, wpos == iatom*/
float2 iind;
float2 jind;
float3 ipos, jpos, dr;
float r2;
float listptr; //Where in the 16-chunk are we now.
float jlinind;
float breakflag; //positive means keep looping, negative means stop
float4 exclconst;
float2 exclind;
float exclusions;
exclconst = float4( 2.0f, 3.0f, 5.0f, 7.0f );
iind = wpos;
exclind.x = iind.x + iind.y * AtomStrWidth;
//etch i atom
ipos = posq[ iind ].xyz;
//Loop over j depending on prevpass
jlinind = prevpass3.w + 1;
jind.y = floor( jlinind / AtomStrWidth );
jind.x = fmod( jlinind, AtomStrWidth );
exclind.y = jlinind;
//All outputs should be initialized to
listptr = 0.0f;
breakflag = 1.0f;
//if we already finished, do nothing
if ( first < 0.0f && prevpass3.w < 0.0f )
breakflag = -1.0f;
//set to -1 to indicate no neighbor
//just to save a separate set of init calls
curpass0 = float4( -1.0f, -1.0f, -1.0f, -1.0f );
curpass1 = curpass0;
curpass2 = curpass0;
curpass3 = curpass0;
while ( jind.y < AtomStrHeight && breakflag > 0.0f ) {
while ( jind.x < AtomStrWidth && breakflag > 0.0f ) {
//First see if this pair is excluded
exclusions = excl[ exclind ];
if ( exclusions < 0.5f ) {
jpos = posq[ jind ].xyz;
dr = jpos - ipos;
r2 = dot( dr, dr );
//If it is inside the cutoff
if ( r2 < cutoff2 ) {
//Figure out where to put it
//We are allowed 4 nested conditionals
//We can play with the structuring of these
if ( listptr < 0.5f )
curpass0.x = jlinind;
else if ( listptr < 1.5f )
curpass0.y = jlinind;
else if ( listptr < 2.5f )
curpass0.z = jlinind;
else if ( listptr < 3.5f )
curpass0.w = jlinind;
else if ( listptr < 4.5f )
curpass1.x = jlinind;
else if ( listptr < 5.5f )
curpass1.y = jlinind;
else if ( listptr < 6.5f )
curpass1.z = jlinind;
else if ( listptr < 7.5f )
curpass1.w = jlinind;
else if ( listptr < 8.5f )
curpass2.x = jlinind;
else if ( listptr < 9.5f )
curpass2.y = jlinind;
else if ( listptr < 10.5f )
curpass2.z = jlinind;
else if ( listptr < 11.5f )
curpass2.w = jlinind;
else if ( listptr < 12.5f )
curpass3.x = jlinind;
else if ( listptr < 13.5f )
curpass3.y = jlinind;
else if ( listptr < 14.5f ) {
curpass3.z = jlinind;
}
else if ( listptr < 15.5f ) {
//We're done for this pass
curpass3.w = jlinind;
breakflag = -1.0f;
}
listptr += 1.0f;
}
}
jlinind += 1.0f;
exclind.y += 1.0f;
jind.x += 1.0f;
}
jind.x = 0.0f;
jind.y += 1.0f;
}
}
//Precomputes lennard jones sig and eps
//to save an indirect etch (and a ew flops) in the
//force kernel. The charge product is not done this way
//because charges have to be etched anyway with the
//positions
kernel void knl_precompute_sigeps(
float AtomStrWidth,
iter float2 wpos<>,
float2 sigeps[][], //x=sigma, y=epsilon
float4 nlist0<>,
float4 nlist1<>,
out float4 sig0<>,
out float4 eps0<>,
out float4 sig1<>,
out float4 eps1<>
)
{
float2 jind;
float4 ind_tmp1, ind_tmp2;
float2 isigeps, jsigeps1, jsigeps2, jsigeps3, jsigeps4;
isigeps = sigeps[ wpos ];
ind_tmp1 = floor( nlist0 / AtomStrWidth );
ind_tmp2 = nlist0 - ind_tmp1 * AtomStrWidth;
jind.y = ind_tmp1.x;
jind.x = ind_tmp2.x;
jsigeps1 = sigeps[ jind ];
jind.y = ind_tmp1.y;
jind.x = ind_tmp2.y;
jsigeps2 = sigeps[ jind ];
jind.y = ind_tmp1.z;
jind.x = ind_tmp2.z;
jsigeps3 = sigeps[ jind ];
jind.y = ind_tmp1.w;
jind.x = ind_tmp2.w;
jsigeps4 = sigeps[ jind ];
sig0.x = isigeps.x + jsigeps1.x;
sig0.y = isigeps.x + jsigeps2.x;
sig0.z = isigeps.x + jsigeps3.x;
sig0.w = isigeps.x + jsigeps4.x;
eps0.x = isigeps.y * jsigeps1.y;
eps0.y = isigeps.y * jsigeps2.y;
eps0.z = isigeps.y * jsigeps3.y;
eps0.w = isigeps.y * jsigeps4.y;
//2nd nlist set
ind_tmp1 = floor( nlist1 / AtomStrWidth );
ind_tmp2 = nlist1 - ind_tmp1 * AtomStrWidth;
jind.y = ind_tmp1.x;
jind.x = ind_tmp2.x;
jsigeps1 = sigeps[ jind ];
jind.y = ind_tmp1.y;
jind.x = ind_tmp2.y;
jsigeps2 = sigeps[ jind ];
jind.y = ind_tmp1.z;
jind.x = ind_tmp2.z;
jsigeps3 = sigeps[ jind ];
jind.y = ind_tmp1.w;
jind.x = ind_tmp2.w;
jsigeps4 = sigeps[ jind ];
sig1.x = isigeps.x + jsigeps1.x;
sig1.y = isigeps.x + jsigeps2.x;
sig1.z = isigeps.x + jsigeps3.x;
sig1.w = isigeps.x + jsigeps4.x;
eps1.x = isigeps.y * jsigeps1.y;
eps1.y = isigeps.y * jsigeps2.y;
eps1.z = isigeps.y * jsigeps3.y;
eps1.w = isigeps.y * jsigeps4.y;
}
platforms/brook/src/gpu/kpdihs.br
deleted
100644 → 0
View file @
cc8b4de0
/****************************************************************
* This file is part of the gpu acceleration library for gromacs.
* Author: V. Vishal
* Copyright (C) Pande Group, Stanford, 2006
*****************************************************************/
//Ryckaert Bellman dihedrals, needed for Amber/OPLS ff's
//
//Input is a stream of quartets i, j, k, l and the output is
//four float3 streams fi, fj, fk, fl.
//If by any chance this kernel becomes the bottleneck, we will
//optimize, but for now, this is kept pretty simple.
//To keep things streaming, we have a stream of 6 parameters(a float4 and float2)
//for each dihedral.
kernel void kpdih(
float xstrwidth, //stream width for x
float4 xq[][], //particle coordinates and charges
float4 atoms<>, //ijkl quartets
float4 parms<>, //parms = ( cp, phi0, mult, 0.0 )
out float3 fi<>, //output forces for i, j, k, l
out float3 fj<>,
out float3 fk<>,
out float3 fl<>
) {
float3 r_ij, r_kj, r_kl;
float2 ai, aj, ak, al;
float3 m, n;
float sgnphi;
float cosfac;
float phi, ddphi, mdphi;
float3 u, v, s;
float nrkj, nrkj2, msq, nsq, cos_phi, sin_phi;
//Convert from linear indices to 2D indices into x
//If this kernel is compute bound, we can do this
//conversion before-hand and feed in the 2D coordinates
ai.y = floor( atoms.x / xstrwidth );
ai.x = atoms.x - ai.y * xstrwidth;
aj.y = floor( atoms.y / xstrwidth );
aj.x = atoms.y - aj.y * xstrwidth;
ak.y = floor( atoms.z / xstrwidth );
ak.x = atoms.z - ak.y * xstrwidth;
al.y = floor( atoms.w / xstrwidth );
al.x = atoms.w - al.y * xstrwidth;
r_ij = xq[ai].xyz - xq[aj].xyz; //3
r_kj = xq[ak].xyz - xq[aj].xyz; //3
r_kl = xq[ak].xyz - xq[al].xyz; //3
m = cross( r_ij, r_kj ); //9
n = cross( r_kj, r_kl ); //9
msq = dot(m, m); //5
nsq = dot(n, n); //5
cos_phi = clamp( dot(m, n)/sqrt(msq*nsq), -1.0, 1.0 ); //8
sgnphi = sign( dot( r_ij, n ) ); //5
phi = sgnphi * acos( cos_phi ); //2
mdphi = parms.z * phi - parms.y; //2
ddphi = - parms.x * parms.z * sin( mdphi ); //3
nrkj2 = dot( r_kj, r_kj ); //5
nrkj = sqrt( nrkj2 ); //1
fi = -ddphi * nrkj / msq * m; //5
fl = ddphi * nrkj / nsq * n; //5
u = dot( r_ij, r_kj ) / nrkj2 * fi; //9
v = dot( r_kl, r_kj ) / nrkj2 * fl; //9
s = u - v; //3
fj = s - fi; //3
fk = -(s + fl); //3
//Total : 100 flops
}
platforms/brook/src/gpu/krbdihs.br
deleted
100644 → 0
View file @
cc8b4de0
/****************************************************************
* This file is part of the gpu acceleration library for gromacs.
* Author: V. Vishal
* Copyright (C) Pande Group, Stanford, 2006
*****************************************************************/
//Ryckaert Bellman dihedrals, needed for Amber/OPLS ff's
//
//Input is a stream of quartets i, j, k, l and the output is
//four float3 streams fi, fj, fk, fl.
//If by any chance this kernel becomes the bottleneck, we will
//optimize, but for now, this is kept pretty simple.
//To keep things streaming, we have a stream of 6 parameters(a float4 and float2)
//for each dihedral.
kernel void krbdih(
float xstrwidth, //stream width for x
float4 xq[][], //particle coordinates and charges
float4 atoms<>, //ijkl quartets
float4 parm03<>, //params 0-3
float2 parm45<>, //params 4 and 5
out float3 fi<>, //output forces for i, j, k, l
out float3 fj<>,
out float3 fk<>,
out float3 fl<>
) {
float3 r_ij, r_kj, r_kl;
float2 ai, aj, ak, al;
float3 m, n;
float sgnphi;
float cosfac;
float ddphi;
float3 u, v, s;
float nrkj, nrkj2, msq, nsq, cos_phi, sin_phi;
//Convert from linear indices to 2D indices into x
//If this kernel is compute bound, we can do this
//conversion before-hand and feed in the 2D coordinates
ai.y = floor( atoms.x / xstrwidth );
ai.x = atoms.x - ai.y * xstrwidth;
aj.y = floor( atoms.y / xstrwidth );
aj.x = atoms.y - aj.y * xstrwidth;
ak.y = floor( atoms.z / xstrwidth );
ak.x = atoms.z - ak.y * xstrwidth;
al.y = floor( atoms.w / xstrwidth );
al.x = atoms.w - al.y * xstrwidth;
r_ij = xq[ai].xyz - xq[aj].xyz; //3
r_kj = xq[ak].xyz - xq[aj].xyz; //3
r_kl = xq[ak].xyz - xq[al].xyz; //3
m = cross( r_ij, r_kj ); //9
n = cross( r_kj, r_kl ); //9
msq = dot(m, m); //5
nsq = dot(n, n); //5
cos_phi = dot(m, n)/sqrt(msq*nsq); //8 (sqrt=1)
//Switching to "polymer convention"
//See gromacs code
cos_phi = -cos_phi;
sgnphi = sign( dot(r_ij, n) ); //5
sin_phi = -sgnphi*sqrt( clamp( 1.0 - cos_phi * cos_phi, 0.0, 1.0) ); //3
//ddphi is basically sum_{i=1}^5 i parm_i cosphi^{i-1}
//This might not be the best way to use the
//4-way mads, but for now we'll let fxc figure it
//out.
//If we precompute some ratios of the parameters
//we can use the 4-way mads better
ddphi = 5.0 * parm45.y;
ddphi = 4.0 * parm45.x + ddphi * cos_phi;
ddphi = 3.0 * parm03.w + ddphi * cos_phi;
ddphi = 2.0 * parm03.z + ddphi * cos_phi;
ddphi = parm03.y + ddphi * cos_phi;
ddphi = -ddphi * sin_phi; //13 flops total for ddphi
nrkj2 = dot( r_kj, r_kj ); //5
nrkj = sqrt( nrkj2 ); //1
fi = -ddphi * nrkj / msq * m; //5
fl = ddphi * nrkj / nsq * n; //5
u = dot( r_ij, r_kj ) / nrkj2 * fi; //9
v = dot( r_kl, r_kj ) / nrkj2 * fl; //9
s = u - v; //3
fj = s - fi; //3
fk = -(s + fl); //3
//Total flops: 109 per rb torsion.
}
platforms/brook/src/gpu/kshakeh.h
0 → 100644
View file @
cb130f92
void
kshakeh_fix1
(
const
float
nit
,
const
float
strwidth
,
const
float
invmH
,
const
float
omega
,
::
brook
::
stream
atoms
,
::
brook
::
stream
posq
,
::
brook
::
stream
posqp
,
::
brook
::
stream
params
,
::
brook
::
stream
cposq0
,
::
brook
::
stream
cposq1
,
::
brook
::
stream
cposq2
,
::
brook
::
stream
cposq3
);
void
kshakeh_fix2
(
const
float
nit
,
const
float
strwidth
,
const
float
invmH
,
const
float
omega
,
::
brook
::
stream
atoms
,
::
brook
::
stream
posq
,
::
brook
::
stream
posqp
,
::
brook
::
stream
params
,
::
brook
::
stream
cposq0
,
::
brook
::
stream
cposq1
,
::
brook
::
stream
cposq2
,
::
brook
::
stream
cposq3
);
void
kshakeh_update
(
const
float
strwidth
,
::
brook
::
stream
invmap
,
::
brook
::
stream
posq
,
::
brook
::
stream
cposq0
,
::
brook
::
stream
cposq1
,
::
brook
::
stream
cposq2
,
::
brook
::
stream
cposq3
,
::
brook
::
stream
oposq
)
;
void
kshakeh
(
const
float
nit
,
const
float
strwidth
,
const
float
invmH
,
const
float
omega
,
::
brook
::
stream
atoms
,
::
brook
::
stream
posq
,
::
brook
::
stream
posqp
,
::
brook
::
stream
params
,
::
brook
::
stream
cposq0
,
::
brook
::
stream
cposq1
,
::
brook
::
stream
cposq2
,
::
brook
::
stream
cposq3
);
void
kshakeh_update1_fix1
(
const
float
strwidth
,
const
float
sdpc1
,
::
brook
::
stream
invmap
,
::
brook
::
stream
posq
,
::
brook
::
stream
posqp
,
::
brook
::
stream
vPrime
,
::
brook
::
stream
cposq0
,
::
brook
::
stream
cposq1
,
::
brook
::
stream
cposq2
,
::
brook
::
stream
cposq3
,
::
brook
::
stream
oposq
);
void
kshakeh_update1_fix1Old
(
const
float
strwidth
,
::
brook
::
stream
invmap
,
::
brook
::
stream
posq
,
::
brook
::
stream
cposq0
,
::
brook
::
stream
cposq1
,
::
brook
::
stream
cposq2
,
::
brook
::
stream
cposq3
,
::
brook
::
stream
oposq
);
void
kshakeh_update2_fix1
(
const
float
strwidth
,
::
brook
::
stream
invmap
,
::
brook
::
stream
posq
,
::
brook
::
stream
posqp
,
::
brook
::
stream
cposq0
,
::
brook
::
stream
cposq1
,
::
brook
::
stream
cposq2
,
::
brook
::
stream
cposq3
,
::
brook
::
stream
oposq
);
platforms/brook/src/gpu/kupdatesd.h
0 → 100644
View file @
cb130f92
void
kupdate_sd1
(
const
float
xstrwidth
,
const
float
gstrwidth
,
const
float
goffset
,
const
float
cem
,
const
float
pc1
,
const
float
pc2
,
const
float
pc3
,
::
brook
::
stream
sdpc
,
::
brook
::
stream
fgauss
,
::
brook
::
stream
sd2X
,
::
brook
::
stream
posq
,
::
brook
::
stream
f
,
::
brook
::
stream
v
,
::
brook
::
stream
invmass
,
::
brook
::
stream
sd1V
,
::
brook
::
stream
vnew
,
::
brook
::
stream
posqp
);
void
kupdate_sd2
(
const
float
xstrwidth
,
const
float
gstrwidth
,
const
float
goffset
,
const
float
pc1
,
const
float
pc2
,
::
brook
::
stream
sdpc
,
::
brook
::
stream
fgauss
,
::
brook
::
stream
sd1V
,
::
brook
::
stream
posq
,
::
brook
::
stream
posqp
,
::
brook
::
stream
vnew
,
::
brook
::
stream
sd2X
,
::
brook
::
stream
v
,
::
brook
::
stream
posqp2
);
void
kpermute_vectors
(
const
float
gstrwidth
,
::
brook
::
stream
perm
,
::
brook
::
stream
gvin
,
::
brook
::
stream
gvout
);
void
kupdate_sd2_fix1
(
const
float
xstrwidth
,
const
float
gstrwidth
,
const
float
goffset
,
const
float
pc1
,
const
float
pc2
,
::
brook
::
stream
sdpc
,
::
brook
::
stream
fgauss
,
::
brook
::
stream
sd1V
,
::
brook
::
stream
posq
,
::
brook
::
stream
posqp
,
::
brook
::
stream
vnew
,
::
brook
::
stream
sd2X
,
::
brook
::
stream
v
,
::
brook
::
stream
posqp2
);
void
kupdate_sd1_fix1
(
const
float
xstrwidth
,
const
float
gstrwidth
,
const
float
goffset
,
const
float
cem
,
const
float
pc1
,
const
float
pc2
,
const
float
pc3
,
::
brook
::
stream
sdpc
,
::
brook
::
stream
fgauss
,
::
brook
::
stream
sd2X
,
::
brook
::
stream
posq
,
::
brook
::
stream
f
,
::
brook
::
stream
v
,
::
brook
::
stream
invmass
,
::
brook
::
stream
sd1V
,
::
brook
::
stream
vnew
,
::
brook
::
stream
posqp
);
void
kupdate_sd2_fix1_FixedRV
(
const
float
xstrwidth
,
const
float
gstrwidth
,
const
float
goffset
,
const
float
pc1
,
const
float
pc2
,
::
brook
::
stream
sdpc
,
::
brook
::
stream
fgauss
,
::
brook
::
stream
sd1V
,
::
brook
::
stream
posq
,
::
brook
::
stream
posqp
,
::
brook
::
stream
vnew
,
::
brook
::
stream
sd2X
,
::
brook
::
stream
v
,
::
brook
::
stream
posqp2
);
void
kupdate_sd1_fix1_FixedRV
(
const
float
xstrwidth
,
const
float
gstrwidth
,
const
float
goffset
,
const
float
cem
,
const
float
pc1
,
const
float
pc2
,
const
float
pc3
,
::
brook
::
stream
sdpc
,
::
brook
::
stream
fgauss
,
::
brook
::
stream
sd2X
,
::
brook
::
stream
posq
,
::
brook
::
stream
f
,
::
brook
::
stream
v
,
::
brook
::
stream
invmass
,
::
brook
::
stream
sd1V
,
::
brook
::
stream
vnew
,
::
brook
::
stream
posqp
);
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment