Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
openmm
Commits
cb130f92
Commit
cb130f92
authored
Sep 25, 2008
by
Mark Friedrichs
Browse files
Mods
parent
cc8b4de0
Changes
28
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
1559 additions
and
424 deletions
+1559
-424
platforms/brook/src/gpu/kgbsa1.br
platforms/brook/src/gpu/kgbsa1.br
+388
-0
platforms/brook/src/gpu/kmerge.br
platforms/brook/src/gpu/kmerge.br
+933
-0
platforms/brook/src/gpu/kmerge_partial_forces.br
platforms/brook/src/gpu/kmerge_partial_forces.br
+50
-0
platforms/brook/src/gpu/knlist.br
platforms/brook/src/gpu/knlist.br
+0
-237
platforms/brook/src/gpu/kpdihs.br
platforms/brook/src/gpu/kpdihs.br
+0
-87
platforms/brook/src/gpu/krbdihs.br
platforms/brook/src/gpu/krbdihs.br
+0
-100
platforms/brook/src/gpu/kshakeh.h
platforms/brook/src/gpu/kshakeh.h
+83
-0
platforms/brook/src/gpu/kupdatesd.h
platforms/brook/src/gpu/kupdatesd.h
+105
-0
No files found.
platforms/brook/src/gpu/kgbsa1.br
0 → 100644
View file @
cb130f92
/****************************************************************
* This file is part of the gpu acceleration library for gromacs.
* Author: Mark Friedrichs
*
* This kernel was developed in collaboration with
*
* Copyright (C) Pande Group, Stanford, 2006
*****************************************************************/
kernel void loop1Internal( float3 d1, float3 d2, float3 d3, float3 d4, float4 jBornR,
float4 jQ, float iBornR, float iQ, out float4 dGpol_dr<>,
out float4 dGpol_dalpha2_ij<> ){
// ---------------------------------------------------------------------------------------
float4 r2, alpha2_ij, D_ij, expTerm, denominator2, denominator, Gpol;
// ---------------------------------------------------------------------------------------
r2 = float4( dot(d1, d1), dot( d2, d2 ), dot( d3, d3 ), dot( d4, d4 ) );
alpha2_ij = jBornR*iBornR;
D_ij = r2/(4.0f*alpha2_ij);
expTerm = exp( -D_ij );
denominator2 = r2 + alpha2_ij*expTerm;
denominator = sqrt( denominator2 );
Gpol = jQ/denominator;
Gpol *= iQ;
dGpol_dr = -Gpol*( 1.0f - 0.25f*expTerm )/denominator2;
dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*( 1.0f + D_ij )*jBornR/denominator2;
}
/* ---------------------------------------------------------------------------------------
Calculate nonpolar ACE term (Simbios)
bornRadius: Born radius
vdwRadius: Vdw radius
duplicationFactor: duplication factor
aceForce: ACE term
--------------------------------------------------------------------------------------- */
kernel void kAceNonPolarLoop1( float iBornRadius, float iVdwRadius, float duplicationFactor,
out float aceForce<> ){
// ---------------------------------------------------------------------------------------
// nonpolar term
float iSurface;
float iAceTerm;
// ---------------------------------------------------------------------------------------
// constants
// solvent radius
const float probeRadius = 0.14f;
// PI*4*6*0.0054*1000 (0.0054=asolv from Tinker)
//const float PI_24_aI = -0.3694512961;
const float PI_24_aI = -407.1504079f;
// ---------------------------------------------------------------------------------------
// etch i position and partial charge
// e = ai * term * (ri+probe)**2 * (ri/rb)**6
// (drbi) = drb(i) - 6.0fd0*e/rb
// (rI+probe)**2
iSurface = (iVdwRadius+probeRadius);
iSurface = iSurface*iSurface;
// (rI/rB)**6
iAceTerm = iVdwRadius/iBornRadius;
iAceTerm = iAceTerm*iAceTerm*iAceTerm;
iAceTerm = iAceTerm*iAceTerm;
aceForce = iSurface*iAceTerm*PI_24_aI/(duplicationFactor*iBornRadius);
}
/* ---------------------------------------------------------------------------------------
Calculate first loop force terms (Simbios)
numberOfAtoms: no. of atoms
roundedUpAtoms: rounded up number of atoms -- accounts for unrolling
duplicationFactor: number of threads for inner loop
streamWidth: atom stream width
fstreamWidth: force stream width (output -- i-unroll)
soluteDielectric: solute dielectric
solventDielectric: solvent dielectric
includeAce: include ACE term
posq: atom positions and charge
bornRadii: Born radii
nonpolarForce: nonpolar force (0 if nonpolar not included, else
ACE value)
bornForce1: i-unroll first force component, including dBornR/dr in .w
bornForce2: i-unroll second force component, including dBornR/dr in .w
bornForce3: i-unroll first force component, including dBornR/dr in .w
bornForce4: i-unroll second force component, including dBornR/dr in .w
--------------------------------------------------------------------------------------- */
kernel void kObcLoop1( float numberOfAtoms, float roundedUpAtoms, float duplicationFactor,
float streamWidth, float fstreamWidth, float soluteDielectric,
float solventDielectric, float includeAce,
float3 posq[][], float bornRadii[][], float2 atomicRadii[][],
out float4 bornForce1<>, out float4 bornForce2<>,
out float4 bornForce3<>, out float4 bornForce4<> ){
// ---------------------------------------------------------------------------------------
// Born radii
float i1BornR, i2BornR, i3BornR, i4BornR;
float j1BornR, j2BornR, j3BornR, j4BornR;
float4 jBornR;
// atomic radii
float i1AtomicR, i2AtomicR, i3AtomicR, i4AtomicR;
// i,j coordinates
float3 i1Pos, i2Pos, i3Pos, i4Pos;
float3 j1Pos, j2Pos, j3Pos, j4Pos;
float4 j1PosQ, j2PosQ, j3PosQ, j4PosQ;
// i, j partial charges
float i1Q, i2Q, i3Q, i4Q;
float j1Q, j2Q, j3Q, j4Q;
float4 jQ;
float aceForce;
// delta coordinates
float3 d1, d2, d3, d4;
// intermediate terms
float4 dGpol_dr, dGpol_dalpha2_ij;
// indices
float2 iAtom;
float forceIndex;
// This is forceIndex mod numberOfAtoms, the true i index
float iAtomLinearIndex, jLinind;
float2 jAtom;
float jEnd, jStart, jBlock;
float whichRep;
float tmp;
// ---------------------------------------------------------------------------------------
// electricConstant = -166.0f2691;
// preFactor = 2.0f*electricConstant*(1.0f - (1.0f/waterDielectric))
float preFactor = -332.05382f;
const float I_Unroll = 4.0f;
const float3 zero3 = float3( 0.0f, 0.0f, 0.0f );
// ---------------------------------------------------------------------------------------
preFactor *= ( (1.0f/soluteDielectric) - (1.0f/solventDielectric) );
iAtom = indexof( bornForce1 );
forceIndex = I_Unroll*( iAtom.x + iAtom.y*fstreamWidth );
iAtomLinearIndex = fmod( forceIndex, roundedUpAtoms );
// ---------------------------------------------------------------------------------------
// set gather index
iAtom.x = fmod( iAtomLinearIndex, streamWidth );
iAtom.y = round( (iAtomLinearIndex - fmod(iAtomLinearIndex, streamWidth ))/streamWidth );
// ---------------------------------------------------------------------------------------
// etch i1 position and partial charge
jQ = posq[ iAtom ];
i1Pos = jQ.xyz;
i1Q = atomicRadii[ iAtom ].y;
i1Q *= preFactor;
i1BornR = bornRadii[ iAtom ];
i1AtomicR = atomicRadii[ iAtom ].x;
kAceNonPolarLoop1( i1BornR, i1AtomicR, duplicationFactor, aceForce );
bornForce1.xyz = zero3;
bornForce1.w = includeAce > 0.5f ? aceForce : 0.0f;
// ---------------------------------------------------------------------------------------
// etch i2 position and partial charge
iAtom.x += 1;
jQ = posq[ iAtom ];
i2Pos = jQ.xyz;
i2Q = atomicRadii[ iAtom ].y;
i2Q *= preFactor;
i2BornR = bornRadii[ iAtom ];
i2AtomicR = atomicRadii[ iAtom ].x;
kAceNonPolarLoop1( i2BornR, i2AtomicR, duplicationFactor, aceForce );
bornForce2.xyz = zero3;
bornForce2.w = includeAce > 0.5f ? aceForce : 0.0f;
// ---------------------------------------------------------------------------------------
// etch i3 position and partial charge
iAtom.x += 1;
jQ = posq[ iAtom ];
i3Pos = jQ.xyz;
i3Q = atomicRadii[ iAtom ].y;
i3Q *= preFactor;
i3BornR = bornRadii[ iAtom ];
i3AtomicR = atomicRadii[ iAtom ].x;
kAceNonPolarLoop1( i3BornR, i3AtomicR, duplicationFactor, aceForce );
bornForce3.xyz = zero3;
bornForce3.w = includeAce > 0.5f ? aceForce : 0.0f;
// ---------------------------------------------------------------------------------------
// etch i4 position and partial charge
iAtom.x += 1;
jQ = posq[ iAtom ];
i4Pos = jQ.xyz;
i4Q = atomicRadii[ iAtom ].y;
i4Q *= preFactor;
i4BornR = bornRadii[ iAtom ];
i4AtomicR = atomicRadii[ iAtom ].x;
kAceNonPolarLoop1( i4BornR, i4AtomicR, duplicationFactor, aceForce );
bornForce4.xyz = zero3;
bornForce4.w = includeAce > 0.5f ? aceForce : 0.0f;
// ---------------------------------------------------------------------------------------
// inner loop setup
// if dupFac == 4, I_UnRoll =2, then breaking inner loop into two segments
// to increase number of threads in flight
// forceStreamSz = N*RepFac/I_UnRoll
// forceIndex = I_UnRoll*( a.x + a.y*forceStreamSz )
// whichRep = 0 or 1
// jBlock = 1 + floor[ N/(duplicationFactor*streamWidth) ]
//changed the following instruction for rounding issues on some ASICs
//whichRep = floor( forceIndex / roundedUpAtoms );
tmp = fmod(forceIndex, roundedUpAtoms);
whichRep = round((forceIndex - tmp)/roundedUpAtoms);
jBlock = 1 + floor( numberOfAtoms/(duplicationFactor*streamWidth ) );
jStart = whichRep*jBlock;
jEnd = ( whichRep > duplicationFactor - 1.5f ) ? 999999.0f : (jStart + jBlock);
jAtom.y = jStart;
jLinind = jAtom.y*streamWidth;
// ---------------------------------------------------------------------------------------
while ( jAtom.y < jEnd && ( numberOfAtoms - jLinind ) > 0.9f ){
jAtom.x = 0.0f;
while ( jAtom.x < streamWidth && ( numberOfAtoms - jLinind ) > 0.9f ) {
// ---------------------------------------------------------------------------------------
// gather required values
j1Pos = posq[ jAtom ];
j1Q = atomicRadii[ jAtom ].y;
j1BornR = bornRadii[ jAtom ];
jAtom.x += 1.0f;
j2Pos = posq[ jAtom ];
j2Q = atomicRadii[ jAtom ].y;
j2BornR = bornRadii[ jAtom ];
jAtom.x += 1.0f;
j3Pos = posq[ jAtom ];
j3Q = atomicRadii[ jAtom ].y;
j3BornR = bornRadii[ jAtom ];
jAtom.x += 1.0f;
j4Pos = posq[ jAtom ];
j4Q = atomicRadii[ jAtom ].y;
j4BornR = bornRadii[ jAtom ];
jAtom.x += 1.0f;
jBornR = float4( j1BornR, j2BornR, j3BornR, j4BornR );
jQ = float4( j1Q, j2Q, j3Q, j4Q );
// ---------------------------------------------------------------------------------------
// i == 1
d1 = i1Pos - j1Pos;
d2 = i1Pos - j2Pos;
d3 = i1Pos - j3Pos;
d4 = i1Pos - j4Pos;
loop1Internal( d1, d2, d3, d4, jBornR, jQ, i1BornR, i1Q, dGpol_dr, dGpol_dalpha2_ij );
bornForce1.xyz += dGpol_dr.x*d1;
bornForce1.xyz += dGpol_dr.y*d2;
bornForce1.xyz += dGpol_dr.z*d3;
bornForce1.xyz += dGpol_dr.w*d4;
bornForce1.w += dGpol_dalpha2_ij.x + dGpol_dalpha2_ij.y + dGpol_dalpha2_ij.z + dGpol_dalpha2_ij.w;
// ---------------------------------------------------------------------------------------
// i == 2
d1 = i2Pos - j1Pos;
d2 = i2Pos - j2Pos;
d3 = i2Pos - j3Pos;
d4 = i2Pos - j4Pos;
loop1Internal( d1, d2, d3, d4, jBornR, jQ, i2BornR, i2Q, dGpol_dr, dGpol_dalpha2_ij );
bornForce2.xyz += dGpol_dr.x*d1;
bornForce2.xyz += dGpol_dr.y*d2;
bornForce2.xyz += dGpol_dr.z*d3;
bornForce2.xyz += dGpol_dr.w*d4;
bornForce2.w += dGpol_dalpha2_ij.x + dGpol_dalpha2_ij.y + dGpol_dalpha2_ij.z + dGpol_dalpha2_ij.w;
// ---------------------------------------------------------------------------------------
// i == 3
d1 = i3Pos - j1Pos;
d2 = i3Pos - j2Pos;
d3 = i3Pos - j3Pos;
d4 = i3Pos - j4Pos;
loop1Internal( d1, d2, d3, d4, jBornR, jQ, i3BornR, i3Q, dGpol_dr, dGpol_dalpha2_ij );
bornForce3.xyz += dGpol_dr.x*d1;
bornForce3.xyz += dGpol_dr.y*d2;
bornForce3.xyz += dGpol_dr.z*d3;
bornForce3.xyz += dGpol_dr.w*d4;
bornForce3.w += dGpol_dalpha2_ij.x + dGpol_dalpha2_ij.y + dGpol_dalpha2_ij.z + dGpol_dalpha2_ij.w;
// ---------------------------------------------------------------------------------------
// i == 4
d1 = i4Pos - j1Pos;
d2 = i4Pos - j2Pos;
d3 = i4Pos - j3Pos;
d4 = i4Pos - j4Pos;
loop1Internal( d1, d2, d3, d4, jBornR, jQ, i4BornR, i4Q, dGpol_dr, dGpol_dalpha2_ij );
bornForce4.xyz += dGpol_dr.x*d1;
bornForce4.xyz += dGpol_dr.y*d2;
bornForce4.xyz += dGpol_dr.z*d3;
bornForce4.xyz += dGpol_dr.w*d4;
bornForce4.w += dGpol_dalpha2_ij.x + dGpol_dalpha2_ij.y + dGpol_dalpha2_ij.z + dGpol_dalpha2_ij.w;
// ---------------------------------------------------------------------------------------
jLinind += 4.0f;
}
jAtom.y += 1.0f;
}
}
platforms/brook/src/gpu/kmerge.br
0 → 100644
View file @
cb130f92
/****************************************************************
//Linear index of i particle, divided by 2 because we unroll i by 2
* This file is part of the gpu acceleration library for gromacs.
* Author: Mark Friedrichs
*
* This kernel was developed in collaboration with
*
* Copyright (C) Pande Group, Stanford, 2006
*****************************************************************/
/* After forces above, we have the forces for even numbered particles
* in one stream, odd numbered particles in another.
* In each stream, the forces are in several parts depending on how
* many times we replicated the input stream.
*
* To avoid an extra kernel to zero forces, this sets the forces
* rather than adding to it.
* */
kernel void kMergeFloat(
float repfac,
float atomStrWidth,
float pstreamStrWidth,
float natoms,
float iUnroll,
iter float2 count<>,
float pstream1[][],
float pstream2[][],
out float outstream<> )
{
float linind;
float2 pindex;
float odd;
float i;
//convert to linear atom index
linind = count.x + count.y * atomStrWidth;
//If odd or even, we pick from diferent streams.
odd = linind - floor( linind / iUnroll ) * iUnroll;
//Now linear index is the index into partial_streams
linind = floor( linind / iUnroll );
outstream = 0.0f;
//If we have predicated conditionals, we should
//keep the conditional inside the loop
for ( i = 0; i < repfac; i+=1.0f ) {
pindex.y = floor( linind / pstreamStrWidth );
pindex.x = linind - pindex.y * pstreamStrWidth;
if ( odd > 0.5f ) { //is odd
outstream += pstream2[ pindex ];
} else {
outstream += pstream1[ pindex ];
}
linind += natoms/iUnroll;
}
}
kernel void kMergeFloat4(
float repfac,
float atomStrWidth,
float pstreamStrWidth,
float natoms,
float iUnroll,
float4 pstream1[][],
float4 pstream2[][],
out float4 outstream<> )
{
float linind;
float2 pindex;
float odd;
float i;
//convert to linear atom index
linind = (indexof outstream).x + ( (indexof outstream).y * atomStrWidth );
//If odd or even, we pick from diferent streams.
odd = linind - floor( linind / iUnroll ) * iUnroll;
//Now linear index is the index into partial_streams
linind = floor( linind / iUnroll );
outstream = float4( 0.0f, 0.0f, 0.0f, 0.0f );
//If we have predicated conditionals, we should
//keep the conditional inside the loop
for ( i = 0.0f; i < repfac; i+= 1.0f ) {
pindex.y = floor( linind / pstreamStrWidth );
pindex.x = linind - pindex.y * pstreamStrWidth;
if ( odd > 0.5f ) { //is odd
outstream += pstream2[ pindex ];
} else {
outstream += pstream1[ pindex ];
}
linind += natoms/iUnroll;
}
}
/* After forces above, we have the forces for even numbered particles
* in one stream, odd numbered particles in another.
* In each stream, the forces are in several parts depending on how
* many times we replicated the input stream.
*
* To avoid an extra kernel to zero forces, this sets the forces
* rather than adding to it.
* */
kernel void kMergeFloat4_4X(
float repfac,
float atomStrWidth,
float pstreamStrWidth,
float natoms,
float iUnroll,
float4 pstream1[][],
float4 pstream2[][],
float4 pstream3[][],
float4 pstream4[][],
out float4 outstream<> )
{
float linind;
float2 pindex;
float odd;
float i;
//convert to linear atom index
linind = (indexof outstream).x + ( (indexof outstream).y * atomStrWidth );
//If odd or even, we pick from diferent streams.
odd = linind - floor( linind / iUnroll ) * iUnroll;
//Now linear index is the index into partial_streams
linind = floor( linind / iUnroll );
outstream = float4( 0.0f, 0.0f, 0.0f, 0.0f );
//If we have predicated conditionals, we should
//keep the conditional inside the loop
for ( i = 0.0f; i < repfac; i+= 1.0f ) {
//pindex.y = floor( linind / pstreamStrWidth );
//pindex.x = linind - pindex.y * pstreamStrWidth;
pindex.y = round( (linind - fmod( linind, pstreamStrWidth ))/pstreamStrWidth ); //bixia modify
pindex.x = linind - pindex.y * pstreamStrWidth;
outstream += float4( linind, odd, pindex.x, pindex.y );
/*
if ( odd < 0.5f ) { //is odd
outstream += pstream1[ pindex ];
} else if( odd < 1.5f ){
outstream += pstream2[ pindex ];
} else if( odd < 2.5f ){
outstream += pstream3[ pindex ];
} else {
outstream += pstream4[ pindex ];
}
*/
linind += natoms/iUnroll;
}
}
kernel void kMergeFloat4_4(
float repfac,
float atomStreamWidth,
float pStreamWidth,
float natoms,
float roundNatoms,
float iUnroll,
float4 pstream1[][],
float4 pstream2[][],
float4 pstream3[][],
float4 pstream4[][],
out float4 outstream<> )
{
float atomIndex, forceIndex, qIndex, qOff;
float2 pindex;
float i;
// given atom index find force indices and streams
pindex = indexof( outstream );
atomIndex = pindex.x + pindex.y*atomStreamWidth;
forceIndex = atomIndex;
outstream = float4( 0.0f, 0.0f, 0.0f, 0.0f );
for( i = 0.0f; i < repfac; i += 1.0f ){
// qIndex = floor( forceIndex/iUnroll );
qIndex = round( (forceIndex - fmod( forceIndex, iUnroll))/iUnroll );
qOff = forceIndex - iUnroll*qIndex;
// pindex.y = floor( qIndex/ pStreamWidth );
pindex.y = round( (qIndex - fmod( qIndex, pStreamWidth ))/pStreamWidth );
// pindex.x = qIndex - pindex.y*pStreamWidth + qOff;
pindex.x = qIndex - pindex.y*pStreamWidth;
// outstream += float4( forceIndex, qIndex, pindex.x, pindex.y );
if ( qOff < 0.5f ){
outstream += pstream1[ pindex ];
} else if( qOff < 1.5f ){
outstream += pstream2[ pindex ];
} else if( qOff < 2.5f ){
outstream += pstream3[ pindex ];
} else {
outstream += pstream4[ pindex ];
}
forceIndex += roundNatoms;
}
}
kernel void kPostObcLoop1(
float repfac,
float atomStreamWidth,
float pStreamWidth,
float natoms,
float roundNatoms,
float iUnroll,
float4 pstream1[][],
float4 pstream2[][],
float4 pstream3[][],
float4 pstream4[][],
float obcChain<>,
float bornRadii<>,
out float4 outstream<>,
out float bornRadii2Force<> )
{
// ---------------------------------------------------------------------------------------
float atomIndex, forceIndex, qIndex, qOff;
float2 pindex;
float i;
float4 o1,o2,o3,o4;
float4 tmp;
float4 zero4;
// given atom index find force indices and streams
pindex = indexof( outstream );
atomIndex = pindex.x + pindex.y*atomStreamWidth;
forceIndex = atomIndex;
zero4 = float4( 0.0f, 0.0f, 0.0f, 0.0f );
outstream = zero4;
for( i = 0.0f; i < repfac; i += 1.0f ){
// qIndex = floor( forceIndex/iUnroll );
qIndex = round( (forceIndex - fmod( forceIndex, iUnroll))/iUnroll );
qOff = forceIndex - iUnroll*qIndex;
// pindex.y = floor( qIndex/ pStreamWidth );
pindex.y = round( (qIndex - fmod( qIndex, pStreamWidth ))/pStreamWidth );
// pindex.x = qIndex - pindex.y*pStreamWidth + qOff;
pindex.x = qIndex - pindex.y*pStreamWidth;
// outstream += float4( forceIndex, qIndex, pindex.x, pindex.y );
// This is going to cause really divergent code and we are
// going to end up doing all the fetches anyway...
if ( qOff < 0.5f ){
outstream += pstream1[ pindex ];
} else if( qOff < 1.5f ){
outstream += pstream2[ pindex ];
} else if( qOff < 2.5f ){
outstream += pstream3[ pindex ];
} else {
outstream += pstream4[ pindex ];
}
// o1 = pstream1[ pindex ];
// o2 = pstream2[ pindex ];
// o3 = pstream3[ pindex ];
// o4 = pstream4[ pindex ];
// tmp = qOff < 0.5f ? o1 : o2;
// tmp = qOff < 1.5f ? tmp : o3;
// tmp = qOff < 2.5f ? tmp : o4;
// outstream += tmp;
forceIndex += roundNatoms;
}
bornRadii2Force = obcChain*bornRadii*bornRadii*outstream.w;
}
// The inner loop by definition creates divergent paths. Chances are
// fair that we will take all sides of the branch anyway, so this
// verion uses manual predication
kernel void kPostObcLoop1_nobranch(
float repfac,
float atomStreamWidth,
float pStreamWidth,
float natoms,
float roundNatoms,
float iUnroll,
float4 pstream1[][],
float4 pstream2[][],
float4 pstream3[][],
float4 pstream4[][],
float obcChain<>,
float bornRadii<>,
out float4 outstream<>,
out float bornRadii2Force<> )
{
// ---------------------------------------------------------------------------------------
float atomIndex, forceIndex, qIndex, qOff;
float2 pindex;
float i;
float4 o1,o2,o3,o4;
float4 tmp;
// given atom index find force indices and streams
pindex = indexof( outstream );
atomIndex = pindex.x + pindex.y*atomStreamWidth;
forceIndex = atomIndex;
outstream = float4( 0.0f, 0.0f, 0.0f, 0.0f );
for( i = 0.0f; i < repfac; i += 1.0f ){
// qIndex = floor( forceIndex/iUnroll );
qIndex = round( (forceIndex - fmod( forceIndex, iUnroll))/iUnroll );
qOff = forceIndex - iUnroll*qIndex;
// pindex.y = floor( qIndex/ pStreamWidth );
pindex.y = round( (qIndex - fmod( qIndex, pStreamWidth ))/pStreamWidth );
// pindex.x = qIndex - pindex.y*pStreamWidth + qOff;
pindex.x = qIndex - pindex.y*pStreamWidth;
o1 = pstream1[ pindex ];
o2 = pstream2[ pindex ];
o3 = pstream3[ pindex ];
o4 = pstream4[ pindex ];
tmp = qOff < 0.5f ? o1 : o2;
tmp = qOff < 1.5f ? tmp : o3;
tmp = qOff < 2.5f ? tmp : o4;
outstream += tmp;
forceIndex += roundNatoms;
}
bornRadii2Force = obcChain*bornRadii*bornRadii*outstream.w;
}
kernel void kSetValue4( float value, out float4 outstream<> ){
outstream = float4( value, value, value, value );
}
kernel void kSetValue3( float value, out float3 outstream<> ){
outstream = float3( value, value, value );
}
kernel void kSetValue2( float value, out float2 outstream<> ){
outstream = float2( value, value );
}
kernel void kSetValue1( float value, out float outstream<> ){
outstream = value;
}
kernel void kCheck( float natoms, float atomStrWidth, float pstreamStrWidth, float unroll, out float4 outstream<> )
{
float linind, forceIndex, atomIndex;
float2 pindex;
pindex = indexof( outstream );
forceIndex = unroll*(pindex.x + pindex.y*pstreamStrWidth);
atomIndex = fmod( forceIndex, natoms );
outstream = float4( pindex.x, pindex.y, forceIndex, atomIndex );
}
/* After forces above, we have the forces for even numbered particles
* in one stream, odd numbered particles in another.
* In each stream, the forces are in several parts depending on how
* many times we replicated the input stream.
*
* To avoid an extra kernel to zero forces, this sets the forces
* rather than adding to it.
* */
kernel void kAddAndMergeFloat4(
float repfac,
float atomStrWidth,
float pstreamStrWidth,
float natoms,
float iUnroll,
float4 inStream<>,
float4 pstream1[][],
float4 pstream2[][],
out float4 outstream<> )
{
float linind;
float2 pindex;
float odd;
float i;
float floor_linind_iUnroll;
linind = (indexof outstream).x + (indexof outstream).y * atomStrWidth;
//If odd or even, we pick from diferent streams.
//odd = linind - floor( linind / iUnroll ) * iUnroll;
//Now linear index is the index into partial_streams
//linind = floor( linind / iUnroll );
floor_linind_iUnroll = round( (linind - fmod(linind, iUnroll))/iUnroll );
odd = linind - floor_linind_iUnroll * iUnroll;//bixia modify
linind = floor_linind_iUnroll; //bixia modify
outstream = inStream;
outstream.w = 0.0f;
//If we have predicated conditionals, we should
//keep the conditional inside the loop
for ( i = 0.0f; i < repfac; i+= 1.0f ) {
//pindex.y = floor( linind / pstreamStrWidth );
pindex.y = round( (linind - fmod( linind, pstreamStrWidth ))/pstreamStrWidth ); //bixia modify
pindex.x = linind - pindex.y * pstreamStrWidth;
if ( odd > 0.5f ) { //is odd
outstream += pstream2[ pindex ];
} else {
outstream += pstream1[ pindex ];
}
linind += natoms/iUnroll;
}
}
/* After forces above, we have the forces for even numbered particles
* in one stream, odd numbered particles in another.
* In each stream, the forces are in several parts depending on how
* many times we replicated the input stream.
*
* To avoid an extra kernel to zero forces, this sets the forces
* rather than adding to it.
* */
/*
kernel void kAddAndMergeFloat4_4(
float repfac,
float atomStrWidth,
float pstreamStrWidth,
float natoms,
float iUnroll,
float4 inStream<>,
float4 pstream1[][],
float4 pstream2[][],
float4 pstream3[][],
float4 pstream4[][],
out float4 outstream<> )
{
float linind;
float2 pindex;
float odd;
float i;
float floor_linind_iUnroll;
linind = (indexof outstream).x + (indexof outstream).y * atomStrWidth;
//If odd or even, we pick from diferent streams.
//odd = linind - floor( linind / iUnroll ) * iUnroll;
//Now linear index is the index into partial_streams
//linind = floor( linind / iUnroll );
floor_linind_iUnroll = round( (linind - fmod(linind, iUnroll))/iUnroll );
odd = linind - floor_linind_iUnroll * iUnroll;//bixia modify
linind = floor_linind_iUnroll; //bixia modify
outstream = inStream;
outstream.w = 0.0f;
// outstream = float4( 0.0f, 0.0f, 0.0f, 0.0f );
//If we have predicated conditionals, we should
//keep the conditional inside the loop
for ( i = 0.0f; i < repfac; i+= 1.0f ) {
//pindex.y = floor( linind / pstreamStrWidth );
pindex.y = round( (linind - fmod( linind, pstreamStrWidth ))/pstreamStrWidth ); //bixia modify
pindex.x = linind - pindex.y * pstreamStrWidth;
if ( odd < 0.5f ) { //is odd
outstream += pstream1[ pindex ];
} else if( odd < 1.5f ){
outstream += pstream2[ pindex ];
} else if( odd < 2.5f ){
outstream += pstream3[ pindex ];
} else {
outstream += pstream4[ pindex ];
}
linind += natoms/iUnroll;
}
} */
kernel void kAddAndMergeFloat4_4(
float repfac,
float atomStreamWidth,
float pStreamWidth,
float natoms,
float roundNatoms,
float iUnroll,
float4 inStream<>,
float4 pstream1[][],
float4 pstream2[][],
float4 pstream3[][],
float4 pstream4[][],
out float4 outstream<> ){
float atomIndex, forceIndex, qIndex, qOff;
float2 pindex;
float i;
// given atom index find force indices and streams
pindex = indexof( outstream );
atomIndex = pindex.x + pindex.y*atomStreamWidth;
forceIndex = atomIndex;
// add current forces in inStream to forces stored in pstreams
// the .w entry is Born sum values; it will be used to calculate the
// Born radii and obcChain term
outstream = inStream;
outstream.w = 0.0f;
//outstream = float4( 0.0f, 0.0f, 0.0f, 0.0f );
// sum over j-loop 'duplications' by gathering from pstreams
for( i = 0.0f; i < repfac; i += 1.0f ){
// qIndex = floor( forceIndex/iUnroll );
qIndex = round( (forceIndex - fmod( forceIndex, iUnroll))/iUnroll );
qOff = forceIndex - iUnroll*qIndex;
// pindex.y = floor( qIndex/ pStreamWidth );
pindex.y = round( (qIndex - fmod( qIndex, pStreamWidth ))/pStreamWidth );
// pindex.x = qIndex - pindex.y*pStreamWidth + qOff;
pindex.x = qIndex - pindex.y*pStreamWidth;
if( qOff < 0.5f ){
outstream += pstream1[ pindex ];
} else if( qOff < 1.5f ){
outstream += pstream2[ pindex ];
} else if( qOff < 2.5f ){
outstream += pstream3[ pindex ];
} else {
outstream += pstream4[ pindex ];
}
forceIndex += roundNatoms;
}
}
kernel void kPostObcLoop2(
float repfac,
float atomStreamWidth,
float pStreamWidth,
float natoms,
float roundNatoms,
float iUnroll,
float conversion,
float mergeNonObcForces,
float4 inObcForces<>,
float3 nonObcForces<>,
float4 pstream1[][],
float4 pstream2[][],
float4 pstream3[][],
float4 pstream4[][],
float atomicRadii<>,
out float bornRadii<>,
out float obcChain<>,
out float3 outForces<> ){
// ---------------------------------------------------------------------------------------
float atomIndex, forceIndex, qIndex, qOff;
float2 pindex;
float i;
float sum2, sum3, bornSum, tanhSum, atomicRadiiOffset, obcIntermediate;
float2 iAtom;
float4 forces;
float expPlus, expMinus;
// ---------------------------------------------------------------------------------------
// constants -- OBC Type II
const float alphaObc = 1.0f;
const float betaObc = 0.8f;
const float gammaObc = 4.85f;
const float dielectricOffset = 0.009f;
// ---------------------------------------------------------------------------------------
// given atom index find force indices and streams
pindex = indexof( outForces );
atomIndex = pindex.x + pindex.y*atomStreamWidth;
forceIndex = atomIndex;
// add current forces in inStream to forces stored in pstreams
// the .w entry is Born sum values; it will be used to calculate the
// Born radii and obcChain term
forces = inObcForces;
forces.w = 0.0f;
//forces = float4( 0.0f, 0.0f, 0.0f, 0.0f );
// sum over j-loop 'duplications' by gathering from pstreams
for( i = 0.0f; i < repfac; i += 1.0f ){
// qIndex = floor( forceIndex/iUnroll );
qIndex = round( (forceIndex - fmod( forceIndex, iUnroll))/iUnroll );
qOff = forceIndex - iUnroll*qIndex;
// pindex.y = floor( qIndex/ pStreamWidth );
pindex.y = round( (qIndex - fmod( qIndex, pStreamWidth ))/pStreamWidth );
// pindex.x = qIndex - pindex.y*pStreamWidth + qOff;
pindex.x = qIndex - pindex.y*pStreamWidth;
if( qOff < 0.5f ){
forces += pstream1[ pindex ];
} else if( qOff < 1.5f ){
forces += pstream2[ pindex ];
} else if( qOff < 2.5f ){
forces += pstream3[ pindex ];
} else {
forces += pstream4[ pindex ];
}
forceIndex += roundNatoms;
}
// compute Born radii and ObcChain
atomicRadiiOffset = atomicRadii - dielectricOffset;
bornSum = forces.w;
bornSum *= 0.5f*atomicRadiiOffset;
sum2 = bornSum*bornSum;
sum3 = bornSum*sum2;
// Tanh does not exist?
// calculate [ exp(x) - exp(-x) ]/[ exp(x) + exp(-x) ]
// tanhSum = tanh( bornSum - betaObc*sum2 + gammaObc*sum3 );
tanhSum = bornSum - betaObc*sum2 + gammaObc*sum3;
expPlus = exp( tanhSum );
expMinus = 1.0f/expPlus;
tanhSum = ( expPlus - expMinus )/( expPlus + expMinus );
bornRadii = 1.0f/( (1.0f/(atomicRadiiOffset)) - tanhSum/atomicRadii );
obcIntermediate = atomicRadiiOffset*( alphaObc - 2.0f*betaObc*bornSum + 3.0f*gammaObc*sum2 );
obcChain = (1.0f - tanhSum*tanhSum)*obcIntermediate/atomicRadii;
if( atomIndex >= natoms ){
bornRadii = 0.0f;
obcChain = 0.0f;
}
// add converted new forces to non-Obc forces
outForces = conversion*forces.xyz;
if( mergeNonObcForces > 0.1f ){
outForces += nonObcForces;
}
}
kernel void kPostObcLoop2_nobranch(
float repfac,
float atomStreamWidth,
float pStreamWidth,
float natoms,
float roundNatoms,
float iUnroll,
float conversion,
float mergeNonObcForces,
float4 inObcForces<>,
float3 nonObcForces<>,
float4 pstream1[][],
float4 pstream2[][],
float4 pstream3[][],
float4 pstream4[][],
float atomicRadii<>,
out float bornRadii<>,
out float obcChain<>,
out float3 outForces<> ){
// ---------------------------------------------------------------------------------------
float atomIndex, forceIndex, qIndex, qOff;
float2 pindex;
float i;
float sum2, sum3, bornSum, tanhSum, atomicRadiiOffset, obcIntermediate;
float4 o1,o2,o3,o4;
float4 tmp;
float2 iAtom;
float4 forces;
float expPlus, expMinus;
// ---------------------------------------------------------------------------------------
// constants -- OBC Type II
const float alphaObc = 1.0f;
const float betaObc = 0.8f;
const float gammaObc = 4.85f;
const float dielectricOffset = 0.009f;
// ---------------------------------------------------------------------------------------
// given atom index find force indices and streams
pindex = indexof( outForces );
atomIndex = pindex.x + pindex.y*atomStreamWidth;
forceIndex = atomIndex;
// add current forces in inStream to forces stored in pstreams
// the .w entry is Born sum values; it will be used to calculate the
// Born radii and obcChain term
forces = inObcForces;
forces.w = 0.0f;
//forces = float4( 0.0f, 0.0f, 0.0f, 0.0f );
// sum over j-loop 'duplications' by gathering from pstreams
for( i = 0.0f; i < repfac; i += 1.0f ){
// qIndex = floor( forceIndex/iUnroll );
qIndex = round( (forceIndex - fmod( forceIndex, iUnroll))/iUnroll );
qOff = forceIndex - iUnroll*qIndex;
// pindex.y = floor( qIndex/ pStreamWidth );
pindex.y = round( (qIndex - fmod( qIndex, pStreamWidth ))/pStreamWidth );
// pindex.x = qIndex - pindex.y*pStreamWidth + qOff;
pindex.x = qIndex - pindex.y*pStreamWidth;
o1 = pstream1[ pindex ];
o2 = pstream2[ pindex ];
o3 = pstream3[ pindex ];
o4 = pstream4[ pindex ];
tmp = qOff < 0.5f ? o1 : o2;
tmp = qOff < 1.5f ? tmp : o3;
tmp = qOff < 2.5f ? tmp : o4;
forces += tmp;
forceIndex += roundNatoms;
}
// compute Born radii and ObcChain
atomicRadiiOffset = atomicRadii - dielectricOffset;
bornSum = forces.w;
bornSum *= 0.5f*atomicRadiiOffset;
sum2 = bornSum*bornSum;
sum3 = bornSum*sum2;
// Tanh does not exist?
// calculate [ exp(x) - exp(-x) ]/[ exp(x) + exp(-x) ]
// tanhSum = tanh( bornSum - betaObc*sum2 + gammaObc*sum3 );
tanhSum = bornSum - betaObc*sum2 + gammaObc*sum3;
expPlus = exp( tanhSum );
expMinus = 1.0f/expPlus;
tanhSum = ( expPlus - expMinus )/( expPlus + expMinus );
bornRadii = 1.0f/( (1.0f/(atomicRadiiOffset)) - tanhSum/atomicRadii );
obcIntermediate = atomicRadiiOffset*( alphaObc - 2.0f*betaObc*bornSum + 3.0f*gammaObc*sum2 );
obcChain = (1.0f - tanhSum*tanhSum)*obcIntermediate/atomicRadii;
if( atomIndex >= natoms ){
bornRadii = 0.0f;
obcChain = 0.0f;
}
// add converted new forces to non-Obc forces
outForces = conversion*forces.xyz;
if( mergeNonObcForces > 0.1f ){
outForces += nonObcForces;
}
}
/* After forces above, we have the forces for even numbered particles
* in one stream, odd numbered particles in another.
* In each stream, the forces are in several parts depending on how
* many times we replicated the input stream.
*
* To avoid an extra kernel to zero forces, this sets the forces
* rather than adding to it.
* */
kernel void kPreGbsaForce2(
float4 intermediateForceIn<>,
float bornRadii<>,
out float bornRadii2Force<> ){
// ---------------------------------------------------------------------------------------
// float P4 = 15.236f;
// P4_ec = P4/electricConstant
const float P4_ec = -0.09176f;
bornRadii2Force = P4_ec*bornRadii*bornRadii*intermediateForceIn.w;
}
/* After forces above, we have the forces for even numbered particles
* in one stream, odd numbered particles in another.
* In each stream, the forces are in several parts depending on how
* many times we replicated the input stream.
*
* To avoid an extra kernel to zero forces, this sets the forces
* rather than adding to it.
* */
kernel void kPreObcForce2(
float4 intermediateForceIn<>,
float4 obcChain<>,
float bornRadii<>,
out float bornRadii2Force<> ){
// ---------------------------------------------------------------------------------------
bornRadii2Force = obcChain*bornRadii*bornRadii*intermediateForceIn.w;
}
/* Add forces from two streams */
kernel void kAddForces3_4( float conversion, float3 force1<>, float4 force2<>, out float3 outForce<> ){
outForce.xyz = force1 + conversion*force2.xyz;
}
/* Copy one stream to another */
kernel void kCopyFloat4( float4 inForce<>, out float4 outForce<> ){
outForce = inForce;
}
/* Copy one stream to another
* */
kernel void kCopyFloat3To4(
float3 inForce<>,
out float4 outForce<> ){
// ---------------------------------------------------------------------------------------
outForce.xyz = inForce;
outForce.w = 0.0f;
}
/* ---------------------------------------------------------------------------------------
Calculate Born radius from bonded and nonbonded Gpol
gpolNonBonded value is in gpolNonBonded.w
--------------------------------------------------------------------------------------- */
kernel void kBornRadii( float4 gpolNonBonded<>, float gpolFixed<>, out float bornRadius<> ){
// ---------------------------------------------------------------------------------------
// constants
const float electricConstant = -166.02691f;
// 0.25*P4
const float P4_25 = 3.81575f;
// ---------------------------------------------------------------------------------------
bornRadius = gpolFixed + P4_25*gpolNonBonded.w;
bornRadius = electricConstant/bornRadius;
// ---------------------------------------------------------------------------------------
}
platforms/brook/src/gpu/kmerge_partial_forces.br
View file @
cb130f92
...
@@ -99,3 +99,53 @@ kernel void kMergeFloat3_4(
...
@@ -99,3 +99,53 @@ kernel void kMergeFloat3_4(
}
}
}
}
kernel void kMergeFloat3_4_nobranch(
float repfac,
float atomStreamWidth,
float pStreamWidth,
float natoms,
float roundNatoms,
float iUnroll,
float3 pstream1[][],
float3 pstream2[][],
float3 pstream3[][],
float3 pstream4[][],
out float3 outstream<> )
{
float atomIndex, forceIndex, qIndex, qOff;
float2 pindex;
float i;
float3 o1,o2,o3,o4;
float3 tmp;
// given atom index find force indices and streams
pindex = indexof( outstream );
atomIndex = pindex.x + pindex.y*atomStreamWidth;
forceIndex = atomIndex;
outstream = float3( 0.0f, 0.0f, 0.0f );
for( i = 0.0f; i < repfac; i += 1.0f ){
qIndex = round( (forceIndex - fmod( forceIndex, iUnroll))/iUnroll );
qOff = forceIndex - iUnroll*qIndex;
pindex.y = round( (qIndex - fmod( qIndex, pStreamWidth ))/pStreamWidth );
pindex.x = qIndex - pindex.y*pStreamWidth;
o1 = pstream1[ pindex ];
o2 = pstream2[ pindex ];
o3 = pstream3[ pindex ];
o4 = pstream4[ pindex ];
tmp = qOff < 0.5f ? o1 : o2;
tmp = qOff < 1.5f ? tmp : o3;
tmp = qOff < 2.5f ? tmp : o4;
outstream += tmp;
forceIndex += roundNatoms;
}
}
platforms/brook/src/gpu/knlist.br
deleted
100644 → 0
View file @
cc8b4de0
/****************************************************************
* This file is part of the gpu acceleration library for gromacs.
* Author: V. Vishal
* Copyright (C) Pande Group, Stanford, 2006
*****************************************************************/
/* Order N^2 neighbor searching.
*
* This only works for force fields that don't have charge groups.
* If you insist on charge groups, you'll have to pass in appropriate masks here.
*
* This is a simplified kernel, for testing the O(N) speeds.
*
* This does a complete N^2 search without considering groups of
* atoms. Most likely this will prove to be inefficient for
* the O(N) kernel. Lets find out.
*
*
* Each component of the curpass textures is an atom index. The w component
* of curpass3 is a count indicating how many j particles we have
* scanned for this particular i atom.
*
* */
kernel void knborsearch(
float first, //Positive means constructing the first 16.
iter float2 wpos<>, //pixel position of output
float AtomStrHeight,
float AtomStrWidth,
float cutoff2, //square of the cutoff
float natoms, //number of atoms
float excl[][], //exclusions in 1x1 format, 0 means not excluded, 1 means excluded.
float4 posq[][], //atom positions/charges
float4 prevpass3<>, //Last output texture of previous pass
out float4 curpass0<>, //First output of current pass
out float4 curpass1<>,
out float4 curpass2<>,
out float4 curpass3<> //Last output of current pass, used in next pass
){
/*For this kernel, wpos == iatom*/
float2 iind;
float2 jind;
float3 ipos, jpos, dr;
float r2;
float listptr; //Where in the 16-chunk are we now.
float jlinind;
float breakflag; //positive means keep looping, negative means stop
float4 exclconst;
float2 exclind;
float exclusions;
exclconst = float4( 2.0f, 3.0f, 5.0f, 7.0f );
iind = wpos;
exclind.x = iind.x + iind.y * AtomStrWidth;
//etch i atom
ipos = posq[ iind ].xyz;
//Loop over j depending on prevpass
jlinind = prevpass3.w + 1;
jind.y = floor( jlinind / AtomStrWidth );
jind.x = fmod( jlinind, AtomStrWidth );
exclind.y = jlinind;
//All outputs should be initialized to
listptr = 0.0f;
breakflag = 1.0f;
//if we already finished, do nothing
if ( first < 0.0f && prevpass3.w < 0.0f )
breakflag = -1.0f;
//set to -1 to indicate no neighbor
//just to save a separate set of init calls
curpass0 = float4( -1.0f, -1.0f, -1.0f, -1.0f );
curpass1 = curpass0;
curpass2 = curpass0;
curpass3 = curpass0;
while ( jind.y < AtomStrHeight && breakflag > 0.0f ) {
while ( jind.x < AtomStrWidth && breakflag > 0.0f ) {
//First see if this pair is excluded
exclusions = excl[ exclind ];
if ( exclusions < 0.5f ) {
jpos = posq[ jind ].xyz;
dr = jpos - ipos;
r2 = dot( dr, dr );
//If it is inside the cutoff
if ( r2 < cutoff2 ) {
//Figure out where to put it
//We are allowed 4 nested conditionals
//We can play with the structuring of these
if ( listptr < 0.5f )
curpass0.x = jlinind;
else if ( listptr < 1.5f )
curpass0.y = jlinind;
else if ( listptr < 2.5f )
curpass0.z = jlinind;
else if ( listptr < 3.5f )
curpass0.w = jlinind;
else if ( listptr < 4.5f )
curpass1.x = jlinind;
else if ( listptr < 5.5f )
curpass1.y = jlinind;
else if ( listptr < 6.5f )
curpass1.z = jlinind;
else if ( listptr < 7.5f )
curpass1.w = jlinind;
else if ( listptr < 8.5f )
curpass2.x = jlinind;
else if ( listptr < 9.5f )
curpass2.y = jlinind;
else if ( listptr < 10.5f )
curpass2.z = jlinind;
else if ( listptr < 11.5f )
curpass2.w = jlinind;
else if ( listptr < 12.5f )
curpass3.x = jlinind;
else if ( listptr < 13.5f )
curpass3.y = jlinind;
else if ( listptr < 14.5f ) {
curpass3.z = jlinind;
}
else if ( listptr < 15.5f ) {
//We're done for this pass
curpass3.w = jlinind;
breakflag = -1.0f;
}
listptr += 1.0f;
}
}
jlinind += 1.0f;
exclind.y += 1.0f;
jind.x += 1.0f;
}
jind.x = 0.0f;
jind.y += 1.0f;
}
}
//Precomputes lennard jones sig and eps
//to save an indirect etch (and a ew flops) in the
//force kernel. The charge product is not done this way
//because charges have to be etched anyway with the
//positions
kernel void knl_precompute_sigeps(
float AtomStrWidth,
iter float2 wpos<>,
float2 sigeps[][], //x=sigma, y=epsilon
float4 nlist0<>,
float4 nlist1<>,
out float4 sig0<>,
out float4 eps0<>,
out float4 sig1<>,
out float4 eps1<>
)
{
float2 jind;
float4 ind_tmp1, ind_tmp2;
float2 isigeps, jsigeps1, jsigeps2, jsigeps3, jsigeps4;
isigeps = sigeps[ wpos ];
ind_tmp1 = floor( nlist0 / AtomStrWidth );
ind_tmp2 = nlist0 - ind_tmp1 * AtomStrWidth;
jind.y = ind_tmp1.x;
jind.x = ind_tmp2.x;
jsigeps1 = sigeps[ jind ];
jind.y = ind_tmp1.y;
jind.x = ind_tmp2.y;
jsigeps2 = sigeps[ jind ];
jind.y = ind_tmp1.z;
jind.x = ind_tmp2.z;
jsigeps3 = sigeps[ jind ];
jind.y = ind_tmp1.w;
jind.x = ind_tmp2.w;
jsigeps4 = sigeps[ jind ];
sig0.x = isigeps.x + jsigeps1.x;
sig0.y = isigeps.x + jsigeps2.x;
sig0.z = isigeps.x + jsigeps3.x;
sig0.w = isigeps.x + jsigeps4.x;
eps0.x = isigeps.y * jsigeps1.y;
eps0.y = isigeps.y * jsigeps2.y;
eps0.z = isigeps.y * jsigeps3.y;
eps0.w = isigeps.y * jsigeps4.y;
//2nd nlist set
ind_tmp1 = floor( nlist1 / AtomStrWidth );
ind_tmp2 = nlist1 - ind_tmp1 * AtomStrWidth;
jind.y = ind_tmp1.x;
jind.x = ind_tmp2.x;
jsigeps1 = sigeps[ jind ];
jind.y = ind_tmp1.y;
jind.x = ind_tmp2.y;
jsigeps2 = sigeps[ jind ];
jind.y = ind_tmp1.z;
jind.x = ind_tmp2.z;
jsigeps3 = sigeps[ jind ];
jind.y = ind_tmp1.w;
jind.x = ind_tmp2.w;
jsigeps4 = sigeps[ jind ];
sig1.x = isigeps.x + jsigeps1.x;
sig1.y = isigeps.x + jsigeps2.x;
sig1.z = isigeps.x + jsigeps3.x;
sig1.w = isigeps.x + jsigeps4.x;
eps1.x = isigeps.y * jsigeps1.y;
eps1.y = isigeps.y * jsigeps2.y;
eps1.z = isigeps.y * jsigeps3.y;
eps1.w = isigeps.y * jsigeps4.y;
}
platforms/brook/src/gpu/kpdihs.br
deleted
100644 → 0
View file @
cc8b4de0
/****************************************************************
* This file is part of the gpu acceleration library for gromacs.
* Author: V. Vishal
* Copyright (C) Pande Group, Stanford, 2006
*****************************************************************/
//Ryckaert Bellman dihedrals, needed for Amber/OPLS ff's
//
//Input is a stream of quartets i, j, k, l and the output is
//four float3 streams fi, fj, fk, fl.
//If by any chance this kernel becomes the bottleneck, we will
//optimize, but for now, this is kept pretty simple.
//To keep things streaming, we have a stream of 6 parameters(a float4 and float2)
//for each dihedral.
kernel void kpdih(
float xstrwidth, //stream width for x
float4 xq[][], //particle coordinates and charges
float4 atoms<>, //ijkl quartets
float4 parms<>, //parms = ( cp, phi0, mult, 0.0 )
out float3 fi<>, //output forces for i, j, k, l
out float3 fj<>,
out float3 fk<>,
out float3 fl<>
) {
float3 r_ij, r_kj, r_kl;
float2 ai, aj, ak, al;
float3 m, n;
float sgnphi;
float cosfac;
float phi, ddphi, mdphi;
float3 u, v, s;
float nrkj, nrkj2, msq, nsq, cos_phi, sin_phi;
//Convert from linear indices to 2D indices into x
//If this kernel is compute bound, we can do this
//conversion before-hand and feed in the 2D coordinates
ai.y = floor( atoms.x / xstrwidth );
ai.x = atoms.x - ai.y * xstrwidth;
aj.y = floor( atoms.y / xstrwidth );
aj.x = atoms.y - aj.y * xstrwidth;
ak.y = floor( atoms.z / xstrwidth );
ak.x = atoms.z - ak.y * xstrwidth;
al.y = floor( atoms.w / xstrwidth );
al.x = atoms.w - al.y * xstrwidth;
r_ij = xq[ai].xyz - xq[aj].xyz; //3
r_kj = xq[ak].xyz - xq[aj].xyz; //3
r_kl = xq[ak].xyz - xq[al].xyz; //3
m = cross( r_ij, r_kj ); //9
n = cross( r_kj, r_kl ); //9
msq = dot(m, m); //5
nsq = dot(n, n); //5
cos_phi = clamp( dot(m, n)/sqrt(msq*nsq), -1.0, 1.0 ); //8
sgnphi = sign( dot( r_ij, n ) ); //5
phi = sgnphi * acos( cos_phi ); //2
mdphi = parms.z * phi - parms.y; //2
ddphi = - parms.x * parms.z * sin( mdphi ); //3
nrkj2 = dot( r_kj, r_kj ); //5
nrkj = sqrt( nrkj2 ); //1
fi = -ddphi * nrkj / msq * m; //5
fl = ddphi * nrkj / nsq * n; //5
u = dot( r_ij, r_kj ) / nrkj2 * fi; //9
v = dot( r_kl, r_kj ) / nrkj2 * fl; //9
s = u - v; //3
fj = s - fi; //3
fk = -(s + fl); //3
//Total : 100 flops
}
platforms/brook/src/gpu/krbdihs.br
deleted
100644 → 0
View file @
cc8b4de0
/****************************************************************
* This file is part of the gpu acceleration library for gromacs.
* Author: V. Vishal
* Copyright (C) Pande Group, Stanford, 2006
*****************************************************************/
//Ryckaert Bellman dihedrals, needed for Amber/OPLS ff's
//
//Input is a stream of quartets i, j, k, l and the output is
//four float3 streams fi, fj, fk, fl.
//If by any chance this kernel becomes the bottleneck, we will
//optimize, but for now, this is kept pretty simple.
//To keep things streaming, we have a stream of 6 parameters(a float4 and float2)
//for each dihedral.
kernel void krbdih(
float xstrwidth, //stream width for x
float4 xq[][], //particle coordinates and charges
float4 atoms<>, //ijkl quartets
float4 parm03<>, //params 0-3
float2 parm45<>, //params 4 and 5
out float3 fi<>, //output forces for i, j, k, l
out float3 fj<>,
out float3 fk<>,
out float3 fl<>
) {
float3 r_ij, r_kj, r_kl;
float2 ai, aj, ak, al;
float3 m, n;
float sgnphi;
float cosfac;
float ddphi;
float3 u, v, s;
float nrkj, nrkj2, msq, nsq, cos_phi, sin_phi;
//Convert from linear indices to 2D indices into x
//If this kernel is compute bound, we can do this
//conversion before-hand and feed in the 2D coordinates
ai.y = floor( atoms.x / xstrwidth );
ai.x = atoms.x - ai.y * xstrwidth;
aj.y = floor( atoms.y / xstrwidth );
aj.x = atoms.y - aj.y * xstrwidth;
ak.y = floor( atoms.z / xstrwidth );
ak.x = atoms.z - ak.y * xstrwidth;
al.y = floor( atoms.w / xstrwidth );
al.x = atoms.w - al.y * xstrwidth;
r_ij = xq[ai].xyz - xq[aj].xyz; //3
r_kj = xq[ak].xyz - xq[aj].xyz; //3
r_kl = xq[ak].xyz - xq[al].xyz; //3
m = cross( r_ij, r_kj ); //9
n = cross( r_kj, r_kl ); //9
msq = dot(m, m); //5
nsq = dot(n, n); //5
cos_phi = dot(m, n)/sqrt(msq*nsq); //8 (sqrt=1)
//Switching to "polymer convention"
//See gromacs code
cos_phi = -cos_phi;
sgnphi = sign( dot(r_ij, n) ); //5
sin_phi = -sgnphi*sqrt( clamp( 1.0 - cos_phi * cos_phi, 0.0, 1.0) ); //3
//ddphi is basically sum_{i=1}^5 i parm_i cosphi^{i-1}
//This might not be the best way to use the
//4-way mads, but for now we'll let fxc figure it
//out.
//If we precompute some ratios of the parameters
//we can use the 4-way mads better
ddphi = 5.0 * parm45.y;
ddphi = 4.0 * parm45.x + ddphi * cos_phi;
ddphi = 3.0 * parm03.w + ddphi * cos_phi;
ddphi = 2.0 * parm03.z + ddphi * cos_phi;
ddphi = parm03.y + ddphi * cos_phi;
ddphi = -ddphi * sin_phi; //13 flops total for ddphi
nrkj2 = dot( r_kj, r_kj ); //5
nrkj = sqrt( nrkj2 ); //1
fi = -ddphi * nrkj / msq * m; //5
fl = ddphi * nrkj / nsq * n; //5
u = dot( r_ij, r_kj ) / nrkj2 * fi; //9
v = dot( r_kl, r_kj ) / nrkj2 * fl; //9
s = u - v; //3
fj = s - fi; //3
fk = -(s + fl); //3
//Total flops: 109 per rb torsion.
}
platforms/brook/src/gpu/kshakeh.h
0 → 100644
View file @
cb130f92
void
kshakeh_fix1
(
const
float
nit
,
const
float
strwidth
,
const
float
invmH
,
const
float
omega
,
::
brook
::
stream
atoms
,
::
brook
::
stream
posq
,
::
brook
::
stream
posqp
,
::
brook
::
stream
params
,
::
brook
::
stream
cposq0
,
::
brook
::
stream
cposq1
,
::
brook
::
stream
cposq2
,
::
brook
::
stream
cposq3
);
void
kshakeh_fix2
(
const
float
nit
,
const
float
strwidth
,
const
float
invmH
,
const
float
omega
,
::
brook
::
stream
atoms
,
::
brook
::
stream
posq
,
::
brook
::
stream
posqp
,
::
brook
::
stream
params
,
::
brook
::
stream
cposq0
,
::
brook
::
stream
cposq1
,
::
brook
::
stream
cposq2
,
::
brook
::
stream
cposq3
);
void
kshakeh_update
(
const
float
strwidth
,
::
brook
::
stream
invmap
,
::
brook
::
stream
posq
,
::
brook
::
stream
cposq0
,
::
brook
::
stream
cposq1
,
::
brook
::
stream
cposq2
,
::
brook
::
stream
cposq3
,
::
brook
::
stream
oposq
)
;
void
kshakeh
(
const
float
nit
,
const
float
strwidth
,
const
float
invmH
,
const
float
omega
,
::
brook
::
stream
atoms
,
::
brook
::
stream
posq
,
::
brook
::
stream
posqp
,
::
brook
::
stream
params
,
::
brook
::
stream
cposq0
,
::
brook
::
stream
cposq1
,
::
brook
::
stream
cposq2
,
::
brook
::
stream
cposq3
);
void
kshakeh_update1_fix1
(
const
float
strwidth
,
const
float
sdpc1
,
::
brook
::
stream
invmap
,
::
brook
::
stream
posq
,
::
brook
::
stream
posqp
,
::
brook
::
stream
vPrime
,
::
brook
::
stream
cposq0
,
::
brook
::
stream
cposq1
,
::
brook
::
stream
cposq2
,
::
brook
::
stream
cposq3
,
::
brook
::
stream
oposq
);
void
kshakeh_update1_fix1Old
(
const
float
strwidth
,
::
brook
::
stream
invmap
,
::
brook
::
stream
posq
,
::
brook
::
stream
cposq0
,
::
brook
::
stream
cposq1
,
::
brook
::
stream
cposq2
,
::
brook
::
stream
cposq3
,
::
brook
::
stream
oposq
);
void
kshakeh_update2_fix1
(
const
float
strwidth
,
::
brook
::
stream
invmap
,
::
brook
::
stream
posq
,
::
brook
::
stream
posqp
,
::
brook
::
stream
cposq0
,
::
brook
::
stream
cposq1
,
::
brook
::
stream
cposq2
,
::
brook
::
stream
cposq3
,
::
brook
::
stream
oposq
);
platforms/brook/src/gpu/kupdatesd.h
0 → 100644
View file @
cb130f92
void
kupdate_sd1
(
const
float
xstrwidth
,
const
float
gstrwidth
,
const
float
goffset
,
const
float
cem
,
const
float
pc1
,
const
float
pc2
,
const
float
pc3
,
::
brook
::
stream
sdpc
,
::
brook
::
stream
fgauss
,
::
brook
::
stream
sd2X
,
::
brook
::
stream
posq
,
::
brook
::
stream
f
,
::
brook
::
stream
v
,
::
brook
::
stream
invmass
,
::
brook
::
stream
sd1V
,
::
brook
::
stream
vnew
,
::
brook
::
stream
posqp
);
void
kupdate_sd2
(
const
float
xstrwidth
,
const
float
gstrwidth
,
const
float
goffset
,
const
float
pc1
,
const
float
pc2
,
::
brook
::
stream
sdpc
,
::
brook
::
stream
fgauss
,
::
brook
::
stream
sd1V
,
::
brook
::
stream
posq
,
::
brook
::
stream
posqp
,
::
brook
::
stream
vnew
,
::
brook
::
stream
sd2X
,
::
brook
::
stream
v
,
::
brook
::
stream
posqp2
);
void
kpermute_vectors
(
const
float
gstrwidth
,
::
brook
::
stream
perm
,
::
brook
::
stream
gvin
,
::
brook
::
stream
gvout
);
void
kupdate_sd2_fix1
(
const
float
xstrwidth
,
const
float
gstrwidth
,
const
float
goffset
,
const
float
pc1
,
const
float
pc2
,
::
brook
::
stream
sdpc
,
::
brook
::
stream
fgauss
,
::
brook
::
stream
sd1V
,
::
brook
::
stream
posq
,
::
brook
::
stream
posqp
,
::
brook
::
stream
vnew
,
::
brook
::
stream
sd2X
,
::
brook
::
stream
v
,
::
brook
::
stream
posqp2
);
void
kupdate_sd1_fix1
(
const
float
xstrwidth
,
const
float
gstrwidth
,
const
float
goffset
,
const
float
cem
,
const
float
pc1
,
const
float
pc2
,
const
float
pc3
,
::
brook
::
stream
sdpc
,
::
brook
::
stream
fgauss
,
::
brook
::
stream
sd2X
,
::
brook
::
stream
posq
,
::
brook
::
stream
f
,
::
brook
::
stream
v
,
::
brook
::
stream
invmass
,
::
brook
::
stream
sd1V
,
::
brook
::
stream
vnew
,
::
brook
::
stream
posqp
);
void
kupdate_sd2_fix1_FixedRV
(
const
float
xstrwidth
,
const
float
gstrwidth
,
const
float
goffset
,
const
float
pc1
,
const
float
pc2
,
::
brook
::
stream
sdpc
,
::
brook
::
stream
fgauss
,
::
brook
::
stream
sd1V
,
::
brook
::
stream
posq
,
::
brook
::
stream
posqp
,
::
brook
::
stream
vnew
,
::
brook
::
stream
sd2X
,
::
brook
::
stream
v
,
::
brook
::
stream
posqp2
);
void
kupdate_sd1_fix1_FixedRV
(
const
float
xstrwidth
,
const
float
gstrwidth
,
const
float
goffset
,
const
float
cem
,
const
float
pc1
,
const
float
pc2
,
const
float
pc3
,
::
brook
::
stream
sdpc
,
::
brook
::
stream
fgauss
,
::
brook
::
stream
sd2X
,
::
brook
::
stream
posq
,
::
brook
::
stream
f
,
::
brook
::
stream
v
,
::
brook
::
stream
invmass
,
::
brook
::
stream
sd1V
,
::
brook
::
stream
vnew
,
::
brook
::
stream
posqp
);
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment