/**************************************************************** //Linear index of i particle, divided by 2 because we unroll i by 2 * This file is part of the gpu acceleration library for gromacs. * Author: Mark Friedrichs * * This kernel was developed in collaboration with * * Copyright (C) Pande Group, Stanford, 2006 *****************************************************************/ /* After forces above, we have the forces for even numbered particles * in one stream, odd numbered particles in another. * In each stream, the forces are in several parts depending on how * many times we replicated the input stream. * * To avoid an extra kernel to zero forces, this sets the forces * rather than adding to it. * */ kernel void kMergeFloat( float repfac, float atomStrWidth, float pstreamStrWidth, float natoms, float iUnroll, iter float2 count<>, float pstream1[][], float pstream2[][], out float outstream<> ) { float linind; float2 pindex; float odd; float i; //convert to linear atom index linind = count.x + count.y * atomStrWidth; //If odd or even, we pick from diferent streams. odd = linind - floor( linind / iUnroll ) * iUnroll; //Now linear index is the index into partial_streams linind = floor( linind / iUnroll ); outstream = 0.0f; //If we have predicated conditionals, we should //keep the conditional inside the loop for ( i = 0; i < repfac; i+=1.0f ) { pindex.y = floor( linind / pstreamStrWidth ); pindex.x = linind - pindex.y * pstreamStrWidth; if ( odd > 0.5f ) { //is odd outstream += pstream2[ pindex ]; } else { outstream += pstream1[ pindex ]; } linind += natoms/iUnroll; } } kernel void kMergeFloat4( float repfac, float atomStrWidth, float pstreamStrWidth, float natoms, float iUnroll, float4 pstream1[][], float4 pstream2[][], out float4 outstream<> ) { float linind; float2 pindex; float odd; float i; //convert to linear atom index linind = (indexof outstream).x + ( (indexof outstream).y * atomStrWidth ); //If odd or even, we pick from diferent streams. odd = linind - floor( linind / iUnroll ) * iUnroll; //Now linear index is the index into partial_streams linind = floor( linind / iUnroll ); outstream = float4( 0.0f, 0.0f, 0.0f, 0.0f ); //If we have predicated conditionals, we should //keep the conditional inside the loop for ( i = 0.0f; i < repfac; i+= 1.0f ) { pindex.y = floor( linind / pstreamStrWidth ); pindex.x = linind - pindex.y * pstreamStrWidth; if ( odd > 0.5f ) { //is odd outstream += pstream2[ pindex ]; } else { outstream += pstream1[ pindex ]; } linind += natoms/iUnroll; } } /* After forces above, we have the forces for even numbered particles * in one stream, odd numbered particles in another. * In each stream, the forces are in several parts depending on how * many times we replicated the input stream. * * To avoid an extra kernel to zero forces, this sets the forces * rather than adding to it. * */ kernel void kMergeFloat4_4X( float repfac, float atomStrWidth, float pstreamStrWidth, float natoms, float iUnroll, float4 pstream1[][], float4 pstream2[][], float4 pstream3[][], float4 pstream4[][], out float4 outstream<> ) { float linind; float2 pindex; float odd; float i; //convert to linear atom index linind = (indexof outstream).x + ( (indexof outstream).y * atomStrWidth ); //If odd or even, we pick from diferent streams. odd = linind - floor( linind / iUnroll ) * iUnroll; //Now linear index is the index into partial_streams linind = floor( linind / iUnroll ); outstream = float4( 0.0f, 0.0f, 0.0f, 0.0f ); //If we have predicated conditionals, we should //keep the conditional inside the loop for ( i = 0.0f; i < repfac; i+= 1.0f ) { //pindex.y = floor( linind / pstreamStrWidth ); //pindex.x = linind - pindex.y * pstreamStrWidth; pindex.y = round( (linind - fmod( linind, pstreamStrWidth ))/pstreamStrWidth ); //bixia modify pindex.x = linind - pindex.y * pstreamStrWidth; outstream += float4( linind, odd, pindex.x, pindex.y ); /* if ( odd < 0.5f ) { //is odd outstream += pstream1[ pindex ]; } else if( odd < 1.5f ){ outstream += pstream2[ pindex ]; } else if( odd < 2.5f ){ outstream += pstream3[ pindex ]; } else { outstream += pstream4[ pindex ]; } */ linind += natoms/iUnroll; } } kernel void kMergeFloat4_4( float repfac, float atomStreamWidth, float pStreamWidth, float natoms, float roundNatoms, float iUnroll, float4 pstream1[][], float4 pstream2[][], float4 pstream3[][], float4 pstream4[][], out float4 outstream<> ) { float atomIndex, forceIndex, qIndex, qOff; float2 pindex; float i; // given atom index find force indices and streams pindex = indexof( outstream ); atomIndex = pindex.x + pindex.y*atomStreamWidth; forceIndex = atomIndex; outstream = float4( 0.0f, 0.0f, 0.0f, 0.0f ); for( i = 0.0f; i < repfac; i += 1.0f ){ // qIndex = floor( forceIndex/iUnroll ); qIndex = round( (forceIndex - fmod( forceIndex, iUnroll))/iUnroll ); qOff = forceIndex - iUnroll*qIndex; // pindex.y = floor( qIndex/ pStreamWidth ); pindex.y = round( (qIndex - fmod( qIndex, pStreamWidth ))/pStreamWidth ); // pindex.x = qIndex - pindex.y*pStreamWidth + qOff; pindex.x = qIndex - pindex.y*pStreamWidth; // outstream += float4( forceIndex, qIndex, pindex.x, pindex.y ); if ( qOff < 0.5f ){ outstream += pstream1[ pindex ]; } else if( qOff < 1.5f ){ outstream += pstream2[ pindex ]; } else if( qOff < 2.5f ){ outstream += pstream3[ pindex ]; } else { outstream += pstream4[ pindex ]; } forceIndex += roundNatoms; } } kernel void kPostObcLoop1( float repfac, float atomStreamWidth, float pStreamWidth, float natoms, float roundNatoms, float iUnroll, float4 pstream1[][], float4 pstream2[][], float4 pstream3[][], float4 pstream4[][], float obcChain<>, float bornRadii<>, out float4 outstream<>, out float bornRadii2Force<> ) { // --------------------------------------------------------------------------------------- float atomIndex, forceIndex, qIndex, qOff; float2 pindex; float i; float4 o1,o2,o3,o4; float4 tmp; float4 zero4; // given atom index find force indices and streams pindex = indexof( outstream ); atomIndex = pindex.x + pindex.y*atomStreamWidth; forceIndex = atomIndex; zero4 = float4( 0.0f, 0.0f, 0.0f, 0.0f ); outstream = zero4; for( i = 0.0f; i < repfac; i += 1.0f ){ // qIndex = floor( forceIndex/iUnroll ); qIndex = round( (forceIndex - fmod( forceIndex, iUnroll))/iUnroll ); qOff = forceIndex - iUnroll*qIndex; // pindex.y = floor( qIndex/ pStreamWidth ); pindex.y = round( (qIndex - fmod( qIndex, pStreamWidth ))/pStreamWidth ); // pindex.x = qIndex - pindex.y*pStreamWidth + qOff; pindex.x = qIndex - pindex.y*pStreamWidth; // outstream += float4( forceIndex, qIndex, pindex.x, pindex.y ); // This is going to cause really divergent code and we are // going to end up doing all the fetches anyway... if ( qOff < 0.5f ){ outstream += pstream1[ pindex ]; } else if( qOff < 1.5f ){ outstream += pstream2[ pindex ]; } else if( qOff < 2.5f ){ outstream += pstream3[ pindex ]; } else { outstream += pstream4[ pindex ]; } // o1 = pstream1[ pindex ]; // o2 = pstream2[ pindex ]; // o3 = pstream3[ pindex ]; // o4 = pstream4[ pindex ]; // tmp = qOff < 0.5f ? o1 : o2; // tmp = qOff < 1.5f ? tmp : o3; // tmp = qOff < 2.5f ? tmp : o4; // outstream += tmp; forceIndex += roundNatoms; } bornRadii2Force = obcChain*bornRadii*bornRadii*outstream.w; } // The inner loop by definition creates divergent paths. Chances are // fair that we will take all sides of the branch anyway, so this // verion uses manual predication kernel void kPostObcLoop1_nobranch( float repfac, float atomStreamWidth, float pStreamWidth, float natoms, float roundNatoms, float iUnroll, float4 pstream1[][], float4 pstream2[][], float4 pstream3[][], float4 pstream4[][], float obcChain<>, float bornRadii<>, out float4 outstream<>, out float bornRadii2Force<> ) { // --------------------------------------------------------------------------------------- float atomIndex, forceIndex, qIndex, qOff; float2 pindex; float i; float4 o1,o2,o3,o4; float4 tmp; // given atom index find force indices and streams pindex = indexof( outstream ); atomIndex = pindex.x + pindex.y*atomStreamWidth; forceIndex = atomIndex; outstream = float4( 0.0f, 0.0f, 0.0f, 0.0f ); for( i = 0.0f; i < repfac; i += 1.0f ){ // qIndex = floor( forceIndex/iUnroll ); qIndex = round( (forceIndex - fmod( forceIndex, iUnroll))/iUnroll ); qOff = forceIndex - iUnroll*qIndex; // pindex.y = floor( qIndex/ pStreamWidth ); pindex.y = round( (qIndex - fmod( qIndex, pStreamWidth ))/pStreamWidth ); // pindex.x = qIndex - pindex.y*pStreamWidth + qOff; pindex.x = qIndex - pindex.y*pStreamWidth; o1 = pstream1[ pindex ]; o2 = pstream2[ pindex ]; o3 = pstream3[ pindex ]; o4 = pstream4[ pindex ]; tmp = qOff < 0.5f ? o1 : o2; tmp = qOff < 1.5f ? tmp : o3; tmp = qOff < 2.5f ? tmp : o4; outstream += tmp; forceIndex += roundNatoms; } bornRadii2Force = obcChain*bornRadii*bornRadii*outstream.w; } kernel void kSetValue4( float value, out float4 outstream<> ){ outstream = float4( value, value, value, value ); } kernel void kSetValue3( float value, out float3 outstream<> ){ outstream = float3( value, value, value ); } kernel void kSetValue2( float value, out float2 outstream<> ){ outstream = float2( value, value ); } kernel void kSetValue1( float value, out float outstream<> ){ outstream = value; } kernel void kCheck( float natoms, float atomStrWidth, float pstreamStrWidth, float unroll, out float4 outstream<> ) { float linind, forceIndex, atomIndex; float2 pindex; pindex = indexof( outstream ); forceIndex = unroll*(pindex.x + pindex.y*pstreamStrWidth); atomIndex = fmod( forceIndex, natoms ); outstream = float4( pindex.x, pindex.y, forceIndex, atomIndex ); } /* After forces above, we have the forces for even numbered particles * in one stream, odd numbered particles in another. * In each stream, the forces are in several parts depending on how * many times we replicated the input stream. * * To avoid an extra kernel to zero forces, this sets the forces * rather than adding to it. * */ kernel void kAddAndMergeFloat4( float repfac, float atomStrWidth, float pstreamStrWidth, float natoms, float iUnroll, float4 inStream<>, float4 pstream1[][], float4 pstream2[][], out float4 outstream<> ) { float linind; float2 pindex; float odd; float i; float floor_linind_iUnroll; linind = (indexof outstream).x + (indexof outstream).y * atomStrWidth; //If odd or even, we pick from diferent streams. //odd = linind - floor( linind / iUnroll ) * iUnroll; //Now linear index is the index into partial_streams //linind = floor( linind / iUnroll ); floor_linind_iUnroll = round( (linind - fmod(linind, iUnroll))/iUnroll ); odd = linind - floor_linind_iUnroll * iUnroll;//bixia modify linind = floor_linind_iUnroll; //bixia modify outstream = inStream; outstream.w = 0.0f; //If we have predicated conditionals, we should //keep the conditional inside the loop for ( i = 0.0f; i < repfac; i+= 1.0f ) { //pindex.y = floor( linind / pstreamStrWidth ); pindex.y = round( (linind - fmod( linind, pstreamStrWidth ))/pstreamStrWidth ); //bixia modify pindex.x = linind - pindex.y * pstreamStrWidth; if ( odd > 0.5f ) { //is odd outstream += pstream2[ pindex ]; } else { outstream += pstream1[ pindex ]; } linind += natoms/iUnroll; } } kernel void kAddAndMergeFloat4_4( float repfac, float atomStreamWidth, float pStreamWidth, float natoms, float roundNatoms, float iUnroll, float4 inStream<>, float4 pstream1[][], float4 pstream2[][], float4 pstream3[][], float4 pstream4[][], out float4 outstream<> ){ float atomIndex, forceIndex, qIndex, qOff; float2 pindex; float i; // given atom index find force indices and streams pindex = indexof( outstream ); atomIndex = pindex.x + pindex.y*atomStreamWidth; forceIndex = atomIndex; // add current forces in inStream to forces stored in pstreams // the .w entry is Born sum values; it will be used to calculate the // Born radii and obcChain term outstream = inStream; outstream.w = 0.0f; //outstream = float4( 0.0f, 0.0f, 0.0f, 0.0f ); // sum over j-loop 'duplications' by gathering from pstreams for( i = 0.0f; i < repfac; i += 1.0f ){ // qIndex = floor( forceIndex/iUnroll ); qIndex = round( (forceIndex - fmod( forceIndex, iUnroll))/iUnroll ); qOff = forceIndex - iUnroll*qIndex; // pindex.y = floor( qIndex/ pStreamWidth ); pindex.y = round( (qIndex - fmod( qIndex, pStreamWidth ))/pStreamWidth ); // pindex.x = qIndex - pindex.y*pStreamWidth + qOff; pindex.x = qIndex - pindex.y*pStreamWidth; if( qOff < 0.5f ){ outstream += pstream1[ pindex ]; } else if( qOff < 1.5f ){ outstream += pstream2[ pindex ]; } else if( qOff < 2.5f ){ outstream += pstream3[ pindex ]; } else { outstream += pstream4[ pindex ]; } forceIndex += roundNatoms; } } kernel void kPostObcLoop2( float repfac, float atomStreamWidth, float pStreamWidth, float natoms, float roundNatoms, float iUnroll, float conversion, float mergeNonObcForces, float4 inObcForces<>, float3 nonObcForces<>, float4 pstream1[][], float4 pstream2[][], float4 pstream3[][], float4 pstream4[][], float atomicRadii<>, out float bornRadii<>, out float obcChain<>, out float3 outForces<> ){ // --------------------------------------------------------------------------------------- float atomIndex, forceIndex, qIndex, qOff; float2 pindex; float i; float sum2, sum3, bornSum, tanhSum, atomicRadiiOffset, obcIntermediate; float2 iAtom; float4 forces; float expPlus, expMinus; // --------------------------------------------------------------------------------------- // constants -- OBC Type II const float alphaObc = 1.0f; const float betaObc = 0.8f; const float gammaObc = 4.85f; const float dielectricOffset = 0.009f; // --------------------------------------------------------------------------------------- // given atom index find force indices and streams pindex = indexof( outForces ); atomIndex = pindex.x + pindex.y*atomStreamWidth; forceIndex = atomIndex; // add current forces in inStream to forces stored in pstreams // the .w entry is Born sum values; it will be used to calculate the // Born radii and obcChain term forces = inObcForces; forces.w = 0.0f; //forces = float4( 0.0f, 0.0f, 0.0f, 0.0f ); // sum over j-loop 'duplications' by gathering from pstreams for( i = 0.0f; i < repfac; i += 1.0f ){ // qIndex = floor( forceIndex/iUnroll ); qIndex = round( (forceIndex - fmod( forceIndex, iUnroll))/iUnroll ); qOff = forceIndex - iUnroll*qIndex; // pindex.y = floor( qIndex/ pStreamWidth ); pindex.y = round( (qIndex - fmod( qIndex, pStreamWidth ))/pStreamWidth ); // pindex.x = qIndex - pindex.y*pStreamWidth + qOff; pindex.x = qIndex - pindex.y*pStreamWidth; if( qOff < 0.5f ){ forces += pstream1[ pindex ]; } else if( qOff < 1.5f ){ forces += pstream2[ pindex ]; } else if( qOff < 2.5f ){ forces += pstream3[ pindex ]; } else { forces += pstream4[ pindex ]; } forceIndex += roundNatoms; } // compute Born radii and ObcChain atomicRadiiOffset = atomicRadii - dielectricOffset; bornSum = forces.w; bornSum *= 0.5f*atomicRadiiOffset; sum2 = bornSum*bornSum; sum3 = bornSum*sum2; // Tanh does not exist? // calculate [ exp(x) - exp(-x) ]/[ exp(x) + exp(-x) ] // tanhSum = tanh( bornSum - betaObc*sum2 + gammaObc*sum3 ); tanhSum = bornSum - betaObc*sum2 + gammaObc*sum3; expPlus = exp( tanhSum ); expMinus = 1.0f/expPlus; tanhSum = ( expPlus - expMinus )/( expPlus + expMinus ); bornRadii = 1.0f/( (1.0f/(atomicRadiiOffset)) - tanhSum/atomicRadii ); obcIntermediate = atomicRadiiOffset*( alphaObc - 2.0f*betaObc*bornSum + 3.0f*gammaObc*sum2 ); obcChain = (1.0f - tanhSum*tanhSum)*obcIntermediate/atomicRadii; if( atomIndex >= natoms ){ bornRadii = 0.0f; obcChain = 0.0f; } // add converted new forces to non-Obc forces outForces = conversion*forces.xyz; if( mergeNonObcForces > 0.1f ){ outForces += nonObcForces; } } kernel void kPostObcLoop2_nobranch( float repfac, float atomStreamWidth, float pStreamWidth, float natoms, float roundNatoms, float iUnroll, float conversion, float mergeNonObcForces, float4 inObcForces<>, float3 nonObcForces<>, float4 pstream1[][], float4 pstream2[][], float4 pstream3[][], float4 pstream4[][], float atomicRadii<>, out float bornRadii<>, out float obcChain<>, out float3 outForces<> ){ // --------------------------------------------------------------------------------------- float atomIndex, forceIndex, qIndex, qOff; float2 pindex; float i; float sum2, sum3, bornSum, tanhSum, atomicRadiiOffset, obcIntermediate; float4 o1,o2,o3,o4; float4 tmp; float2 iAtom; float4 forces; float expPlus, expMinus; // --------------------------------------------------------------------------------------- // constants -- OBC Type II const float alphaObc = 1.0f; const float betaObc = 0.8f; const float gammaObc = 4.85f; const float dielectricOffset = 0.009f; // --------------------------------------------------------------------------------------- // given atom index find force indices and streams pindex = indexof( outForces ); atomIndex = pindex.x + pindex.y*atomStreamWidth; forceIndex = atomIndex; // add current forces in inStream to forces stored in pstreams // the .w entry is Born sum values; it will be used to calculate the // Born radii and obcChain term forces = inObcForces; forces.w = 0.0f; //forces = float4( 0.0f, 0.0f, 0.0f, 0.0f ); // sum over j-loop 'duplications' by gathering from pstreams for( i = 0.0f; i < repfac; i += 1.0f ){ // qIndex = floor( forceIndex/iUnroll ); qIndex = round( (forceIndex - fmod( forceIndex, iUnroll))/iUnroll ); qOff = forceIndex - iUnroll*qIndex; // pindex.y = floor( qIndex/ pStreamWidth ); pindex.y = round( (qIndex - fmod( qIndex, pStreamWidth ))/pStreamWidth ); // pindex.x = qIndex - pindex.y*pStreamWidth + qOff; pindex.x = qIndex - pindex.y*pStreamWidth; o1 = pstream1[ pindex ]; o2 = pstream2[ pindex ]; o3 = pstream3[ pindex ]; o4 = pstream4[ pindex ]; tmp = qOff < 0.5f ? o1 : o2; tmp = qOff < 1.5f ? tmp : o3; tmp = qOff < 2.5f ? tmp : o4; forces += tmp; forceIndex += roundNatoms; } // compute Born radii and ObcChain atomicRadiiOffset = atomicRadii - dielectricOffset; bornSum = forces.w; bornSum *= 0.5f*atomicRadiiOffset; sum2 = bornSum*bornSum; sum3 = bornSum*sum2; // Tanh does not exist? // calculate [ exp(x) - exp(-x) ]/[ exp(x) + exp(-x) ] // tanhSum = tanh( bornSum - betaObc*sum2 + gammaObc*sum3 ); tanhSum = bornSum - betaObc*sum2 + gammaObc*sum3; expPlus = exp( tanhSum ); expMinus = 1.0f/expPlus; tanhSum = ( expPlus - expMinus )/( expPlus + expMinus ); bornRadii = 1.0f/( (1.0f/(atomicRadiiOffset)) - tanhSum/atomicRadii ); obcIntermediate = atomicRadiiOffset*( alphaObc - 2.0f*betaObc*bornSum + 3.0f*gammaObc*sum2 ); obcChain = (1.0f - tanhSum*tanhSum)*obcIntermediate/atomicRadii; if( atomIndex >= natoms ){ bornRadii = 0.0f; obcChain = 0.0f; } // add converted new forces to non-Obc forces outForces = conversion*forces.xyz; if( mergeNonObcForces > 0.1f ){ outForces += nonObcForces; } } kernel void kPostCalculateBornRadii_nobranch( float repfac, float atomStreamWidth, float pStreamWidth, float natoms, float roundNatoms, float iUnroll, float conversion, float mergeNonObcForces, float4 pstream1[][], float atomicRadii<>, out float bornRadii<>, out float obcChain<> ){ // --------------------------------------------------------------------------------------- float atomIndex, forceIndex, qIndex, qOff; float2 pindex; float i; float sum2, sum3, bornSum, tanhSum, atomicRadiiOffset, obcIntermediate; float4 o1; float tmp; float2 iAtom; float4 forces; float expPlus, expMinus; // --------------------------------------------------------------------------------------- // constants -- OBC Type II const float alphaObc = 1.0f; const float betaObc = 0.8f; const float gammaObc = 4.85f; const float dielectricOffset = 0.009f; // --------------------------------------------------------------------------------------- // given atom index find force indices and streams pindex = indexof( obcChain ); atomIndex = pindex.x + pindex.y*atomStreamWidth; forceIndex = atomIndex; // add current forces in inStream to forces stored in pstreams // the .w entry is Born sum values; it will be used to calculate the // Born radii and obcChain term bornSum = 0.0f; // sum over j-loop 'duplications' by gathering from pstreams for( i = 0.0f; i < repfac; i += 1.0f ){ qIndex = round( (forceIndex - fmod( forceIndex, iUnroll))/iUnroll ); qOff = forceIndex - iUnroll*qIndex; pindex.y = round( (qIndex - fmod( qIndex, pStreamWidth ))/pStreamWidth ); pindex.x = qIndex - pindex.y*pStreamWidth; o1 = pstream1[ pindex ]; tmp = qOff < 0.5f ? o1.x : o1.y; tmp = qOff < 1.5f ? tmp : o1.z; tmp = qOff < 2.5f ? tmp : o1.w; bornSum += tmp; forceIndex += roundNatoms; } // compute Born radii and ObcChain atomicRadiiOffset = atomicRadii - dielectricOffset; bornSum *= 0.5f*atomicRadiiOffset; sum2 = bornSum*bornSum; sum3 = bornSum*sum2; // Tanh does not exist? // calculate [ exp(x) - exp(-x) ]/[ exp(x) + exp(-x) ] // tanhSum = tanh( bornSum - betaObc*sum2 + gammaObc*sum3 ); tanhSum = bornSum - betaObc*sum2 + gammaObc*sum3; expPlus = exp( tanhSum ); expMinus = 1.0f/expPlus; tanhSum = ( expPlus - expMinus )/( expPlus + expMinus ); bornRadii = 1.0f/( (1.0f/(atomicRadiiOffset)) - tanhSum/atomicRadii ); obcIntermediate = atomicRadiiOffset*( alphaObc - 2.0f*betaObc*bornSum + 3.0f*gammaObc*sum2 ); obcChain = (1.0f - tanhSum*tanhSum)*obcIntermediate/atomicRadii; if( atomIndex >= natoms ){ bornRadii = 0.0f; obcChain = 0.0f; } } /* Add forces from two streams */ kernel void kAddForces3_4( float conversion, float3 force1<>, float4 force2<>, out float3 outForce<> ){ outForce.xyz = force1 + conversion*force2.xyz; } /* Copy one stream to another */ kernel void kCopyFloat4( float4 inForce<>, out float4 outForce<> ){ outForce = inForce; } /* Copy one stream to another * */ kernel void kCopyFloat3To4( float3 inForce<>, out float4 outForce<> ){ // --------------------------------------------------------------------------------------- outForce.xyz = inForce; outForce.w = 0.0f; }