/**************************************************************** //Linear index of i particle, divided by 2 because we unroll i by 2 * This file is part of the gpu acceleration library for gromacs. * Author: Mark Friedrichs * * This kernel was developed in collaboration with * * Copyright (C) Pande Group, Stanford, 2006 *****************************************************************/ /* After forces above, we have the forces for even numbered particles * in one stream, odd numbered particles in another. * In each stream, the forces are in several parts depending on how * many times we replicated the input stream. * * To avoid an extra kernel to zero forces, this sets the forces * rather than adding to it. * */ kernel void kMergeFloat( float repfac, float atomStrWidth, float pstreamStrWidth, float natoms, float iUnroll, iter float2 count<>, float pstream1[][], float pstream2[][], out float outstream<> ) { float linind; float2 pindex; float odd; float i; //convert to linear atom index linind = count.x + count.y * atomStrWidth; //If odd or even, we pick from diferent streams. odd = linind - floor( linind / iUnroll ) * iUnroll; //Now linear index is the index into partial_streams linind = floor( linind / iUnroll ); outstream = 0.0f; //If we have predicated conditionals, we should //keep the conditional inside the loop for ( i = 0; i < repfac; i+=1.0f ) { pindex.y = floor( linind / pstreamStrWidth ); pindex.x = linind - pindex.y * pstreamStrWidth; if ( odd > 0.5f ) { //is odd outstream += pstream2[ pindex ]; } else { outstream += pstream1[ pindex ]; } linind += natoms/iUnroll; } } kernel void kMergeFloat4( float repfac, float atomStrWidth, float pstreamStrWidth, float natoms, float iUnroll, float4 pstream1[][], float4 pstream2[][], out float4 outstream<> ) { float linind; float2 pindex; float odd; float i; //convert to linear atom index linind = (indexof outstream).x + ( (indexof outstream).y * atomStrWidth ); //If odd or even, we pick from diferent streams. odd = linind - floor( linind / iUnroll ) * iUnroll; //Now linear index is the index into partial_streams linind = floor( linind / iUnroll ); outstream = float4( 0.0f, 0.0f, 0.0f, 0.0f ); //If we have predicated conditionals, we should //keep the conditional inside the loop for ( i = 0.0f; i < repfac; i+= 1.0f ) { pindex.y = floor( linind / pstreamStrWidth ); pindex.x = linind - pindex.y * pstreamStrWidth; if ( odd > 0.5f ) { //is odd outstream += pstream2[ pindex ]; } else { outstream += pstream1[ pindex ]; } linind += natoms/iUnroll; } } /* After forces above, we have the forces for even numbered particles * in one stream, odd numbered particles in another. * In each stream, the forces are in several parts depending on how * many times we replicated the input stream. * * To avoid an extra kernel to zero forces, this sets the forces * rather than adding to it. * */ kernel void kMergeFloat4_4X( float repfac, float atomStrWidth, float pstreamStrWidth, float natoms, float iUnroll, float4 pstream1[][], float4 pstream2[][], float4 pstream3[][], float4 pstream4[][], out float4 outstream<> ) { float linind; float2 pindex; float odd; float i; //convert to linear atom index linind = (indexof outstream).x + ( (indexof outstream).y * atomStrWidth ); //If odd or even, we pick from diferent streams. odd = linind - floor( linind / iUnroll ) * iUnroll; //Now linear index is the index into partial_streams linind = floor( linind / iUnroll ); outstream = float4( 0.0f, 0.0f, 0.0f, 0.0f ); //If we have predicated conditionals, we should //keep the conditional inside the loop for ( i = 0.0f; i < repfac; i+= 1.0f ) { //pindex.y = floor( linind / pstreamStrWidth ); //pindex.x = linind - pindex.y * pstreamStrWidth; pindex.y = round( (linind - fmod( linind, pstreamStrWidth ))/pstreamStrWidth ); //bixia modify pindex.x = linind - pindex.y * pstreamStrWidth; outstream += float4( linind, odd, pindex.x, pindex.y ); /* if ( odd < 0.5f ) { //is odd outstream += pstream1[ pindex ]; } else if( odd < 1.5f ){ outstream += pstream2[ pindex ]; } else if( odd < 2.5f ){ outstream += pstream3[ pindex ]; } else { outstream += pstream4[ pindex ]; } */ linind += natoms/iUnroll; } } kernel void kMergeFloat4_4( float repfac, float atomStreamWidth, float pStreamWidth, float natoms, float roundNatoms, float iUnroll, float4 pstream1[][], float4 pstream2[][], float4 pstream3[][], float4 pstream4[][], out float4 outstream<> ) { float atomIndex, forceIndex, qIndex, qOff; float2 pindex; float i; // given atom index find force indices and streams pindex = indexof( outstream ); atomIndex = pindex.x + pindex.y*atomStreamWidth; forceIndex = atomIndex; outstream = float4( 0.0f, 0.0f, 0.0f, 0.0f ); for( i = 0.0f; i < repfac; i += 1.0f ){ // qIndex = floor( forceIndex/iUnroll ); qIndex = round( (forceIndex - fmod( forceIndex, iUnroll))/iUnroll ); qOff = forceIndex - iUnroll*qIndex; // pindex.y = floor( qIndex/ pStreamWidth ); pindex.y = round( (qIndex - fmod( qIndex, pStreamWidth ))/pStreamWidth ); // pindex.x = qIndex - pindex.y*pStreamWidth + qOff; pindex.x = qIndex - pindex.y*pStreamWidth; // outstream += float4( forceIndex, qIndex, pindex.x, pindex.y ); if ( qOff < 0.5f ){ outstream += pstream1[ pindex ]; } else if( qOff < 1.5f ){ outstream += pstream2[ pindex ]; } else if( qOff < 2.5f ){ outstream += pstream3[ pindex ]; } else { outstream += pstream4[ pindex ]; } forceIndex += roundNatoms; } } kernel void kSetValue4( float value, out float4 outstream<> ){ outstream = float4( value, value, value, value ); } kernel void kSetValue3( float value, out float3 outstream<> ){ outstream = float3( value, value, value ); } kernel void kSetValue2( float value, out float2 outstream<> ){ outstream = float2( value, value ); } kernel void kSetValue1( float value, out float outstream<> ){ outstream = value; } kernel void kCheck( float natoms, float atomStrWidth, float pstreamStrWidth, float unroll, out float4 outstream<> ) { float linind, forceIndex, atomIndex; float2 pindex; pindex = indexof( outstream ); forceIndex = unroll*(pindex.x + pindex.y*pstreamStrWidth); atomIndex = fmod( forceIndex, natoms ); outstream = float4( pindex.x, pindex.y, forceIndex, atomIndex ); } /* After forces above, we have the forces for even numbered particles * in one stream, odd numbered particles in another. * In each stream, the forces are in several parts depending on how * many times we replicated the input stream. * * To avoid an extra kernel to zero forces, this sets the forces * rather than adding to it. * */ kernel void kAddAndMergeFloat4( float repfac, float atomStrWidth, float pstreamStrWidth, float natoms, float iUnroll, float4 inStream<>, float4 pstream1[][], float4 pstream2[][], out float4 outstream<> ) { float linind; float2 pindex; float odd; float i; float floor_linind_iUnroll; linind = (indexof outstream).x + (indexof outstream).y * atomStrWidth; //If odd or even, we pick from diferent streams. //odd = linind - floor( linind / iUnroll ) * iUnroll; //Now linear index is the index into partial_streams //linind = floor( linind / iUnroll ); floor_linind_iUnroll = round( (linind - fmod(linind, iUnroll))/iUnroll ); odd = linind - floor_linind_iUnroll * iUnroll;//bixia modify linind = floor_linind_iUnroll; //bixia modify outstream = inStream; outstream.w = 0.0f; //If we have predicated conditionals, we should //keep the conditional inside the loop for ( i = 0.0f; i < repfac; i+= 1.0f ) { //pindex.y = floor( linind / pstreamStrWidth ); pindex.y = round( (linind - fmod( linind, pstreamStrWidth ))/pstreamStrWidth ); //bixia modify pindex.x = linind - pindex.y * pstreamStrWidth; if ( odd > 0.5f ) { //is odd outstream += pstream2[ pindex ]; } else { outstream += pstream1[ pindex ]; } linind += natoms/iUnroll; } } kernel void kAddAndMergeFloat4_4( float repfac, float atomStreamWidth, float pStreamWidth, float natoms, float roundNatoms, float iUnroll, float4 inStream<>, float4 pstream1[][], float4 pstream2[][], float4 pstream3[][], float4 pstream4[][], out float4 outstream<> ){ float atomIndex, forceIndex, qIndex, qOff; float2 pindex; float i; // given atom index find force indices and streams pindex = indexof( outstream ); atomIndex = pindex.x + pindex.y*atomStreamWidth; forceIndex = atomIndex; // add current forces in inStream to forces stored in pstreams // the .w entry is Born sum values; it will be used to calculate the // Born radii and obcChain term outstream = inStream; outstream.w = 0.0f; //outstream = float4( 0.0f, 0.0f, 0.0f, 0.0f ); // sum over j-loop 'duplications' by gathering from pstreams for( i = 0.0f; i < repfac; i += 1.0f ){ // qIndex = floor( forceIndex/iUnroll ); qIndex = round( (forceIndex - fmod( forceIndex, iUnroll))/iUnroll ); qOff = forceIndex - iUnroll*qIndex; // pindex.y = floor( qIndex/ pStreamWidth ); pindex.y = round( (qIndex - fmod( qIndex, pStreamWidth ))/pStreamWidth ); // pindex.x = qIndex - pindex.y*pStreamWidth + qOff; pindex.x = qIndex - pindex.y*pStreamWidth; if( qOff < 0.5f ){ outstream += pstream1[ pindex ]; } else if( qOff < 1.5f ){ outstream += pstream2[ pindex ]; } else if( qOff < 2.5f ){ outstream += pstream3[ pindex ]; } else { outstream += pstream4[ pindex ]; } forceIndex += roundNatoms; } } /* Add forces from two streams */ kernel void kAddForces3_4( float conversion, float3 force1<>, float4 force2<>, out float3 outForce<> ){ outForce.xyz = force1 + conversion*force2.xyz; } /* Copy one stream to another */ kernel void kCopyFloat4( float4 inForce<>, out float4 outForce<> ){ outForce = inForce; } /* Copy one stream to another * */ kernel void kCopyFloat3To4( float3 inForce<>, out float4 outForce<> ){ // --------------------------------------------------------------------------------------- outForce.xyz = inForce; outForce.w = 0.0f; }