kinvmap_gather.br


/****************************************************************
* This file is part of the gpu acceleration library for gromacs.
* Author: V. Vishal
* Copyright (C) Pande Group, Stanford, 2006
*****************************************************************/

//Gather kernel for use with diferent angle, dihedral and improper functions
//
//For small systems, the overhead of calling the kernel is so high
//That we have to minimize the number of kernel calls. At the same
//time we don't want to waste reads, so I'm providing a bunch of kernels
//here that are unrolled to diferent extents. Use the appropriate one
//depending on how many inverse maps there are, rather than looping over
//the simple one.


//helper function to make the unrolling look better
kernel float3 do_gather( float strwidth, float4 invmap<>, float3 forces[][] ) {
	float3 f;
	float4 quotient, remainder;
	float2 idx;

	f = float3( 0.0f, 0.0f, 0.0f );
	
	//Convert from linear to 2D index
	// quotient  = floor( invmap / strwidth );
	quotient  = round( ( invmap - fmod(invmap, strwidth))/strwidth ); 
	remainder = invmap - quotient * strwidth;
	
	//Add each force only if non-negative
	if ( invmap.x >= 0.0f ) {
		idx.y = quotient.x;
		idx.x = remainder.x;
		f = forces[ idx ];
	}

	if ( invmap.y >= 0.0f ) {
		idx.y = quotient.y;
		idx.x = remainder.y;
		f += forces[ idx ];
	}
	
	if ( invmap.z >= 0.0f ) {
		idx.y = quotient.z;
		idx.x = remainder.z;
		f += forces[ idx ];
	}

	if ( invmap.w >= 0.0f ) {
		idx.y = quotient.w;
		idx.x = remainder.w;
		f += forces[ idx ];
	}

	return f;
}

//Simple version, takes only one index stream
kernel void kinvmap_gather(
		float strwidth,    //stream width of the dihedral forces
		float4 invmap<>,   //indices into the dihedral forces
		float3 forces[][], //dihedral forces
		float3 inforce<>,  //particle forces before
		out float3 outforce<> //particle forces after
		)
{
	outforce = inforce;

	outforce += do_gather( strwidth, invmap, forces );
}

//Takes two inverse maps
kernel void kinvmap_gather2(
		float strwidth,    //stream width of the dihedral forces
		float4 invmap1<>,   //indices into the dihedral forces
		float4 invmap2<>,   //indices into the dihedral forces
		float3 forces[][], //dihedral forces
		float3 inforce<>,  //particle forces before
		out float3 outforce<> //particle forces after
		)
{
	outforce = inforce;

	outforce += do_gather( strwidth, invmap1, forces );
	outforce += do_gather( strwidth, invmap2, forces );
}

//Takes three inverse maps
kernel void kinvmap_gather3(
		float strwidth,    //stream width of the dihedral forces
		float4 invmap1<>,   //indices into the dihedral forces
		float4 invmap2<>,   //indices into the dihedral forces
		float4 invmap3<>,
		float3 forces[][], //dihedral forces
		float3 inforce<>,  //particle forces before
		out float3 outforce<> //particle forces after
		)
{
	outforce = inforce;

	outforce += do_gather( strwidth, invmap1, forces );
	outforce += do_gather( strwidth, invmap2, forces );
	outforce += do_gather( strwidth, invmap3, forces );
}

//Takes four inverse maps
kernel void kinvmap_gather4(
		float strwidth,    //stream width of the dihedral forces
		float4 invmap1<>,   //indices into the dihedral forces
		float4 invmap2<>,   //indices into the dihedral forces
		float4 invmap3<>,
		float4 invmap4<>,
		float3 forces[][], //dihedral forces
		float3 inforce<>,  //particle forces before
		out float3 outforce<> //particle forces after
		)
{
	outforce = inforce;

	outforce += do_gather( strwidth, invmap1, forces );
	outforce += do_gather( strwidth, invmap2, forces );
	outforce += do_gather( strwidth, invmap3, forces );
	outforce += do_gather( strwidth, invmap4, forces );
}

//Takes five inverse maps
kernel void kinvmap_gather5(
		float strwidth,    //stream width of the dihedral forces
		float4 invmap1<>,   //indices into the dihedral forces
		float4 invmap2<>,   //indices into the dihedral forces
		float4 invmap3<>,
		float4 invmap4<>,
		float4 invmap5<>,
		float3 forces[][], //dihedral forces
		float3 inforce<>,  //particle forces before
		out float3 outforce<> //particle forces after
		)
{
	outforce = inforce;

	outforce += do_gather( strwidth, invmap1, forces );
	outforce += do_gather( strwidth, invmap2, forces );
	outforce += do_gather( strwidth, invmap3, forces );
	outforce += do_gather( strwidth, invmap4, forces );
	outforce += do_gather( strwidth, invmap5, forces );
}

//Takes six inverse maps - this is the last one!
kernel void kinvmap_gather6(
		float strwidth,    //stream width of the dihedral forces
		float4 invmap1<>,   //indices into the dihedral forces
		float4 invmap2<>,   //indices into the dihedral forces
		float4 invmap3<>,
		float4 invmap4<>,
		float4 invmap5<>,
		float4 invmap6<>,
		float3 forces[][], //dihedral forces
		float3 inforce<>,  //particle forces before
		out float3 outforce<> //particle forces after
		)
{
	outforce = inforce;

	outforce += do_gather( strwidth, invmap1, forces );
	outforce += do_gather( strwidth, invmap2, forces );
	outforce += do_gather( strwidth, invmap3, forces );
	outforce += do_gather( strwidth, invmap4, forces );
	outforce += do_gather( strwidth, invmap5, forces );
	outforce += do_gather( strwidth, invmap6, forces );
}


kernel float3 etch_force( 
		float fpos, 
		float strwidth,
		float3 fi[][], 
		float3 fj[][],
		float3 fk[][],
		float3 fl[][]
		)
{
	float2 ind;
	float _fpos;

	_fpos = fpos;
	
	if ( _fpos > 300000.0f ) {
		_fpos = _fpos - 300000.0f;
		//ind.y = floor( _fpos / strwidth );
      ind.y = round( ( _fpos - fmod( _fpos, strwidth))/strwidth );
		ind.x = _fpos - ind.y * strwidth;

		return fl[ ind ];
	}
	else if ( _fpos > 200000.0f ) {
		_fpos = _fpos - 200000.0f;
		//ind.y = floor( _fpos / strwidth );
      ind.y = round( ( _fpos - fmod( _fpos, strwidth))/strwidth );
		ind.x = _fpos - ind.y * strwidth;

		return fk[ ind ];
	}
	else if ( _fpos > 100000.0f ) {
		_fpos = _fpos - 100000.0f;
		//ind.y = floor( _fpos / strwidth );
      ind.y = round( ( _fpos - fmod( _fpos, strwidth))/strwidth );
		ind.x = _fpos - ind.y * strwidth;

		return fj[ ind ];
	}
	else if ( _fpos >= -0.5f ) {
		//ind.y = floor( _fpos / strwidth );
      ind.y = round( ( _fpos - fmod( _fpos, strwidth))/strwidth );
		ind.x = _fpos - ind.y * strwidth;

		return fi[ ind ];
	}
	else 
		return 0.0f;
	
}


//For-loop version doesn't work
//Using a merged version of the above

kernel float2 linear_to_2D( float linind, float width )
{
	float2 ind;
	//ind.y = floor( linind / width );
   ind.y = round( ( linind - fmod( linind, width))/width );
	ind.x = linind - ind.y * width;

	return ind;
}

//helper function to make the unrolling look better
kernel float3 do_gather_merged_single( float strwidth, float invmap, 
		float3 fi[][], float3 fj[][], float3 fk[][], float3 fl[][] ) {
	
	float3 f;
	float2 idx;
	float _invmap;
	float n;

	_invmap = invmap;

	n = floor( _invmap / 100000.0f );
	_invmap -= n * 100000.0f;
	idx = linear_to_2D( _invmap, strwidth );

	if ( n > 2.5f ) { 
		f = fl[ idx ];
	}
	else if ( n > 1.5f ) {
		f = fk[ idx ];
	}
	else if ( n > 0.5f ) {
		f = fj[ idx ];
	}
	else if( n > -0.5f ) {
		f = fi[ idx ];
	}

	return f;
}

kernel float3 do_gather_merged( float strwidth, float4 invmap,
		float3 fi[][], float3 fj[][], float3 fk[][], float3 fl[][])
{
	float3 f;
	f = do_gather_merged_single( strwidth, invmap.x, fi, fj, fk, fl )
	    + do_gather_merged_single( strwidth, invmap.y, fi, fj, fk, fl )
	    + do_gather_merged_single( strwidth, invmap.z, fi, fj, fk, fl )
	    + do_gather_merged_single( strwidth, invmap.w, fi, fj, fk, fl );

	return f;
}

kernel void kinvmap_gather_merged5(
		float natoms,      //number of atoms
		float strwidth,    //stream width of out-of-order forces
		float4 invmap0<>, 
		float4 invmap1<>, 
		float4 invmap2<>, 
		float4 invmap3<>, 
		float4 invmap4<>, 
		float3 fi[][],     //i-forces
		float3 fj[][],     //j-forces
		float3 fk[][],     //k-forces
		float3 fl[][],     //l-forces
		float3 inforce<>,  
		out float3 outforce<>
		)
{
	outforce = inforce;
	outforce += do_gather_merged( strwidth, invmap0, fi, fj, fk, fl )
	            + do_gather_merged( strwidth, invmap1, fi, fj, fk, fl )
	            + do_gather_merged( strwidth, invmap2, fi, fj, fk, fl )
	            + do_gather_merged( strwidth, invmap3, fi, fj, fk, fl )
	            + do_gather_merged( strwidth, invmap4, fi, fj, fk, fl );
}

kernel void kinvmap_gather_merged9(
		float natoms,      //number of atoms
		float strwidth,    //stream width of out-of-order forces
		float4 invmap0<>, 
		float4 invmap1<>, 
		float4 invmap2<>, 
		float4 invmap3<>, 
		float4 invmap4<>, 
		float4 invmap5<>, 
		float4 invmap6<>, 
		float4 invmap7<>, 
		float4 invmap8<>, 
		float3 fi[][],     //i-forces
		float3 fj[][],     //j-forces
		float3 fk[][],     //k-forces
		float3 fl[][],     //l-forces
		float3 inforce<>,  
		out float3 outforce<>
		)
{
	outforce = inforce;
	outforce += do_gather_merged( strwidth, invmap0, fi, fj, fk, fl )
	            + do_gather_merged( strwidth, invmap1, fi, fj, fk, fl )
	            + do_gather_merged( strwidth, invmap2, fi, fj, fk, fl )
	            + do_gather_merged( strwidth, invmap3, fi, fj, fk, fl )
	            + do_gather_merged( strwidth, invmap4, fi, fj, fk, fl )
	            + do_gather_merged( strwidth, invmap5, fi, fj, fk, fl )
	            + do_gather_merged( strwidth, invmap6, fi, fj, fk, fl )
	            + do_gather_merged( strwidth, invmap7, fi, fj, fk, fl )
	            + do_gather_merged( strwidth, invmap8, fi, fj, fk, fl );
}