/* -------------------------------------------------------------------------- *
 *                                   OpenMM                                   *
 * -------------------------------------------------------------------------- *
 * This is part of the OpenMM molecular simulation toolkit originating from   *
 * Simbios, the NIH National Center for Physics-Based Simulation of           *
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
 * Portions copyright (c) 2008-2009 Stanford University and the Authors.      *
 * Authors: Peter Eastman, Mark Friedrichs, Chris Bruns, Mike Houston         *
 * Contributors:                                                              *
 *                                                                            *
 * Permission is hereby granted, free of charge, to any person obtaining a    *
 * copy of this software and associated documentation files (the "Software"), *
 * to deal in the Software without restriction, including without limitation  *
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
 * and/or sell copies of the Software, and to permit persons to whom the      *
 * Software is furnished to do so, subject to the following conditions:       *
 *                                                                            *
 * The above copyright notice and this permission notice shall be included in *
 * all copies or substantial portions of the Software.                        *
 *                                                                            *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
 * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
 * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
 * -------------------------------------------------------------------------- */

//Gather kernel for use with diferent angle, dihedral and improper functions
//
//For small systems, the overhead of calling the kernel is so high
//That we have to minimize the number of kernel calls. At the same
//time we don't want to waste reads, so I'm providing a bunch of kernels
//here that are unrolled to diferent extents. Use the appropriate one
//depending on how many inverse maps there are, rather than looping over
//the simple one.


//helper function to make the unrolling look better
kernel float3 do_gather( float strwidth, float4 invmap<>, float3 forces[][] ) {
	float3 f;
	float4 quotient, remainder;
	float2 idx;

	f = float3( 0.0f, 0.0f, 0.0f );
	
	//Convert from linear to 2D index
	// quotient  = floor( invmap / strwidth );
	quotient  = round( ( invmap - fmod(invmap, strwidth))/strwidth ); 
	remainder = invmap - quotient * strwidth;
	
	//Add each force only if non-negative
	if ( invmap.x >= 0.0f ) {
		idx.y = quotient.x;
		idx.x = remainder.x;
		f = forces[ idx ];
	}

	if ( invmap.y >= 0.0f ) {
		idx.y = quotient.y;
		idx.x = remainder.y;
		f += forces[ idx ];
	}
	
	if ( invmap.z >= 0.0f ) {
		idx.y = quotient.z;
		idx.x = remainder.z;
		f += forces[ idx ];
	}

	if ( invmap.w >= 0.0f ) {
		idx.y = quotient.w;
		idx.x = remainder.w;
		f += forces[ idx ];
	}

	return f;
}

//helper function to make the unrolling look better

kernel float3 do_gather_nobranch( float strwidth, float4 invmap<>, float3 forces[][] ) {
    
    float3 f1, f2, f3, f4, f, z, t;    
    float4 quotient, remainder;    
    float2 idx;    
    float m1, m2, m3, m4;    
        
    z = float3( 0.0f, 0.0f, 0.0f );    
    f=z;    
        
    //Convert from linear to 2D index    
    // quotient  = floor( invmap / strwidth );    
    quotient  = round( ( invmap - fmod(invmap, strwidth))/strwidth );     
    remainder = invmap - quotient * strwidth;    
    
    m1 = invmap.x;    
    m2 = invmap.y;    
    m3 = invmap.z;    
    m4 = invmap.w;    
    
    //Add each force only if non-negative    
    idx.y = quotient.x;    
    idx.x = remainder.x;    
    f1 = forces[ idx ];    
    
    idx.y = quotient.y;    
    idx.x = remainder.y;    
    f2 = forces[ idx ];
    
    idx.y = quotient.z;    
    idx.x = remainder.z;    
    f3 = forces[ idx ];
    
    idx.y = quotient.w;    
    idx.x = remainder.w;    
    f4 = forces[ idx ];
    
    f = (m1 >= 0.0f) ? f1 : z;    
    t = (m2 >= 0.0f) ? f2 : z;    
    f = f+t;    
    t = (m3 >= 0.0f) ? f3 : z;    
    f = f+t;    
    t = (m4 >= 0.0f) ? f4 : z;    
    f = f+t;
    
    return f;
    
}


//Simple version, takes only one index stream
kernel void kinvmap_gather(
		float strwidth,    //stream width of the dihedral forces
		float4 invmap<>,   //indices into the dihedral forces
		float3 forces[][], //dihedral forces
		float3 inforce<>,  //particle forces before
		out float3 outforce<> //particle forces after
		)
{
	outforce = inforce;

	outforce += do_gather_nobranch( strwidth, invmap, forces );
}

//Takes two inverse maps
kernel void kinvmap_gather2(
		float strwidth,    //stream width of the dihedral forces
		float4 invmap1<>,   //indices into the dihedral forces
		float4 invmap2<>,   //indices into the dihedral forces
		float3 forces[][], //dihedral forces
		float3 inforce<>,  //particle forces before
		out float3 outforce<> //particle forces after
		)
{
	outforce = inforce;

	outforce += do_gather_nobranch( strwidth, invmap1, forces );
	outforce += do_gather_nobranch( strwidth, invmap2, forces );
}

//Takes three inverse maps
kernel void kinvmap_gather3(
		float strwidth,    //stream width of the dihedral forces
		float4 invmap1<>,   //indices into the dihedral forces
		float4 invmap2<>,   //indices into the dihedral forces
		float4 invmap3<>,
		float3 forces[][], //dihedral forces
		float3 inforce<>,  //particle forces before
		out float3 outforce<> //particle forces after
		)
{
	outforce = inforce;

	outforce += do_gather_nobranch( strwidth, invmap1, forces );
	outforce += do_gather_nobranch( strwidth, invmap2, forces );
	outforce += do_gather_nobranch( strwidth, invmap3, forces );
}

//Takes four inverse maps
kernel void kinvmap_gather4(
		float strwidth,    //stream width of the dihedral forces
		float4 invmap1<>,   //indices into the dihedral forces
		float4 invmap2<>,   //indices into the dihedral forces
		float4 invmap3<>,
		float4 invmap4<>,
		float3 forces[][], //dihedral forces
		float3 inforce<>,  //particle forces before
		out float3 outforce<> //particle forces after
		)
{
	outforce = inforce;

	outforce += do_gather_nobranch( strwidth, invmap1, forces );
	outforce += do_gather_nobranch( strwidth, invmap2, forces );
	outforce += do_gather_nobranch( strwidth, invmap3, forces );
	outforce += do_gather_nobranch( strwidth, invmap4, forces );
}

//Takes five inverse maps
kernel void kinvmap_gather5(
		float strwidth,    //stream width of the dihedral forces
		float4 invmap1<>,   //indices into the dihedral forces
		float4 invmap2<>,   //indices into the dihedral forces
		float4 invmap3<>,
		float4 invmap4<>,
		float4 invmap5<>,
		float3 forces[][], //dihedral forces
		float3 inforce<>,  //particle forces before
		out float3 outforce<> //particle forces after
		)
{
	outforce = inforce;

	outforce += do_gather_nobranch( strwidth, invmap1, forces );
	outforce += do_gather_nobranch( strwidth, invmap2, forces );
	outforce += do_gather_nobranch( strwidth, invmap3, forces );
	outforce += do_gather_nobranch( strwidth, invmap4, forces );
	outforce += do_gather_nobranch( strwidth, invmap5, forces );
}

//Takes six inverse maps - this is the last one!
kernel void kinvmap_gather6(
		float strwidth,    //stream width of the dihedral forces
		float4 invmap1<>,   //indices into the dihedral forces
		float4 invmap2<>,   //indices into the dihedral forces
		float4 invmap3<>,
		float4 invmap4<>,
		float4 invmap5<>,
		float4 invmap6<>,
		float3 forces[][], //dihedral forces
		float3 inforce<>,  //particle forces before
		out float3 outforce<> //particle forces after
		)
{
	outforce = inforce;

	outforce += do_gather_nobranch( strwidth, invmap1, forces );
	outforce += do_gather_nobranch( strwidth, invmap2, forces );
	outforce += do_gather_nobranch( strwidth, invmap3, forces );
	outforce += do_gather_nobranch( strwidth, invmap4, forces );
	outforce += do_gather_nobranch( strwidth, invmap5, forces );
	outforce += do_gather_nobranch( strwidth, invmap6, forces );
}


//Takes three + four inverse maps
kernel void kinvmap_gather2_2(
		float strwidth,    //stream width of the dihedral forces
		float4 invmap3_1<>,   //indices into the dihedral forces
		float4 invmap3_2<>,   //indices into the dihedral forces
		float3 forces3[][], //dihedral forces
		float4 invmap4_1<>,   //indices into the dihedral forces
		float4 invmap4_2<>,   //indices into the dihedral forces
		float3 forces4[][], //dihedral forces
		float3 inforce<>,  //particle forces before
		out float3 outforce<> //particle forces after
		)
{
	outforce = inforce;

	outforce += do_gather_nobranch( strwidth, invmap3_1, forces3 );
	outforce += do_gather_nobranch( strwidth, invmap3_2, forces3 );
    
	outforce += do_gather_nobranch( strwidth, invmap4_1, forces4 );
	outforce += do_gather_nobranch( strwidth, invmap4_2, forces4 );
}


//Takes three + four inverse maps
kernel void kinvmap_gather3_3(
		float strwidth,    //stream width of the dihedral forces
		float4 invmap3_1<>,   //indices into the dihedral forces
		float4 invmap3_2<>,   //indices into the dihedral forces
		float4 invmap3_3<>,
		float3 forces3[][], //dihedral forces
		float4 invmap4_1<>,   //indices into the dihedral forces
		float4 invmap4_2<>,   //indices into the dihedral forces
		float4 invmap4_3<>,
		float3 forces4[][], //dihedral forces
		float3 inforce<>,  //particle forces before
		out float3 outforce<> //particle forces after
		)
{
	outforce = inforce;

	outforce += do_gather_nobranch( strwidth, invmap3_1, forces3 );
	outforce += do_gather_nobranch( strwidth, invmap3_2, forces3 );
	outforce += do_gather_nobranch( strwidth, invmap3_3, forces3 );
    
	outforce += do_gather_nobranch( strwidth, invmap4_1, forces4 );
	outforce += do_gather_nobranch( strwidth, invmap4_2, forces4 );
	outforce += do_gather_nobranch( strwidth, invmap4_3, forces4 );
}


//Takes three + four inverse maps
kernel void kinvmap_gather3_4(
		float strwidth,    //stream width of the dihedral forces
		float4 invmap3_1<>,   //indices into the dihedral forces
		float4 invmap3_2<>,   //indices into the dihedral forces
		float4 invmap3_3<>,
		float3 forces3[][], //dihedral forces
		float4 invmap4_1<>,   //indices into the dihedral forces
		float4 invmap4_2<>,   //indices into the dihedral forces
		float4 invmap4_3<>,
		float4 invmap4_4<>,
		float3 forces4[][], //dihedral forces
		float3 inforce<>,  //particle forces before
		out float3 outforce<> //particle forces after
		)
{
	outforce = inforce;

	outforce += do_gather_nobranch( strwidth, invmap3_1, forces3 );
	outforce += do_gather_nobranch( strwidth, invmap3_2, forces3 );
	outforce += do_gather_nobranch( strwidth, invmap3_3, forces3 );
    
	outforce += do_gather_nobranch( strwidth, invmap4_1, forces4 );
	outforce += do_gather_nobranch( strwidth, invmap4_2, forces4 );
	outforce += do_gather_nobranch( strwidth, invmap4_3, forces4 );
	outforce += do_gather_nobranch( strwidth, invmap4_4, forces4 );
}

//Takes three + five inverse maps
kernel void kinvmap_gather3_5(
		float strwidth,    //stream width of the dihedral forces
		float4 invmap3_1[][],   //indices into the dihedral forces
		float4 invmap3_2[][],   //indices into the dihedral forces
		float4 invmap3_3[][],
		float3 forces3[][], //dihedral forces
		float4 invmap5_1[][],   //indices into the dihedral forces
		float4 invmap5_2[][],   //indices into the dihedral forces
		float4 invmap5_3[][],
		float4 invmap5_4[][],
		float4 invmap5_5[][],
		float3 forces5[][], //dihedral forces
		float3 inforce<>,  //particle forces before
		out float3 outforce<> //particle forces after
		)
{
	float2 idx = indexof(outforce);
	outforce = inforce;

	outforce += do_gather_nobranch( strwidth, invmap3_1[idx], forces3 );
	outforce += do_gather_nobranch( strwidth, invmap3_2[idx], forces3 );
	outforce += do_gather_nobranch( strwidth, invmap3_3[idx], forces3 );
    
	outforce += do_gather_nobranch( strwidth, invmap5_1[idx], forces5 );
	outforce += do_gather_nobranch( strwidth, invmap5_2[idx], forces5 );
	outforce += do_gather_nobranch( strwidth, invmap5_3[idx], forces5 );
	outforce += do_gather_nobranch( strwidth, invmap5_4[idx], forces5 );
	outforce += do_gather_nobranch( strwidth, invmap5_5[idx], forces5 );
}


//Takes one + one inverse maps
kernel void kinvmap_gather1_1(
		float strwidth,    //stream width of the dihedral forces
		float4 invmap3_1[][],   //indices into the dihedral forces
		float3 forces3[][], //dihedral forces
		float4 invmap5_1[][],   //indices into the dihedral forces
		float3 forces5[][], //dihedral forces
		float3 inforce<>,  //particle forces before
		out float3 outforce<> //particle forces after
		)
{
	float2 idx = indexof(outforce);
	outforce = inforce;

	outforce += do_gather_nobranch( strwidth, invmap3_1[idx], forces3 );
	outforce += do_gather_nobranch( strwidth, invmap5_1[idx], forces5 );
}


//Takes five + two inverse maps
kernel void kinvmap_gather5_2(
		float strwidth,    //stream width of the dihedral forces
		float4 invmap5_1<>,   //indices into the dihedral forces
		float4 invmap5_2<>,   //indices into the dihedral forces
		float4 invmap5_3<>,
		float4 invmap5_4<>,
		float4 invmap5_5<>,
		float3 forces5[][], //dihedral forces
		float4 invmap2_1<>,   //indices into the dihedral forces
		float4 invmap2_2<>,   //indices into the dihedral forces
		float3 forces2[][], //dihedral forces
		float3 inforce<>,  //particle forces before
		out float3 outforce<> //particle forces after
		)
{
	outforce = inforce;

	outforce += do_gather_nobranch( strwidth, invmap5_1, forces5 );
	outforce += do_gather_nobranch( strwidth, invmap5_2, forces5 );
	outforce += do_gather_nobranch( strwidth, invmap5_3, forces5 );
	outforce += do_gather_nobranch( strwidth, invmap5_4, forces5 );
	outforce += do_gather_nobranch( strwidth, invmap5_5, forces5 );
    
	outforce += do_gather_nobranch( strwidth, invmap2_1, forces2 );
	outforce += do_gather_nobranch( strwidth, invmap2_2, forces2 );
}


//Takes five + two inverse maps
kernel void kinvmap_gather4_2(
		float strwidth,    //stream width of the dihedral forces
		float4 invmap4_1<>,   //indices into the dihedral forces
		float4 invmap4_2<>,   //indices into the dihedral forces
		float4 invmap4_3<>,
		float4 invmap4_4<>,
		float3 forces4[][], //dihedral forces
		float4 invmap2_1<>,   //indices into the dihedral forces
		float4 invmap2_2<>,   //indices into the dihedral forces
		float3 forces2[][], //dihedral forces
		float3 inforce<>,  //particle forces before
		out float3 outforce<> //particle forces after
		)
{
	outforce = inforce;

	outforce += do_gather_nobranch( strwidth, invmap4_1, forces4 );
	outforce += do_gather_nobranch( strwidth, invmap4_2, forces4 );
	outforce += do_gather_nobranch( strwidth, invmap4_3, forces4 );
	outforce += do_gather_nobranch( strwidth, invmap4_4, forces4 );
    
	outforce += do_gather_nobranch( strwidth, invmap2_1, forces2 );
	outforce += do_gather_nobranch( strwidth, invmap2_2, forces2 );
}


kernel float3 etch_force( 
		float fpos, 
		float strwidth,
		float3 fi[][], 
		float3 fj[][],
		float3 fk[][],
		float3 fl[][]
		)
{
	float2 ind;
	float _fpos;

	_fpos = fpos;
	
	if ( _fpos > 300000.0f ) {
		_fpos = _fpos - 300000.0f;
		//ind.y = floor( _fpos / strwidth );
      ind.y = round( ( _fpos - fmod( _fpos, strwidth))/strwidth );
		ind.x = _fpos - ind.y * strwidth;

		return fl[ ind ];
	}
	else if ( _fpos > 200000.0f ) {
		_fpos = _fpos - 200000.0f;
		//ind.y = floor( _fpos / strwidth );
      ind.y = round( ( _fpos - fmod( _fpos, strwidth))/strwidth );
		ind.x = _fpos - ind.y * strwidth;

		return fk[ ind ];
	}
	else if ( _fpos > 100000.0f ) {
		_fpos = _fpos - 100000.0f;
		//ind.y = floor( _fpos / strwidth );
      ind.y = round( ( _fpos - fmod( _fpos, strwidth))/strwidth );
		ind.x = _fpos - ind.y * strwidth;

		return fj[ ind ];
	}
	else if ( _fpos >= -0.5f ) {
		//ind.y = floor( _fpos / strwidth );
      ind.y = round( ( _fpos - fmod( _fpos, strwidth))/strwidth );
		ind.x = _fpos - ind.y * strwidth;

		return fi[ ind ];
	}
	else 
		return 0.0f;
	
}


//For-loop version doesn't work
//Using a merged version of the above

kernel float2 linear_to_2D( float linind, float width )
{
	float2 ind;
	//ind.y = floor( linind / width );
   ind.y = round( ( linind - fmod( linind, width))/width );
	ind.x = linind - ind.y * width;

	return ind;
}

//helper function to make the unrolling look better
kernel float3 do_gather_merged_single( float strwidth, float invmap, 
		float3 fi[][], float3 fj[][], float3 fk[][], float3 fl[][] ) {
	
	float3 f;
	float2 idx;
	float _invmap;
	float n;

	_invmap = invmap;

	n = floor( _invmap / 100000.0f );
	_invmap -= n * 100000.0f;
	idx = linear_to_2D( _invmap, strwidth );

	if ( n > 2.5f ) { 
		f = fl[ idx ];
	}
	else if ( n > 1.5f ) {
		f = fk[ idx ];
	}
	else if ( n > 0.5f ) {
		f = fj[ idx ];
	}
	else if( n > -0.5f ) {
		f = fi[ idx ];
	}

	return f;
}

kernel float3 do_gather_merged( float strwidth, float4 invmap,
		float3 fi[][], float3 fj[][], float3 fk[][], float3 fl[][])
{
	float3 f;
	f = do_gather_merged_single( strwidth, invmap.x, fi, fj, fk, fl )
	    + do_gather_merged_single( strwidth, invmap.y, fi, fj, fk, fl )
	    + do_gather_merged_single( strwidth, invmap.z, fi, fj, fk, fl )
	    + do_gather_merged_single( strwidth, invmap.w, fi, fj, fk, fl );

	return f;
}

kernel void kinvmap_gather_merged5(
		float natoms,      //number of atoms
		float strwidth,    //stream width of out-of-order forces
		float4 invmap0<>, 
		float4 invmap1<>, 
		float4 invmap2<>, 
		float4 invmap3<>, 
		float4 invmap4<>, 
		float3 fi[][],     //i-forces
		float3 fj[][],     //j-forces
		float3 fk[][],     //k-forces
		float3 fl[][],     //l-forces
		float3 inforce<>,  
		out float3 outforce<>
		)
{
	outforce = inforce;
	outforce += do_gather_merged( strwidth, invmap0, fi, fj, fk, fl )
	            + do_gather_merged( strwidth, invmap1, fi, fj, fk, fl )
	            + do_gather_merged( strwidth, invmap2, fi, fj, fk, fl )
	            + do_gather_merged( strwidth, invmap3, fi, fj, fk, fl )
	            + do_gather_merged( strwidth, invmap4, fi, fj, fk, fl );
}

kernel void kinvmap_gather_merged9(
		float natoms,      //number of atoms
		float strwidth,    //stream width of out-of-order forces
		float4 invmap0<>, 
		float4 invmap1<>, 
		float4 invmap2<>, 
		float4 invmap3<>, 
		float4 invmap4<>, 
		float4 invmap5<>, 
		float4 invmap6<>, 
		float4 invmap7<>, 
		float4 invmap8<>, 
		float3 fi[][],     //i-forces
		float3 fj[][],     //j-forces
		float3 fk[][],     //k-forces
		float3 fl[][],     //l-forces
		float3 inforce<>,  
		out float3 outforce<>
		)
{
	outforce = inforce;
	outforce += do_gather_merged( strwidth, invmap0, fi, fj, fk, fl )
	            + do_gather_merged( strwidth, invmap1, fi, fj, fk, fl )
	            + do_gather_merged( strwidth, invmap2, fi, fj, fk, fl )
	            + do_gather_merged( strwidth, invmap3, fi, fj, fk, fl )
	            + do_gather_merged( strwidth, invmap4, fi, fj, fk, fl )
	            + do_gather_merged( strwidth, invmap5, fi, fj, fk, fl )
	            + do_gather_merged( strwidth, invmap6, fi, fj, fk, fl )
	            + do_gather_merged( strwidth, invmap7, fi, fj, fk, fl )
	            + do_gather_merged( strwidth, invmap8, fi, fj, fk, fl );
}