/****************************************************************
* This file is part of the gpu acceleration library for gromacs.
* Author: V. Vishal
* Copyright (C) Pande Group, Stanford, 2006
*****************************************************************/

/* Order N^2 neighbor searching.
 *
 * This only works for force fields that don't have charge groups.
 * If you insist on charge groups, you'll have to pass in appropriate masks here.
 *
 * This is a simplified kernel, for testing the O(N) speeds.
 *
 * This does a complete N^2 search without considering groups of
 * atoms. Most likely this will prove to be inefficient for 
 * the O(N) kernel. Lets find out.
 * 
 *
 * Each component of the curpass textures is an atom index. The w component
 * of curpass3 is a count indicating how many j particles we have
 * scanned for this particular i atom.
 * 
 * */

kernel void knborsearch(
		float first,          //Positive means constructing the first 16.
		iter float2 wpos<>,   //pixel position of output
		float AtomStrHeight,
		float AtomStrWidth,
		float cutoff2,      //square of the cutoff
		float natoms,       //number of atoms
		float excl[][],    //exclusions in 1x1 format, 0 means not excluded, 1 means excluded.
		float4 posq[][],    //atom positions/charges
		float4 prevpass3<>, //Last output texture of previous pass
		out float4 curpass0<>,  //First output of current pass
		out float4 curpass1<>,  
		out float4 curpass2<>,  
		out float4 curpass3<>  //Last output of current pass, used in next pass
		){
	/*For this kernel, wpos == iatom*/
	float2 iind;
	float2 jind;
	float3 ipos, jpos, dr;
	float r2;
	float listptr; //Where in the 16-chunk are we now.
	float jlinind;
	float breakflag; //positive means keep looping, negative means stop
	float4 exclconst;
	float2 exclind;
	float exclusions;

	exclconst = float4( 2.0f, 3.0f, 5.0f, 7.0f );
	
	iind = wpos;

	exclind.x = iind.x + iind.y * AtomStrWidth;

	//etch i atom
	ipos = posq[ iind ].xyz;

	//Loop over j depending on prevpass
	jlinind = prevpass3.w + 1;
	jind.y = floor( jlinind / AtomStrWidth );
	jind.x = fmod( jlinind, AtomStrWidth );
	exclind.y = jlinind;

	//All outputs should be initialized to 

	listptr = 0.0f;
	breakflag = 1.0f;

	//if we already finished, do nothing
	if ( first < 0.0f && prevpass3.w < 0.0f )
		breakflag = -1.0f;

	//set to -1 to indicate no neighbor
	//just to save a separate set of init calls
	curpass0 = float4( -1.0f, -1.0f, -1.0f, -1.0f );
	curpass1 = curpass0;
	curpass2 = curpass0;
	curpass3 = curpass0;
	
	while ( jind.y < AtomStrHeight && breakflag > 0.0f ) {
		while ( jind.x < AtomStrWidth && breakflag > 0.0f ) {

			//First see if this pair is excluded
			exclusions = excl[ exclind ];

			if ( exclusions < 0.5f ) {
				jpos = posq[ jind ].xyz;

				dr = jpos - ipos;
				r2 = dot( dr, dr );

				//If it is inside the cutoff
				if ( r2 < cutoff2 ) {
					//Figure out where to put it
					//We are allowed 4 nested conditionals
					//We can play with the structuring of these
					if ( listptr < 0.5f )
						curpass0.x = jlinind;
					else if ( listptr < 1.5f )
						curpass0.y = jlinind;
					else if ( listptr < 2.5f )
						curpass0.z = jlinind;
					else if ( listptr < 3.5f )
						curpass0.w = jlinind;
					else if ( listptr < 4.5f )
						curpass1.x = jlinind;
					else if ( listptr < 5.5f )
						curpass1.y = jlinind;
					else if ( listptr < 6.5f )
						curpass1.z = jlinind;
					else if ( listptr < 7.5f )
						curpass1.w = jlinind;
					else if ( listptr < 8.5f )
						curpass2.x = jlinind;
					else if ( listptr < 9.5f )
						curpass2.y = jlinind;
					else if ( listptr < 10.5f )
						curpass2.z = jlinind;
					else if ( listptr < 11.5f )
						curpass2.w = jlinind;
					else if ( listptr < 12.5f )
						curpass3.x = jlinind;
					else if ( listptr < 13.5f )
						curpass3.y = jlinind;
					else if ( listptr < 14.5f ) {
						curpass3.z = jlinind;
					}
					else if ( listptr < 15.5f ) {
						//We're done for this pass
						curpass3.w = jlinind;
						breakflag = -1.0f;
					}
					listptr += 1.0f;
				}
			}

			jlinind += 1.0f;
			exclind.y += 1.0f;
			jind.x  += 1.0f;
		}
		jind.x = 0.0f;
		jind.y += 1.0f;
	}
	
}

//Precomputes lennard jones sig and eps 
//to save an indirect etch (and a ew flops) in the 
//force kernel. The charge product is not done this way
//because charges have to be etched anyway with the 
//positions
kernel void knl_precompute_sigeps(
		float AtomStrWidth,
		iter float2 wpos<>,
		float2 sigeps[][], //x=sigma, y=epsilon
		float4 nlist0<>,
		float4 nlist1<>,
		out float4 sig0<>,
		out float4 eps0<>,
		out float4 sig1<>,
		out float4 eps1<>
	   	)
{
	float2 jind;
	float4 ind_tmp1, ind_tmp2;
	float2 isigeps, jsigeps1, jsigeps2, jsigeps3, jsigeps4;
	
	isigeps = sigeps[ wpos ];

	ind_tmp1 = floor( nlist0 / AtomStrWidth );
	ind_tmp2 = nlist0 - ind_tmp1 * AtomStrWidth;

	jind.y = ind_tmp1.x;
	jind.x = ind_tmp2.x;
	jsigeps1 = sigeps[ jind ];
	
	jind.y = ind_tmp1.y;
	jind.x = ind_tmp2.y;
	jsigeps2 = sigeps[ jind ];
	
	jind.y = ind_tmp1.z;
	jind.x = ind_tmp2.z;
	jsigeps3 = sigeps[ jind ];
	
	jind.y = ind_tmp1.w;
	jind.x = ind_tmp2.w;
	jsigeps4 = sigeps[ jind ];

	sig0.x = isigeps.x + jsigeps1.x;
	sig0.y = isigeps.x + jsigeps2.x;
	sig0.z = isigeps.x + jsigeps3.x;
	sig0.w = isigeps.x + jsigeps4.x;
	
	eps0.x = isigeps.y * jsigeps1.y;
	eps0.y = isigeps.y * jsigeps2.y;
	eps0.z = isigeps.y * jsigeps3.y;
	eps0.w = isigeps.y * jsigeps4.y;

	//2nd nlist set
	
	ind_tmp1 = floor( nlist1 / AtomStrWidth );
	ind_tmp2 = nlist1 - ind_tmp1 * AtomStrWidth;

	jind.y = ind_tmp1.x;
	jind.x = ind_tmp2.x;
	jsigeps1 = sigeps[ jind ];
	
	jind.y = ind_tmp1.y;
	jind.x = ind_tmp2.y;
	jsigeps2 = sigeps[ jind ];
	
	jind.y = ind_tmp1.z;
	jind.x = ind_tmp2.z;
	jsigeps3 = sigeps[ jind ];
	
	jind.y = ind_tmp1.w;
	jind.x = ind_tmp2.w;
	jsigeps4 = sigeps[ jind ];

	

	sig1.x = isigeps.x + jsigeps1.x;
	sig1.y = isigeps.x + jsigeps2.x;
	sig1.z = isigeps.x + jsigeps3.x;
	sig1.w = isigeps.x + jsigeps4.x;
	
	eps1.x = isigeps.y * jsigeps1.y;
	eps1.y = isigeps.y * jsigeps2.y;
	eps1.z = isigeps.y * jsigeps3.y;
	eps1.w = isigeps.y * jsigeps4.y;
	
	
}