/* -------------------------------------------------------------------------- *
 *                                   OpenMM                                   *
 * -------------------------------------------------------------------------- *
 * This is part of the OpenMM molecular simulation toolkit originating from   *
 * Simbios, the NIH National Center for Physics-Based Simulation of           *
 * Biological Structures at Stanford, funded under the NIH Roadmap for        *
 * Medical Research, grant U54 GM072970. See https://simtk.org.               *
 *                                                                            *
 * Portions copyright (c) 2008 Stanford University and the Authors.           *
 * Authors: Peter Eastman, Mark Friedrichs, Chris Bruns                       *
 * Contributors:                                                              *
 *                                                                            *
 * Permission is hereby granted, free of charge, to any person obtaining a    *
 * copy of this software and associated documentation files (the "Software"), *
 * to deal in the Software without restriction, including without limitation  *
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
 * and/or sell copies of the Software, and to permit persons to whom the      *
 * Software is furnished to do so, subject to the following conditions:       *
 *                                                                            *
 * The above copyright notice and this permission notice shall be included in *
 * all copies or substantial portions of the Software.                        *
 *                                                                            *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
 * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
 * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
 * -------------------------------------------------------------------------- */

/* 
 * 3rd attempt at writing a force kernel that uses neighborlists.
 * Keeping things very simple this time - no unrolling at all.
 *
 * Starting with simple Coul-LJ
 * */

//Constant dielectric, LJ (normal forces)
kernel float4 nl_scalar_force_CDLJ( float4 qq, float epsfac, float4 sig, float4 eps, float4 r2 ) {
	float4 invr, invrsig2, invrsig6;
	float4 f;

	invr = rsqrt( r2 ); // 1 or 2 flops?
	invrsig2 = invr * sig; //1 flop
	invrsig2 = invrsig2 * invrsig2; //1 flop
	invrsig6 = invrsig2 * invrsig2 * invrsig2; //2flops
	f = eps * ( 12.0f*invrsig6 - 6.0f ) * invrsig6; //4flops
	f += epsfac*qq*invr; //2 flops
	f *= invr*invr; //2 flops 
	return f; //Total ? flops
}

kernel float4 nl_get_r2( float3 d1, float3 d2, float3 d3, float3 d4 ) {
	float4 r2;
	r2 = float4( dot(d1, d1), dot( d2, d2 ), dot( d3, d3 ), dot( d4, d4 ) ); //5*4 flops
	return r2;
}

kernel void knbforce_nl(
		float AtomStrWidth,  //Width of atom position stream
		float epsfac,
		iter float2 wpos<>,  //pixel position of output (i-atom)
		float4 posq[][],     //coordinates and charges
		float3 inforce<>,
		float4 nlist0<>,     //nbor indices
		float4 sig0<>,        //LJ sigmas
		float4 eps0<>,        //LJ epsilons
		//We'll add more nlists here to reduce number of kernel calls
		out float3 force<>   //output forces
		)
{
	float2 jind;
	float4 ind_tmp1, ind_tmp2;
	float4 qj;
	float3 jpos1, jpos2, jpos3, jpos4;
	float4 r2, fs, qq;
	float3 d1, d2, d3, d4;
	float3 ipos;
	float qi;
	
	//Since neighbors will not be adjacent in memory
	//We may want to stagger compute and etch, but 
	//with fxc, we don't have control over scheduling anyway.
	//Will try hand scheduling the ps3 code later

	//etch i parameters
	ipos = posq[ wpos ].xyz;
	qi   = posq[ wpos ].w;

	//etch j parameters
	
	ind_tmp1 = floor( nlist0 / AtomStrWidth );
	ind_tmp2 = nlist0 - ind_tmp1 * AtomStrWidth;

	force = inforce;
	
	//1st set of 4 neighbors
	jind.y = ind_tmp1.x;
	jind.x = ind_tmp2.x;
	jpos1  = posq[ jind ].xyz;
	qj.x   = posq[ jind ].w;
	
	jind.y = ind_tmp1.y;
	jind.x = ind_tmp2.y;
	jpos2  = posq[ jind ].xyz;
	qj.y   = posq[ jind ].w;
	
	jind.y = ind_tmp1.z;
	jind.x = ind_tmp2.z;
	jpos3  = posq[ jind ].xyz;
	qj.z   = posq[ jind ].w;

	
	jind.y = ind_tmp1.w;
	jind.x = ind_tmp2.w;
	jpos4  = posq[ jind ].xyz;
	qj.w   = posq[ jind ].w;


	d1 = ipos - jpos1;
	d2 = ipos - jpos2;
	d3 = ipos - jpos3;
	d4 = ipos - jpos4;
	
	r2 = nl_get_r2( d1, d2, d3, d4 );

	qq = qi * qj;

	fs = nl_scalar_force_CDLJ( qq, epsfac, sig0, eps0, r2 );
	
	force += fs.x * d1;
	force += fs.y * d2;
	force += fs.z * d3;
	force += fs.w * d4;	
}