/****************************************************************
* This file is part of the gpu acceleration library for gromacs.
* Author: V. Vishal
* Copyright (C) Pande Group, Stanford, 2006
*****************************************************************/


//Ryckaert Bellman dihedrals, needed for Amber/OPLS ff's
//
//Input is a stream of quartets i, j, k, l and the output is
//four float3 streams fi, fj, fk, fl.
//If by any chance this kernel becomes the bottleneck, we will
//optimize, but for now, this is kept pretty simple.
//To keep things streaming, we have a stream of 6 parameters(a float4 and float2)
//for each dihedral.
kernel void kpdih( 
        float xstrwidth, //stream width for x
        float4 xq[][], //particle coordinates and charges
        float4 atoms<>, //ijkl quartets
        float4 parms<>, //parms = ( cp, phi0, mult, 0.0 )
        out float3 fi<>, //output forces for i, j, k, l
        out float3 fj<>,
        out float3 fk<>,
        out float3 fl<>
	  	) {
	
	float3 r_ij, r_kj, r_kl;
	float2 ai, aj, ak, al;
	float3 m, n;
	float sgnphi;
	float cosfac;
	float phi, ddphi, mdphi;
	float3 u, v, s;
	float nrkj, nrkj2, msq, nsq, cos_phi, sin_phi;

	//Convert from linear indices to 2D indices into x
	//If this kernel is compute bound, we can do this 
	//conversion before-hand and feed in the 2D coordinates
	
	ai.y = floor( atoms.x / xstrwidth );
	ai.x = atoms.x - ai.y * xstrwidth;

	aj.y = floor( atoms.y / xstrwidth );
	aj.x = atoms.y - aj.y * xstrwidth;
	
	ak.y = floor( atoms.z / xstrwidth );
	ak.x = atoms.z - ak.y * xstrwidth;
	
	al.y = floor( atoms.w / xstrwidth );
	al.x = atoms.w - al.y * xstrwidth;
	
	
	r_ij = xq[ai].xyz - xq[aj].xyz; //3
	r_kj = xq[ak].xyz - xq[aj].xyz; //3
	r_kl = xq[ak].xyz - xq[al].xyz; //3
	
	m = cross( r_ij, r_kj ); //9
	n = cross( r_kj, r_kl ); //9

	msq = dot(m, m); //5
	nsq = dot(n, n); //5
	
	cos_phi = clamp( dot(m, n)/sqrt(msq*nsq), -1.0, 1.0 ); //8
	
	sgnphi = sign( dot( r_ij, n ) ); //5

	phi = sgnphi * acos( cos_phi ); //2

	mdphi = parms.z * phi - parms.y; //2
	ddphi = - parms.x * parms.z * sin( mdphi ); //3

	nrkj2 = dot( r_kj, r_kj ); //5
	nrkj  = sqrt( nrkj2 );     //1

	fi = -ddphi * nrkj / msq * m; //5
	fl =  ddphi * nrkj / nsq * n; //5
	
	u = dot( r_ij, r_kj ) / nrkj2 * fi; //9
	v = dot( r_kl, r_kj ) / nrkj2 * fl; //9
	s = u - v; //3
	
	fj = s - fi; //3
	fk = -(s + fl); //3

	//Total : 100 flops
}