knlforce.br


/****************************************************************
* This file is part of the gpu acceleration library for gromacs.
* Author: V. Vishal
* Copyright (C) Pande Group, Stanford, 2006
*****************************************************************/

/* 
 * 3rd attempt at writing a force kernel that uses neighborlists.
 * Keeping things very simple this time - no unrolling at all.
 *
 * Starting with simple Coul-LJ
 * */

//Constant dielectric, LJ (normal forces)
kernel float4 nl_scalar_force_CDLJ( float4 qq, float epsfac, float4 sig, float4 eps, float4 r2 ) {
	float4 invr, invrsig2, invrsig6;
	float4 f;

	invr = rsqrt( r2 ); // 1 or 2 flops?
	invrsig2 = invr * sig; //1 flop
	invrsig2 = invrsig2 * invrsig2; //1 flop
	invrsig6 = invrsig2 * invrsig2 * invrsig2; //2flops
	f = eps * ( 12.0f*invrsig6 - 6.0f ) * invrsig6; //4flops
	f += epsfac*qq*invr; //2 flops
	f *= invr*invr; //2 flops 
	return f; //Total ? flops
}

kernel float4 nl_get_r2( float3 d1, float3 d2, float3 d3, float3 d4 ) {
	float4 r2;
	r2 = float4( dot(d1, d1), dot( d2, d2 ), dot( d3, d3 ), dot( d4, d4 ) ); //5*4 flops
	return r2;
}

kernel void knbforce_nl(
		float AtomStrWidth,  //Width of atom position stream
		float epsfac,
		iter float2 wpos<>,  //pixel position of output (i-atom)
		float4 posq[][],     //coordinates and charges
		float3 inforce<>,
		float4 nlist0<>,     //nbor indices
		float4 sig0<>,        //LJ sigmas
		float4 eps0<>,        //LJ epsilons
		//We'll add more nlists here to reduce number of kernel calls
		out float3 force<>   //output forces
		)
{
	float2 jind;
	float4 ind_tmp1, ind_tmp2;
	float4 qj;
	float3 jpos1, jpos2, jpos3, jpos4;
	float4 r2, fs, qq;
	float3 d1, d2, d3, d4;
	float3 ipos;
	float qi;
	
	//Since neighbors will not be adjacent in memory
	//We may want to stagger compute and etch, but 
	//with fxc, we don't have control over scheduling anyway.
	//Will try hand scheduling the ps3 code later

	//etch i parameters
	ipos = posq[ wpos ].xyz;
	qi   = posq[ wpos ].w;

	//etch j parameters
	
	ind_tmp1 = floor( nlist0 / AtomStrWidth );
	ind_tmp2 = nlist0 - ind_tmp1 * AtomStrWidth;

	force = inforce;
	
	//1st set of 4 neighbors
	jind.y = ind_tmp1.x;
	jind.x = ind_tmp2.x;
	jpos1  = posq[ jind ].xyz;
	qj.x   = posq[ jind ].w;
	
	jind.y = ind_tmp1.y;
	jind.x = ind_tmp2.y;
	jpos2  = posq[ jind ].xyz;
	qj.y   = posq[ jind ].w;
	
	jind.y = ind_tmp1.z;
	jind.x = ind_tmp2.z;
	jpos3  = posq[ jind ].xyz;
	qj.z   = posq[ jind ].w;

	
	jind.y = ind_tmp1.w;
	jind.x = ind_tmp2.w;
	jpos4  = posq[ jind ].xyz;
	qj.w   = posq[ jind ].w;


	d1 = ipos - jpos1;
	d2 = ipos - jpos2;
	d3 = ipos - jpos3;
	d4 = ipos - jpos4;
	
	r2 = nl_get_r2( d1, d2, d3, d4 );

	qq = qi * qj;

	fs = nl_scalar_force_CDLJ( qq, epsfac, sig0, eps0, r2 );
	
	force += fs.x * d1;
	force += fs.y * d2;
	force += fs.z * d3;
	force += fs.w * d4;	
}