/**************************************************************** * This file is part of the gpu acceleration library for gromacs. * Author: V. Vishal * Copyright (C) Pande Group, Stanford, 2006 *****************************************************************/ /* * 3rd attempt at writing a force kernel that uses neighborlists. * Keeping things very simple this time - no unrolling at all. * * Starting with simple Coul-LJ * */ //Constant dielectric, LJ (normal forces) kernel float4 nl_scalar_force_CDLJ( float4 qq, float epsfac, float4 sig, float4 eps, float4 r2 ) { float4 invr, invrsig2, invrsig6; float4 f; invr = rsqrt( r2 ); // 1 or 2 flops? invrsig2 = invr * sig; //1 flop invrsig2 = invrsig2 * invrsig2; //1 flop invrsig6 = invrsig2 * invrsig2 * invrsig2; //2flops f = eps * ( 12.0f*invrsig6 - 6.0f ) * invrsig6; //4flops f += epsfac*qq*invr; //2 flops f *= invr*invr; //2 flops return f; //Total ? flops } kernel float4 nl_get_r2( float3 d1, float3 d2, float3 d3, float3 d4 ) { float4 r2; r2 = float4( dot(d1, d1), dot( d2, d2 ), dot( d3, d3 ), dot( d4, d4 ) ); //5*4 flops return r2; } kernel void knbforce_nl( float AtomStrWidth, //Width of atom position stream float epsfac, iter float2 wpos<>, //pixel position of output (i-atom) float4 posq[][], //coordinates and charges float3 inforce<>, float4 nlist0<>, //nbor indices float4 sig0<>, //LJ sigmas float4 eps0<>, //LJ epsilons //We'll add more nlists here to reduce number of kernel calls out float3 force<> //output forces ) { float2 jind; float4 ind_tmp1, ind_tmp2; float4 qj; float3 jpos1, jpos2, jpos3, jpos4; float4 r2, fs, qq; float3 d1, d2, d3, d4; float3 ipos; float qi; //Since neighbors will not be adjacent in memory //We may want to stagger compute and etch, but //with fxc, we don't have control over scheduling anyway. //Will try hand scheduling the ps3 code later //etch i parameters ipos = posq[ wpos ].xyz; qi = posq[ wpos ].w; //etch j parameters ind_tmp1 = floor( nlist0 / AtomStrWidth ); ind_tmp2 = nlist0 - ind_tmp1 * AtomStrWidth; force = inforce; //1st set of 4 neighbors jind.y = ind_tmp1.x; jind.x = ind_tmp2.x; jpos1 = posq[ jind ].xyz; qj.x = posq[ jind ].w; jind.y = ind_tmp1.y; jind.x = ind_tmp2.y; jpos2 = posq[ jind ].xyz; qj.y = posq[ jind ].w; jind.y = ind_tmp1.z; jind.x = ind_tmp2.z; jpos3 = posq[ jind ].xyz; qj.z = posq[ jind ].w; jind.y = ind_tmp1.w; jind.x = ind_tmp2.w; jpos4 = posq[ jind ].xyz; qj.w = posq[ jind ].w; d1 = ipos - jpos1; d2 = ipos - jpos2; d3 = ipos - jpos3; d4 = ipos - jpos4; r2 = nl_get_r2( d1, d2, d3, d4 ); qq = qi * qj; fs = nl_scalar_force_CDLJ( qq, epsfac, sig0, eps0, r2 ); force += fs.x * d1; force += fs.y * d2; force += fs.z * d3; force += fs.w * d4; }