/**************************************************************** * This file is part of the gpu acceleration library for gromacs. * Author: V. Vishal * Copyright (C) Pande Group, Stanford, 2006 *****************************************************************/ //Ryckaert Bellman dihedrals, needed for Amber/OPLS ff's // //Input is a stream of quartets i, j, k, l and the output is //four float3 streams fi, fj, fk, fl. //If by any chance this kernel becomes the bottleneck, we will //optimize, but for now, this is kept pretty simple. //To keep things streaming, we have a stream of 6 parameters(a float4 and float2) //for each dihedral. kernel void krbdih( float xstrwidth, //stream width for x float4 xq[][], //particle coordinates and charges float4 atoms<>, //ijkl quartets float4 parm03<>, //params 0-3 float2 parm45<>, //params 4 and 5 out float3 fi<>, //output forces for i, j, k, l out float3 fj<>, out float3 fk<>, out float3 fl<> ) { float3 r_ij, r_kj, r_kl; float2 ai, aj, ak, al; float3 m, n; float sgnphi; float cosfac; float ddphi; float3 u, v, s; float nrkj, nrkj2, msq, nsq, cos_phi, sin_phi; //Convert from linear indices to 2D indices into x //If this kernel is compute bound, we can do this //conversion before-hand and feed in the 2D coordinates ai.y = floor( atoms.x / xstrwidth ); ai.x = atoms.x - ai.y * xstrwidth; aj.y = floor( atoms.y / xstrwidth ); aj.x = atoms.y - aj.y * xstrwidth; ak.y = floor( atoms.z / xstrwidth ); ak.x = atoms.z - ak.y * xstrwidth; al.y = floor( atoms.w / xstrwidth ); al.x = atoms.w - al.y * xstrwidth; r_ij = xq[ai].xyz - xq[aj].xyz; //3 r_kj = xq[ak].xyz - xq[aj].xyz; //3 r_kl = xq[ak].xyz - xq[al].xyz; //3 m = cross( r_ij, r_kj ); //9 n = cross( r_kj, r_kl ); //9 msq = dot(m, m); //5 nsq = dot(n, n); //5 cos_phi = dot(m, n)/sqrt(msq*nsq); //8 (sqrt=1) //Switching to "polymer convention" //See gromacs code cos_phi = -cos_phi; sgnphi = sign( dot(r_ij, n) ); //5 sin_phi = -sgnphi*sqrt( clamp( 1.0 - cos_phi * cos_phi, 0.0, 1.0) ); //3 //ddphi is basically sum_{i=1}^5 i parm_i cosphi^{i-1} //This might not be the best way to use the //4-way mads, but for now we'll let fxc figure it //out. //If we precompute some ratios of the parameters //we can use the 4-way mads better ddphi = 5.0 * parm45.y; ddphi = 4.0 * parm45.x + ddphi * cos_phi; ddphi = 3.0 * parm03.w + ddphi * cos_phi; ddphi = 2.0 * parm03.z + ddphi * cos_phi; ddphi = parm03.y + ddphi * cos_phi; ddphi = -ddphi * sin_phi; //13 flops total for ddphi nrkj2 = dot( r_kj, r_kj ); //5 nrkj = sqrt( nrkj2 ); //1 fi = -ddphi * nrkj / msq * m; //5 fl = ddphi * nrkj / nsq * n; //5 u = dot( r_ij, r_kj ) / nrkj2 * fi; //9 v = dot( r_kl, r_kj ) / nrkj2 * fl; //9 s = u - v; //3 fj = s - fi; //3 fk = -(s + fl); //3 //Total flops: 109 per rb torsion. }