/* -------------------------------------------------------------------------- * * OpenMM * * -------------------------------------------------------------------------- * * This is part of the OpenMM molecular simulation toolkit originating from * * Simbios, the NIH National Center for Physics-Based Simulation of * * Biological Structures at Stanford, funded under the NIH Roadmap for * * Medical Research, grant U54 GM072970. See https://simtk.org. * * * * Portions copyright (c) 2008 Stanford University and the Authors. * * Authors: Peter Eastman, Mark Friedrichs, Chris Bruns * * Contributors: * * * * Permission is hereby granted, free of charge, to any person obtaining a * * copy of this software and associated documentation files (the "Software"), * * to deal in the Software without restriction, including without limitation * * the rights to use, copy, modify, merge, publish, distribute, sublicense, * * and/or sell copies of the Software, and to permit persons to whom the * * Software is furnished to do so, subject to the following conditions: * * * * The above copyright notice and this permission notice shall be included in * * all copies or substantial portions of the Software. * * * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, * * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR * * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE * * USE OR OTHER DEALINGS IN THE SOFTWARE. * * -------------------------------------------------------------------------- */ /* * 3rd attempt at writing a force kernel that uses neighborlists. * Keeping things very simple this time - no unrolling at all. * * Starting with simple Coul-LJ * */ //Constant dielectric, LJ (normal forces) kernel float4 nl_scalar_force_CDLJ( float4 qq, float epsfac, float4 sig, float4 eps, float4 r2 ) { float4 invr, invrsig2, invrsig6; float4 f; invr = rsqrt( r2 ); // 1 or 2 flops? invrsig2 = invr * sig; //1 flop invrsig2 = invrsig2 * invrsig2; //1 flop invrsig6 = invrsig2 * invrsig2 * invrsig2; //2flops f = eps * ( 12.0f*invrsig6 - 6.0f ) * invrsig6; //4flops f += epsfac*qq*invr; //2 flops f *= invr*invr; //2 flops return f; //Total ? flops } kernel float4 nl_get_r2( float3 d1, float3 d2, float3 d3, float3 d4 ) { float4 r2; r2 = float4( dot(d1, d1), dot( d2, d2 ), dot( d3, d3 ), dot( d4, d4 ) ); //5*4 flops return r2; } kernel void knbforce_nl( float AtomStrWidth, //Width of atom position stream float epsfac, iter float2 wpos<>, //pixel position of output (i-atom) float4 posq[][], //coordinates and charges float3 inforce<>, float4 nlist0<>, //nbor indices float4 sig0<>, //LJ sigmas float4 eps0<>, //LJ epsilons //We'll add more nlists here to reduce number of kernel calls out float3 force<> //output forces ) { float2 jind; float4 ind_tmp1, ind_tmp2; float4 qj; float3 jpos1, jpos2, jpos3, jpos4; float4 r2, fs, qq; float3 d1, d2, d3, d4; float3 ipos; float qi; //Since neighbors will not be adjacent in memory //We may want to stagger compute and etch, but //with fxc, we don't have control over scheduling anyway. //Will try hand scheduling the ps3 code later //etch i parameters ipos = posq[ wpos ].xyz; qi = posq[ wpos ].w; //etch j parameters ind_tmp1 = floor( nlist0 / AtomStrWidth ); ind_tmp2 = nlist0 - ind_tmp1 * AtomStrWidth; force = inforce; //1st set of 4 neighbors jind.y = ind_tmp1.x; jind.x = ind_tmp2.x; jpos1 = posq[ jind ].xyz; qj.x = posq[ jind ].w; jind.y = ind_tmp1.y; jind.x = ind_tmp2.y; jpos2 = posq[ jind ].xyz; qj.y = posq[ jind ].w; jind.y = ind_tmp1.z; jind.x = ind_tmp2.z; jpos3 = posq[ jind ].xyz; qj.z = posq[ jind ].w; jind.y = ind_tmp1.w; jind.x = ind_tmp2.w; jpos4 = posq[ jind ].xyz; qj.w = posq[ jind ].w; d1 = ipos - jpos1; d2 = ipos - jpos2; d3 = ipos - jpos3; d4 = ipos - jpos4; r2 = nl_get_r2( d1, d2, d3, d4 ); qq = qi * qj; fs = nl_scalar_force_CDLJ( qq, epsfac, sig0, eps0, r2 ); force += fs.x * d1; force += fs.y * d2; force += fs.z * d3; force += fs.w * d4; }